diff --git a/Gen_llm_eval_output.py b/Gen_llm_eval_output.py new file mode 100644 index 0000000000000000000000000000000000000000..7c75d62b2c324c2cd142ff728af991ef9cdab188 --- /dev/null +++ b/Gen_llm_eval_output.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 + +#python Gen_llm_eval_output.py --p1 csv_files/llm_scores_p1.xlsx --p2 csv_files/llm_scores_p2.xlsx --p3 csv_files/llm_scores_p3.xlsx --output-dir csv_files/outputs +import argparse +import os +import re +import math +import pandas as pd +import numpy as np + +REQUIRED_COLS = ["model", "task", "language", "configuration", "prompts", "f1"] + +def read_scores(path: str) -> pd.DataFrame: + df = pd.read_excel(path) + # normalize columns + df.columns = [c.strip().lower() for c in df.columns] + if "prompts" not in df.columns and "prompt" in df.columns: + df["prompts"] = df["prompt"] + missing = [c for c in REQUIRED_COLS if c not in df.columns] + if missing: + raise ValueError(f"{path} is missing required columns: {missing}") + # keep only required, coerce f1 to numeric + df = df[REQUIRED_COLS].copy() + df["f1"] = pd.to_numeric(df["f1"], errors="coerce") + df = df.dropna(subset=["f1"]) + return df + +def sanitize_filename(s: str) -> str: + return re.sub(r"[^0-9A-Za-z._\-+]+", "_", str(s).strip()) + +def format_float(x): + if x is None or (isinstance(x, float) and (math.isnan(x) or math.isinf(x))): + return "nan" + return f"{x:.4f}" + +def prompt_order_key(label: str): + # Sort by the number in "prompt-" if present; fallback to string + m = re.search(r"(\d+)", str(label)) + return (0, int(m.group(1))) if m else (1, str(label)) + +def render_group_table(g: pd.DataFrame, model: str, language: str, configuration: str) -> str: + # Collect all prompt-level f1 values (across tasks and prompts) + prompt_values = g["f1"].to_numpy(dtype=float) + if prompt_values.size > 0: + gen_value = float(np.mean(prompt_values)) + gen_stderr = float(np.std(prompt_values, ddof=1) / math.sqrt(len(prompt_values))) if len(prompt_values) > 1 else 0.0 + else: + gen_value, gen_stderr = float("nan"), 0.0 + + # Build table text + if configuration=="0shot" : configuration='0' + if configuration=="10shot" : configuration='10' + model = model.split("__")[0]+'/'+model.split("__")[1] + #if model =='Henrychur__MMed-Llama-3-8B' : model='Henrychur/MMed-Llama-3-8B' + #if model =='HiTZ__Medical-mT5-large' : model='' + #if model =='Qwen__Qwen2.5-14B-Instruct-1M' : model='Qwen/'+model + #if model =='Qwen__Qwen2.5-32B-Instruct' : model='Qwen/'+model + #if model =='Qwen__Qwen3-30B-A3B-Instruct-2507' : model='Qwen/'+model + #if model =='deepseek-ai__DeepSeek-R1-Distill-Qwen-32B' : model='' + #if model =='epfl-llm__meditron-7b' : model='' + #if model =='google__gemma-2-9b-it' : model='' + #if model =='google__gemma-3-27b-it' : model='' + #if model =='google__medgemma-27b-text-it' : model='' + #if model =='google__medgemma-4b-it' : model='' + #if model =='microsoft__MediPhi-Clinical' : model='' + #if model =='microsoft__MediPhi-Instruct' : model='' + #if model =='mistralai__Mistral-7B-Instruct-v0.2' : model='' + #if model =='mistralai__Mistral-Nemo-Instruct-2407' : model='' + #if model =='tiiuae__Falcon3-10B-Instruct' : model='' + #if model =='unsloth__phi-4' : model='' + #if model =='Henrychur__MMed-Llama-3-8B' : model='' + + header = f"hf (pretrained={model} ), num_fewshot: {configuration}, batch_size: 1" + lines = [ + "|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr|", + "|-------|-------|------|------|------|----|------|---|------|", + #f"|Gen | | | |f1 | |{format_float(gen_value)} |---| {format_float(gen_stderr)} |", + ] + + # For each task, add task row (mean over prompts) then prompt rows + for task, df_task in g.groupby("task", sort=False): + f1s = df_task["f1"].to_numpy(dtype=float) + task_mean = float(np.mean(f1s)) if f1s.size else float("nan") + lines.append(f"| - {task.upper()} | | | |f1 | | {format_float(task_mean)} | |0 |") + + # Prompt-level rows, sorted by prompt number if available + df_task = df_task.copy() + df_task["_order"] = df_task["prompts"].map(prompt_order_key) + df_task = df_task.sort_values("_order") + for _, r in df_task.iterrows(): + prompt_label = str(r["prompts"]) + lines.append(f"| - {prompt_label} | | | |f1 | | {format_float(r['f1'])} | | 0 |") + + return header + "\n" + "\n".join(lines) + "\n" + +def main(): + ap = argparse.ArgumentParser(description="Build per-(model,language,configuration) summaries from three prompt Excel files.") + ap.add_argument("--p1", required=True, help="Path to llm_scores_p1.xlsx") + ap.add_argument("--p2", required=True, help="Path to llm_scores_p2.xlsx") + ap.add_argument("--p3", required=True, help="Path to llm_scores_p3.xlsx") + ap.add_argument("--output-dir", required=True, help="Directory to write output files") + args = ap.parse_args() + + os.makedirs(args.output_dir, exist_ok=True) + + df = pd.concat([read_scores(args.p1), read_scores(args.p2), read_scores(args.p3)], ignore_index=True) + + # One file per (model, language, configuration) + for (model, language, config), g in df.groupby(["model", "language", "configuration"], sort=False): + content = render_group_table(g, model, language, config) + fname = f"{sanitize_filename(model)}__{sanitize_filename(language)}__{sanitize_filename(config)}.txt" + out_path = os.path.join(args.output_dir, fname) + with open(out_path, "w", encoding="utf-8") as f: + f.write(content) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..b5685772804c8af4235a8504dc6752bfc9ae5d1d --- /dev/null +++ b/Makefile @@ -0,0 +1,13 @@ +.PHONY: style format + + +style: + python -m black --line-length 119 . + python -m isort . + ruff check --fix . + + +quality: + python -m black --check --line-length 119 . + python -m isort --check-only . + ruff check . diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..f38df7c717e152c32a40642146efb5db18c70532 --- /dev/null +++ b/app.py @@ -0,0 +1,1144 @@ +import gradio as gr +from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns +import pandas as pd +from apscheduler.schedulers.background import BackgroundScheduler +from huggingface_hub import snapshot_download +from src.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE +from src.tasks import TASK_DESCRIPTIONS, MEASURE_DESCRIPTION +from src.display.css_html_js import custom_css +from src.display.utils import BENCHMARK_COLS, COLS, EVAL_COLS, EVAL_TYPES, AutoEvalColumn, ModelType, fields, WeightType, Precision +from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN +from src.populate import get_evaluation_queue_df, get_leaderboard_df +from src.submission.submit import add_new_eval +import random +import matplotlib.pyplot as plt +import re +import plotly.express as px +import plotly.graph_objects as go +import numpy as np + + + + +# === NEW: helper for prompt sensitivity (simple: only NER/REL and 3 prompts) === +def calculate_prompt_sensitivity(dataframe, tasks, prompt_ids): + """ + Computes a simple Prompt Sensitivity Index (PSI) over the tasks + using the distribution of 'Best Prompt Id' across the provided prompt_ids. + """ + cv_per_task = [] + for task in tasks: + prompt_col = f"{task} Best Prompt Id" + task_accuracies = [] + for pid in prompt_ids: + total = len(dataframe[prompt_col].dropna()) if prompt_col in dataframe.columns else 0 + count = (dataframe[prompt_col] == pid).sum() if prompt_col in dataframe.columns else 0 + acc = (count / total * 100) if total > 0 else 0 + task_accuracies.append(acc) + if task_accuracies: + mean_acc = np.mean(task_accuracies) + std_acc = np.std(task_accuracies) + cv_per_task.append((std_acc / mean_acc) if mean_acc > 0 else 0) + else: + cv_per_task.append(0) + mean_cv = np.mean(cv_per_task) if cv_per_task else 0 + psi = 1.0 if mean_cv >= 0.5 else (mean_cv / 0.5) + return psi, mean_cv, cv_per_task + +def create_best_model_comparison_table(dataframe, lang: str | None = None, shot: str | None = None): + """ + Table with best overall model per task and the model with the best prompt score. + Applies optional filters: + - lang in {EN, IT, SL, SK, GR, PL} or None/"All" + - shot in {"0","10"} or None/"All" (mapped to IS_FS False/True) + """ + tasks = ["NER", "REL", "RML", "HIS", "DIA"] + df = dataframe.copy() + + if lang and lang != "All" and "LANG" in df.columns: + df = df[df["LANG"] == lang] + if shot and shot != "All" and "IS_FS" in df.columns: + df = df[df["IS_FS"] == (shot == "10")] + + table_data = {'Task': [], 'Best Overall Model': [], 'CPS': [], 'Best Prompt Model': [], 'Acc.': []} + + for task in tasks: + if task not in df.columns or df.empty: + continue + # Best overall on task + #max_idx = df[task].idxmax() + max_idx = pd.to_numeric(df[task], errors='coerce').idxmax() + try: + model_raw = df.loc[max_idx, 'Model'] + except Exception as e: + break + + if isinstance(model_raw, str) and '<' in model_raw: + match = re.search(r'>([^<]+)<', model_raw) + model_name = match.group(1) if match else model_raw + else: + model_name = str(model_raw) + comb_perf_value = df.loc[max_idx, task] + + # Best prompt row for task + best_prompt_column = f"{task} Best Prompt" + if best_prompt_column in df.columns and df[best_prompt_column].notna().any(): + best_prompt_idx= pd.to_numeric(df[best_prompt_column],errors='coerce').idxmax() + try: + best_prompt_model_raw = df.loc[best_prompt_idx, 'Model'] + except Exception as e: + break + if isinstance(best_prompt_model_raw, str) and '<' in best_prompt_model_raw: + match = re.search(r'>([^<]+)<', best_prompt_model_raw) + best_prompt_model = match.group(1) if match else best_prompt_model_raw + else: + best_prompt_model = str(best_prompt_model_raw) + best_prompt_accuracy = df.loc[best_prompt_idx, best_prompt_column] + else: + best_prompt_model = "n/a" + best_prompt_accuracy = float('nan') + + table_data['Task'].append(task) + table_data['Best Overall Model'].append(model_name) + table_data['CPS'].append(f"{comb_perf_value:.2f}") + table_data['Best Prompt Model'].append(best_prompt_model) + table_data['Acc.'].append(f"{best_prompt_accuracy:.2f}" if isinstance(best_prompt_accuracy, (int, float)) else "n/a") + + fig = go.Figure(data=[go.Table( + columnwidth=[60, 220, 60, 220, 60], + header=dict( + values=[f'{col}' for col in table_data.keys()], + fill_color=['#2171b5', '#2171b5', '#2171b5', '#4292c6', '#4292c6'], + font=dict(color='white', size=12, family='Arial'), + align='center', height=30 + ), + cells=dict( + values=list(table_data.values()), + fill_color=[['#f0f0f0' if i % 2 == 0 else 'white' for i in range(len(table_data['Task']))]], + font=dict(color='#2c3e50', size=11, family='Arial'), + align=['center', 'left', 'center', 'left', 'center'], + height=30 + ) + )]) + + subtitle = [] + subtitle.append(lang if (lang and lang != "All") else "All languages") + subtitle.append(f"{shot}-shot" if (shot and shot != "All") else "All shots") + + fig.update_layout( + title={'text': f"Top Model per Task: CPS & Best Prompt — {', '.join(subtitle)}", + 'font': {'family': 'Arial', 'size': 14, 'color': '#2c3e50'}}, + font=dict(family="Arial", size=11), + height=420, margin=dict(l=20, r=20, t=50, b=80) + ) + return fig + + + +# === NEW: Best-model comparison table (only NER, REL) === +def create_best_model_comparison_table_without_lang(dataframe): + """ + Table with the best overall model per task (NER, REL,) and the model that + achieves the best score with its own best prompt. + """ + tasks = ["NER", "REL", "RML", "HIS", "DIA"] + table_data = {'Task': [], 'Best Overall Model': [], 'CPS': [], 'Best Prompt Model': [], 'Acc.': []} + + for task in tasks: + if task not in dataframe.columns: + continue + + # Best overall on the task's combined performance + max_idx = dataframe[task].idxmax() + model_raw = dataframe.loc[max_idx, 'Model'] + if isinstance(model_raw, str) and '<' in model_raw: + match = re.search(r'>([^<]+)<', model_raw) + model_name = match.group(1) if match else model_raw + else: + model_name = str(model_raw) + comb_perf_value = dataframe.loc[max_idx, task] + + # Model with the best prompt for this task + best_prompt_column = f"{task} Best Prompt" + if best_prompt_column in dataframe.columns: + best_prompt_idx = dataframe[best_prompt_column].idxmax() + best_prompt_model_raw = dataframe.loc[best_prompt_idx, 'Model'] + if isinstance(best_prompt_model_raw, str) and '<' in best_prompt_model_raw: + match = re.search(r'>([^<]+)<', best_prompt_model_raw) + best_prompt_model = match.group(1) if match else best_prompt_model_raw + else: + best_prompt_model = str(best_prompt_model_raw) + best_prompt_accuracy = dataframe.loc[best_prompt_idx, best_prompt_column] + else: + best_prompt_model = "n/a" + best_prompt_accuracy = float('nan') + + table_data['Task'].append(task) + table_data['Best Overall Model'].append(model_name) + table_data['CPS'].append(f"{comb_perf_value:.2f}") + table_data['Best Prompt Model'].append(best_prompt_model) + table_data['Acc.'].append(f"{best_prompt_accuracy:.2f}" if isinstance(best_prompt_accuracy, (int, float)) else "n/a") + + fig = go.Figure(data=[go.Table( + columnwidth=[60, 220, 60, 220, 60], + header=dict( + values=[f'{col}' for col in table_data.keys()], + fill_color=['#2171b5', '#2171b5', '#2171b5', '#4292c6', '#4292c6'], + font=dict(color='white', size=12, family='Arial'), + align='center', height=30 + ), + cells=dict( + values=list(table_data.values()), + fill_color=[['#f0f0f0' if i % 2 == 0 else 'white' for i in range(len(table_data['Task']))]], + font=dict(color='#2c3e50', size=11, family='Arial'), + align=['center', 'left', 'center', 'left', 'center'], + height=30 + ) + )]) + fig.update_layout( + title={'text': "Top Model per Task: CPS & Best Prompt (NER/REL)", + 'font': {'family': 'Arial', 'size': 14, 'color': '#2c3e50'}}, + font=dict(family="Arial", size=11), + height=420, margin=dict(l=20, r=20, t=50, b=80) + ) + fig.add_annotation( + text=("Best Overall Model uses the task's primary metric (CPS). " + "Best Prompt Model is the one whose own best prompt yields the highest score."), + xref="paper", yref="paper", x=0.5, y=-0.20, showarrow=False, + font=dict(size=11, color="gray", family="Arial"), align="center", xanchor="center" + ) + return fig + +def create_prompt_heatmap(dataframe, lang: str | None = None, shot: str | None = None): + """ + Heatmap of share (%) of models whose BEST prompt is each pid, for NER/REL with prompts p1..p3. + Optional filters: + - lang: None or one of EN/IT/SL/SK/GR/PL (None means All) + - shot: None or "0"/"10" (None means All) mapped to IS_FS False/True + """ + tasks = ["NER", "REL", "RML", "HIS", "DIA"] + + df = dataframe.copy() + # Language filter + if lang and lang != "All" and "LANG" in df.columns: + df = df[df["LANG"] == lang] + # Shot filter -> IS_FS (10-shot=True, 0-shot=False) + if shot and shot != "All" and "IS_FS" in df.columns: + df = df[df["IS_FS"] == (shot == "10")] + + # Collect prompt ids present, normalize labels to p1..p3 + def label_for(pid): + if isinstance(pid, str): return pid + try: return f"p{int(pid)}" + except Exception: return str(pid) + + all_ids = set() + for task in tasks: + col = f"{task} Best Prompt Id" + if col in df.columns: + all_ids.update(df[col].dropna().unique()) + prompt_ids_raw = sorted(list(all_ids), key=lambda x: int(re.sub(r'[^0-9]', '', str(x)) or 0)) + prompt_ids_raw = [pid for pid in prompt_ids_raw if label_for(pid) in {"p1", "p2", "p3"}] or [1, 2, 3] + y_tick_labels = [label_for(pid) for pid in prompt_ids_raw] + + matrix, hovers = [], [] + for pid in prompt_ids_raw: + row, hover_row = [], [] + for task in tasks: + col = f"{task} Best Prompt Id" + if col in df.columns and len(df[col].dropna()) > 0: + series = df[col].dropna() + + def same_pid(v): + a = re.sub(r'[^0-9]', '', str(v)) + b = re.sub(r'[^0-9]', '', str(pid)) + return a == b and a != "" + + total = len(series) + count = sum(same_pid(v) for v in series) + pct = (count / total * 100) if total > 0 else 0 + row.append(pct) + hover_row.append(f"{task} — {label_for(pid)}
Models: {count}/{total}
Percentage: {pct:.1f}%") + else: + row.append(0); hover_row.append(f"{task} — {label_for(pid)}
No data") + matrix.append(row); hovers.append(hover_row) + + fig = go.Figure(data=go.Heatmap( + z=matrix, x=tasks, y=y_tick_labels, + colorscale=[[0,'#f7fbff'],[0.2,'#deebf7'],[0.4,'#9ecae1'],[0.6,'#4292c6'],[0.8,'#2171b5'],[1,'#08519c']], + text=[[f"{val:.0f}%" if val is not None else "" for val in row] for row in matrix], + texttemplate="%{text}", textfont={"size": 11, "family": "Arial"}, + hovertemplate='%{customdata}', customdata=hovers, + colorbar=dict(title="% Models", ticksuffix="%"), + zmin=0, zmax=100 + )) + + title_parts = [] + title_parts.append(lang if (lang and lang != "All") else "All languages") + title_parts.append(f"{shot}-shot" if (shot and shot != "All") else "All shots") + fig.update_layout( + title={'text': f"Most Effective Prompts — {', '.join(title_parts)}", + 'font': {'family': 'Arial', 'size': 14, 'color': '#2c3e50'}}, + xaxis_title="Task", yaxis_title="Prompt", + font=dict(family="Arial", size=11), margin=dict(b=100), + template="plotly_white", dragmode=False, height=420 + ) + fig.update_xaxes(fixedrange=True); fig.update_yaxes(fixedrange=True) + return fig + + +# === NEW: Prompt heatmap (only NER, REL; 3 prompts p1, p2, p3) === +def create_prompt_heatmap_without_lang(dataframe): + """ + Heatmap of the share of models (in %) whose BEST prompt for the task is each prompt id, + for tasks NER and REL, with exactly 3 prompts (p1, p2, p3). It supports columns storing + ids as integers (1/2/3) or strings ('p1'/'p2'/'p3'). + """ + tasks = ["NER", "REL", "RML", "HIS", "DIA"] + + # Collect unique prompt ids as they appear (int or 'pX'); restrict to 3 prompts + all_ids = set() + for task in tasks: + col = f"{task} Best Prompt Id" + if col in dataframe.columns: + all_ids.update(dataframe[col].dropna().unique()) + + # Normalize to display labels and preserve the original values as keys + def label_for(pid): + if isinstance(pid, str): + return pid # e.g., 'p1' + try: + return f"p{int(pid)}" + except Exception: + return str(pid) + + prompt_ids_raw = sorted(list(all_ids), key=lambda x: int(re.sub(r'[^0-9]', '', str(x)) or 0)) + # Optional: hard-limit to p1/p2/p3 if extra noise exists + prompt_ids_raw = [pid for pid in prompt_ids_raw if label_for(pid) in {"p1", "p2", "p3"}] + + if not prompt_ids_raw: + # Fallback to p1..p3 + prompt_ids_raw = [1, 2, 3] + + y_tick_labels = [label_for(pid) for pid in prompt_ids_raw] + + matrix, hovers = [], [] + for pid in prompt_ids_raw: + row, hover_row = [], [] + for task in tasks: + col = f"{task} Best Prompt Id" + if col in dataframe.columns: + series = dataframe[col].dropna() + # match values regardless of 'p1' vs 1 vs '1' + def same_pid(v): + a = re.sub(r'[^0-9]', '', str(v)) + b = re.sub(r'[^0-9]', '', str(pid)) + return a == b and a != "" + total = len(series) + count = sum(same_pid(v) for v in series) + pct = (count / total * 100) if total > 0 else 0 + row.append(pct) + hover_row.append( + f"{task} — {label_for(pid)}
Models: {count}/{total}
Percentage: {pct:.1f}%" + ) + else: + row.append(0); hover_row.append(f"{task} — {label_for(pid)}
No data") + matrix.append(row) + hovers.append(hover_row) + + fig = go.Figure(data=go.Heatmap( + z=matrix, x=tasks, y=y_tick_labels, + colorscale=[[0,'#f7fbff'],[0.2,'#deebf7'],[0.4,'#9ecae1'],[0.6,'#4292c6'],[0.8,'#2171b5'],[1,'#08519c']], + text=[[f"{val:.0f}%" if val is not None else "" for val in row] for row in matrix], + texttemplate="%{text}", + textfont={"size": 11, "family": "Arial"}, + hovertemplate='%{customdata}', + customdata=hovers, + colorbar=dict(title="% Models", ticksuffix="%"), + zmin=0, zmax=100 + )) + fig.update_layout( + title={'text': "Most Effective Prompts Across Models (NER/REL)", + 'font': {'family': 'Arial', 'size': 14, 'color': '#2c3e50'}}, + xaxis_title="Task", yaxis_title="Prompt", + font=dict(family="Arial", size=11), + margin=dict(b=120), template="plotly_white", dragmode=False, height=420 + ) + + # PSI (optional info line) + psi, mean_cv, _ = calculate_prompt_sensitivity( + dataframe, tasks, prompt_ids_raw + ) + fig.add_annotation( + text=f"Prompt Sensitivity (mean CV): {mean_cv:.2f}", + xref="paper", yref="paper", x=0.3, y=1.12, showarrow=False, + font=dict(size=11, color="#2c3e50", family="Arial") + ) + + fig.update_xaxes(fixedrange=True); fig.update_yaxes(fixedrange=True) + return fig + + + + + + + +def mean_of_max_per_field(df): + """ + Calcola il massimo per ciascun campo e poi la media dei massimi. + + Args: + df (pd.DataFrame): DataFrame con colonne TE, SA, HS, AT, WIC, FAQ, LS, SU, NER, REL, RML, DIA, HIS + + Returns: + float: media dei valori massimi dei campi + """ + #fields = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"] + fields = ["NER", "REL", "RML", "DIA", "HIS"] + #print(df.columns) + + # Controlla che tutte le colonne esistano nel DataFrame + missing = [f for f in fields if f not in df.columns] + if missing: + raise ValueError(f"Le seguenti colonne mancano nel DataFrame: {missing}") + + # Calcola il massimo per ciascun campo + max_values = df[fields].apply(pd.to_numeric, errors='coerce').max(skipna=True) + + # Calcola la media dei massimi + mean_max = max_values.mean() + + return mean_max + + +def barplot_mean_few_minus_zero_shot(dataframe, tasks=None): + if tasks is None: + tasks = [ "NER", "REL", "RML", "DIA", "HIS"] + + task_means = {} + + for task in tasks: + if task not in dataframe.columns: + continue + + # Separa few-shot e zero-shot + few_shot = dataframe[dataframe['IS_FS'] == True][["Model", task]] + zero_shot = dataframe[dataframe['IS_FS'] == False][["Model", task]] + + # Allinea i modelli + merged = pd.merge(few_shot, zero_shot, on="Model", suffixes=("_few", "_zero")) + + # Rimuovi righe con valori mancanti + merged = merged.dropna(subset=[f"{task}_few", f"{task}_zero"]) + + if merged.empty: + continue + + # Calcola differenza few - zero + diff = merged[f"{task}_few"] - merged[f"{task}_zero"] + + # Calcola la media + task_means[task] = diff.mean() + + # Crea barplot + fig = go.Figure([go.Bar( + x=list(task_means.keys()), + y=list(task_means.values()), + marker_color="#ff7f0e", + text=[f"{v:.2f}" for v in task_means.values()], + textposition="outside", + hovertemplate="%{x}
Mean Delta Accuracy: %{y:.2f}%" + )]) + + # Linea di riferimento a 0 + ''' + fig.add_shape( + type="line", + x0=-0.5, x1=len(task_means) - 0.5, + y0=0, y1=0, + line=dict(color="black", width=2, dash="dash"), + xref="x", yref="y" + ) + ''' + + fig.update_layout( + title="Mean Accuracy Difference (Few-shot − Zero-shot) per Task", + xaxis_title="", + yaxis_title="Mean Delta Combined Performance", + template="plotly_white", + font=dict(family="Arial", size=13), + #margin=dict(b=100) + ) + + fig.add_annotation( + text="10-shot learning generally outperforms zero-shot.
" + "", + xref="paper", yref="paper", + x=0, y=-0.2, + showarrow=False, + font=dict(size=11, color="gray"), + align="left" + ) + + return fig + + +def boxplot_per_task(dataframe=None, baselines=None, references=None): + + #print(dataframe.columns) + + #tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"] + tasks =["NER", "REL", "RML", "HIS", "DIA"] + if dataframe is None: + np.random.seed(42) + dataframe = pd.DataFrame({ + task: np.random.uniform(0.4, 0.9, 20) * 100 + for task in tasks + }) + + if baselines is None: + baselines = {task: np.random.randint(50, 70) for task in tasks} + + colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", + "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"] + + fig = go.Figure() + + for i, task in enumerate(tasks): + if task in dataframe.columns: + y_data = dataframe[task].dropna().tolist() + + # boxplot + fig.add_trace(go.Box( + y=y_data, + name=task, + marker=dict(color=colors[i]), + line=dict(color="black", width=2), + fillcolor=colors[i], + opacity=0.7, + hovertemplate=""+task+"
Accuracy: %{y:.2f}%", + width=0.6, + whiskerwidth=0.2, + quartilemethod="linear" + )) + + # baseline + #if task in baselines and baselines[task] is not None: + #fig.add_shape( + # type="line", + # x0=i - 0.3, x1=i + 0.3, + # y0=baselines[task], y1=baselines[task], + # line=dict(color="black", width=2, dash="dot"), # più visibile + # xref="x", yref="y" + #) + #''' + #fig.add_annotation( + #x=i, y=baselines[task], + #text=f"{baselines[task]}%", + #showarrow=False, + #yshift=10, + #font=dict(size=10, color="black") + #) + #''' + + # reference GPT-4o + # if task in references and references[task] is not None: + # fig.add_shape( + # type="line", + # x0=i - 0.3, x1=i + 0.3, + # y0=references[task], y1=references[task], + # line=dict(color="red", width=2, dash="dashdot"), + # xref="x", yref="y" + # ) + + fig.update_layout( + title="Distribution of Model Accuracy by Task", + xaxis_title="Task", + yaxis_title="Combined Performance", + template="plotly_white", + boxmode="group", + dragmode=False, + font=dict(family="Arial", size=10), + margin=dict(b=80), + ) + + fig.add_annotation( + text=("" + #"In tasks like TE and SA, models approach the accuracy of supervised
" + #"models at EVALITA (dashed black line); in NER and REL they remain lower.
" + # "Dashed red lines show GPT-4o reference results for generative tasks." + ), + xref="paper", yref="paper", + x=0.5, y=-0.30, + showarrow=False, + font=dict(size=11, color="gray"), + align="left" + ) + + fig.update_yaxes(range=[0, 100], fixedrange=True) + + return fig + +# EVALITA results +BASELINES = { + "TE":71.00, "SA": 66.38, "HS": 80.88, "AT": 82.40, "WIC": 85.00, + "LS": 38.82, "SU": 38.91, "NER":88.00, "REL": 62.99 +} + +# GPT-4o +REFERENCES = { + "NER": 79.11, + "REL": 63.32, + "LS": 59.25, + "SU": 33.04 + +} + + +def boxplot_prompts_per_task(dataframe, tasks=None): + if tasks is None: + tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"] + + # Lista delle colonne da aggiornare + cols_to_update = ["REL Best Prompt Id", "NER Best Prompt Id", "SU Best Prompt Id", "LS Best Prompt Id"] + # Applichiamo la trasformazione + for col in cols_to_update: + dataframe[col] = dataframe[col].replace({1: 7, 2: 8}) + + fig = go.Figure() + + # Liste per creare una sola voce in legenda per Average e Best + avg_x, avg_y = [], [] + best_x, best_y, best_text = [], [], [] + + for task in tasks: + avg_col = f"{task} Prompt Average" + best_col = f"{task} Best Prompt" + best_id_col = f"{task} Best Prompt Id" + + if all(col in dataframe.columns for col in [avg_col, best_col, best_id_col]): + avg_value = dataframe[avg_col].mean() + avg_x.append(task) + avg_y.append(avg_value) + + best_value = dataframe[best_col].mean() + best_x.append(task) + best_y.append(best_value) + best_id = dataframe[best_id_col].mode()[0] # Most frequent best prompt id + best_text.append(f"P:{best_id}") + + # Barre Average Accuracy (azzurro) + fig.add_trace(go.Bar( + x=avg_x, + y=avg_y, + name="Avg. Accuracy", + marker_color="#1f77b4", + )) + + # Barre Best Prompt (rosso) + fig.add_trace(go.Bar( + x=best_x, + y=best_y, + name="Best Prompt", + marker_color="#d62728", + )) + + # Testo sopra barre Best Prompt con ID + for x, y, text in zip(best_x, best_y, best_text): + fig.add_annotation( + x=x, + y=y + 3, # leggermente sopra la barra + text=text, + showarrow=False, + font=dict(size=12, color="black") + ) + + fig.update_layout( + title= "Prompt Accuracy: Avg vs Best", + xaxis_title="Task", + yaxis_title="Combined Performance", + barmode='group', + template="plotly_white", + font=dict(family="Arial", size=10), + yaxis=dict(range=[0, 100], fixedrange=True) + ) + + # caption come annotazione separata + fig.add_annotation( + text="There is no single prompt that performs best across all tasks.
" + "Different prompts achieve the highest accuracy on different tasks.", + xref="paper", yref="paper", + x=0.5, y=-0.3, + showarrow=False, + font=dict(size=11, color="gray"), + align="center", + xanchor="center" + ) + + return fig + + +def line_chart(dataframe): + + # Normalizza le dimensioni per avere marker non troppo piccoli né enormi + def scale_sizes(values, min_size=8, max_size=30): + vmin, vmax = min(values), max(values) + return [ + min_size + (val - vmin) / (vmax - vmin) * (max_size - min_size) if vmax > vmin else (min_size + max_size) / 2 + for val in values + ] + + # dati in base a IS_FS + df_true = dataframe[dataframe['IS_FS'] == True] + df_false = dataframe[dataframe['IS_FS'] == False] + + # Estrai valori x, y e labels + x_true = df_true['#Params (B)'].tolist() + y_true = df_true['Avg. Comb. Perf. ⬆️'].tolist() + labels_true = [re.search(r'>([^<]+)<', m).group(1) for m in df_true['Model'].tolist()] + + x_false = df_false['#Params (B)'].tolist() + y_false = df_false['Avg. Comb. Perf. ⬆️'].tolist() + labels_false = [re.search(r'>([^<]+)<', m).group(1) for m in df_false['Model'].tolist()] + + fig = go.Figure() + + # Punti IS_FS=True + fig.add_trace(go.Scatter( + x=x_true, + y=y_true, + mode='markers', + name='10-Shot', + marker=dict( + color='blue', + size=scale_sizes(x_true) + ), + hovertemplate='%{customdata}
#Params: %{x}
Performance: %{y}', + customdata=labels_true + )) + + # Punti IS_FS=False + fig.add_trace(go.Scatter( + x=x_false, + y=y_false, + mode='markers', + name='0-Shot', + marker=dict( + color='red', + size=scale_sizes(x_false) + ), + hovertemplate='%{customdata}
#Params: %{x}
Performance: %{y}', + customdata=labels_false + )) + + # Trova il massimo tra tutti i modelli + all_y = y_true + y_false + all_x = x_true + x_false + all_labels = labels_true + labels_false + max_idx = all_y.index(max(all_y)) + max_x = all_x[max_idx] + max_y = all_y[max_idx] + max_label = all_labels[max_idx] + + # Aggiungi annotazione visibile per il modello migliore + fig.add_annotation( + x=max_x, + y=max_y, + #text=f"Top: {max_label} ({max_y:.1f}%)", + text=f"{max_label}", + showarrow=True, + arrowhead=2, + arrowsize=1, + arrowwidth=2, + arrowcolor="black", + font=dict(size=11, color="black"), + xshift=10, + yshift=10, + ax = -30, ay = -20, # sposta la label a sinistra e sopra il punto + xanchor = "right" # allinea la label a destra rispetto al punto + ) + + fig.update_layout( + title="Avg. Combined Performance vs #Params", + xaxis_title="#Params (B)", + yaxis_title="Avg. Combined Performance", + template="plotly_white", + hovermode="closest", + font=dict(family="Arial", size=10), + dragmode=False, + xaxis=dict( + tickvals=[0, 25, 50, 75, 100, 125], + ticktext=["0", "25", "50", "75", "100"] + ), + yaxis=dict( + tickvals=[0, 20, 40, 60, 80, 100], # 👈 tick fissi + range=[0, 100] # 👈 range bloccato + ) + ) + + # Caption + fig.add_annotation( + text="Accuracy generally rises with #Params, but smaller models
" + "with 10-shot can outperform larger zero-shot models.", + xref="paper", yref="paper", + x=0.5, y=-0.3, # 👈 centrata + showarrow=False, + font=dict(size=11, color="gray"), + align="center", + xanchor="center" # 👈 ancora centrata rispetto al testo + ) + + fig.update_xaxes(fixedrange=True, rangeslider_visible=False) + fig.update_yaxes(fixedrange=True) + + return fig + + +# Define task metadata (icons, names, descriptions) +TASK_METADATA_MULTIPLECHOICE = { + #"TE": {"icon": "📊", "name": "Textual Entailment", "tooltip": ""}, + #"SA": {"icon": "😃", "name": "Sentiment Analysis", "tooltip": ""}, + #"HS": {"icon": "⚠️", "name": "Hate Speech", "tooltip": ""}, + #"AT": {"icon": "🏥", "name": "Admission Test", "tooltip": ""}, + #"WIC": {"icon": "🔤", "name": "Word in Context", "tooltip": ""}, + #"FAQ": {"icon": "❓", "name": "Frequently Asked Questions", "tooltip": ""} +} + +# Define task metadata (icons, names, descriptions) +TASK_METADATA_GENERATIVE = { + + "NER": {"icon": "🏷️", "name": "Named Entity Recognition", "tooltip": ""}, + "REL": {"icon": "🔗", "name": "Relation Extraction", "tooltip": ""}, + "RML": {"icon": "😃", "name": "CRF RML", "tooltip": "CRF RML"}, + "DIA": {"icon": "🏥", "name": "CRF Diagnosis", "tooltip": "CRF Diagnosis"}, + "HIS": {"icon": "📝", "name": "CRF History", "tooltip": "CRF History"}, +} + +def restart_space(): + """Restart the Hugging Face space.""" + API.restart_space(repo_id=REPO_ID) + + +def init_leaderboard(dataframe, default_selection=None, hidden_columns=None): + """ + Initialize and return the leaderboard when it is first loaded or when 'benchmark' is selected. + The table is sorted based on the "Avg. Combined Performance" field. + """ + if dataframe is None or dataframe.empty: + raise ValueError("Leaderboard DataFrame is empty or None.") + + #print("????????????????????????????????", mean_of_max_per_field(dataframe)) + + sorted_dataframe = dataframe.sort_values(by="Avg. Comb. Perf. ⬆️", ascending=False) + + sorted_dataframe = sorted_dataframe.reset_index(drop=True) + sorted_dataframe["Rank"] = sorted_dataframe.index + 1 + + # Flag per sapere se la medaglia è già stata assegnata per categoria e tipo + large_medal_fs_assigned = False + medium_medal_fs_assigned = False + small_medal_fs_assigned = False + + large_medal_0shot_assigned = False + medium_medal_0shot_assigned = False + small_medal_0shot_assigned = False + + # Lista temporanea per salvare i nuovi valori della colonna Model + new_model_column = [] + + for _, row in sorted_dataframe.iterrows(): + if row['IS_FS']: # 10-Few-Shot + if row["Size"] == "🔵🔵🔵" and not large_medal_fs_assigned: + new_model_column.append(f"{row['Model']} 🔵🔵🔵🏆") + large_medal_fs_assigned = True + elif row["Size"] == "🔵🔵" and not medium_medal_fs_assigned: + new_model_column.append(f"{row['Model']} 🔵🔵🏆") + medium_medal_fs_assigned = True + elif row["Size"] == "🔵" and not small_medal_fs_assigned: + new_model_column.append(f"{row['Model']} 🔵🏆") + small_medal_fs_assigned = True + else: + new_model_column.append(row["Model"]) + else: # 0-Shot + if row["Size"] == "🔵🔵🔵" and not large_medal_0shot_assigned: + new_model_column.append(f"{row['Model']} 🔵🔵🔵🎖️") + large_medal_0shot_assigned = True + elif row["Size"] == "🔵🔵" and not medium_medal_0shot_assigned: + new_model_column.append(f"{row['Model']} 🔵🔵🎖️") + medium_medal_0shot_assigned = True + elif row["Size"] == "🔵" and not small_medal_0shot_assigned: + new_model_column.append(f"{row['Model']} 🔵🎖️") + small_medal_0shot_assigned = True + else: + new_model_column.append(row["Model"]) + + # Lista delle colonne da aggiornare + #cols_to_update = ["REL Best Prompt Id", "NER Best Prompt Id", "SU Best Prompt Id", "LS Best Prompt Id"] + # Applichiamo la trasformazione + #for col in cols_to_update: + # dataframe[col] = dataframe[col].replace({1: 7, 2: 8}) + + # Aggiorna la colonna Model + sorted_dataframe["Model"] = new_model_column + + field_list = fields(AutoEvalColumn) + + return Leaderboard( + value=sorted_dataframe, + datatype=[c.type for c in field_list], + search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name], + hide_columns=hidden_columns or [c.name for c in field_list if c.hidden], + filter_columns=[ + ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Shot Learning (FS): "), + ColumnFilter(AutoEvalColumn.LANG.name, type="checkboxgroup", label="Languges: "), + ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max = 100, default = [0,100], label="Select the number of parameters (B)"), + ], + bool_checkboxgroup_label="Evaluation Mode", + interactive=False, + ) + +def update_task_leaderboard(dataframe, default_selection=None, hidden_columns=None): + """ + Update and return the leaderboard when a specific task is selected. + The table is sorted based on the "Combined Performance" field. + """ + if dataframe is None or dataframe.empty: + raise ValueError("Leaderboard DataFrame is empty or None.") + #sorted_dataframe = dataframe.sort_values(by="Combined Performance", ascending=False) + clean_df = dataframe.assign( **{"Combined Performance": pd.to_numeric(dataframe["Combined Performance"], errors="coerce")}).loc[lambda df: df["Combined Performance"].notna() & (df["Combined Performance"] != 0)] + + sorted_dataframe = clean_df.sort_values(by="Combined Performance", ascending=False) + + # aggiungo la colonna rank in base alla posizione + sorted_dataframe = sorted_dataframe.reset_index(drop=True) + sorted_dataframe["Rank"] = sorted_dataframe.index + 1 + + # Flag per sapere se la medaglia è già stata assegnata per categoria e tipo + large_medal_fs_assigned = False + medium_medal_fs_assigned = False + small_medal_fs_assigned = False + + large_medal_0shot_assigned = False + medium_medal_0shot_assigned = False + small_medal_0shot_assigned = False + + # Lista temporanea per salvare i nuovi valori della colonna Model + new_model_column = [] + + for _, row in sorted_dataframe.iterrows(): + if row['IS_FS']: # 5-Few-Shot + if row["Size"] == "🔵🔵🔵" and not large_medal_fs_assigned: + new_model_column.append(f"{row['Model']} 🔵🔵🔵🏆") + large_medal_fs_assigned = True + elif row["Size"] == "🔵🔵" and not medium_medal_fs_assigned: + new_model_column.append(f"{row['Model']} 🔵🔵🏆") + medium_medal_fs_assigned = True + elif row["Size"] == "🔵" and not small_medal_fs_assigned: + new_model_column.append(f"{row['Model']} 🔵🏆") + small_medal_fs_assigned = True + else: + new_model_column.append(row["Model"]) + else: # 0-Shot + if row["Size"] == "🔵🔵🔵" and not large_medal_0shot_assigned: + new_model_column.append(f"{row['Model']} 🔵🔵🔵🎖️") + large_medal_0shot_assigned = True + elif row["Size"] == "🔵🔵" and not medium_medal_0shot_assigned: + new_model_column.append(f"{row['Model']} 🔵🔵🎖️") + medium_medal_0shot_assigned = True + elif row["Size"] == "🔵" and not small_medal_0shot_assigned: + new_model_column.append(f"{row['Model']} 🔵🎖️") + small_medal_0shot_assigned = True + else: + new_model_column.append(row["Model"]) + + # Aggiorna la colonna Model + sorted_dataframe["Model"] = new_model_column + + pd.set_option('display.max_colwidth', None) + #print("========================", dataframe['Model']) + + #print(sorted_dataframe['Combined Performance']) + + field_list = fields(AutoEvalColumn) + + return Leaderboard( + value=sorted_dataframe, + #datatype=[c.type for c in field_list], + datatype=[c.type for c in field_list] + [int], + #select_columns=SelectColumns( + # default_selection=default_selection or [c.name for c in field_list if c.displayed_by_default], + # cant_deselect=[c.name for c in field_list if c.never_hidden], + # label="Select Columns to Display:", + #), + search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name], + hide_columns=hidden_columns or [c.name for c in field_list if c.hidden], + filter_columns=[ + ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Shot Learning (FS): "), + ColumnFilter(AutoEvalColumn.LANG.name, type="checkboxgroup", label="Languges: "), + + ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=100, default=[0, 100], + label="Select the number of parameters (B)"), + ], + bool_checkboxgroup_label="Evaluation Mode", + interactive=False + ) + + + +def download_snapshot(repo, local_dir): + """Try to download a snapshot from Hugging Face Hub.""" + try: + print(f"Downloading from {repo} to {local_dir}...") + snapshot_download(repo_id=repo, local_dir=local_dir, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN) + except Exception as e: + print(f"Error downloading {repo}: {e}") + restart_space() + + +# Initialize the app by downloading snapshots +download_snapshot(QUEUE_REPO, EVAL_REQUESTS_PATH) +download_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH) + +# Load leaderboard data +LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS) +finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS) +#print(LEADERBOARD_DF.columns.tolist()) + +theoretical_max_combined_perf = mean_of_max_per_field(LEADERBOARD_DF) + +# Prepare the main interface +demo = gr.Blocks(css=custom_css) +with demo: + #gr.HTML(TITLE) + gr.HTML( + """ +
+

+ ECREAM-LLM Leaderboard +

+
+ """ + ) + gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") + + # ⬇️ QUI aggiungiamo i grafici subito sotto la barra del titolo e sopra le tabs + with gr.Row(): + gr.Plot(value=line_chart(LEADERBOARD_DF), elem_id="line-chart") + gr.Plot(value=boxplot_per_task(LEADERBOARD_DF, BASELINES, REFERENCES), elem_id="boxplot-task") + + # === NEW: second row with the 2 extra plots (NER/REL + p1..p3) === + #with gr.Row(): + #gr.Plot(value=create_prompt_heatmap(LEADERBOARD_DF), elem_id="prompt-heatmap") + #gr.Plot(value=create_best_model_comparison_table(LEADERBOARD_DF), elem_id="best-model-table") + # === NEW: gray background wrapper for combos === + with gr.Row(elem_id="filters-wrap"): + lang_dd = gr.Dropdown( + choices=["All", "EN", "IT", "SL", "SK", "GR", "PL"], + value="All", label="Language: ", scale=1 + ) + shot_dd = gr.Dropdown( + choices=["All", "0", "10"], + value="All", label="N-Shot: ", scale=1 + ) + + with gr.Row(): + heatmap_plot = gr.Plot(value=create_prompt_heatmap(LEADERBOARD_DF, None, None), elem_id="prompt-heatmap") + table_plot = gr.Plot(value=create_best_model_comparison_table(LEADERBOARD_DF, None, None), elem_id="best-model-table") + + def _update_both(lang, shot): + return ( + create_prompt_heatmap(LEADERBOARD_DF, None if lang == "All" else lang, None if shot == "All" else shot), + create_best_model_comparison_table(LEADERBOARD_DF, None if lang == "All" else lang, None if shot == "All" else shot) + ) + + lang_dd.change(_update_both, inputs=[lang_dd, shot_dd], outputs=[heatmap_plot, table_plot]) + shot_dd.change(_update_both, inputs=[lang_dd, shot_dd], outputs=[heatmap_plot, table_plot]) + + + + with gr.Tabs(elem_classes="tab-buttons") as tabs: + + # Main leaderboard tab + with gr.TabItem("🏅 Benchmark"): + + leaderboard = init_leaderboard( + LEADERBOARD_DF, + default_selection=['Rank', 'Size', 'LANG', 'FS', 'Model', "Avg. Comb. Perf. ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL", "RML", "DIA", "HIS"], + hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['Rank', 'Size', 'LANG', 'FS', 'Model', "Avg. Comb. Perf. ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL", "RML", "DIA", "HIS"]] + ) + + + # About tab + with gr.TabItem("📝 About"): + gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") + + # Task-specific leaderboards + for task, metadata in TASK_METADATA_MULTIPLECHOICE.items(): + + with gr.TabItem(f"{metadata['icon']}{task}"): + + task_description = TASK_DESCRIPTIONS.get(task, "Description not available.") + gr.Markdown(task_description, elem_classes="markdown-text") + + leaderboard = update_task_leaderboard( + LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average", f"{task} Prompt Std": "Prompt Std", f"{task} Best Prompt": "Best Prompt", f"{task} Best Prompt Id": "Best Prompt Id", task: "Combined Performance"}), + default_selection=['Rank', 'Size','LANG', 'FS', 'Model', 'Combined Performance', 'Prompt Average', 'Prompt Std', 'Best Prompt', 'Best Prompt Id'], + hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['Rank', 'Size','LANG', 'FS', 'Model', 'Combined Performance', 'Prompt Average', 'Prompt Std', 'Best Prompt', 'Best Prompt Id']] + ) + + # About tab + with gr.TabItem("│", interactive=False): + gr.Markdown("", elem_classes="markdown-text") + + # Task-specific leaderboards + for task, metadata in TASK_METADATA_GENERATIVE.items(): + with gr.TabItem(f"{metadata['icon']}{task}"): + task_description = TASK_DESCRIPTIONS.get(task, "Description not available.") + gr.Markdown(task_description, elem_classes="markdown-text1") + #print (LEADERBOARD_DF) + leaderboard = update_task_leaderboard( + LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average", + f"{task} Prompt Std": "Prompt Std", + f"{task} Best Prompt": "Best Prompt", + f"{task} Best Prompt Id": "Best Prompt Id", + task: "Combined Performance"}), + default_selection=['Rank', 'Size', 'LANG', 'FS', 'Model', 'Combined Performance', 'Prompt Average', 'Prompt Std', 'Best Prompt', + 'Best Prompt Id'], + hidden_columns=[col for col in LEADERBOARD_DF.columns if + col not in ['Rank', 'Size','LANG', 'FS', 'Model', 'Combined Performance', 'Prompt Average', 'Prompt Std', + 'Best Prompt', 'Best Prompt Id']] + ) + + # Citation section + with gr.Accordion("📙 Citation", open=False): + gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True) + + with gr.Accordion("📙 Credits", open=False): + gr.Markdown( + """ + ***This project has been funded by the European Union under: + + Horizon Europe eCREAM Project (Grant Agreement No.101057726) + """ + ) + +# Background job to restart space +scheduler = BackgroundScheduler() +scheduler.add_job(restart_space, "interval", seconds=1800) +scheduler.start() + +# Launch the app with concurrent queueing +demo.queue(default_concurrency_limit=40).launch(debug=True, # Enable Gradio debug mode + show_error=True) \ No newline at end of file diff --git a/app_17_10_2025.py b/app_17_10_2025.py new file mode 100644 index 0000000000000000000000000000000000000000..bfc58ad1c422019017f845125f491be834d1f1e9 --- /dev/null +++ b/app_17_10_2025.py @@ -0,0 +1,815 @@ +import gradio as gr +from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns +import pandas as pd +from apscheduler.schedulers.background import BackgroundScheduler +from huggingface_hub import snapshot_download +from src.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE +from src.tasks import TASK_DESCRIPTIONS, MEASURE_DESCRIPTION +from src.display.css_html_js import custom_css +from src.display.utils import BENCHMARK_COLS, COLS, EVAL_COLS, EVAL_TYPES, AutoEvalColumn, ModelType, fields, WeightType, Precision +from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN +from src.populate import get_evaluation_queue_df, get_leaderboard_df +from src.submission.submit import add_new_eval +import random +import matplotlib.pyplot as plt +import re +import plotly.express as px +import plotly.graph_objects as go +import numpy as np + + +def mean_of_max_per_field(df): + """ + Calcola il massimo per ciascun campo e poi la media dei massimi. + + Args: + df (pd.DataFrame): DataFrame con colonne TE, SA, HS, AT, WIC, FAQ, LS, SU, NER, REL + + Returns: + float: media dei valori massimi dei campi + """ + #fields = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"] + fields = ["NER", "REL"] + #print(df.columns) + + # Controlla che tutte le colonne esistano nel DataFrame + missing = [f for f in fields if f not in df.columns] + if missing: + raise ValueError(f"Le seguenti colonne mancano nel DataFrame: {missing}") + + # Calcola il massimo per ciascun campo + max_values = df[fields].max() + + # Calcola la media dei massimi + mean_max = max_values.mean() + + return mean_max + + +def barplot_mean_few_minus_zero_shot(dataframe, tasks=None): + if tasks is None: + tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"] + + task_means = {} + + for task in tasks: + if task not in dataframe.columns: + continue + + # Separa few-shot e zero-shot + few_shot = dataframe[dataframe['IS_FS'] == True][["Model", task]] + zero_shot = dataframe[dataframe['IS_FS'] == False][["Model", task]] + + # Allinea i modelli + merged = pd.merge(few_shot, zero_shot, on="Model", suffixes=("_few", "_zero")) + + # Rimuovi righe con valori mancanti + merged = merged.dropna(subset=[f"{task}_few", f"{task}_zero"]) + + if merged.empty: + continue + + # Calcola differenza few - zero + diff = merged[f"{task}_few"] - merged[f"{task}_zero"] + + # Calcola la media + task_means[task] = diff.mean() + + # Crea barplot + fig = go.Figure([go.Bar( + x=list(task_means.keys()), + y=list(task_means.values()), + marker_color="#ff7f0e", + text=[f"{v:.2f}" for v in task_means.values()], + textposition="outside", + hovertemplate="%{x}
Mean Delta Accuracy: %{y:.2f}%" + )]) + + # Linea di riferimento a 0 + ''' + fig.add_shape( + type="line", + x0=-0.5, x1=len(task_means) - 0.5, + y0=0, y1=0, + line=dict(color="black", width=2, dash="dash"), + xref="x", yref="y" + ) + ''' + + fig.update_layout( + title="Mean Accuracy Difference (Few-shot − Zero-shot) per Task", + xaxis_title="", + yaxis_title="Mean Delta Combined Performance", + template="plotly_white", + font=dict(family="Arial", size=13), + #margin=dict(b=100) + ) + + fig.add_annotation( + text="10-shot learning generally outperforms zero-shot.
" + "", + xref="paper", yref="paper", + x=0, y=-0.2, + showarrow=False, + font=dict(size=11, color="gray"), + align="left" + ) + + return fig + + +def boxplot_per_task(dataframe=None, baselines=None, references=None): + + #print(dataframe.columns) + + #tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"] + tasks =["NER", "REL"] + if dataframe is None: + np.random.seed(42) + dataframe = pd.DataFrame({ + task: np.random.uniform(0.4, 0.9, 20) * 100 + for task in tasks + }) + + if baselines is None: + baselines = {task: np.random.randint(50, 70) for task in tasks} + + colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", + "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"] + + fig = go.Figure() + + for i, task in enumerate(tasks): + if task in dataframe.columns: + y_data = dataframe[task].dropna().tolist() + + # boxplot + fig.add_trace(go.Box( + y=y_data, + name=task, + marker=dict(color=colors[i]), + line=dict(color="black", width=2), + fillcolor=colors[i], + opacity=0.7, + hovertemplate=""+task+"
Accuracy: %{y:.2f}%", + width=0.6, + whiskerwidth=0.2, + quartilemethod="linear" + )) + + # baseline + if task in baselines and baselines[task] is not None: + fig.add_shape( + type="line", + x0=i - 0.3, x1=i + 0.3, + y0=baselines[task], y1=baselines[task], + line=dict(color="black", width=2, dash="dot"), # più visibile + xref="x", yref="y" + ) + ''' + fig.add_annotation( + x=i, y=baselines[task], + text=f"{baselines[task]}%", + showarrow=False, + yshift=10, + font=dict(size=10, color="black") + ) + ''' + + # reference GPT-4o + if task in references and references[task] is not None: + fig.add_shape( + type="line", + x0=i - 0.3, x1=i + 0.3, + y0=references[task], y1=references[task], + line=dict(color="red", width=2, dash="dashdot"), + xref="x", yref="y" + ) + + fig.update_layout( + title="Distribution of Model Accuracy by Task", + xaxis_title="Task", + yaxis_title="Combined Performance", + template="plotly_white", + boxmode="group", + dragmode=False, + font=dict(family="Arial", size=10), + margin=dict(b=80), + ) + + fig.add_annotation( + text=("" + #"In tasks like TE and SA, models approach the accuracy of supervised
" + #"models at EVALITA (dashed black line); in NER and REL they remain lower.
" + # "Dashed red lines show GPT-4o reference results for generative tasks." + ), + xref="paper", yref="paper", + x=0.5, y=-0.30, + showarrow=False, + font=dict(size=11, color="gray"), + align="left" + ) + + fig.update_yaxes(range=[0, 100], fixedrange=True) + + return fig + +# EVALITA results +BASELINES = { + "TE":71.00, "SA": 66.38, "HS": 80.88, "AT": 82.40, "WIC": 85.00, + "LS": 38.82, "SU": 38.91, "NER":88.00, "REL": 62.99 +} + +# GPT-4o +REFERENCES = { + "NER": 79.11, + "REL": 63.32, + "LS": 59.25, + "SU": 33.04 + +} + + +def boxplot_prompts_per_task(dataframe, tasks=None): + if tasks is None: + tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"] + + # Lista delle colonne da aggiornare + cols_to_update = ["REL Best Prompt Id", "NER Best Prompt Id", "SU Best Prompt Id", "LS Best Prompt Id"] + # Applichiamo la trasformazione + for col in cols_to_update: + dataframe[col] = dataframe[col].replace({1: 7, 2: 8}) + + fig = go.Figure() + + # Liste per creare una sola voce in legenda per Average e Best + avg_x, avg_y = [], [] + best_x, best_y, best_text = [], [], [] + + for task in tasks: + avg_col = f"{task} Prompt Average" + best_col = f"{task} Best Prompt" + best_id_col = f"{task} Best Prompt Id" + + if all(col in dataframe.columns for col in [avg_col, best_col, best_id_col]): + avg_value = dataframe[avg_col].mean() + avg_x.append(task) + avg_y.append(avg_value) + + best_value = dataframe[best_col].mean() + best_x.append(task) + best_y.append(best_value) + best_id = dataframe[best_id_col].mode()[0] # Most frequent best prompt id + best_text.append(f"P:{best_id}") + + # Barre Average Accuracy (azzurro) + fig.add_trace(go.Bar( + x=avg_x, + y=avg_y, + name="Avg. Accuracy", + marker_color="#1f77b4", + #hovertemplate="%{y:.2f}%" + #hovertemplate="" + task + "
Accuracy: %{y:.2f}%", + )) + + # Barre Best Prompt (rosso) + fig.add_trace(go.Bar( + x=best_x, + y=best_y, + name="Best Prompt", + marker_color="#d62728", + #hovertemplate="%{y:.2f}%" + #hovertemplate = "" + task + "
Accuracy: %{y:.2f}%", + )) + + # Testo sopra barre Best Prompt con ID + for x, y, text in zip(best_x, best_y, best_text): + fig.add_annotation( + x=x, + y=y + 3, # leggermente sopra la barra + text=text, + showarrow=False, + font=dict(size=12, color="black") + ) + + fig.update_layout( + title= "Prompt Accuracy: Avg vs Best", + xaxis_title="Task", + yaxis_title="Combined Performance", + barmode='group', + template="plotly_white", + font=dict(family="Arial", size=10), + yaxis=dict(range=[0, 100], fixedrange=True) + ) + + # caption come annotazione separata + fig.add_annotation( + text="There is no single prompt that performs best across all tasks.
" + "Different prompts achieve the highest accuracy on different tasks.", + xref="paper", yref="paper", + x=0.5, y=-0.3, + showarrow=False, + font=dict(size=11, color="gray"), + align="center", + xanchor="center" + ) + + return fig + + +def line_chart(dataframe): + + # Normalizza le dimensioni per avere marker non troppo piccoli né enormi + def scale_sizes(values, min_size=8, max_size=30): + vmin, vmax = min(values), max(values) + return [ + min_size + (val - vmin) / (vmax - vmin) * (max_size - min_size) if vmax > vmin else (min_size + max_size) / 2 + for val in values + ] + + # dati in base a IS_FS + df_true = dataframe[dataframe['IS_FS'] == True] + df_false = dataframe[dataframe['IS_FS'] == False] + + # Estrai valori x, y e labels + x_true = df_true['#Params (B)'].tolist() + y_true = df_true['Avg. Comb. Perf. ⬆️'].tolist() + labels_true = [re.search(r'>([^<]+)<', m).group(1) for m in df_true['Model'].tolist()] + + x_false = df_false['#Params (B)'].tolist() + y_false = df_false['Avg. Comb. Perf. ⬆️'].tolist() + labels_false = [re.search(r'>([^<]+)<', m).group(1) for m in df_false['Model'].tolist()] + + fig = go.Figure() + + # Punti IS_FS=True + fig.add_trace(go.Scatter( + x=x_true, + y=y_true, + mode='markers', + name='10-Shot', + marker=dict( + color='blue', + size=scale_sizes(x_true) + ), + hovertemplate='%{customdata}
#Params: %{x}
Performance: %{y}', + customdata=labels_true + )) + + # Punti IS_FS=False + fig.add_trace(go.Scatter( + x=x_false, + y=y_false, + mode='markers', + name='0-Shot', + marker=dict( + color='red', + size=scale_sizes(x_false) + ), + hovertemplate='%{customdata}
#Params: %{x}
Performance: %{y}', + customdata=labels_false + )) + + # Trova il massimo tra tutti i modelli + all_y = y_true + y_false + all_x = x_true + x_false + all_labels = labels_true + labels_false + max_idx = all_y.index(max(all_y)) + max_x = all_x[max_idx] + max_y = all_y[max_idx] + max_label = all_labels[max_idx] + + # Aggiungi annotazione visibile per il modello migliore + fig.add_annotation( + x=max_x, + y=max_y, + #text=f"Top: {max_label} ({max_y:.1f}%)", + text=f"{max_label}", + showarrow=True, + arrowhead=2, + arrowsize=1, + arrowwidth=2, + arrowcolor="black", + font=dict(size=11, color="black"), + xshift=10, + yshift=10, + ax = -30, ay = -20, # sposta la label a sinistra e sopra il punto + xanchor = "right" # allinea la label a destra rispetto al punto + ) + + fig.update_layout( + title="Avg. Combined Performance vs #Params", + xaxis_title="#Params (B)", + yaxis_title="Avg. Combined Performance", + template="plotly_white", + hovermode="closest", + font=dict(family="Arial", size=10), + dragmode=False, + xaxis=dict( + tickvals=[0, 25, 50, 75, 100, 125], + ticktext=["0", "25", "50", "75", "100"] + ), + yaxis=dict( + tickvals=[0, 20, 40, 60, 80, 100], # 👈 tick fissi + range=[0, 100] # 👈 range bloccato + ) + ) + + # Caption + fig.add_annotation( + text="Accuracy generally rises with #Params, but smaller models
" + "with 10-shot can outperform larger zero-shot models.", + xref="paper", yref="paper", + x=0.5, y=-0.3, # 👈 centrata + showarrow=False, + font=dict(size=11, color="gray"), + align="center", + xanchor="center" # 👈 ancora centrata rispetto al testo + ) + + fig.update_xaxes(fixedrange=True, rangeslider_visible=False) + fig.update_yaxes(fixedrange=True) + + return fig + + +# Define task metadata (icons, names, descriptions) +TASK_METADATA_MULTIPLECHOICE = { + #"TE": {"icon": "📊", "name": "Textual Entailment", "tooltip": ""}, + #"SA": {"icon": "😃", "name": "Sentiment Analysis", "tooltip": ""}, + #"HS": {"icon": "⚠️", "name": "Hate Speech", "tooltip": ""}, + #"AT": {"icon": "🏥", "name": "Admission Test", "tooltip": ""}, + #"WIC": {"icon": "🔤", "name": "Word in Context", "tooltip": ""}, + #"FAQ": {"icon": "❓", "name": "Frequently Asked Questions", "tooltip": ""} +} + +# Define task metadata (icons, names, descriptions) +TASK_METADATA_GENERATIVE = { + #"LS": {"icon": "🔄", "name": "Lexical Substitution", "tooltip": ""}, + #"SU": {"icon": "📝", "name": "Summarization", "tooltip": ""}, + "NER": {"icon": "🏷️", "name": "Named Entity Recognition", "tooltip": ""}, + "REL": {"icon": "🔗", "name": "Relation Extraction", "tooltip": ""}, +} + +def restart_space(): + """Restart the Hugging Face space.""" + API.restart_space(repo_id=REPO_ID) + + +def init_leaderboard(dataframe, default_selection=None, hidden_columns=None): + """ + Initialize and return the leaderboard when it is first loaded or when 'benchmark' is selected. + The table is sorted based on the "Avg. Combined Performance" field. + """ + if dataframe is None or dataframe.empty: + raise ValueError("Leaderboard DataFrame is empty or None.") + + #print("????????????????????????????????", mean_of_max_per_field(dataframe)) + + sorted_dataframe = dataframe.sort_values(by="Avg. Comb. Perf. ⬆️", ascending=False) + + sorted_dataframe = sorted_dataframe.reset_index(drop=True) + sorted_dataframe["Rank"] = sorted_dataframe.index + 1 + + # Flag per sapere se la medaglia è già stata assegnata per categoria e tipo + large_medal_fs_assigned = False + medium_medal_fs_assigned = False + small_medal_fs_assigned = False + + large_medal_0shot_assigned = False + medium_medal_0shot_assigned = False + small_medal_0shot_assigned = False + + # Lista temporanea per salvare i nuovi valori della colonna Model + new_model_column = [] + + for _, row in sorted_dataframe.iterrows(): + if row['IS_FS']: # 10-Few-Shot + if row["Size"] == "🔵🔵🔵" and not large_medal_fs_assigned: + new_model_column.append(f"{row['Model']} 🔵🔵🔵🏆") + large_medal_fs_assigned = True + elif row["Size"] == "🔵🔵" and not medium_medal_fs_assigned: + new_model_column.append(f"{row['Model']} 🔵🔵🏆") + medium_medal_fs_assigned = True + elif row["Size"] == "🔵" and not small_medal_fs_assigned: + new_model_column.append(f"{row['Model']} 🔵🏆") + small_medal_fs_assigned = True + else: + new_model_column.append(row["Model"]) + else: # 0-Shot + if row["Size"] == "🔵🔵🔵" and not large_medal_0shot_assigned: + new_model_column.append(f"{row['Model']} 🔵🔵🔵🎖️") + large_medal_0shot_assigned = True + elif row["Size"] == "🔵🔵" and not medium_medal_0shot_assigned: + new_model_column.append(f"{row['Model']} 🔵🔵🎖️") + medium_medal_0shot_assigned = True + elif row["Size"] == "🔵" and not small_medal_0shot_assigned: + new_model_column.append(f"{row['Model']} 🔵🎖️") + small_medal_0shot_assigned = True + else: + new_model_column.append(row["Model"]) + + # Lista delle colonne da aggiornare + #cols_to_update = ["REL Best Prompt Id", "NER Best Prompt Id", "SU Best Prompt Id", "LS Best Prompt Id"] + # Applichiamo la trasformazione + #for col in cols_to_update: + # dataframe[col] = dataframe[col].replace({1: 7, 2: 8}) + + # Aggiorna la colonna Model + sorted_dataframe["Model"] = new_model_column + + field_list = fields(AutoEvalColumn) + + return Leaderboard( + value=sorted_dataframe, + datatype=[c.type for c in field_list], + #select_columns=SelectColumns( + # default_selection=default_selection or [c.name for c in field_list if c.displayed_by_default], + # cant_deselect=[c.name for c in field_list if c.never_hidden], + # label="Select Columns to Display:", + #), + search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name], + hide_columns=hidden_columns or [c.name for c in field_list if c.hidden], + filter_columns=[ + ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Shot Learning (FS)"), + #ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)", + # default=[["0️⃣", "0️⃣"]]), + ColumnFilter(AutoEvalColumn.LANG.name, type="checkboxgroup", label="Languges "), + + ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max = 100, default = [0,100], label="Select the number of parameters (B)"), + ], + #filter_columns=[ + # ColumnFilter("IS_FS", type="checkbox", default=False, label="5-Few-Shot") + # #ColumnFilter("FS", type="dropdown", label="5-Few-Shot") + #], + bool_checkboxgroup_label="Evaluation Mode", + interactive=False, + ) + +def update_task_leaderboard(dataframe, default_selection=None, hidden_columns=None): + """ + Update and return the leaderboard when a specific task is selected. + The table is sorted based on the "Combined Performance" field. + """ + if dataframe is None or dataframe.empty: + raise ValueError("Leaderboard DataFrame is empty or None.") + + sorted_dataframe = dataframe.sort_values(by="Combined Performance", ascending=False) + + # aggiungo la colonna rank in base alla posizione + sorted_dataframe = sorted_dataframe.reset_index(drop=True) + sorted_dataframe["Rank"] = sorted_dataframe.index + 1 + + # Flag per sapere se la medaglia è già stata assegnata per categoria e tipo + large_medal_fs_assigned = False + medium_medal_fs_assigned = False + small_medal_fs_assigned = False + + large_medal_0shot_assigned = False + medium_medal_0shot_assigned = False + small_medal_0shot_assigned = False + + # Lista temporanea per salvare i nuovi valori della colonna Model + new_model_column = [] + + for _, row in sorted_dataframe.iterrows(): + if row['IS_FS']: # 5-Few-Shot + if row["Size"] == "🔵🔵🔵" and not large_medal_fs_assigned: + new_model_column.append(f"{row['Model']} 🔵🔵🔵🏆") + large_medal_fs_assigned = True + elif row["Size"] == "🔵🔵" and not medium_medal_fs_assigned: + new_model_column.append(f"{row['Model']} 🔵🔵🏆") + medium_medal_fs_assigned = True + elif row["Size"] == "🔵" and not small_medal_fs_assigned: + new_model_column.append(f"{row['Model']} 🔵🏆") + small_medal_fs_assigned = True + else: + new_model_column.append(row["Model"]) + else: # 0-Shot + if row["Size"] == "🔵🔵🔵" and not large_medal_0shot_assigned: + new_model_column.append(f"{row['Model']} 🔵🔵🔵🎖️") + large_medal_0shot_assigned = True + elif row["Size"] == "🔵🔵" and not medium_medal_0shot_assigned: + new_model_column.append(f"{row['Model']} 🔵🔵🎖️") + medium_medal_0shot_assigned = True + elif row["Size"] == "🔵" and not small_medal_0shot_assigned: + new_model_column.append(f"{row['Model']} 🔵🎖️") + small_medal_0shot_assigned = True + else: + new_model_column.append(row["Model"]) + + # Aggiorna la colonna Model + sorted_dataframe["Model"] = new_model_column + + pd.set_option('display.max_colwidth', None) + #print("========================", dataframe['Model']) + + #print(sorted_dataframe['Combined Performance']) + + field_list = fields(AutoEvalColumn) + + return Leaderboard( + value=sorted_dataframe, + #datatype=[c.type for c in field_list], + datatype=[c.type for c in field_list] + [int], + #select_columns=SelectColumns( + # default_selection=default_selection or [c.name for c in field_list if c.displayed_by_default], + # cant_deselect=[c.name for c in field_list if c.never_hidden], + # label="Select Columns to Display:", + #), + search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name], + hide_columns=hidden_columns or [c.name for c in field_list if c.hidden], + filter_columns=[ + ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Shot Learning (FS)"), + ColumnFilter(AutoEvalColumn.LANG.name, type="checkboxgroup", label="Languges "), + + ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=100, default=[0, 100], + label="Select the number of parameters (B)"), + ], + bool_checkboxgroup_label="Evaluation Mode", + interactive=False + ) + +''' +# Helper function for leaderboard initialization +def init_leaderboard(dataframe, default_selection=None, hidden_columns=None): + """Initialize and return a leaderboard.""" + if dataframe is None or dataframe.empty: + raise ValueError("Leaderboard DataFrame is empty or None.") + + return Leaderboard( + value=dataframe, + datatype=[c.type for c in fields(AutoEvalColumn)], + select_columns=SelectColumns( + default_selection=default_selection or [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default], + cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden], + label="Select Columns to Display:", + ), + search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name], + hide_columns=hidden_columns or [c.name for c in fields(AutoEvalColumn) if c.hidden], + filter_columns=[ + ColumnFilter(AutoEvalColumn.fewshot_type.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)"), + ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=150, label="Select the number of parameters (B)"), + ], + bool_checkboxgroup_label="Hide models", + interactive=False, + ) +''' + +def download_snapshot(repo, local_dir): + """Try to download a snapshot from Hugging Face Hub.""" + try: + print(f"Downloading from {repo} to {local_dir}...") + snapshot_download(repo_id=repo, local_dir=local_dir, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN) + except Exception as e: + print(f"Error downloading {repo}: {e}") + restart_space() + + +# Initialize the app by downloading snapshots +download_snapshot(QUEUE_REPO, EVAL_REQUESTS_PATH) +download_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH) + +# Load leaderboard data +LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS) +finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS) +#print(LEADERBOARD_DF.columns.tolist()) + +theoretical_max_combined_perf = mean_of_max_per_field(LEADERBOARD_DF) + +# Prepare the main interface +demo = gr.Blocks(css=custom_css) +with demo: + #gr.HTML(TITLE) + gr.HTML( + """ +
+

+ ECREAM-LLM Leaderboard +

+
+ """ + ) + gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") + + # ⬇️ QUI aggiungiamo i grafici subito sotto la barra del titolo e sopra le tabs + with gr.Row(): + gr.Plot(value=line_chart(LEADERBOARD_DF), elem_id="line-chart") + gr.Plot(value=boxplot_per_task(LEADERBOARD_DF, BASELINES, REFERENCES), elem_id="boxplot-task") + #gr.Plot(value=boxplot_prompts_per_task(LEADERBOARD_DF), elem_id="boxplot-prompt-task") + + with gr.Tabs(elem_classes="tab-buttons") as tabs: + + # Main leaderboard tab + with gr.TabItem("🏅 Benchmark"): + + leaderboard = init_leaderboard( + LEADERBOARD_DF, + default_selection=['Rank', 'Size', 'LANG', 'FS', 'Model', "Avg. Comb. Perf. ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"], + hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['Rank', 'Size', 'LANG', 'FS', 'Model', "Avg. Comb. Perf. ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]] + ) + + # gr.HTML( + # f""" + #
+ # Theoretical performance of a model that scores the highest on every individual task: {theoretical_max_combined_perf:.2f} + #
+ # $ """ + # ) + + ''' + with gr.TabItem("📈 Charts"): + #gr.Plot(value=line_chart(LEADERBOARD_DF), label="Andamento di esempio") + #gr.Plot(value=line_chart_interactive_test(), label="Andamento interattivo") + gr.Plot(value=line_chart(LEADERBOARD_DF)) + gr.Plot(value=boxplot_per_task(LEADERBOARD_DF, BASELINES)) + gr.Plot(value=boxplot_prompts_per_task(LEADERBOARD_DF)) + gr.Plot(value=barplot_mean_few_minus_zero_shot(LEADERBOARD_DF)) + ''' + + # About tab + with gr.TabItem("📝 About"): + gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") + + # About tab + #with gr.TabItem("║", interactive=False): + # gr.Markdown("", elem_classes="markdown-text") + + + # Task-specific leaderboards + for task, metadata in TASK_METADATA_MULTIPLECHOICE.items(): + + with gr.TabItem(f"{metadata['icon']}{task}"): + + task_description = TASK_DESCRIPTIONS.get(task, "Description not available.") + gr.Markdown(task_description, elem_classes="markdown-text") + + leaderboard = update_task_leaderboard( + LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average", f"{task} Prompt Std": "Prompt Std", f"{task} Best Prompt": "Best Prompt", f"{task} Best Prompt Id": "Best Prompt Id", task: "Combined Performance"}), + default_selection=['Rank', 'Size','LANG', 'FS', 'Model', 'Combined Performance', 'Prompt Average', 'Prompt Std', 'Best Prompt', 'Best Prompt Id'], + hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['Rank', 'Size','LANG', 'FS', 'Model', 'Combined Performance', 'Prompt Average', 'Prompt Std', 'Best Prompt', 'Best Prompt Id']] + ) + + # About tab + with gr.TabItem("│", interactive=False): + gr.Markdown("", elem_classes="markdown-text") + + # Task-specific leaderboards + for task, metadata in TASK_METADATA_GENERATIVE.items(): + with gr.TabItem(f"{metadata['icon']}{task}"): + task_description = TASK_DESCRIPTIONS.get(task, "Description not available.") + gr.Markdown(task_description, elem_classes="markdown-text1") + #print (LEADERBOARD_DF) + leaderboard = update_task_leaderboard( + LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average", + f"{task} Prompt Std": "Prompt Std", + f"{task} Best Prompt": "Best Prompt", + f"{task} Best Prompt Id": "Best Prompt Id", + task: "Combined Performance"}), + default_selection=['Rank', 'Size', 'LANG', 'FS', 'Model', 'Combined Performance', 'Prompt Average', 'Prompt Std', 'Best Prompt', + 'Best Prompt Id'], + hidden_columns=[col for col in LEADERBOARD_DF.columns if + col not in ['Rank', 'Size','LANG', 'FS', 'Model', 'Combined Performance', 'Prompt Average', 'Prompt Std', + 'Best Prompt', 'Best Prompt Id']] + ) + + # Citation section + with gr.Accordion("📙 Citation", open=False): + gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True) + + with gr.Accordion("📙 Credits", open=False): + gr.Markdown( + """ + ***This project has been funded by the European Union under: + + Horizon Europe eCREAM Project (Grant Agreement No.101057726) + """ + ) + +# Background job to restart space +scheduler = BackgroundScheduler() +scheduler.add_job(restart_space, "interval", seconds=1800) +scheduler.start() + +# Launch the app with concurrent queueing +demo.queue(default_concurrency_limit=40).launch(debug=True, # Enable Gradio debug mode + show_error=True) \ No newline at end of file diff --git a/csv_files/llm_scores_p1.xlsx b/csv_files/llm_scores_p1.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..a6d5790fd6e7b7ce5f7d819925710d57f375f3ce Binary files /dev/null and b/csv_files/llm_scores_p1.xlsx differ diff --git a/csv_files/llm_scores_p2.xlsx b/csv_files/llm_scores_p2.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..26af80a6cd45b0ec82097dafc6d4b8105945ecda Binary files /dev/null and b/csv_files/llm_scores_p2.xlsx differ diff --git a/csv_files/llm_scores_p3.xlsx b/csv_files/llm_scores_p3.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..3621768a991e0d757b6e5af462af6dc618f9f293 Binary files /dev/null and b/csv_files/llm_scores_p3.xlsx differ diff --git a/csv_files/outputs/.ipynb_checkpoints/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__en__0shot-checkpoint.txt b/csv_files/outputs/.ipynb_checkpoints/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__en__0shot-checkpoint.txt new file mode 100644 index 0000000000000000000000000000000000000000..37a0a0b3fcec4413d915fb8b17302fe6c93286c4 --- /dev/null +++ b/csv_files/outputs/.ipynb_checkpoints/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__en__0shot-checkpoint.txt @@ -0,0 +1,11 @@ +hf (pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2877 | |0 | +| - p1 | | | |f1 | | 0.1963 | | 0 | +| - p2 | | | |f1 | | 0.3459 | | 0 | +| - p3 | | | |f1 | | 0.3208 | | 0 | +| - RE | | | |f1 | | 0.4430 | |0 | +| - p1 | | | |f1 | | 0.4487 | | 0 | +| - p2 | | | |f1 | | 0.4492 | | 0 | +| - p3 | | | |f1 | | 0.4311 | | 0 | diff --git a/csv_files/outputs/.ipynb_checkpoints/epfl-llm__meditron-7b__it__10shot-checkpoint.txt b/csv_files/outputs/.ipynb_checkpoints/epfl-llm__meditron-7b__it__10shot-checkpoint.txt new file mode 100644 index 0000000000000000000000000000000000000000..b2aa7fd7dc8637dbd14ef01f078eceecddd04f15 --- /dev/null +++ b/csv_files/outputs/.ipynb_checkpoints/epfl-llm__meditron-7b__it__10shot-checkpoint.txt @@ -0,0 +1,11 @@ +hf (pretrained=epfl-llm/meditron-7b ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3288 | |0 | +| - p1 | | | |f1 | | 0.2991 | | 0 | +| - p2 | | | |f1 | | 0.3563 | | 0 | +| - p3 | | | |f1 | | 0.3311 | | 0 | +| - RE | | | |f1 | | 0.0896 | |0 | +| - p1 | | | |f1 | | 0.0832 | | 0 | +| - p2 | | | |f1 | | 0.0887 | | 0 | +| - p3 | | | |f1 | | 0.0968 | | 0 | diff --git a/csv_files/outputs/Henrychur__MMed-Llama-3-8B__en__0shot.txt b/csv_files/outputs/Henrychur__MMed-Llama-3-8B__en__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..adb6649f69212f89096131405d496954e948d037 --- /dev/null +++ b/csv_files/outputs/Henrychur__MMed-Llama-3-8B__en__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0918 | |0 | +| - p1 | | | |f1 | | 0.0629 | | 0 | +| - p2 | | | |f1 | | 0.1041 | | 0 | +| - p3 | | | |f1 | | 0.1083 | | 0 | +| - RE | | | |f1 | | 0.2604 | |0 | +| - p1 | | | |f1 | | 0.1287 | | 0 | +| - p2 | | | |f1 | | 0.3394 | | 0 | +| - p3 | | | |f1 | | 0.3131 | | 0 | diff --git a/csv_files/outputs/Henrychur__MMed-Llama-3-8B__en__10shot.txt b/csv_files/outputs/Henrychur__MMed-Llama-3-8B__en__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..970790d8c37ee624a941d7838baf752c1418f32c --- /dev/null +++ b/csv_files/outputs/Henrychur__MMed-Llama-3-8B__en__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2142 | |0 | +| - p1 | | | |f1 | | 0.2189 | | 0 | +| - p2 | | | |f1 | | 0.2243 | | 0 | +| - p3 | | | |f1 | | 0.1994 | | 0 | +| - RE | | | |f1 | | 0.1681 | |0 | +| - p1 | | | |f1 | | 0.1189 | | 0 | +| - p2 | | | |f1 | | 0.1668 | | 0 | +| - p3 | | | |f1 | | 0.2185 | | 0 | diff --git a/csv_files/outputs/Henrychur__MMed-Llama-3-8B__gr__0shot.txt b/csv_files/outputs/Henrychur__MMed-Llama-3-8B__gr__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..256806ae1aae91613bb15e7f61973bb2c3d373e9 --- /dev/null +++ b/csv_files/outputs/Henrychur__MMed-Llama-3-8B__gr__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0611 | |0 | +| - p1 | | | |f1 | | 0.0620 | | 0 | +| - p2 | | | |f1 | | 0.0592 | | 0 | +| - p3 | | | |f1 | | 0.0620 | | 0 | +| - RE | | | |f1 | | 0.0863 | |0 | +| - p1 | | | |f1 | | 0.1017 | | 0 | +| - p2 | | | |f1 | | 0.0506 | | 0 | +| - p3 | | | |f1 | | 0.1065 | | 0 | diff --git a/csv_files/outputs/Henrychur__MMed-Llama-3-8B__gr__10shot.txt b/csv_files/outputs/Henrychur__MMed-Llama-3-8B__gr__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..e1968c70df7ae59de71b96c9719693f1041cc591 --- /dev/null +++ b/csv_files/outputs/Henrychur__MMed-Llama-3-8B__gr__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1474 | |0 | +| - p1 | | | |f1 | | 0.1667 | | 0 | +| - p2 | | | |f1 | | 0.1089 | | 0 | +| - p3 | | | |f1 | | 0.1667 | | 0 | +| - RE | | | |f1 | | 0.0970 | |0 | +| - p1 | | | |f1 | | 0.0821 | | 0 | +| - p2 | | | |f1 | | 0.1053 | | 0 | +| - p3 | | | |f1 | | 0.1036 | | 0 | diff --git a/csv_files/outputs/Henrychur__MMed-Llama-3-8B__it__0shot.txt b/csv_files/outputs/Henrychur__MMed-Llama-3-8B__it__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..de76936f997964ba608d49c32bded3be64711fed --- /dev/null +++ b/csv_files/outputs/Henrychur__MMed-Llama-3-8B__it__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0416 | |0 | +| - p1 | | | |f1 | | 0.0435 | | 0 | +| - p2 | | | |f1 | | 0.0429 | | 0 | +| - p3 | | | |f1 | | 0.0384 | | 0 | +| - RE | | | |f1 | | 0.1413 | |0 | +| - p1 | | | |f1 | | 0.0672 | | 0 | +| - p2 | | | |f1 | | 0.2266 | | 0 | +| - p3 | | | |f1 | | 0.1300 | | 0 | diff --git a/csv_files/outputs/Henrychur__MMed-Llama-3-8B__it__10shot.txt b/csv_files/outputs/Henrychur__MMed-Llama-3-8B__it__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..1324843bcf3efaa493662c82d523957a6e202c45 --- /dev/null +++ b/csv_files/outputs/Henrychur__MMed-Llama-3-8B__it__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3753 | |0 | +| - p1 | | | |f1 | | 0.3299 | | 0 | +| - p2 | | | |f1 | | 0.4023 | | 0 | +| - p3 | | | |f1 | | 0.3938 | | 0 | +| - RE | | | |f1 | | 0.1331 | |0 | +| - p1 | | | |f1 | | 0.0977 | | 0 | +| - p2 | | | |f1 | | 0.1226 | | 0 | +| - p3 | | | |f1 | | 0.1789 | | 0 | diff --git a/csv_files/outputs/Henrychur__MMed-Llama-3-8B__pl__0shot.txt b/csv_files/outputs/Henrychur__MMed-Llama-3-8B__pl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..922dce80469337edc75e7835aa6a600369523091 --- /dev/null +++ b/csv_files/outputs/Henrychur__MMed-Llama-3-8B__pl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0379 | |0 | +| - p1 | | | |f1 | | 0.0379 | | 0 | +| - p2 | | | |f1 | | 0.0378 | | 0 | +| - p3 | | | |f1 | | 0.0379 | | 0 | +| - RE | | | |f1 | | 0.0891 | |0 | +| - p1 | | | |f1 | | 0.0602 | | 0 | +| - p2 | | | |f1 | | 0.1293 | | 0 | +| - p3 | | | |f1 | | 0.0778 | | 0 | diff --git a/csv_files/outputs/Henrychur__MMed-Llama-3-8B__pl__10shot.txt b/csv_files/outputs/Henrychur__MMed-Llama-3-8B__pl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..9dee6185f81350fdc85c72cb4a61be93b071cc61 --- /dev/null +++ b/csv_files/outputs/Henrychur__MMed-Llama-3-8B__pl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3966 | |0 | +| - p1 | | | |f1 | | 0.3992 | | 0 | +| - p2 | | | |f1 | | 0.3916 | | 0 | +| - p3 | | | |f1 | | 0.3992 | | 0 | +| - RE | | | |f1 | | 0.1003 | |0 | +| - p1 | | | |f1 | | 0.0998 | | 0 | +| - p2 | | | |f1 | | 0.1055 | | 0 | +| - p3 | | | |f1 | | 0.0956 | | 0 | diff --git a/csv_files/outputs/Henrychur__MMed-Llama-3-8B__sk__0shot.txt b/csv_files/outputs/Henrychur__MMed-Llama-3-8B__sk__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..8a25091bbb80e0a2681548322565c52cb0858b07 --- /dev/null +++ b/csv_files/outputs/Henrychur__MMed-Llama-3-8B__sk__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0385 | |0 | +| - p1 | | | |f1 | | 0.0387 | | 0 | +| - p2 | | | |f1 | | 0.0380 | | 0 | +| - p3 | | | |f1 | | 0.0387 | | 0 | +| - RE | | | |f1 | | 0.0174 | |0 | +| - p1 | | | |f1 | | 0.0121 | | 0 | +| - p2 | | | |f1 | | 0.0280 | | 0 | +| - p3 | | | |f1 | | 0.0121 | | 0 | diff --git a/csv_files/outputs/Henrychur__MMed-Llama-3-8B__sk__10shot.txt b/csv_files/outputs/Henrychur__MMed-Llama-3-8B__sk__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c591c7f0a88ced816e237245a16bdc6d688db83 --- /dev/null +++ b/csv_files/outputs/Henrychur__MMed-Llama-3-8B__sk__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3507 | |0 | +| - p1 | | | |f1 | | 0.3444 | | 0 | +| - p2 | | | |f1 | | 0.3632 | | 0 | +| - p3 | | | |f1 | | 0.3444 | | 0 | +| - RE | | | |f1 | | 0.0884 | |0 | +| - p1 | | | |f1 | | 0.0734 | | 0 | +| - p2 | | | |f1 | | 0.1045 | | 0 | +| - p3 | | | |f1 | | 0.0875 | | 0 | diff --git a/csv_files/outputs/Henrychur__MMed-Llama-3-8B__sl__0shot.txt b/csv_files/outputs/Henrychur__MMed-Llama-3-8B__sl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..af66f9c26430a2440fce61f08cdf1c00204b2cf0 --- /dev/null +++ b/csv_files/outputs/Henrychur__MMed-Llama-3-8B__sl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0438 | |0 | +| - p1 | | | |f1 | | 0.0429 | | 0 | +| - p2 | | | |f1 | | 0.0456 | | 0 | +| - p3 | | | |f1 | | 0.0429 | | 0 | +| - RE | | | |f1 | | 0.1278 | |0 | +| - p1 | | | |f1 | | 0.0967 | | 0 | +| - p2 | | | |f1 | | 0.1900 | | 0 | +| - p3 | | | |f1 | | 0.0967 | | 0 | diff --git a/csv_files/outputs/Henrychur__MMed-Llama-3-8B__sl__10shot.txt b/csv_files/outputs/Henrychur__MMed-Llama-3-8B__sl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..f5a52d295a6f2f02b23f1a057560f2abba92d1b8 --- /dev/null +++ b/csv_files/outputs/Henrychur__MMed-Llama-3-8B__sl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3720 | |0 | +| - p1 | | | |f1 | | 0.3558 | | 0 | +| - p2 | | | |f1 | | 0.4045 | | 0 | +| - p3 | | | |f1 | | 0.3558 | | 0 | +| - RE | | | |f1 | | 0.0762 | |0 | +| - p1 | | | |f1 | | 0.0787 | | 0 | +| - p2 | | | |f1 | | 0.0781 | | 0 | +| - p3 | | | |f1 | | 0.0719 | | 0 | diff --git a/csv_files/outputs/HiTZ__Medical-mT5-large__en__0shot.txt b/csv_files/outputs/HiTZ__Medical-mT5-large__en__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..6f2276bf179a07b2a459292322afb0dd5fbc5788 --- /dev/null +++ b/csv_files/outputs/HiTZ__Medical-mT5-large__en__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0578 | |0 | +| - p1 | | | |f1 | | 0.0940 | | 0 | +| - p2 | | | |f1 | | 0.0331 | | 0 | +| - p3 | | | |f1 | | 0.0464 | | 0 | +| - RE | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_files/outputs/HiTZ__Medical-mT5-large__en__10shot.txt b/csv_files/outputs/HiTZ__Medical-mT5-large__en__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..765d090e468a522437f45edb65fc5d65485264b7 --- /dev/null +++ b/csv_files/outputs/HiTZ__Medical-mT5-large__en__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1317 | |0 | +| - p1 | | | |f1 | | 0.1215 | | 0 | +| - p2 | | | |f1 | | 0.1415 | | 0 | +| - p3 | | | |f1 | | 0.1322 | | 0 | +| - RE | | | |f1 | | 0.0031 | |0 | +| - p1 | | | |f1 | | 0.0028 | | 0 | +| - p2 | | | |f1 | | 0.0016 | | 0 | +| - p3 | | | |f1 | | 0.0049 | | 0 | diff --git a/csv_files/outputs/HiTZ__Medical-mT5-large__gr__0shot.txt b/csv_files/outputs/HiTZ__Medical-mT5-large__gr__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..4cc7d4a784cb754ed058341765da74fe59e4950e --- /dev/null +++ b/csv_files/outputs/HiTZ__Medical-mT5-large__gr__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0769 | |0 | +| - p1 | | | |f1 | | 0.0859 | | 0 | +| - p2 | | | |f1 | | 0.0591 | | 0 | +| - p3 | | | |f1 | | 0.0859 | | 0 | +| - RE | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_files/outputs/HiTZ__Medical-mT5-large__gr__10shot.txt b/csv_files/outputs/HiTZ__Medical-mT5-large__gr__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..4edd50dc0d05a279ed9a6be3efb12660fc646344 --- /dev/null +++ b/csv_files/outputs/HiTZ__Medical-mT5-large__gr__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1448 | |0 | +| - p1 | | | |f1 | | 0.1455 | | 0 | +| - p2 | | | |f1 | | 0.1434 | | 0 | +| - p3 | | | |f1 | | 0.1455 | | 0 | +| - RE | | | |f1 | | 0.0010 | |0 | +| - p1 | | | |f1 | | 0.0024 | | 0 | +| - p2 | | | |f1 | | 0.0007 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_files/outputs/HiTZ__Medical-mT5-large__it__0shot.txt b/csv_files/outputs/HiTZ__Medical-mT5-large__it__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..89aa974a3e785f05de8765a3214c2c1b54683fa4 --- /dev/null +++ b/csv_files/outputs/HiTZ__Medical-mT5-large__it__0shot.txt @@ -0,0 +1,10 @@ +hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0812 | |0 | +| - p1 | | | |f1 | | 0.0770 | | 0 | +| - p2 | | | |f1 | | 0.0920 | | 0 | +| - p3 | | | |f1 | | 0.0747 | | 0 | +| - RE | | | |f1 | | 0.0000 | |0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_files/outputs/HiTZ__Medical-mT5-large__it__10shot.txt b/csv_files/outputs/HiTZ__Medical-mT5-large__it__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b5e4935064fdb46893c4e273f664a6bc5a4bf79 --- /dev/null +++ b/csv_files/outputs/HiTZ__Medical-mT5-large__it__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1694 | |0 | +| - p1 | | | |f1 | | 0.1616 | | 0 | +| - p2 | | | |f1 | | 0.1774 | | 0 | +| - p3 | | | |f1 | | 0.1690 | | 0 | +| - RE | | | |f1 | | 0.0048 | |0 | +| - p1 | | | |f1 | | 0.0035 | | 0 | +| - p2 | | | |f1 | | 0.0064 | | 0 | +| - p3 | | | |f1 | | 0.0046 | | 0 | diff --git a/csv_files/outputs/HiTZ__Medical-mT5-large__pl__0shot.txt b/csv_files/outputs/HiTZ__Medical-mT5-large__pl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..b3febb68ea3e21f9230cb485500075ba859f318f --- /dev/null +++ b/csv_files/outputs/HiTZ__Medical-mT5-large__pl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0308 | |0 | +| - p1 | | | |f1 | | 0.0244 | | 0 | +| - p2 | | | |f1 | | 0.0436 | | 0 | +| - p3 | | | |f1 | | 0.0244 | | 0 | +| - RE | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_files/outputs/HiTZ__Medical-mT5-large__pl__10shot.txt b/csv_files/outputs/HiTZ__Medical-mT5-large__pl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..9c96e416317f7b151616c4982e8c0640322bb615 --- /dev/null +++ b/csv_files/outputs/HiTZ__Medical-mT5-large__pl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1516 | |0 | +| - p1 | | | |f1 | | 0.1500 | | 0 | +| - p2 | | | |f1 | | 0.1548 | | 0 | +| - p3 | | | |f1 | | 0.1500 | | 0 | +| - RE | | | |f1 | | 0.0032 | |0 | +| - p1 | | | |f1 | | 0.0040 | | 0 | +| - p2 | | | |f1 | | 0.0023 | | 0 | +| - p3 | | | |f1 | | 0.0034 | | 0 | diff --git a/csv_files/outputs/HiTZ__Medical-mT5-large__sk__0shot.txt b/csv_files/outputs/HiTZ__Medical-mT5-large__sk__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..75cf3c4ce337fe7f13221bf8b230f9c267ae3639 --- /dev/null +++ b/csv_files/outputs/HiTZ__Medical-mT5-large__sk__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0712 | |0 | +| - p1 | | | |f1 | | 0.0880 | | 0 | +| - p2 | | | |f1 | | 0.0375 | | 0 | +| - p3 | | | |f1 | | 0.0880 | | 0 | +| - RE | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_files/outputs/HiTZ__Medical-mT5-large__sk__10shot.txt b/csv_files/outputs/HiTZ__Medical-mT5-large__sk__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..ce7ca5e76b585007a9dc187a6dd14ae6e22f17cc --- /dev/null +++ b/csv_files/outputs/HiTZ__Medical-mT5-large__sk__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1444 | |0 | +| - p1 | | | |f1 | | 0.1485 | | 0 | +| - p2 | | | |f1 | | 0.1360 | | 0 | +| - p3 | | | |f1 | | 0.1485 | | 0 | +| - RE | | | |f1 | | 0.0027 | |0 | +| - p1 | | | |f1 | | 0.0038 | | 0 | +| - p2 | | | |f1 | | 0.0024 | | 0 | +| - p3 | | | |f1 | | 0.0020 | | 0 | diff --git a/csv_files/outputs/HiTZ__Medical-mT5-large__sl__0shot.txt b/csv_files/outputs/HiTZ__Medical-mT5-large__sl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..8811248dde3d8e1e5d3e5bd0c4d11888b8adad09 --- /dev/null +++ b/csv_files/outputs/HiTZ__Medical-mT5-large__sl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0711 | |0 | +| - p1 | | | |f1 | | 0.0777 | | 0 | +| - p2 | | | |f1 | | 0.0579 | | 0 | +| - p3 | | | |f1 | | 0.0777 | | 0 | +| - RE | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_files/outputs/HiTZ__Medical-mT5-large__sl__10shot.txt b/csv_files/outputs/HiTZ__Medical-mT5-large__sl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..97237b461fcde9621e1b414675820a8989f1add9 --- /dev/null +++ b/csv_files/outputs/HiTZ__Medical-mT5-large__sl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1422 | |0 | +| - p1 | | | |f1 | | 0.1470 | | 0 | +| - p2 | | | |f1 | | 0.1325 | | 0 | +| - p3 | | | |f1 | | 0.1470 | | 0 | +| - RE | | | |f1 | | 0.0080 | |0 | +| - p1 | | | |f1 | | 0.0073 | | 0 | +| - p2 | | | |f1 | | 0.0074 | | 0 | +| - p3 | | | |f1 | | 0.0093 | | 0 | diff --git a/csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__en__0shot.txt b/csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__en__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e6243a974bbae1a9614d27b4d2e11522417cb4d --- /dev/null +++ b/csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__en__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2500 | |0 | +| - p1 | | | |f1 | | 0.3425 | | 0 | +| - p2 | | | |f1 | | 0.1181 | | 0 | +| - p3 | | | |f1 | | 0.2893 | | 0 | +| - RE | | | |f1 | | 0.4075 | |0 | +| - p1 | | | |f1 | | 0.4135 | | 0 | +| - p2 | | | |f1 | | 0.3917 | | 0 | +| - p3 | | | |f1 | | 0.4172 | | 0 | diff --git a/csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__en__10shot.txt b/csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__en__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..a27ebddd1aefdbbdb6edd25f8352b15456cf81a7 --- /dev/null +++ b/csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__en__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5993 | |0 | +| - p1 | | | |f1 | | 0.6091 | | 0 | +| - p2 | | | |f1 | | 0.5646 | | 0 | +| - p3 | | | |f1 | | 0.6243 | | 0 | +| - RE | | | |f1 | | 0.6164 | |0 | +| - p1 | | | |f1 | | 0.6332 | | 0 | +| - p2 | | | |f1 | | 0.6025 | | 0 | +| - p3 | | | |f1 | | 0.6133 | | 0 | diff --git a/csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__gr__0shot.txt b/csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__gr__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..32e68359dde026f73f4bdc753c7293e1d097dd76 --- /dev/null +++ b/csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__gr__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1290 | |0 | +| - p1 | | | |f1 | | 0.1339 | | 0 | +| - p2 | | | |f1 | | 0.1191 | | 0 | +| - p3 | | | |f1 | | 0.1339 | | 0 | +| - RE | | | |f1 | | 0.3957 | |0 | +| - p1 | | | |f1 | | 0.3796 | | 0 | +| - p2 | | | |f1 | | 0.4266 | | 0 | +| - p3 | | | |f1 | | 0.3810 | | 0 | diff --git a/csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__gr__10shot.txt b/csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__gr__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..8435fa43de5b6d649de6e305295728062df17d85 --- /dev/null +++ b/csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__gr__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6028 | |0 | +| - p1 | | | |f1 | | 0.6119 | | 0 | +| - p2 | | | |f1 | | 0.5847 | | 0 | +| - p3 | | | |f1 | | 0.6119 | | 0 | +| - RE | | | |f1 | | 0.6056 | |0 | +| - p1 | | | |f1 | | 0.5962 | | 0 | +| - p2 | | | |f1 | | 0.6024 | | 0 | +| - p3 | | | |f1 | | 0.6183 | | 0 | diff --git a/csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__it__0shot.txt b/csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__it__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..273a22dfa278f3ac9935c7789f1efc8bc0c51068 --- /dev/null +++ b/csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__it__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2137 | |0 | +| - p1 | | | |f1 | | 0.2467 | | 0 | +| - p2 | | | |f1 | | 0.1709 | | 0 | +| - p3 | | | |f1 | | 0.2234 | | 0 | +| - RE | | | |f1 | | 0.4016 | |0 | +| - p1 | | | |f1 | | 0.4173 | | 0 | +| - p2 | | | |f1 | | 0.3770 | | 0 | +| - p3 | | | |f1 | | 0.4106 | | 0 | diff --git a/csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__it__10shot.txt b/csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__it__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..3e78b0cd1d68e177e93c6ae63d60ff33e934b1cd --- /dev/null +++ b/csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__it__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6569 | |0 | +| - p1 | | | |f1 | | 0.6719 | | 0 | +| - p2 | | | |f1 | | 0.6327 | | 0 | +| - p3 | | | |f1 | | 0.6661 | | 0 | +| - RE | | | |f1 | | 0.5952 | |0 | +| - p1 | | | |f1 | | 0.5767 | | 0 | +| - p2 | | | |f1 | | 0.5998 | | 0 | +| - p3 | | | |f1 | | 0.6093 | | 0 | diff --git a/csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__pl__0shot.txt b/csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__pl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..2bbaa4e441dac6b8c9ed99f717bd896a34a45e3d --- /dev/null +++ b/csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__pl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0586 | |0 | +| - p1 | | | |f1 | | 0.0697 | | 0 | +| - p2 | | | |f1 | | 0.0364 | | 0 | +| - p3 | | | |f1 | | 0.0697 | | 0 | +| - RE | | | |f1 | | 0.4022 | |0 | +| - p1 | | | |f1 | | 0.3803 | | 0 | +| - p2 | | | |f1 | | 0.4464 | | 0 | +| - p3 | | | |f1 | | 0.3800 | | 0 | diff --git a/csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__pl__10shot.txt b/csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__pl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..465d16af61fd9338c7188c53fbf60f164ed3aac6 --- /dev/null +++ b/csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__pl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6092 | |0 | +| - p1 | | | |f1 | | 0.6226 | | 0 | +| - p2 | | | |f1 | | 0.5824 | | 0 | +| - p3 | | | |f1 | | 0.6226 | | 0 | +| - RE | | | |f1 | | 0.5944 | |0 | +| - p1 | | | |f1 | | 0.5991 | | 0 | +| - p2 | | | |f1 | | 0.5466 | | 0 | +| - p3 | | | |f1 | | 0.6375 | | 0 | diff --git a/csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__sk__0shot.txt b/csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__sk__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..8660df7e3f0f119e44cf5a67e7a942f913b8aa4d --- /dev/null +++ b/csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__sk__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0955 | |0 | +| - p1 | | | |f1 | | 0.1220 | | 0 | +| - p2 | | | |f1 | | 0.0426 | | 0 | +| - p3 | | | |f1 | | 0.1220 | | 0 | +| - RE | | | |f1 | | 0.4116 | |0 | +| - p1 | | | |f1 | | 0.4027 | | 0 | +| - p2 | | | |f1 | | 0.4294 | | 0 | +| - p3 | | | |f1 | | 0.4027 | | 0 | diff --git a/csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__sk__10shot.txt b/csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__sk__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..63b5158840c219e67fbf758e2ed730ca530afe7d --- /dev/null +++ b/csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__sk__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6419 | |0 | +| - p1 | | | |f1 | | 0.6386 | | 0 | +| - p2 | | | |f1 | | 0.6486 | | 0 | +| - p3 | | | |f1 | | 0.6386 | | 0 | +| - RE | | | |f1 | | 0.5899 | |0 | +| - p1 | | | |f1 | | 0.5894 | | 0 | +| - p2 | | | |f1 | | 0.5845 | | 0 | +| - p3 | | | |f1 | | 0.5959 | | 0 | diff --git a/csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__sl__0shot.txt b/csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__sl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..52a254555d051acdd5ed2169b161e4db6559e7f6 --- /dev/null +++ b/csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__sl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3398 | |0 | +| - p1 | | | |f1 | | 0.3910 | | 0 | +| - p2 | | | |f1 | | 0.2375 | | 0 | +| - p3 | | | |f1 | | 0.3910 | | 0 | +| - RE | | | |f1 | | 0.3777 | |0 | +| - p1 | | | |f1 | | 0.3775 | | 0 | +| - p2 | | | |f1 | | 0.3783 | | 0 | +| - p3 | | | |f1 | | 0.3775 | | 0 | diff --git a/csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__sl__10shot.txt b/csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__sl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..11a5d3eb944b1de7399b5736ad5127c36767eac5 --- /dev/null +++ b/csv_files/outputs/Qwen__Qwen2.5-14B-Instruct-1M__sl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6371 | |0 | +| - p1 | | | |f1 | | 0.6467 | | 0 | +| - p2 | | | |f1 | | 0.6178 | | 0 | +| - p3 | | | |f1 | | 0.6467 | | 0 | +| - RE | | | |f1 | | 0.5837 | |0 | +| - p1 | | | |f1 | | 0.5949 | | 0 | +| - p2 | | | |f1 | | 0.5782 | | 0 | +| - p3 | | | |f1 | | 0.5781 | | 0 | diff --git a/csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__en__0shot.txt b/csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__en__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..6d46997ae1107c126bb8219af5d19f4b69f60a6d --- /dev/null +++ b/csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__en__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3279 | |0 | +| - p1 | | | |f1 | | 0.3804 | | 0 | +| - p2 | | | |f1 | | 0.3068 | | 0 | +| - p3 | | | |f1 | | 0.2964 | | 0 | +| - RE | | | |f1 | | 0.4658 | |0 | +| - p1 | | | |f1 | | 0.4734 | | 0 | +| - p2 | | | |f1 | | 0.4649 | | 0 | +| - p3 | | | |f1 | | 0.4591 | | 0 | diff --git a/csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__en__10shot.txt b/csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__en__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..5071ef1d6625c2e56e68013ab891e5757b1af187 --- /dev/null +++ b/csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__en__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5895 | |0 | +| - p1 | | | |f1 | | 0.5970 | | 0 | +| - p2 | | | |f1 | | 0.5602 | | 0 | +| - p3 | | | |f1 | | 0.6113 | | 0 | +| - RE | | | |f1 | | 0.6440 | |0 | +| - p1 | | | |f1 | | 0.6482 | | 0 | +| - p2 | | | |f1 | | 0.6469 | | 0 | +| - p3 | | | |f1 | | 0.6370 | | 0 | diff --git a/csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__gr__0shot.txt b/csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__gr__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d091d2dd0d08ddd7d9ae2f74d581e4787f4ebf9 --- /dev/null +++ b/csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__gr__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4506 | |0 | +| - p1 | | | |f1 | | 0.5976 | | 0 | +| - p2 | | | |f1 | | 0.1568 | | 0 | +| - p3 | | | |f1 | | 0.5976 | | 0 | +| - RE | | | |f1 | | 0.4104 | |0 | +| - p1 | | | |f1 | | 0.4393 | | 0 | +| - p2 | | | |f1 | | 0.4083 | | 0 | +| - p3 | | | |f1 | | 0.3834 | | 0 | diff --git a/csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__gr__10shot.txt b/csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__gr__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..fa6241f9b435b69937d53ca833cc5a27fa25c2c0 --- /dev/null +++ b/csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__gr__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6175 | |0 | +| - p1 | | | |f1 | | 0.6196 | | 0 | +| - p2 | | | |f1 | | 0.6131 | | 0 | +| - p3 | | | |f1 | | 0.6196 | | 0 | +| - RE | | | |f1 | | 0.5840 | |0 | +| - p1 | | | |f1 | | 0.5913 | | 0 | +| - p2 | | | |f1 | | 0.5896 | | 0 | +| - p3 | | | |f1 | | 0.5710 | | 0 | diff --git a/csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__it__0shot.txt b/csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__it__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..cb5936448f68978fe3b04ed6fcf29377b929e3d0 --- /dev/null +++ b/csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__it__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2734 | |0 | +| - p1 | | | |f1 | | 0.3758 | | 0 | +| - p2 | | | |f1 | | 0.1647 | | 0 | +| - p3 | | | |f1 | | 0.2796 | | 0 | +| - RE | | | |f1 | | 0.4370 | |0 | +| - p1 | | | |f1 | | 0.4505 | | 0 | +| - p2 | | | |f1 | | 0.4159 | | 0 | +| - p3 | | | |f1 | | 0.4447 | | 0 | diff --git a/csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__it__10shot.txt b/csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__it__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..9fd68069be248fb9602424d0ab5e675f83263e82 --- /dev/null +++ b/csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__it__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.7005 | |0 | +| - p1 | | | |f1 | | 0.6934 | | 0 | +| - p2 | | | |f1 | | 0.7152 | | 0 | +| - p3 | | | |f1 | | 0.6930 | | 0 | +| - RE | | | |f1 | | 0.5641 | |0 | +| - p1 | | | |f1 | | 0.5801 | | 0 | +| - p2 | | | |f1 | | 0.5595 | | 0 | +| - p3 | | | |f1 | | 0.5526 | | 0 | diff --git a/csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__pl__0shot.txt b/csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__pl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..d4f8030e0178b97b248945b2973d52689441048e --- /dev/null +++ b/csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__pl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2428 | |0 | +| - p1 | | | |f1 | | 0.2486 | | 0 | +| - p2 | | | |f1 | | 0.2311 | | 0 | +| - p3 | | | |f1 | | 0.2486 | | 0 | +| - RE | | | |f1 | | 0.4074 | |0 | +| - p1 | | | |f1 | | 0.3865 | | 0 | +| - p2 | | | |f1 | | 0.4569 | | 0 | +| - p3 | | | |f1 | | 0.3788 | | 0 | diff --git a/csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__pl__10shot.txt b/csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__pl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..c0657f5bc039e0ef6c46d0a9ab79ea5c33277f47 --- /dev/null +++ b/csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__pl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6006 | |0 | +| - p1 | | | |f1 | | 0.6008 | | 0 | +| - p2 | | | |f1 | | 0.6004 | | 0 | +| - p3 | | | |f1 | | 0.6008 | | 0 | +| - RE | | | |f1 | | 0.5888 | |0 | +| - p1 | | | |f1 | | 0.5858 | | 0 | +| - p2 | | | |f1 | | 0.5868 | | 0 | +| - p3 | | | |f1 | | 0.5938 | | 0 | diff --git a/csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__sk__0shot.txt b/csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__sk__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c2c921f81dfd861433916d7a82eae8f0794ee40 --- /dev/null +++ b/csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__sk__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3375 | |0 | +| - p1 | | | |f1 | | 0.3578 | | 0 | +| - p2 | | | |f1 | | 0.2968 | | 0 | +| - p3 | | | |f1 | | 0.3578 | | 0 | +| - RE | | | |f1 | | 0.4031 | |0 | +| - p1 | | | |f1 | | 0.3971 | | 0 | +| - p2 | | | |f1 | | 0.4152 | | 0 | +| - p3 | | | |f1 | | 0.3971 | | 0 | diff --git a/csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__sk__10shot.txt b/csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__sk__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..ccd3f8f6a3d5adfc50bb93253d2b1a2baddb48ea --- /dev/null +++ b/csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__sk__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6720 | |0 | +| - p1 | | | |f1 | | 0.6743 | | 0 | +| - p2 | | | |f1 | | 0.6673 | | 0 | +| - p3 | | | |f1 | | 0.6743 | | 0 | +| - RE | | | |f1 | | 0.5643 | |0 | +| - p1 | | | |f1 | | 0.5733 | | 0 | +| - p2 | | | |f1 | | 0.5586 | | 0 | +| - p3 | | | |f1 | | 0.5609 | | 0 | diff --git a/csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__sl__0shot.txt b/csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__sl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..12d98d519252a33b36dae0af4719974c3d12e5c2 --- /dev/null +++ b/csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__sl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3183 | |0 | +| - p1 | | | |f1 | | 0.3344 | | 0 | +| - p2 | | | |f1 | | 0.2863 | | 0 | +| - p3 | | | |f1 | | 0.3344 | | 0 | +| - RE | | | |f1 | | 0.4048 | |0 | +| - p1 | | | |f1 | | 0.3979 | | 0 | +| - p2 | | | |f1 | | 0.4186 | | 0 | +| - p3 | | | |f1 | | 0.3979 | | 0 | diff --git a/csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__sl__10shot.txt b/csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__sl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..45927874109b49e1ce1db253c58c78ab3ea1a926 --- /dev/null +++ b/csv_files/outputs/Qwen__Qwen2.5-32B-Instruct__sl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6373 | |0 | +| - p1 | | | |f1 | | 0.6253 | | 0 | +| - p2 | | | |f1 | | 0.6615 | | 0 | +| - p3 | | | |f1 | | 0.6253 | | 0 | +| - RE | | | |f1 | | 0.5727 | |0 | +| - p1 | | | |f1 | | 0.5992 | | 0 | +| - p2 | | | |f1 | | 0.5849 | | 0 | +| - p3 | | | |f1 | | 0.5339 | | 0 | diff --git a/csv_files/outputs/Qwen__Qwen3-30B-A3B-Instruct-2507__en__0shot.txt b/csv_files/outputs/Qwen__Qwen3-30B-A3B-Instruct-2507__en__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..6420cfe9d439aeff3b1a1d5a016a08fc48833326 --- /dev/null +++ b/csv_files/outputs/Qwen__Qwen3-30B-A3B-Instruct-2507__en__0shot.txt @@ -0,0 +1,10 @@ +hf (pretrained=Qwen/Qwen3-30B-A3B-Instruct-2507 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - RE | | | |f1 | | 0.4141 | |0 | +| - p1 | | | |f1 | | 0.4394 | | 0 | +| - p2 | | | |f1 | | 0.4031 | | 0 | +| - p3 | | | |f1 | | 0.3997 | | 0 | +| - NER | | | |f1 | | 0.4445 | |0 | +| - p2 | | | |f1 | | 0.4162 | | 0 | +| - p3 | | | |f1 | | 0.4729 | | 0 | diff --git a/csv_files/outputs/Qwen__Qwen3-30B-A3B-Instruct-2507__en__10shot.txt b/csv_files/outputs/Qwen__Qwen3-30B-A3B-Instruct-2507__en__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..09d6c20e438665ebd65645a507c560c6cb20a278 --- /dev/null +++ b/csv_files/outputs/Qwen__Qwen3-30B-A3B-Instruct-2507__en__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen3-30B-A3B-Instruct-2507 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5907 | |0 | +| - p1 | | | |f1 | | 0.5986 | | 0 | +| - p2 | | | |f1 | | 0.5593 | | 0 | +| - p3 | | | |f1 | | 0.6143 | | 0 | +| - RE | | | |f1 | | 0.5259 | |0 | +| - p1 | | | |f1 | | 0.5150 | | 0 | +| - p2 | | | |f1 | | 0.5261 | | 0 | +| - p3 | | | |f1 | | 0.5364 | | 0 | diff --git a/csv_files/outputs/Qwen__Qwen3-30B-A3B-Instruct-2507__gr__0shot.txt b/csv_files/outputs/Qwen__Qwen3-30B-A3B-Instruct-2507__gr__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..9679a42ef05e59d1976f28f381ab016e9bd01f2b --- /dev/null +++ b/csv_files/outputs/Qwen__Qwen3-30B-A3B-Instruct-2507__gr__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen3-30B-A3B-Instruct-2507 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4368 | |0 | +| - p1 | | | |f1 | | 0.4291 | | 0 | +| - p2 | | | |f1 | | 0.4521 | | 0 | +| - p3 | | | |f1 | | 0.4291 | | 0 | +| - RE | | | |f1 | | 0.3776 | |0 | +| - p1 | | | |f1 | | 0.3733 | | 0 | +| - p2 | | | |f1 | | 0.3799 | | 0 | +| - p3 | | | |f1 | | 0.3798 | | 0 | diff --git a/csv_files/outputs/Qwen__Qwen3-30B-A3B-Instruct-2507__gr__10shot.txt b/csv_files/outputs/Qwen__Qwen3-30B-A3B-Instruct-2507__gr__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..e7b5e451b837ea640578ca02095bc52a621c7ee1 --- /dev/null +++ b/csv_files/outputs/Qwen__Qwen3-30B-A3B-Instruct-2507__gr__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen3-30B-A3B-Instruct-2507 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5999 | |0 | +| - p1 | | | |f1 | | 0.6164 | | 0 | +| - p2 | | | |f1 | | 0.5669 | | 0 | +| - p3 | | | |f1 | | 0.6164 | | 0 | +| - RE | | | |f1 | | 0.5149 | |0 | +| - p1 | | | |f1 | | 0.5015 | | 0 | +| - p2 | | | |f1 | | 0.5209 | | 0 | +| - p3 | | | |f1 | | 0.5223 | | 0 | diff --git a/csv_files/outputs/Qwen__Qwen3-30B-A3B-Instruct-2507__it__0shot.txt b/csv_files/outputs/Qwen__Qwen3-30B-A3B-Instruct-2507__it__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..d32b05f0de3224777b7aae2d6385c38711a575a8 --- /dev/null +++ b/csv_files/outputs/Qwen__Qwen3-30B-A3B-Instruct-2507__it__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen3-30B-A3B-Instruct-2507 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3572 | |0 | +| - p1 | | | |f1 | | 0.0885 | | 0 | +| - p2 | | | |f1 | | 0.5316 | | 0 | +| - p3 | | | |f1 | | 0.4514 | | 0 | +| - RE | | | |f1 | | 0.3959 | |0 | +| - p1 | | | |f1 | | 0.3784 | | 0 | +| - p2 | | | |f1 | | 0.4123 | | 0 | +| - p3 | | | |f1 | | 0.3972 | | 0 | diff --git a/csv_files/outputs/Qwen__Qwen3-30B-A3B-Instruct-2507__it__10shot.txt b/csv_files/outputs/Qwen__Qwen3-30B-A3B-Instruct-2507__it__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..9dbb95dc7c82ed188142cf9ef0a295bfd27bb3e5 --- /dev/null +++ b/csv_files/outputs/Qwen__Qwen3-30B-A3B-Instruct-2507__it__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen3-30B-A3B-Instruct-2507 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6673 | |0 | +| - p1 | | | |f1 | | 0.6793 | | 0 | +| - p2 | | | |f1 | | 0.6447 | | 0 | +| - p3 | | | |f1 | | 0.6778 | | 0 | +| - RE | | | |f1 | | 0.5982 | |0 | +| - p1 | | | |f1 | | 0.6041 | | 0 | +| - p2 | | | |f1 | | 0.5838 | | 0 | +| - p3 | | | |f1 | | 0.6065 | | 0 | diff --git a/csv_files/outputs/Qwen__Qwen3-30B-A3B-Instruct-2507__pl__0shot.txt b/csv_files/outputs/Qwen__Qwen3-30B-A3B-Instruct-2507__pl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6358c6bb902fb22713a21ed802c947dd78e7ea6 --- /dev/null +++ b/csv_files/outputs/Qwen__Qwen3-30B-A3B-Instruct-2507__pl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen3-30B-A3B-Instruct-2507 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4235 | |0 | +| - p1 | | | |f1 | | 0.4332 | | 0 | +| - p2 | | | |f1 | | 0.4043 | | 0 | +| - p3 | | | |f1 | | 0.4332 | | 0 | +| - RE | | | |f1 | | 0.4186 | |0 | +| - p1 | | | |f1 | | 0.4152 | | 0 | +| - p2 | | | |f1 | | 0.4220 | | 0 | +| - p3 | | | |f1 | | 0.4187 | | 0 | diff --git a/csv_files/outputs/Qwen__Qwen3-30B-A3B-Instruct-2507__pl__10shot.txt b/csv_files/outputs/Qwen__Qwen3-30B-A3B-Instruct-2507__pl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f116f8c7deee3f443c689514bb8a23fdb8d305c --- /dev/null +++ b/csv_files/outputs/Qwen__Qwen3-30B-A3B-Instruct-2507__pl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen3-30B-A3B-Instruct-2507 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6118 | |0 | +| - p1 | | | |f1 | | 0.6276 | | 0 | +| - p2 | | | |f1 | | 0.5803 | | 0 | +| - p3 | | | |f1 | | 0.6276 | | 0 | +| - RE | | | |f1 | | 0.5166 | |0 | +| - p1 | | | |f1 | | 0.5103 | | 0 | +| - p2 | | | |f1 | | 0.5200 | | 0 | +| - p3 | | | |f1 | | 0.5195 | | 0 | diff --git a/csv_files/outputs/Qwen__Qwen3-30B-A3B-Instruct-2507__sk__0shot.txt b/csv_files/outputs/Qwen__Qwen3-30B-A3B-Instruct-2507__sk__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..82f30fd004efa2674df97a1dae911f0a92ff3e26 --- /dev/null +++ b/csv_files/outputs/Qwen__Qwen3-30B-A3B-Instruct-2507__sk__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen3-30B-A3B-Instruct-2507 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3287 | |0 | +| - p1 | | | |f1 | | 0.3231 | | 0 | +| - p2 | | | |f1 | | 0.3398 | | 0 | +| - p3 | | | |f1 | | 0.3231 | | 0 | +| - RE | | | |f1 | | 0.3943 | |0 | +| - p1 | | | |f1 | | 0.3980 | | 0 | +| - p2 | | | |f1 | | 0.3867 | | 0 | +| - p3 | | | |f1 | | 0.3980 | | 0 | diff --git a/csv_files/outputs/Qwen__Qwen3-30B-A3B-Instruct-2507__sk__10shot.txt b/csv_files/outputs/Qwen__Qwen3-30B-A3B-Instruct-2507__sk__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..652672223f87eeb324263928437a787e75b87b20 --- /dev/null +++ b/csv_files/outputs/Qwen__Qwen3-30B-A3B-Instruct-2507__sk__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen3-30B-A3B-Instruct-2507 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6030 | |0 | +| - p1 | | | |f1 | | 0.6085 | | 0 | +| - p2 | | | |f1 | | 0.5919 | | 0 | +| - p3 | | | |f1 | | 0.6085 | | 0 | +| - RE | | | |f1 | | 0.5106 | |0 | +| - p1 | | | |f1 | | 0.4920 | | 0 | +| - p2 | | | |f1 | | 0.5025 | | 0 | +| - p3 | | | |f1 | | 0.5373 | | 0 | diff --git a/csv_files/outputs/Qwen__Qwen3-30B-A3B-Instruct-2507__sl__0shot.txt b/csv_files/outputs/Qwen__Qwen3-30B-A3B-Instruct-2507__sl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..126b784a0d6414a7ebb39eb6954f1444f9a726e9 --- /dev/null +++ b/csv_files/outputs/Qwen__Qwen3-30B-A3B-Instruct-2507__sl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen3-30B-A3B-Instruct-2507 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4501 | |0 | +| - p1 | | | |f1 | | 0.4486 | | 0 | +| - p2 | | | |f1 | | 0.4531 | | 0 | +| - p3 | | | |f1 | | 0.4486 | | 0 | +| - RE | | | |f1 | | 0.4118 | |0 | +| - p1 | | | |f1 | | 0.4115 | | 0 | +| - p2 | | | |f1 | | 0.4126 | | 0 | +| - p3 | | | |f1 | | 0.4115 | | 0 | diff --git a/csv_files/outputs/Qwen__Qwen3-30B-A3B-Instruct-2507__sl__10shot.txt b/csv_files/outputs/Qwen__Qwen3-30B-A3B-Instruct-2507__sl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..827b5e0d0dd790eea628cc4c77b18800829dd3d5 --- /dev/null +++ b/csv_files/outputs/Qwen__Qwen3-30B-A3B-Instruct-2507__sl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen3-30B-A3B-Instruct-2507 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6391 | |0 | +| - p1 | | | |f1 | | 0.6615 | | 0 | +| - p2 | | | |f1 | | 0.5944 | | 0 | +| - p3 | | | |f1 | | 0.6615 | | 0 | +| - RE | | | |f1 | | 0.5356 | |0 | +| - p1 | | | |f1 | | 0.5062 | | 0 | +| - p2 | | | |f1 | | 0.5576 | | 0 | +| - p3 | | | |f1 | | 0.5429 | | 0 | diff --git a/csv_files/outputs/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__en__0shot.txt b/csv_files/outputs/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__en__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..37a0a0b3fcec4413d915fb8b17302fe6c93286c4 --- /dev/null +++ b/csv_files/outputs/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__en__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2877 | |0 | +| - p1 | | | |f1 | | 0.1963 | | 0 | +| - p2 | | | |f1 | | 0.3459 | | 0 | +| - p3 | | | |f1 | | 0.3208 | | 0 | +| - RE | | | |f1 | | 0.4430 | |0 | +| - p1 | | | |f1 | | 0.4487 | | 0 | +| - p2 | | | |f1 | | 0.4492 | | 0 | +| - p3 | | | |f1 | | 0.4311 | | 0 | diff --git a/csv_files/outputs/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__en__10shot.txt b/csv_files/outputs/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__en__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..d1353488cb49ca2c17d3d3e38c7a9b5efe1528a4 --- /dev/null +++ b/csv_files/outputs/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__en__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5963 | |0 | +| - p1 | | | |f1 | | 0.6024 | | 0 | +| - p2 | | | |f1 | | 0.5929 | | 0 | +| - p3 | | | |f1 | | 0.5935 | | 0 | +| - RE | | | |f1 | | 0.5221 | |0 | +| - p1 | | | |f1 | | 0.5191 | | 0 | +| - p2 | | | |f1 | | 0.5199 | | 0 | +| - p3 | | | |f1 | | 0.5273 | | 0 | diff --git a/csv_files/outputs/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__gr__0shot.txt b/csv_files/outputs/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__gr__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..79aa97329e86a168483edd679e8cc64109aed7a6 --- /dev/null +++ b/csv_files/outputs/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__gr__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3421 | |0 | +| - p1 | | | |f1 | | 0.3455 | | 0 | +| - p2 | | | |f1 | | 0.3354 | | 0 | +| - p3 | | | |f1 | | 0.3455 | | 0 | +| - RE | | | |f1 | | 0.3485 | |0 | +| - p1 | | | |f1 | | 0.2406 | | 0 | +| - p2 | | | |f1 | | 0.3947 | | 0 | +| - p3 | | | |f1 | | 0.4102 | | 0 | diff --git a/csv_files/outputs/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__gr__10shot.txt b/csv_files/outputs/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__gr__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..d10be9c5334b54f3adcb1cee0c3d5a9defc21084 --- /dev/null +++ b/csv_files/outputs/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__gr__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5884 | |0 | +| - p1 | | | |f1 | | 0.5928 | | 0 | +| - p2 | | | |f1 | | 0.5796 | | 0 | +| - p3 | | | |f1 | | 0.5928 | | 0 | +| - RE | | | |f1 | | 0.4415 | |0 | +| - p1 | | | |f1 | | 0.4467 | | 0 | +| - p2 | | | |f1 | | 0.4210 | | 0 | +| - p3 | | | |f1 | | 0.4569 | | 0 | diff --git a/csv_files/outputs/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__it__0shot.txt b/csv_files/outputs/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__it__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..5db52b32f4fc4b7478fd7b3ec2f0063bd67713ec --- /dev/null +++ b/csv_files/outputs/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__it__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3220 | |0 | +| - p1 | | | |f1 | | 0.2678 | | 0 | +| - p2 | | | |f1 | | 0.3568 | | 0 | +| - p3 | | | |f1 | | 0.3414 | | 0 | +| - RE | | | |f1 | | 0.4452 | |0 | +| - p1 | | | |f1 | | 0.4519 | | 0 | +| - p2 | | | |f1 | | 0.4611 | | 0 | +| - p3 | | | |f1 | | 0.4227 | | 0 | diff --git a/csv_files/outputs/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__it__10shot.txt b/csv_files/outputs/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__it__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..df97d37a91573d214913b317e4b83ba9899e389a --- /dev/null +++ b/csv_files/outputs/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__it__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6864 | |0 | +| - p1 | | | |f1 | | 0.6982 | | 0 | +| - p2 | | | |f1 | | 0.6679 | | 0 | +| - p3 | | | |f1 | | 0.6930 | | 0 | +| - RE | | | |f1 | | 0.5530 | |0 | +| - p1 | | | |f1 | | 0.5546 | | 0 | +| - p2 | | | |f1 | | 0.5526 | | 0 | +| - p3 | | | |f1 | | 0.5518 | | 0 | diff --git a/csv_files/outputs/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__pl__0shot.txt b/csv_files/outputs/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__pl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..16a21b3d60d0b28e03e3f22502e906d4f9d2586d --- /dev/null +++ b/csv_files/outputs/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__pl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3379 | |0 | +| - p1 | | | |f1 | | 0.3204 | | 0 | +| - p2 | | | |f1 | | 0.3728 | | 0 | +| - p3 | | | |f1 | | 0.3204 | | 0 | +| - RE | | | |f1 | | 0.4131 | |0 | +| - p1 | | | |f1 | | 0.3983 | | 0 | +| - p2 | | | |f1 | | 0.4327 | | 0 | +| - p3 | | | |f1 | | 0.4083 | | 0 | diff --git a/csv_files/outputs/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__pl__10shot.txt b/csv_files/outputs/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__pl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..7bbdde1853e3107c0e3fa26a80be768aedf20a06 --- /dev/null +++ b/csv_files/outputs/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__pl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6189 | |0 | +| - p1 | | | |f1 | | 0.6214 | | 0 | +| - p2 | | | |f1 | | 0.6140 | | 0 | +| - p3 | | | |f1 | | 0.6214 | | 0 | +| - RE | | | |f1 | | 0.5023 | |0 | +| - p1 | | | |f1 | | 0.4863 | | 0 | +| - p2 | | | |f1 | | 0.5129 | | 0 | +| - p3 | | | |f1 | | 0.5076 | | 0 | diff --git a/csv_files/outputs/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__sk__0shot.txt b/csv_files/outputs/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__sk__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..ffa46c1c5e22e9d8e038069c447c1f026cfc61f6 --- /dev/null +++ b/csv_files/outputs/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__sk__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2521 | |0 | +| - p1 | | | |f1 | | 0.2829 | | 0 | +| - p2 | | | |f1 | | 0.1905 | | 0 | +| - p3 | | | |f1 | | 0.2829 | | 0 | +| - RE | | | |f1 | | 0.3959 | |0 | +| - p1 | | | |f1 | | 0.3893 | | 0 | +| - p2 | | | |f1 | | 0.4091 | | 0 | +| - p3 | | | |f1 | | 0.3893 | | 0 | diff --git a/csv_files/outputs/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__sk__10shot.txt b/csv_files/outputs/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__sk__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..6516944fdac3db3ed3380f5c97391fae7dbc061d --- /dev/null +++ b/csv_files/outputs/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__sk__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6302 | |0 | +| - p1 | | | |f1 | | 0.6347 | | 0 | +| - p2 | | | |f1 | | 0.6211 | | 0 | +| - p3 | | | |f1 | | 0.6347 | | 0 | +| - RE | | | |f1 | | 0.4646 | |0 | +| - p1 | | | |f1 | | 0.4799 | | 0 | +| - p2 | | | |f1 | | 0.4451 | | 0 | +| - p3 | | | |f1 | | 0.4689 | | 0 | diff --git a/csv_files/outputs/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__sl__0shot.txt b/csv_files/outputs/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__sl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..c0a0fe07a3970ef5a820c76cb9751944d12fdab2 --- /dev/null +++ b/csv_files/outputs/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__sl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2604 | |0 | +| - p1 | | | |f1 | | 0.2810 | | 0 | +| - p2 | | | |f1 | | 0.2192 | | 0 | +| - p3 | | | |f1 | | 0.2810 | | 0 | +| - RE | | | |f1 | | 0.4116 | |0 | +| - p1 | | | |f1 | | 0.4116 | | 0 | +| - p2 | | | |f1 | | 0.4115 | | 0 | +| - p3 | | | |f1 | | 0.4116 | | 0 | diff --git a/csv_files/outputs/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__sl__10shot.txt b/csv_files/outputs/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__sl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..698e2379856e9df40de4014fdbd473b61395c81b --- /dev/null +++ b/csv_files/outputs/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__sl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6026 | |0 | +| - p1 | | | |f1 | | 0.6015 | | 0 | +| - p2 | | | |f1 | | 0.6049 | | 0 | +| - p3 | | | |f1 | | 0.6015 | | 0 | +| - RE | | | |f1 | | 0.4911 | |0 | +| - p1 | | | |f1 | | 0.5137 | | 0 | +| - p2 | | | |f1 | | 0.4674 | | 0 | +| - p3 | | | |f1 | | 0.4923 | | 0 | diff --git a/csv_files/outputs/epfl-llm__meditron-7b__en__0shot.txt b/csv_files/outputs/epfl-llm__meditron-7b__en__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..a378a1ac602f249f8f7988c361155b300f3edd65 --- /dev/null +++ b/csv_files/outputs/epfl-llm__meditron-7b__en__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=epfl-llm/meditron-7b ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0612 | |0 | +| - p1 | | | |f1 | | 0.0578 | | 0 | +| - p2 | | | |f1 | | 0.0410 | | 0 | +| - p3 | | | |f1 | | 0.0848 | | 0 | +| - RE | | | |f1 | | 0.0313 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0442 | | 0 | +| - p3 | | | |f1 | | 0.0497 | | 0 | diff --git a/csv_files/outputs/epfl-llm__meditron-7b__en__10shot.txt b/csv_files/outputs/epfl-llm__meditron-7b__en__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..45de7e54ed94cfb5fcc9ab2c265f192e7aa9f981 --- /dev/null +++ b/csv_files/outputs/epfl-llm__meditron-7b__en__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=epfl-llm/meditron-7b ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1245 | |0 | +| - p1 | | | |f1 | | 0.0803 | | 0 | +| - p2 | | | |f1 | | 0.1479 | | 0 | +| - p3 | | | |f1 | | 0.1454 | | 0 | +| - RE | | | |f1 | | 0.0692 | |0 | +| - p1 | | | |f1 | | 0.0722 | | 0 | +| - p2 | | | |f1 | | 0.0692 | | 0 | +| - p3 | | | |f1 | | 0.0663 | | 0 | diff --git a/csv_files/outputs/epfl-llm__meditron-7b__gr__0shot.txt b/csv_files/outputs/epfl-llm__meditron-7b__gr__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..b733c326cb013320727e13c717645ad3b4ff775e --- /dev/null +++ b/csv_files/outputs/epfl-llm__meditron-7b__gr__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=epfl-llm/meditron-7b ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2426 | |0 | +| - p1 | | | |f1 | | 0.2417 | | 0 | +| - p2 | | | |f1 | | 0.2443 | | 0 | +| - p3 | | | |f1 | | 0.2417 | | 0 | +| - RE | | | |f1 | | 0.0592 | |0 | +| - p1 | | | |f1 | | 0.1556 | | 0 | +| - p2 | | | |f1 | | 0.0161 | | 0 | +| - p3 | | | |f1 | | 0.0058 | | 0 | diff --git a/csv_files/outputs/epfl-llm__meditron-7b__gr__10shot.txt b/csv_files/outputs/epfl-llm__meditron-7b__gr__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..87b319e4253b8aba65bfcf2e4ade2615fc2ae10e --- /dev/null +++ b/csv_files/outputs/epfl-llm__meditron-7b__gr__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=epfl-llm/meditron-7b ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - RE | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_files/outputs/epfl-llm__meditron-7b__it__0shot.txt b/csv_files/outputs/epfl-llm__meditron-7b__it__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..2a0d1cb5b5e49884090c31b808c2bc2a7f01cf4c --- /dev/null +++ b/csv_files/outputs/epfl-llm__meditron-7b__it__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=epfl-llm/meditron-7b ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0639 | |0 | +| - p1 | | | |f1 | | 0.0773 | | 0 | +| - p2 | | | |f1 | | 0.0612 | | 0 | +| - p3 | | | |f1 | | 0.0531 | | 0 | +| - RE | | | |f1 | | 0.1072 | |0 | +| - p1 | | | |f1 | | 0.0020 | | 0 | +| - p2 | | | |f1 | | 0.1929 | | 0 | +| - p3 | | | |f1 | | 0.1268 | | 0 | diff --git a/csv_files/outputs/epfl-llm__meditron-7b__it__10shot.txt b/csv_files/outputs/epfl-llm__meditron-7b__it__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..b2aa7fd7dc8637dbd14ef01f078eceecddd04f15 --- /dev/null +++ b/csv_files/outputs/epfl-llm__meditron-7b__it__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=epfl-llm/meditron-7b ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3288 | |0 | +| - p1 | | | |f1 | | 0.2991 | | 0 | +| - p2 | | | |f1 | | 0.3563 | | 0 | +| - p3 | | | |f1 | | 0.3311 | | 0 | +| - RE | | | |f1 | | 0.0896 | |0 | +| - p1 | | | |f1 | | 0.0832 | | 0 | +| - p2 | | | |f1 | | 0.0887 | | 0 | +| - p3 | | | |f1 | | 0.0968 | | 0 | diff --git a/csv_files/outputs/epfl-llm__meditron-7b__pl__0shot.txt b/csv_files/outputs/epfl-llm__meditron-7b__pl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..7c763fab18fe2af421c37a99965e57159fb9f0dd --- /dev/null +++ b/csv_files/outputs/epfl-llm__meditron-7b__pl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=epfl-llm/meditron-7b ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1161 | |0 | +| - p1 | | | |f1 | | 0.1140 | | 0 | +| - p2 | | | |f1 | | 0.1203 | | 0 | +| - p3 | | | |f1 | | 0.1140 | | 0 | +| - RE | | | |f1 | | 0.0025 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0076 | | 0 | diff --git a/csv_files/outputs/epfl-llm__meditron-7b__pl__10shot.txt b/csv_files/outputs/epfl-llm__meditron-7b__pl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..14675a45035d0e69895142e3b0f6800ec9197583 --- /dev/null +++ b/csv_files/outputs/epfl-llm__meditron-7b__pl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=epfl-llm/meditron-7b ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3222 | |0 | +| - p1 | | | |f1 | | 0.3184 | | 0 | +| - p2 | | | |f1 | | 0.3297 | | 0 | +| - p3 | | | |f1 | | 0.3184 | | 0 | +| - RE | | | |f1 | | 0.0510 | |0 | +| - p1 | | | |f1 | | 0.0533 | | 0 | +| - p2 | | | |f1 | | 0.0461 | | 0 | +| - p3 | | | |f1 | | 0.0535 | | 0 | diff --git a/csv_files/outputs/epfl-llm__meditron-7b__sk__0shot.txt b/csv_files/outputs/epfl-llm__meditron-7b__sk__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..93e3d58f1c091c0ef7928d528ba2d95cfc046831 --- /dev/null +++ b/csv_files/outputs/epfl-llm__meditron-7b__sk__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=epfl-llm/meditron-7b ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0778 | |0 | +| - p1 | | | |f1 | | 0.0874 | | 0 | +| - p2 | | | |f1 | | 0.0586 | | 0 | +| - p3 | | | |f1 | | 0.0874 | | 0 | +| - RE | | | |f1 | | 0.0034 | |0 | +| - p1 | | | |f1 | | 0.0036 | | 0 | +| - p2 | | | |f1 | | 0.0031 | | 0 | +| - p3 | | | |f1 | | 0.0036 | | 0 | diff --git a/csv_files/outputs/epfl-llm__meditron-7b__sk__10shot.txt b/csv_files/outputs/epfl-llm__meditron-7b__sk__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..bad9a6c35cda030096e0a1ffe1e020b004d5263a --- /dev/null +++ b/csv_files/outputs/epfl-llm__meditron-7b__sk__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=epfl-llm/meditron-7b ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2993 | |0 | +| - p1 | | | |f1 | | 0.3004 | | 0 | +| - p2 | | | |f1 | | 0.2970 | | 0 | +| - p3 | | | |f1 | | 0.3004 | | 0 | +| - RE | | | |f1 | | 0.0404 | |0 | +| - p1 | | | |f1 | | 0.0445 | | 0 | +| - p2 | | | |f1 | | 0.0393 | | 0 | +| - p3 | | | |f1 | | 0.0375 | | 0 | diff --git a/csv_files/outputs/epfl-llm__meditron-7b__sl__0shot.txt b/csv_files/outputs/epfl-llm__meditron-7b__sl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..cfb36aa25fe8478844f4e4741701e7cd84df3e6c --- /dev/null +++ b/csv_files/outputs/epfl-llm__meditron-7b__sl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=epfl-llm/meditron-7b ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0951 | |0 | +| - p1 | | | |f1 | | 0.1197 | | 0 | +| - p2 | | | |f1 | | 0.0460 | | 0 | +| - p3 | | | |f1 | | 0.1197 | | 0 | +| - RE | | | |f1 | | 0.0445 | |0 | +| - p1 | | | |f1 | | 0.0598 | | 0 | +| - p2 | | | |f1 | | 0.0137 | | 0 | +| - p3 | | | |f1 | | 0.0598 | | 0 | diff --git a/csv_files/outputs/epfl-llm__meditron-7b__sl__10shot.txt b/csv_files/outputs/epfl-llm__meditron-7b__sl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..eab52b0c7040bdf63e365ec759ff69b327922c10 --- /dev/null +++ b/csv_files/outputs/epfl-llm__meditron-7b__sl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=epfl-llm/meditron-7b ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3052 | |0 | +| - p1 | | | |f1 | | 0.3119 | | 0 | +| - p2 | | | |f1 | | 0.2916 | | 0 | +| - p3 | | | |f1 | | 0.3119 | | 0 | +| - RE | | | |f1 | | 0.0502 | |0 | +| - p1 | | | |f1 | | 0.0477 | | 0 | +| - p2 | | | |f1 | | 0.0501 | | 0 | +| - p3 | | | |f1 | | 0.0528 | | 0 | diff --git a/csv_files/outputs/google__gemma-2-9b-it__en__0shot.txt b/csv_files/outputs/google__gemma-2-9b-it__en__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..568fb0bef70896f666a03925b261a19ecc918295 --- /dev/null +++ b/csv_files/outputs/google__gemma-2-9b-it__en__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/gemma-2-9b-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4603 | |0 | +| - p1 | | | |f1 | | 0.3267 | | 0 | +| - p2 | | | |f1 | | 0.5174 | | 0 | +| - p3 | | | |f1 | | 0.5370 | | 0 | +| - RE | | | |f1 | | 0.4211 | |0 | +| - p1 | | | |f1 | | 0.4360 | | 0 | +| - p2 | | | |f1 | | 0.4205 | | 0 | +| - p3 | | | |f1 | | 0.4067 | | 0 | diff --git a/csv_files/outputs/google__gemma-2-9b-it__en__10shot.txt b/csv_files/outputs/google__gemma-2-9b-it__en__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..fd9ea572467adfa2c454f1539b60635facd6d39f --- /dev/null +++ b/csv_files/outputs/google__gemma-2-9b-it__en__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/gemma-2-9b-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5919 | |0 | +| - p1 | | | |f1 | | 0.6200 | | 0 | +| - p2 | | | |f1 | | 0.5639 | | 0 | +| - p3 | | | |f1 | | 0.5918 | | 0 | +| - RE | | | |f1 | | 0.5303 | |0 | +| - p1 | | | |f1 | | 0.5163 | | 0 | +| - p2 | | | |f1 | | 0.5337 | | 0 | +| - p3 | | | |f1 | | 0.5409 | | 0 | diff --git a/csv_files/outputs/google__gemma-2-9b-it__gr__0shot.txt b/csv_files/outputs/google__gemma-2-9b-it__gr__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..87226c5046c2278e0f2a6e57fa83aa395a61ca52 --- /dev/null +++ b/csv_files/outputs/google__gemma-2-9b-it__gr__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/gemma-2-9b-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5292 | |0 | +| - p1 | | | |f1 | | 0.5549 | | 0 | +| - p2 | | | |f1 | | 0.4777 | | 0 | +| - p3 | | | |f1 | | 0.5549 | | 0 | +| - RE | | | |f1 | | 0.4008 | |0 | +| - p1 | | | |f1 | | 0.4124 | | 0 | +| - p2 | | | |f1 | | 0.3957 | | 0 | +| - p3 | | | |f1 | | 0.3943 | | 0 | diff --git a/csv_files/outputs/google__gemma-2-9b-it__gr__10shot.txt b/csv_files/outputs/google__gemma-2-9b-it__gr__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..739bad8c7a5639671141f53c0413696e38d96592 --- /dev/null +++ b/csv_files/outputs/google__gemma-2-9b-it__gr__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/gemma-2-9b-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5943 | |0 | +| - p1 | | | |f1 | | 0.6083 | | 0 | +| - p2 | | | |f1 | | 0.5663 | | 0 | +| - p3 | | | |f1 | | 0.6083 | | 0 | +| - RE | | | |f1 | | 0.5162 | |0 | +| - p1 | | | |f1 | | 0.5070 | | 0 | +| - p2 | | | |f1 | | 0.4971 | | 0 | +| - p3 | | | |f1 | | 0.5444 | | 0 | diff --git a/csv_files/outputs/google__gemma-2-9b-it__it__0shot.txt b/csv_files/outputs/google__gemma-2-9b-it__it__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c1e51ad7fa14eeadd44b3fdc9a10c6ff1ae5784 --- /dev/null +++ b/csv_files/outputs/google__gemma-2-9b-it__it__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/gemma-2-9b-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6158 | |0 | +| - p1 | | | |f1 | | 0.5739 | | 0 | +| - p2 | | | |f1 | | 0.6524 | | 0 | +| - p3 | | | |f1 | | 0.6210 | | 0 | +| - RE | | | |f1 | | 0.4298 | |0 | +| - p1 | | | |f1 | | 0.4585 | | 0 | +| - p2 | | | |f1 | | 0.4113 | | 0 | +| - p3 | | | |f1 | | 0.4196 | | 0 | diff --git a/csv_files/outputs/google__gemma-2-9b-it__it__10shot.txt b/csv_files/outputs/google__gemma-2-9b-it__it__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..51580ad2fec3d1b525363e7391cd794ab01ea59f --- /dev/null +++ b/csv_files/outputs/google__gemma-2-9b-it__it__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/gemma-2-9b-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6707 | |0 | +| - p1 | | | |f1 | | 0.6910 | | 0 | +| - p2 | | | |f1 | | 0.6643 | | 0 | +| - p3 | | | |f1 | | 0.6569 | | 0 | +| - RE | | | |f1 | | 0.5209 | |0 | +| - p1 | | | |f1 | | 0.4958 | | 0 | +| - p2 | | | |f1 | | 0.5365 | | 0 | +| - p3 | | | |f1 | | 0.5305 | | 0 | diff --git a/csv_files/outputs/google__gemma-2-9b-it__pl__0shot.txt b/csv_files/outputs/google__gemma-2-9b-it__pl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..f187b39ad69bd8bb0dbf697be691b129eadee340 --- /dev/null +++ b/csv_files/outputs/google__gemma-2-9b-it__pl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/gemma-2-9b-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4092 | |0 | +| - p1 | | | |f1 | | 0.4060 | | 0 | +| - p2 | | | |f1 | | 0.4155 | | 0 | +| - p3 | | | |f1 | | 0.4060 | | 0 | +| - RE | | | |f1 | | 0.3891 | |0 | +| - p1 | | | |f1 | | 0.3674 | | 0 | +| - p2 | | | |f1 | | 0.4271 | | 0 | +| - p3 | | | |f1 | | 0.3729 | | 0 | diff --git a/csv_files/outputs/google__gemma-2-9b-it__pl__10shot.txt b/csv_files/outputs/google__gemma-2-9b-it__pl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..5ac1adba3779f7f68d4bfbf2cf88b163e3b84f4b --- /dev/null +++ b/csv_files/outputs/google__gemma-2-9b-it__pl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/gemma-2-9b-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5893 | |0 | +| - p1 | | | |f1 | | 0.5908 | | 0 | +| - p2 | | | |f1 | | 0.5862 | | 0 | +| - p3 | | | |f1 | | 0.5908 | | 0 | +| - RE | | | |f1 | | 0.5033 | |0 | +| - p1 | | | |f1 | | 0.5168 | | 0 | +| - p2 | | | |f1 | | 0.4808 | | 0 | +| - p3 | | | |f1 | | 0.5124 | | 0 | diff --git a/csv_files/outputs/google__gemma-2-9b-it__sk__0shot.txt b/csv_files/outputs/google__gemma-2-9b-it__sk__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..8f0bc370ed50b39ebc88b5e7e85f8a110f45a283 --- /dev/null +++ b/csv_files/outputs/google__gemma-2-9b-it__sk__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/gemma-2-9b-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4775 | |0 | +| - p1 | | | |f1 | | 0.4875 | | 0 | +| - p2 | | | |f1 | | 0.4575 | | 0 | +| - p3 | | | |f1 | | 0.4875 | | 0 | +| - RE | | | |f1 | | 0.4106 | |0 | +| - p1 | | | |f1 | | 0.3989 | | 0 | +| - p2 | | | |f1 | | 0.4340 | | 0 | +| - p3 | | | |f1 | | 0.3989 | | 0 | diff --git a/csv_files/outputs/google__gemma-2-9b-it__sk__10shot.txt b/csv_files/outputs/google__gemma-2-9b-it__sk__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..b909c3988f118020a3d985678e092a54be2f61f1 --- /dev/null +++ b/csv_files/outputs/google__gemma-2-9b-it__sk__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/gemma-2-9b-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6135 | |0 | +| - p1 | | | |f1 | | 0.6141 | | 0 | +| - p2 | | | |f1 | | 0.6122 | | 0 | +| - p3 | | | |f1 | | 0.6141 | | 0 | +| - RE | | | |f1 | | 0.5007 | |0 | +| - p1 | | | |f1 | | 0.5153 | | 0 | +| - p2 | | | |f1 | | 0.4754 | | 0 | +| - p3 | | | |f1 | | 0.5114 | | 0 | diff --git a/csv_files/outputs/google__gemma-2-9b-it__sl__0shot.txt b/csv_files/outputs/google__gemma-2-9b-it__sl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..98ce20cc51f351ff18c9e416c89a29a920d6bacd --- /dev/null +++ b/csv_files/outputs/google__gemma-2-9b-it__sl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/gemma-2-9b-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4487 | |0 | +| - p1 | | | |f1 | | 0.4707 | | 0 | +| - p2 | | | |f1 | | 0.4046 | | 0 | +| - p3 | | | |f1 | | 0.4707 | | 0 | +| - RE | | | |f1 | | 0.4058 | |0 | +| - p1 | | | |f1 | | 0.4079 | | 0 | +| - p2 | | | |f1 | | 0.4016 | | 0 | +| - p3 | | | |f1 | | 0.4079 | | 0 | diff --git a/csv_files/outputs/google__gemma-2-9b-it__sl__10shot.txt b/csv_files/outputs/google__gemma-2-9b-it__sl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..e956839ba21e57d9f60059817766b32fe88d80a2 --- /dev/null +++ b/csv_files/outputs/google__gemma-2-9b-it__sl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/gemma-2-9b-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6156 | |0 | +| - p1 | | | |f1 | | 0.6365 | | 0 | +| - p2 | | | |f1 | | 0.5737 | | 0 | +| - p3 | | | |f1 | | 0.6365 | | 0 | +| - RE | | | |f1 | | 0.4883 | |0 | +| - p1 | | | |f1 | | 0.4801 | | 0 | +| - p2 | | | |f1 | | 0.4878 | | 0 | +| - p3 | | | |f1 | | 0.4972 | | 0 | diff --git a/csv_files/outputs/google__gemma-3-27b-it__en__0shot.txt b/csv_files/outputs/google__gemma-3-27b-it__en__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..11bb43caf4d6194fe781b6d25ed4e6d8ba98ec60 --- /dev/null +++ b/csv_files/outputs/google__gemma-3-27b-it__en__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/gemma-3-27b-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5490 | |0 | +| - p1 | | | |f1 | | 0.5446 | | 0 | +| - p2 | | | |f1 | | 0.5830 | | 0 | +| - p3 | | | |f1 | | 0.5194 | | 0 | +| - RE | | | |f1 | | 0.4623 | |0 | +| - p1 | | | |f1 | | 0.4543 | | 0 | +| - p2 | | | |f1 | | 0.4582 | | 0 | +| - p3 | | | |f1 | | 0.4743 | | 0 | diff --git a/csv_files/outputs/google__gemma-3-27b-it__en__10shot.txt b/csv_files/outputs/google__gemma-3-27b-it__en__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..9b6e27e1e65dc4e4d80d829f031471763c651a1f --- /dev/null +++ b/csv_files/outputs/google__gemma-3-27b-it__en__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/gemma-3-27b-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6187 | |0 | +| - p1 | | | |f1 | | 0.6160 | | 0 | +| - p2 | | | |f1 | | 0.6308 | | 0 | +| - p3 | | | |f1 | | 0.6094 | | 0 | +| - RE | | | |f1 | | 0.5518 | |0 | +| - p1 | | | |f1 | | 0.5191 | | 0 | +| - p2 | | | |f1 | | 0.5600 | | 0 | +| - p3 | | | |f1 | | 0.5764 | | 0 | diff --git a/csv_files/outputs/google__gemma-3-27b-it__gr__0shot.txt b/csv_files/outputs/google__gemma-3-27b-it__gr__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..93e9d713d614c2dbaa4d17389ea32a9b3021a3cf --- /dev/null +++ b/csv_files/outputs/google__gemma-3-27b-it__gr__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/gemma-3-27b-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5151 | |0 | +| - p1 | | | |f1 | | 0.4866 | | 0 | +| - p2 | | | |f1 | | 0.5721 | | 0 | +| - p3 | | | |f1 | | 0.4866 | | 0 | +| - RE | | | |f1 | | 0.4473 | |0 | +| - p1 | | | |f1 | | 0.3955 | | 0 | +| - p2 | | | |f1 | | 0.4695 | | 0 | +| - p3 | | | |f1 | | 0.4769 | | 0 | diff --git a/csv_files/outputs/google__gemma-3-27b-it__gr__10shot.txt b/csv_files/outputs/google__gemma-3-27b-it__gr__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..f1b60273ff3047ec635fd913fc4fe0db8a2ca133 --- /dev/null +++ b/csv_files/outputs/google__gemma-3-27b-it__gr__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/gemma-3-27b-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6570 | |0 | +| - p1 | | | |f1 | | 0.6551 | | 0 | +| - p2 | | | |f1 | | 0.6608 | | 0 | +| - p3 | | | |f1 | | 0.6551 | | 0 | +| - RE | | | |f1 | | 0.5405 | |0 | +| - p1 | | | |f1 | | 0.5083 | | 0 | +| - p2 | | | |f1 | | 0.5550 | | 0 | +| - p3 | | | |f1 | | 0.5581 | | 0 | diff --git a/csv_files/outputs/google__gemma-3-27b-it__it__0shot.txt b/csv_files/outputs/google__gemma-3-27b-it__it__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..20b323e8a1c0fdd3626eab05d6d99055375834fe --- /dev/null +++ b/csv_files/outputs/google__gemma-3-27b-it__it__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/gemma-3-27b-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6065 | |0 | +| - p1 | | | |f1 | | 0.5543 | | 0 | +| - p2 | | | |f1 | | 0.6697 | | 0 | +| - p3 | | | |f1 | | 0.5954 | | 0 | +| - RE | | | |f1 | | 0.4737 | |0 | +| - p1 | | | |f1 | | 0.4390 | | 0 | +| - p2 | | | |f1 | | 0.4895 | | 0 | +| - p3 | | | |f1 | | 0.4927 | | 0 | diff --git a/csv_files/outputs/google__gemma-3-27b-it__it__10shot.txt b/csv_files/outputs/google__gemma-3-27b-it__it__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..f498e38336fb8c77988c54dcd070e31a0ac9a220 --- /dev/null +++ b/csv_files/outputs/google__gemma-3-27b-it__it__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/gemma-3-27b-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.7115 | |0 | +| - p1 | | | |f1 | | 0.7142 | | 0 | +| - p2 | | | |f1 | | 0.6992 | | 0 | +| - p3 | | | |f1 | | 0.7212 | | 0 | +| - RE | | | |f1 | | 0.5615 | |0 | +| - p1 | | | |f1 | | 0.5223 | | 0 | +| - p2 | | | |f1 | | 0.5837 | | 0 | +| - p3 | | | |f1 | | 0.5786 | | 0 | diff --git a/csv_files/outputs/google__gemma-3-27b-it__pl__0shot.txt b/csv_files/outputs/google__gemma-3-27b-it__pl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..e9f1519cf73039b96aee8faf352b110818910761 --- /dev/null +++ b/csv_files/outputs/google__gemma-3-27b-it__pl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/gemma-3-27b-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4508 | |0 | +| - p1 | | | |f1 | | 0.4506 | | 0 | +| - p2 | | | |f1 | | 0.4511 | | 0 | +| - p3 | | | |f1 | | 0.4506 | | 0 | +| - RE | | | |f1 | | 0.4307 | |0 | +| - p1 | | | |f1 | | 0.4384 | | 0 | +| - p2 | | | |f1 | | 0.4267 | | 0 | +| - p3 | | | |f1 | | 0.4271 | | 0 | diff --git a/csv_files/outputs/google__gemma-3-27b-it__pl__10shot.txt b/csv_files/outputs/google__gemma-3-27b-it__pl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..9eed39172f7a13688201364ca71ab665a6378bda --- /dev/null +++ b/csv_files/outputs/google__gemma-3-27b-it__pl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/gemma-3-27b-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6618 | |0 | +| - p1 | | | |f1 | | 0.6591 | | 0 | +| - p2 | | | |f1 | | 0.6672 | | 0 | +| - p3 | | | |f1 | | 0.6591 | | 0 | +| - RE | | | |f1 | | 0.5592 | |0 | +| - p1 | | | |f1 | | 0.5795 | | 0 | +| - p2 | | | |f1 | | 0.5601 | | 0 | +| - p3 | | | |f1 | | 0.5380 | | 0 | diff --git a/csv_files/outputs/google__gemma-3-27b-it__sk__0shot.txt b/csv_files/outputs/google__gemma-3-27b-it__sk__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..27cb80b8b7a5de7165aa52e89c5f70d0ac61dc23 --- /dev/null +++ b/csv_files/outputs/google__gemma-3-27b-it__sk__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/gemma-3-27b-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2841 | |0 | +| - p1 | | | |f1 | | 0.3183 | | 0 | +| - p2 | | | |f1 | | 0.2157 | | 0 | +| - p3 | | | |f1 | | 0.3183 | | 0 | +| - RE | | | |f1 | | 0.4369 | |0 | +| - p1 | | | |f1 | | 0.4373 | | 0 | +| - p2 | | | |f1 | | 0.4360 | | 0 | +| - p3 | | | |f1 | | 0.4373 | | 0 | diff --git a/csv_files/outputs/google__gemma-3-27b-it__sk__10shot.txt b/csv_files/outputs/google__gemma-3-27b-it__sk__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f8297965f5ccbab4e4581425fdb9d9628f5cc8c --- /dev/null +++ b/csv_files/outputs/google__gemma-3-27b-it__sk__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/gemma-3-27b-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6786 | |0 | +| - p1 | | | |f1 | | 0.6737 | | 0 | +| - p2 | | | |f1 | | 0.6885 | | 0 | +| - p3 | | | |f1 | | 0.6737 | | 0 | +| - RE | | | |f1 | | 0.5095 | |0 | +| - p1 | | | |f1 | | 0.5121 | | 0 | +| - p2 | | | |f1 | | 0.5061 | | 0 | +| - p3 | | | |f1 | | 0.5103 | | 0 | diff --git a/csv_files/outputs/google__gemma-3-27b-it__sl__0shot.txt b/csv_files/outputs/google__gemma-3-27b-it__sl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..39bd48a7d0643c98d4e640b58b7343ee908f2d64 --- /dev/null +++ b/csv_files/outputs/google__gemma-3-27b-it__sl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/gemma-3-27b-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4508 | |0 | +| - p1 | | | |f1 | | 0.4370 | | 0 | +| - p2 | | | |f1 | | 0.4783 | | 0 | +| - p3 | | | |f1 | | 0.4370 | | 0 | +| - RE | | | |f1 | | 0.4301 | |0 | +| - p1 | | | |f1 | | 0.4255 | | 0 | +| - p2 | | | |f1 | | 0.4391 | | 0 | +| - p3 | | | |f1 | | 0.4255 | | 0 | diff --git a/csv_files/outputs/google__gemma-3-27b-it__sl__10shot.txt b/csv_files/outputs/google__gemma-3-27b-it__sl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..ed1c6926d8cdf2d22c23adf3393d14f1da9cd4d9 --- /dev/null +++ b/csv_files/outputs/google__gemma-3-27b-it__sl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/gemma-3-27b-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6806 | |0 | +| - p1 | | | |f1 | | 0.6750 | | 0 | +| - p2 | | | |f1 | | 0.6918 | | 0 | +| - p3 | | | |f1 | | 0.6750 | | 0 | +| - RE | | | |f1 | | 0.4999 | |0 | +| - p1 | | | |f1 | | 0.5149 | | 0 | +| - p2 | | | |f1 | | 0.4703 | | 0 | +| - p3 | | | |f1 | | 0.5145 | | 0 | diff --git a/csv_files/outputs/google__medgemma-27b-text-it__en__0shot.txt b/csv_files/outputs/google__medgemma-27b-text-it__en__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..15bfbd55479583afff9de0e140bb5e0cd5970dd2 --- /dev/null +++ b/csv_files/outputs/google__medgemma-27b-text-it__en__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/medgemma-27b-text-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5011 | |0 | +| - p1 | | | |f1 | | 0.3842 | | 0 | +| - p2 | | | |f1 | | 0.6035 | | 0 | +| - p3 | | | |f1 | | 0.5156 | | 0 | +| - RE | | | |f1 | | 0.4681 | |0 | +| - p1 | | | |f1 | | 0.4836 | | 0 | +| - p2 | | | |f1 | | 0.4763 | | 0 | +| - p3 | | | |f1 | | 0.4443 | | 0 | diff --git a/csv_files/outputs/google__medgemma-27b-text-it__en__10shot.txt b/csv_files/outputs/google__medgemma-27b-text-it__en__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..0166f0aea940a1e11dbe79c1f2cf729f6c18c859 --- /dev/null +++ b/csv_files/outputs/google__medgemma-27b-text-it__en__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/medgemma-27b-text-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6324 | |0 | +| - p1 | | | |f1 | | 0.6355 | | 0 | +| - p2 | | | |f1 | | 0.6161 | | 0 | +| - p3 | | | |f1 | | 0.6455 | | 0 | +| - RE | | | |f1 | | 0.5540 | |0 | +| - p1 | | | |f1 | | 0.5562 | | 0 | +| - p2 | | | |f1 | | 0.5494 | | 0 | +| - p3 | | | |f1 | | 0.5565 | | 0 | diff --git a/csv_files/outputs/google__medgemma-27b-text-it__gr__0shot.txt b/csv_files/outputs/google__medgemma-27b-text-it__gr__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..d065c4984ef73926a5399a10a41f28371f93fc00 --- /dev/null +++ b/csv_files/outputs/google__medgemma-27b-text-it__gr__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/medgemma-27b-text-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5585 | |0 | +| - p1 | | | |f1 | | 0.5314 | | 0 | +| - p2 | | | |f1 | | 0.6126 | | 0 | +| - p3 | | | |f1 | | 0.5314 | | 0 | +| - RE | | | |f1 | | 0.4199 | |0 | +| - p1 | | | |f1 | | 0.4069 | | 0 | +| - p2 | | | |f1 | | 0.4332 | | 0 | +| - p3 | | | |f1 | | 0.4197 | | 0 | diff --git a/csv_files/outputs/google__medgemma-27b-text-it__gr__10shot.txt b/csv_files/outputs/google__medgemma-27b-text-it__gr__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..70f88e2528d3a1ff7cf33f95ceb671fc00a7aa14 --- /dev/null +++ b/csv_files/outputs/google__medgemma-27b-text-it__gr__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/medgemma-27b-text-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6839 | |0 | +| - p1 | | | |f1 | | 0.6836 | | 0 | +| - p2 | | | |f1 | | 0.6846 | | 0 | +| - p3 | | | |f1 | | 0.6836 | | 0 | +| - RE | | | |f1 | | 0.5680 | |0 | +| - p1 | | | |f1 | | 0.5392 | | 0 | +| - p2 | | | |f1 | | 0.5867 | | 0 | +| - p3 | | | |f1 | | 0.5780 | | 0 | diff --git a/csv_files/outputs/google__medgemma-27b-text-it__it__0shot.txt b/csv_files/outputs/google__medgemma-27b-text-it__it__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..f4af53b4a7ac4e873081dbac1012b7e5319e4714 --- /dev/null +++ b/csv_files/outputs/google__medgemma-27b-text-it__it__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/medgemma-27b-text-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5351 | |0 | +| - p1 | | | |f1 | | 0.4261 | | 0 | +| - p2 | | | |f1 | | 0.6212 | | 0 | +| - p3 | | | |f1 | | 0.5582 | | 0 | +| - RE | | | |f1 | | 0.4521 | |0 | +| - p1 | | | |f1 | | 0.4042 | | 0 | +| - p2 | | | |f1 | | 0.4916 | | 0 | +| - p3 | | | |f1 | | 0.4604 | | 0 | diff --git a/csv_files/outputs/google__medgemma-27b-text-it__it__10shot.txt b/csv_files/outputs/google__medgemma-27b-text-it__it__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..de0f1bbac03a767e292f5c6d52f4319e367c44c0 --- /dev/null +++ b/csv_files/outputs/google__medgemma-27b-text-it__it__10shot.txt @@ -0,0 +1,10 @@ +hf (pretrained=google/medgemma-27b-text-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.7133 | |0 | +| - p1 | | | |f1 | | 0.7262 | | 0 | +| - p2 | | | |f1 | | 0.7005 | | 0 | +| - RE | | | |f1 | | 0.5960 | |0 | +| - p1 | | | |f1 | | 0.5919 | | 0 | +| - p2 | | | |f1 | | 0.6235 | | 0 | +| - p3 | | | |f1 | | 0.5726 | | 0 | diff --git a/csv_files/outputs/google__medgemma-27b-text-it__pl__0shot.txt b/csv_files/outputs/google__medgemma-27b-text-it__pl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..f9eba07f4234c3cc6cc8a0f05a5cd8f0cd620ac8 --- /dev/null +++ b/csv_files/outputs/google__medgemma-27b-text-it__pl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/medgemma-27b-text-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4245 | |0 | +| - p1 | | | |f1 | | 0.4216 | | 0 | +| - p2 | | | |f1 | | 0.4303 | | 0 | +| - p3 | | | |f1 | | 0.4216 | | 0 | +| - RE | | | |f1 | | 0.4332 | |0 | +| - p1 | | | |f1 | | 0.4325 | | 0 | +| - p2 | | | |f1 | | 0.4424 | | 0 | +| - p3 | | | |f1 | | 0.4246 | | 0 | diff --git a/csv_files/outputs/google__medgemma-27b-text-it__pl__10shot.txt b/csv_files/outputs/google__medgemma-27b-text-it__pl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..0276a07fab4408898651f3f147a274d5d5df3c97 --- /dev/null +++ b/csv_files/outputs/google__medgemma-27b-text-it__pl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/medgemma-27b-text-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6791 | |0 | +| - p1 | | | |f1 | | 0.6829 | | 0 | +| - p2 | | | |f1 | | 0.6715 | | 0 | +| - p3 | | | |f1 | | 0.6829 | | 0 | +| - RE | | | |f1 | | 0.5997 | |0 | +| - p1 | | | |f1 | | 0.5940 | | 0 | +| - p2 | | | |f1 | | 0.6133 | | 0 | +| - p3 | | | |f1 | | 0.5918 | | 0 | diff --git a/csv_files/outputs/google__medgemma-27b-text-it__sk__0shot.txt b/csv_files/outputs/google__medgemma-27b-text-it__sk__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..bd8b98a582ba732f34685230d5c2af7c07ed3a59 --- /dev/null +++ b/csv_files/outputs/google__medgemma-27b-text-it__sk__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/medgemma-27b-text-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2336 | |0 | +| - p1 | | | |f1 | | 0.2971 | | 0 | +| - p2 | | | |f1 | | 0.1066 | | 0 | +| - p3 | | | |f1 | | 0.2971 | | 0 | +| - RE | | | |f1 | | 0.4440 | |0 | +| - p1 | | | |f1 | | 0.4395 | | 0 | +| - p2 | | | |f1 | | 0.4531 | | 0 | +| - p3 | | | |f1 | | 0.4395 | | 0 | diff --git a/csv_files/outputs/google__medgemma-27b-text-it__sk__10shot.txt b/csv_files/outputs/google__medgemma-27b-text-it__sk__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..003be3f4ed88a6a499b89ed958c415cc485b70c3 --- /dev/null +++ b/csv_files/outputs/google__medgemma-27b-text-it__sk__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/medgemma-27b-text-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.7137 | |0 | +| - p1 | | | |f1 | | 0.7143 | | 0 | +| - p2 | | | |f1 | | 0.7127 | | 0 | +| - p3 | | | |f1 | | 0.7143 | | 0 | +| - RE | | | |f1 | | 0.5156 | |0 | +| - p1 | | | |f1 | | 0.5111 | | 0 | +| - p2 | | | |f1 | | 0.5188 | | 0 | +| - p3 | | | |f1 | | 0.5171 | | 0 | diff --git a/csv_files/outputs/google__medgemma-27b-text-it__sl__0shot.txt b/csv_files/outputs/google__medgemma-27b-text-it__sl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c70cd19fab747d42922e7a5ecdf7736a81004f9 --- /dev/null +++ b/csv_files/outputs/google__medgemma-27b-text-it__sl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/medgemma-27b-text-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4863 | |0 | +| - p1 | | | |f1 | | 0.4675 | | 0 | +| - p2 | | | |f1 | | 0.5238 | | 0 | +| - p3 | | | |f1 | | 0.4675 | | 0 | +| - RE | | | |f1 | | 0.4201 | |0 | +| - p1 | | | |f1 | | 0.4182 | | 0 | +| - p2 | | | |f1 | | 0.4239 | | 0 | +| - p3 | | | |f1 | | 0.4182 | | 0 | diff --git a/csv_files/outputs/google__medgemma-27b-text-it__sl__10shot.txt b/csv_files/outputs/google__medgemma-27b-text-it__sl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c8ad321754c222e65360614da1f6192f3387c7c --- /dev/null +++ b/csv_files/outputs/google__medgemma-27b-text-it__sl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/medgemma-27b-text-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6887 | |0 | +| - p1 | | | |f1 | | 0.6947 | | 0 | +| - p2 | | | |f1 | | 0.6765 | | 0 | +| - p3 | | | |f1 | | 0.6947 | | 0 | +| - RE | | | |f1 | | 0.5469 | |0 | +| - p1 | | | |f1 | | 0.5323 | | 0 | +| - p2 | | | |f1 | | 0.5590 | | 0 | +| - p3 | | | |f1 | | 0.5494 | | 0 | diff --git a/csv_files/outputs/google__medgemma-4b-it__en__0shot.txt b/csv_files/outputs/google__medgemma-4b-it__en__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d18ba6fc1bdc423039c71bd42a10e8a09e997db --- /dev/null +++ b/csv_files/outputs/google__medgemma-4b-it__en__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/medgemma-4b-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2625 | |0 | +| - p1 | | | |f1 | | 0.2635 | | 0 | +| - p2 | | | |f1 | | 0.2503 | | 0 | +| - p3 | | | |f1 | | 0.2737 | | 0 | +| - RE | | | |f1 | | 0.2851 | |0 | +| - p1 | | | |f1 | | 0.2095 | | 0 | +| - p2 | | | |f1 | | 0.3257 | | 0 | +| - p3 | | | |f1 | | 0.3203 | | 0 | diff --git a/csv_files/outputs/google__medgemma-4b-it__en__10shot.txt b/csv_files/outputs/google__medgemma-4b-it__en__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..4bc79186d7029b363381303d141376c728294ab7 --- /dev/null +++ b/csv_files/outputs/google__medgemma-4b-it__en__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/medgemma-4b-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4930 | |0 | +| - p1 | | | |f1 | | 0.4833 | | 0 | +| - p2 | | | |f1 | | 0.5005 | | 0 | +| - p3 | | | |f1 | | 0.4951 | | 0 | +| - RE | | | |f1 | | 0.1198 | |0 | +| - p1 | | | |f1 | | 0.0964 | | 0 | +| - p2 | | | |f1 | | 0.1237 | | 0 | +| - p3 | | | |f1 | | 0.1391 | | 0 | diff --git a/csv_files/outputs/google__medgemma-4b-it__gr__0shot.txt b/csv_files/outputs/google__medgemma-4b-it__gr__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..d0f048d8fb01e8ed8352829fa0179010381f66ca --- /dev/null +++ b/csv_files/outputs/google__medgemma-4b-it__gr__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/medgemma-4b-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2688 | |0 | +| - p1 | | | |f1 | | 0.2705 | | 0 | +| - p2 | | | |f1 | | 0.2654 | | 0 | +| - p3 | | | |f1 | | 0.2705 | | 0 | +| - RE | | | |f1 | | 0.2053 | |0 | +| - p1 | | | |f1 | | 0.2381 | | 0 | +| - p2 | | | |f1 | | 0.3024 | | 0 | +| - p3 | | | |f1 | | 0.0754 | | 0 | diff --git a/csv_files/outputs/google__medgemma-4b-it__gr__10shot.txt b/csv_files/outputs/google__medgemma-4b-it__gr__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..4352edec2156fd74172ffb93bfc7069ed935cce2 --- /dev/null +++ b/csv_files/outputs/google__medgemma-4b-it__gr__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/medgemma-4b-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4953 | |0 | +| - p1 | | | |f1 | | 0.4910 | | 0 | +| - p2 | | | |f1 | | 0.5039 | | 0 | +| - p3 | | | |f1 | | 0.4910 | | 0 | +| - RE | | | |f1 | | 0.1453 | |0 | +| - p1 | | | |f1 | | 0.1204 | | 0 | +| - p2 | | | |f1 | | 0.1605 | | 0 | +| - p3 | | | |f1 | | 0.1551 | | 0 | diff --git a/csv_files/outputs/google__medgemma-4b-it__it__0shot.txt b/csv_files/outputs/google__medgemma-4b-it__it__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..0726976105566a084aa8c1c51c84c700da3b7752 --- /dev/null +++ b/csv_files/outputs/google__medgemma-4b-it__it__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/medgemma-4b-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2929 | |0 | +| - p1 | | | |f1 | | 0.3157 | | 0 | +| - p2 | | | |f1 | | 0.2627 | | 0 | +| - p3 | | | |f1 | | 0.3004 | | 0 | +| - RE | | | |f1 | | 0.1767 | |0 | +| - p1 | | | |f1 | | 0.2154 | | 0 | +| - p2 | | | |f1 | | 0.2461 | | 0 | +| - p3 | | | |f1 | | 0.0688 | | 0 | diff --git a/csv_files/outputs/google__medgemma-4b-it__it__10shot.txt b/csv_files/outputs/google__medgemma-4b-it__it__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..4b3a8e23520bec5746d9090322128b48cf1af41f --- /dev/null +++ b/csv_files/outputs/google__medgemma-4b-it__it__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/medgemma-4b-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5454 | |0 | +| - p1 | | | |f1 | | 0.5633 | | 0 | +| - p2 | | | |f1 | | 0.5377 | | 0 | +| - p3 | | | |f1 | | 0.5352 | | 0 | +| - RE | | | |f1 | | 0.1753 | |0 | +| - p1 | | | |f1 | | 0.1592 | | 0 | +| - p2 | | | |f1 | | 0.1917 | | 0 | +| - p3 | | | |f1 | | 0.1751 | | 0 | diff --git a/csv_files/outputs/google__medgemma-4b-it__pl__0shot.txt b/csv_files/outputs/google__medgemma-4b-it__pl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..1da79eb1fdbb0c791458ff4a9cce9c33a7da6497 --- /dev/null +++ b/csv_files/outputs/google__medgemma-4b-it__pl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/medgemma-4b-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2231 | |0 | +| - p1 | | | |f1 | | 0.2255 | | 0 | +| - p2 | | | |f1 | | 0.2183 | | 0 | +| - p3 | | | |f1 | | 0.2255 | | 0 | +| - RE | | | |f1 | | 0.1173 | |0 | +| - p1 | | | |f1 | | 0.1150 | | 0 | +| - p2 | | | |f1 | | 0.1314 | | 0 | +| - p3 | | | |f1 | | 0.1054 | | 0 | diff --git a/csv_files/outputs/google__medgemma-4b-it__pl__10shot.txt b/csv_files/outputs/google__medgemma-4b-it__pl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..79c82263014a0069b6c825385d95cf6477004a4a --- /dev/null +++ b/csv_files/outputs/google__medgemma-4b-it__pl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/medgemma-4b-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5193 | |0 | +| - p1 | | | |f1 | | 0.5186 | | 0 | +| - p2 | | | |f1 | | 0.5206 | | 0 | +| - p3 | | | |f1 | | 0.5186 | | 0 | +| - RE | | | |f1 | | 0.1055 | |0 | +| - p1 | | | |f1 | | 0.1171 | | 0 | +| - p2 | | | |f1 | | 0.0997 | | 0 | +| - p3 | | | |f1 | | 0.0997 | | 0 | diff --git a/csv_files/outputs/google__medgemma-4b-it__sk__0shot.txt b/csv_files/outputs/google__medgemma-4b-it__sk__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..1bea1720e28d395cc9d1f6ee52968a6965a98c84 --- /dev/null +++ b/csv_files/outputs/google__medgemma-4b-it__sk__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/medgemma-4b-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2427 | |0 | +| - p1 | | | |f1 | | 0.2447 | | 0 | +| - p2 | | | |f1 | | 0.2387 | | 0 | +| - p3 | | | |f1 | | 0.2447 | | 0 | +| - RE | | | |f1 | | 0.1212 | |0 | +| - p1 | | | |f1 | | 0.1119 | | 0 | +| - p2 | | | |f1 | | 0.1399 | | 0 | +| - p3 | | | |f1 | | 0.1119 | | 0 | diff --git a/csv_files/outputs/google__medgemma-4b-it__sk__10shot.txt b/csv_files/outputs/google__medgemma-4b-it__sk__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..19c6346c5007538093d4b83f945e14ee4616490c --- /dev/null +++ b/csv_files/outputs/google__medgemma-4b-it__sk__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/medgemma-4b-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4654 | |0 | +| - p1 | | | |f1 | | 0.4756 | | 0 | +| - p2 | | | |f1 | | 0.4449 | | 0 | +| - p3 | | | |f1 | | 0.4756 | | 0 | +| - RE | | | |f1 | | 0.1035 | |0 | +| - p1 | | | |f1 | | 0.1095 | | 0 | +| - p2 | | | |f1 | | 0.1009 | | 0 | +| - p3 | | | |f1 | | 0.1000 | | 0 | diff --git a/csv_files/outputs/google__medgemma-4b-it__sl__0shot.txt b/csv_files/outputs/google__medgemma-4b-it__sl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..56dbab87e4f7fed3a562f04e90d5511d788bdc34 --- /dev/null +++ b/csv_files/outputs/google__medgemma-4b-it__sl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/medgemma-4b-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2569 | |0 | +| - p1 | | | |f1 | | 0.2574 | | 0 | +| - p2 | | | |f1 | | 0.2558 | | 0 | +| - p3 | | | |f1 | | 0.2574 | | 0 | +| - RE | | | |f1 | | 0.1012 | |0 | +| - p1 | | | |f1 | | 0.0973 | | 0 | +| - p2 | | | |f1 | | 0.1089 | | 0 | +| - p3 | | | |f1 | | 0.0973 | | 0 | diff --git a/csv_files/outputs/google__medgemma-4b-it__sl__10shot.txt b/csv_files/outputs/google__medgemma-4b-it__sl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..cc424f90dde2288be7dda70e93c0e761287409da --- /dev/null +++ b/csv_files/outputs/google__medgemma-4b-it__sl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/medgemma-4b-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5063 | |0 | +| - p1 | | | |f1 | | 0.5117 | | 0 | +| - p2 | | | |f1 | | 0.4955 | | 0 | +| - p3 | | | |f1 | | 0.5117 | | 0 | +| - RE | | | |f1 | | 0.1260 | |0 | +| - p1 | | | |f1 | | 0.1178 | | 0 | +| - p2 | | | |f1 | | 0.1101 | | 0 | +| - p3 | | | |f1 | | 0.1501 | | 0 | diff --git a/csv_files/outputs/microsoft__MediPhi-Clinical__en__0shot.txt b/csv_files/outputs/microsoft__MediPhi-Clinical__en__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..76cdb91c718afaf7089efd18094132a0bf5f121a --- /dev/null +++ b/csv_files/outputs/microsoft__MediPhi-Clinical__en__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=microsoft/MediPhi-Clinical ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2786 | |0 | +| - p1 | | | |f1 | | 0.2502 | | 0 | +| - p2 | | | |f1 | | 0.3089 | | 0 | +| - p3 | | | |f1 | | 0.2768 | | 0 | +| - RE | | | |f1 | | 0.3248 | |0 | +| - p1 | | | |f1 | | 0.2274 | | 0 | +| - p2 | | | |f1 | | 0.3929 | | 0 | +| - p3 | | | |f1 | | 0.3542 | | 0 | diff --git a/csv_files/outputs/microsoft__MediPhi-Clinical__en__10shot.txt b/csv_files/outputs/microsoft__MediPhi-Clinical__en__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..fb75acf6585f103c2dc138b976cb9d036ecee6c6 --- /dev/null +++ b/csv_files/outputs/microsoft__MediPhi-Clinical__en__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=microsoft/MediPhi-Clinical ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5008 | |0 | +| - p1 | | | |f1 | | 0.5009 | | 0 | +| - p2 | | | |f1 | | 0.4966 | | 0 | +| - p3 | | | |f1 | | 0.5049 | | 0 | +| - RE | | | |f1 | | 0.1125 | |0 | +| - p1 | | | |f1 | | 0.1175 | | 0 | +| - p2 | | | |f1 | | 0.1095 | | 0 | +| - p3 | | | |f1 | | 0.1107 | | 0 | diff --git a/csv_files/outputs/microsoft__MediPhi-Clinical__gr__0shot.txt b/csv_files/outputs/microsoft__MediPhi-Clinical__gr__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..a0c5c0a5a00e2cdf36cc9f66ad99c89cd41760ac --- /dev/null +++ b/csv_files/outputs/microsoft__MediPhi-Clinical__gr__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=microsoft/MediPhi-Clinical ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1717 | |0 | +| - p1 | | | |f1 | | 0.1641 | | 0 | +| - p2 | | | |f1 | | 0.1869 | | 0 | +| - p3 | | | |f1 | | 0.1641 | | 0 | +| - RE | | | |f1 | | 0.0977 | |0 | +| - p1 | | | |f1 | | 0.0736 | | 0 | +| - p2 | | | |f1 | | 0.0778 | | 0 | +| - p3 | | | |f1 | | 0.1418 | | 0 | diff --git a/csv_files/outputs/microsoft__MediPhi-Clinical__gr__10shot.txt b/csv_files/outputs/microsoft__MediPhi-Clinical__gr__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..eba564920c2c92d7800080552c6a59b8def8c9b7 --- /dev/null +++ b/csv_files/outputs/microsoft__MediPhi-Clinical__gr__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=microsoft/MediPhi-Clinical ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3384 | |0 | +| - p1 | | | |f1 | | 0.3375 | | 0 | +| - p2 | | | |f1 | | 0.3403 | | 0 | +| - p3 | | | |f1 | | 0.3375 | | 0 | +| - RE | | | |f1 | | 0.0606 | |0 | +| - p1 | | | |f1 | | 0.0427 | | 0 | +| - p2 | | | |f1 | | 0.0681 | | 0 | +| - p3 | | | |f1 | | 0.0711 | | 0 | diff --git a/csv_files/outputs/microsoft__MediPhi-Clinical__it__0shot.txt b/csv_files/outputs/microsoft__MediPhi-Clinical__it__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..2179db033abe54c1a3211347f00f4fb25ebca628 --- /dev/null +++ b/csv_files/outputs/microsoft__MediPhi-Clinical__it__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=microsoft/MediPhi-Clinical ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3307 | |0 | +| - p1 | | | |f1 | | 0.3397 | | 0 | +| - p2 | | | |f1 | | 0.3300 | | 0 | +| - p3 | | | |f1 | | 0.3226 | | 0 | +| - RE | | | |f1 | | 0.0792 | |0 | +| - p1 | | | |f1 | | 0.1489 | | 0 | +| - p2 | | | |f1 | | 0.0736 | | 0 | +| - p3 | | | |f1 | | 0.0149 | | 0 | diff --git a/csv_files/outputs/microsoft__MediPhi-Clinical__it__10shot.txt b/csv_files/outputs/microsoft__MediPhi-Clinical__it__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c42fd3d91bc50bf07de543792051bd0fbd08f0c --- /dev/null +++ b/csv_files/outputs/microsoft__MediPhi-Clinical__it__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=microsoft/MediPhi-Clinical ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5257 | |0 | +| - p1 | | | |f1 | | 0.5195 | | 0 | +| - p2 | | | |f1 | | 0.5301 | | 0 | +| - p3 | | | |f1 | | 0.5275 | | 0 | +| - RE | | | |f1 | | 0.1499 | |0 | +| - p1 | | | |f1 | | 0.2114 | | 0 | +| - p2 | | | |f1 | | 0.0961 | | 0 | +| - p3 | | | |f1 | | 0.1422 | | 0 | diff --git a/csv_files/outputs/microsoft__MediPhi-Clinical__pl__0shot.txt b/csv_files/outputs/microsoft__MediPhi-Clinical__pl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..398a394b401f757945eec334c5bd3723685aa80b --- /dev/null +++ b/csv_files/outputs/microsoft__MediPhi-Clinical__pl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=microsoft/MediPhi-Clinical ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2831 | |0 | +| - p1 | | | |f1 | | 0.2815 | | 0 | +| - p2 | | | |f1 | | 0.2861 | | 0 | +| - p3 | | | |f1 | | 0.2815 | | 0 | +| - RE | | | |f1 | | 0.2693 | |0 | +| - p1 | | | |f1 | | 0.2109 | | 0 | +| - p2 | | | |f1 | | 0.2908 | | 0 | +| - p3 | | | |f1 | | 0.3061 | | 0 | diff --git a/csv_files/outputs/microsoft__MediPhi-Clinical__pl__10shot.txt b/csv_files/outputs/microsoft__MediPhi-Clinical__pl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..4fc3f8a0bc1e4a0d38f2f3729779f31828e7b70b --- /dev/null +++ b/csv_files/outputs/microsoft__MediPhi-Clinical__pl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=microsoft/MediPhi-Clinical ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3986 | |0 | +| - p1 | | | |f1 | | 0.3913 | | 0 | +| - p2 | | | |f1 | | 0.4132 | | 0 | +| - p3 | | | |f1 | | 0.3913 | | 0 | +| - RE | | | |f1 | | 0.1366 | |0 | +| - p1 | | | |f1 | | 0.1255 | | 0 | +| - p2 | | | |f1 | | 0.1207 | | 0 | +| - p3 | | | |f1 | | 0.1636 | | 0 | diff --git a/csv_files/outputs/microsoft__MediPhi-Clinical__sk__0shot.txt b/csv_files/outputs/microsoft__MediPhi-Clinical__sk__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..9aaca4c790d98eb11b6ab172e1b2f9ba297b0a56 --- /dev/null +++ b/csv_files/outputs/microsoft__MediPhi-Clinical__sk__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=microsoft/MediPhi-Clinical ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2710 | |0 | +| - p1 | | | |f1 | | 0.2571 | | 0 | +| - p2 | | | |f1 | | 0.2987 | | 0 | +| - p3 | | | |f1 | | 0.2571 | | 0 | +| - RE | | | |f1 | | 0.1062 | |0 | +| - p1 | | | |f1 | | 0.1554 | | 0 | +| - p2 | | | |f1 | | 0.0077 | | 0 | +| - p3 | | | |f1 | | 0.1554 | | 0 | diff --git a/csv_files/outputs/microsoft__MediPhi-Clinical__sk__10shot.txt b/csv_files/outputs/microsoft__MediPhi-Clinical__sk__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..b031c9c0bd0f5fe6669a53563e3681aa1a74d890 --- /dev/null +++ b/csv_files/outputs/microsoft__MediPhi-Clinical__sk__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=microsoft/MediPhi-Clinical ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4025 | |0 | +| - p1 | | | |f1 | | 0.4106 | | 0 | +| - p2 | | | |f1 | | 0.3861 | | 0 | +| - p3 | | | |f1 | | 0.4106 | | 0 | +| - RE | | | |f1 | | 0.0613 | |0 | +| - p1 | | | |f1 | | 0.0509 | | 0 | +| - p2 | | | |f1 | | 0.0606 | | 0 | +| - p3 | | | |f1 | | 0.0724 | | 0 | diff --git a/csv_files/outputs/microsoft__MediPhi-Clinical__sl__0shot.txt b/csv_files/outputs/microsoft__MediPhi-Clinical__sl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd2d143234ccb78abac42a387aa128d3314e802f --- /dev/null +++ b/csv_files/outputs/microsoft__MediPhi-Clinical__sl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=microsoft/MediPhi-Clinical ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2892 | |0 | +| - p1 | | | |f1 | | 0.2998 | | 0 | +| - p2 | | | |f1 | | 0.2680 | | 0 | +| - p3 | | | |f1 | | 0.2998 | | 0 | +| - RE | | | |f1 | | 0.0304 | |0 | +| - p1 | | | |f1 | | 0.0395 | | 0 | +| - p2 | | | |f1 | | 0.0121 | | 0 | +| - p3 | | | |f1 | | 0.0395 | | 0 | diff --git a/csv_files/outputs/microsoft__MediPhi-Clinical__sl__10shot.txt b/csv_files/outputs/microsoft__MediPhi-Clinical__sl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..891a18854ac34e549e2ead223a4f5c50fa589fb3 --- /dev/null +++ b/csv_files/outputs/microsoft__MediPhi-Clinical__sl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=microsoft/MediPhi-Clinical ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4021 | |0 | +| - p1 | | | |f1 | | 0.4036 | | 0 | +| - p2 | | | |f1 | | 0.3990 | | 0 | +| - p3 | | | |f1 | | 0.4036 | | 0 | +| - RE | | | |f1 | | 0.0748 | |0 | +| - p1 | | | |f1 | | 0.0829 | | 0 | +| - p2 | | | |f1 | | 0.0674 | | 0 | +| - p3 | | | |f1 | | 0.0742 | | 0 | diff --git a/csv_files/outputs/microsoft__MediPhi-Instruct__en__0shot.txt b/csv_files/outputs/microsoft__MediPhi-Instruct__en__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..d109d8f49c2b55f83170cc9751275a72ff6e1387 --- /dev/null +++ b/csv_files/outputs/microsoft__MediPhi-Instruct__en__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=microsoft/MediPhi-Instruct ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1598 | |0 | +| - p1 | | | |f1 | | 0.0761 | | 0 | +| - p2 | | | |f1 | | 0.2410 | | 0 | +| - p3 | | | |f1 | | 0.1625 | | 0 | +| - RE | | | |f1 | | 0.2982 | |0 | +| - p1 | | | |f1 | | 0.1135 | | 0 | +| - p2 | | | |f1 | | 0.4006 | | 0 | +| - p3 | | | |f1 | | 0.3804 | | 0 | diff --git a/csv_files/outputs/microsoft__MediPhi-Instruct__en__10shot.txt b/csv_files/outputs/microsoft__MediPhi-Instruct__en__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..91d54c4d91a393447e1a9033fe06a44c2ea83264 --- /dev/null +++ b/csv_files/outputs/microsoft__MediPhi-Instruct__en__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=microsoft/MediPhi-Instruct ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5216 | |0 | +| - p1 | | | |f1 | | 0.5357 | | 0 | +| - p2 | | | |f1 | | 0.5227 | | 0 | +| - p3 | | | |f1 | | 0.5063 | | 0 | +| - RE | | | |f1 | | 0.1719 | |0 | +| - p1 | | | |f1 | | 0.1432 | | 0 | +| - p2 | | | |f1 | | 0.1888 | | 0 | +| - p3 | | | |f1 | | 0.1836 | | 0 | diff --git a/csv_files/outputs/microsoft__MediPhi-Instruct__gr__0shot.txt b/csv_files/outputs/microsoft__MediPhi-Instruct__gr__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..033006f71175ad0e7ba4e4f9b2b91bd4b604c058 --- /dev/null +++ b/csv_files/outputs/microsoft__MediPhi-Instruct__gr__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=microsoft/MediPhi-Instruct ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1159 | |0 | +| - p1 | | | |f1 | | 0.1294 | | 0 | +| - p2 | | | |f1 | | 0.0890 | | 0 | +| - p3 | | | |f1 | | 0.1294 | | 0 | +| - RE | | | |f1 | | 0.1184 | |0 | +| - p1 | | | |f1 | | 0.0962 | | 0 | +| - p2 | | | |f1 | | 0.0673 | | 0 | +| - p3 | | | |f1 | | 0.1916 | | 0 | diff --git a/csv_files/outputs/microsoft__MediPhi-Instruct__gr__10shot.txt b/csv_files/outputs/microsoft__MediPhi-Instruct__gr__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..116bb08deaa20d0ae0a961c362d4802b12d2add2 --- /dev/null +++ b/csv_files/outputs/microsoft__MediPhi-Instruct__gr__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=microsoft/MediPhi-Instruct ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2881 | |0 | +| - p1 | | | |f1 | | 0.2822 | | 0 | +| - p2 | | | |f1 | | 0.2999 | | 0 | +| - p3 | | | |f1 | | 0.2822 | | 0 | +| - RE | | | |f1 | | 0.0675 | |0 | +| - p1 | | | |f1 | | 0.0576 | | 0 | +| - p2 | | | |f1 | | 0.0674 | | 0 | +| - p3 | | | |f1 | | 0.0777 | | 0 | diff --git a/csv_files/outputs/microsoft__MediPhi-Instruct__it__0shot.txt b/csv_files/outputs/microsoft__MediPhi-Instruct__it__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd0e6008d2a3198f8ea3a505176acf5dd7c54ac8 --- /dev/null +++ b/csv_files/outputs/microsoft__MediPhi-Instruct__it__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=microsoft/MediPhi-Instruct ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2023 | |0 | +| - p1 | | | |f1 | | 0.0867 | | 0 | +| - p2 | | | |f1 | | 0.2484 | | 0 | +| - p3 | | | |f1 | | 0.2717 | | 0 | +| - RE | | | |f1 | | 0.2623 | |0 | +| - p1 | | | |f1 | | 0.1712 | | 0 | +| - p2 | | | |f1 | | 0.2896 | | 0 | +| - p3 | | | |f1 | | 0.3261 | | 0 | diff --git a/csv_files/outputs/microsoft__MediPhi-Instruct__it__10shot.txt b/csv_files/outputs/microsoft__MediPhi-Instruct__it__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..751b2811277dbd2d73fca6f47f097140e427c007 --- /dev/null +++ b/csv_files/outputs/microsoft__MediPhi-Instruct__it__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=microsoft/MediPhi-Instruct ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5715 | |0 | +| - p1 | | | |f1 | | 0.5729 | | 0 | +| - p2 | | | |f1 | | 0.5627 | | 0 | +| - p3 | | | |f1 | | 0.5790 | | 0 | +| - RE | | | |f1 | | 0.2679 | |0 | +| - p1 | | | |f1 | | 0.2873 | | 0 | +| - p2 | | | |f1 | | 0.2307 | | 0 | +| - p3 | | | |f1 | | 0.2858 | | 0 | diff --git a/csv_files/outputs/microsoft__MediPhi-Instruct__pl__0shot.txt b/csv_files/outputs/microsoft__MediPhi-Instruct__pl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..9c6bc50efa744a67b9f472d4fdd237432f562068 --- /dev/null +++ b/csv_files/outputs/microsoft__MediPhi-Instruct__pl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=microsoft/MediPhi-Instruct ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1567 | |0 | +| - p1 | | | |f1 | | 0.1510 | | 0 | +| - p2 | | | |f1 | | 0.1680 | | 0 | +| - p3 | | | |f1 | | 0.1510 | | 0 | +| - RE | | | |f1 | | 0.2881 | |0 | +| - p1 | | | |f1 | | 0.2683 | | 0 | +| - p2 | | | |f1 | | 0.3126 | | 0 | +| - p3 | | | |f1 | | 0.2832 | | 0 | diff --git a/csv_files/outputs/microsoft__MediPhi-Instruct__pl__10shot.txt b/csv_files/outputs/microsoft__MediPhi-Instruct__pl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..72e10fb742c76990d605ee3a2c3c4ef35b670091 --- /dev/null +++ b/csv_files/outputs/microsoft__MediPhi-Instruct__pl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=microsoft/MediPhi-Instruct ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4447 | |0 | +| - p1 | | | |f1 | | 0.4417 | | 0 | +| - p2 | | | |f1 | | 0.4506 | | 0 | +| - p3 | | | |f1 | | 0.4417 | | 0 | +| - RE | | | |f1 | | 0.2291 | |0 | +| - p1 | | | |f1 | | 0.1525 | | 0 | +| - p2 | | | |f1 | | 0.2686 | | 0 | +| - p3 | | | |f1 | | 0.2662 | | 0 | diff --git a/csv_files/outputs/microsoft__MediPhi-Instruct__sk__0shot.txt b/csv_files/outputs/microsoft__MediPhi-Instruct__sk__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..a3c42b94084e72cbff224a863d3f392fb1b26463 --- /dev/null +++ b/csv_files/outputs/microsoft__MediPhi-Instruct__sk__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=microsoft/MediPhi-Instruct ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1788 | |0 | +| - p1 | | | |f1 | | 0.1641 | | 0 | +| - p2 | | | |f1 | | 0.2081 | | 0 | +| - p3 | | | |f1 | | 0.1641 | | 0 | +| - RE | | | |f1 | | 0.1221 | |0 | +| - p1 | | | |f1 | | 0.1776 | | 0 | +| - p2 | | | |f1 | | 0.0112 | | 0 | +| - p3 | | | |f1 | | 0.1776 | | 0 | diff --git a/csv_files/outputs/microsoft__MediPhi-Instruct__sk__10shot.txt b/csv_files/outputs/microsoft__MediPhi-Instruct__sk__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..2d69cd7d7928f4a6c643567b723646b0ea9b62cc --- /dev/null +++ b/csv_files/outputs/microsoft__MediPhi-Instruct__sk__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=microsoft/MediPhi-Instruct ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4226 | |0 | +| - p1 | | | |f1 | | 0.4327 | | 0 | +| - p2 | | | |f1 | | 0.4023 | | 0 | +| - p3 | | | |f1 | | 0.4327 | | 0 | +| - RE | | | |f1 | | 0.1313 | |0 | +| - p1 | | | |f1 | | 0.1070 | | 0 | +| - p2 | | | |f1 | | 0.1395 | | 0 | +| - p3 | | | |f1 | | 0.1473 | | 0 | diff --git a/csv_files/outputs/microsoft__MediPhi-Instruct__sl__0shot.txt b/csv_files/outputs/microsoft__MediPhi-Instruct__sl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d1efab70337f7f4131da606243432c236e0a589 --- /dev/null +++ b/csv_files/outputs/microsoft__MediPhi-Instruct__sl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=microsoft/MediPhi-Instruct ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1792 | |0 | +| - p1 | | | |f1 | | 0.1758 | | 0 | +| - p2 | | | |f1 | | 0.1860 | | 0 | +| - p3 | | | |f1 | | 0.1758 | | 0 | +| - RE | | | |f1 | | 0.1325 | |0 | +| - p1 | | | |f1 | | 0.1446 | | 0 | +| - p2 | | | |f1 | | 0.1084 | | 0 | +| - p3 | | | |f1 | | 0.1446 | | 0 | diff --git a/csv_files/outputs/microsoft__MediPhi-Instruct__sl__10shot.txt b/csv_files/outputs/microsoft__MediPhi-Instruct__sl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..679149a7bff8efd0e260ce4a2032de17a86487c6 --- /dev/null +++ b/csv_files/outputs/microsoft__MediPhi-Instruct__sl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=microsoft/MediPhi-Instruct ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3837 | |0 | +| - p1 | | | |f1 | | 0.3973 | | 0 | +| - p2 | | | |f1 | | 0.3564 | | 0 | +| - p3 | | | |f1 | | 0.3973 | | 0 | +| - RE | | | |f1 | | 0.1550 | |0 | +| - p1 | | | |f1 | | 0.1155 | | 0 | +| - p2 | | | |f1 | | 0.1468 | | 0 | +| - p3 | | | |f1 | | 0.2027 | | 0 | diff --git a/csv_files/outputs/mistralai__Mistral-7B-Instruct-v0.2__en__0shot.txt b/csv_files/outputs/mistralai__Mistral-7B-Instruct-v0.2__en__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..f4ee3894a133ddaa7d959a5cc3ee2a1e1a1647e2 --- /dev/null +++ b/csv_files/outputs/mistralai__Mistral-7B-Instruct-v0.2__en__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=mistralai/Mistral-7B-Instruct-v0.2 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2278 | |0 | +| - p1 | | | |f1 | | 0.2529 | | 0 | +| - p2 | | | |f1 | | 0.2144 | | 0 | +| - p3 | | | |f1 | | 0.2162 | | 0 | +| - RE | | | |f1 | | 0.3007 | |0 | +| - p1 | | | |f1 | | 0.3688 | | 0 | +| - p2 | | | |f1 | | 0.3642 | | 0 | +| - p3 | | | |f1 | | 0.1693 | | 0 | diff --git a/csv_files/outputs/mistralai__Mistral-7B-Instruct-v0.2__en__10shot.txt b/csv_files/outputs/mistralai__Mistral-7B-Instruct-v0.2__en__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..e691afc144b5537fc7c41225a0f9323b2628b66e --- /dev/null +++ b/csv_files/outputs/mistralai__Mistral-7B-Instruct-v0.2__en__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=mistralai/Mistral-7B-Instruct-v0.2 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4753 | |0 | +| - p1 | | | |f1 | | 0.4725 | | 0 | +| - p2 | | | |f1 | | 0.4730 | | 0 | +| - p3 | | | |f1 | | 0.4805 | | 0 | +| - RE | | | |f1 | | 0.3592 | |0 | +| - p1 | | | |f1 | | 0.2593 | | 0 | +| - p2 | | | |f1 | | 0.4034 | | 0 | +| - p3 | | | |f1 | | 0.4148 | | 0 | diff --git a/csv_files/outputs/mistralai__Mistral-7B-Instruct-v0.2__gr__0shot.txt b/csv_files/outputs/mistralai__Mistral-7B-Instruct-v0.2__gr__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..0017f0d9afa0d196c05a399d60bf6b69bf801441 --- /dev/null +++ b/csv_files/outputs/mistralai__Mistral-7B-Instruct-v0.2__gr__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=mistralai/Mistral-7B-Instruct-v0.2 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1705 | |0 | +| - p1 | | | |f1 | | 0.1603 | | 0 | +| - p2 | | | |f1 | | 0.1909 | | 0 | +| - p3 | | | |f1 | | 0.1603 | | 0 | +| - RE | | | |f1 | | 0.0592 | |0 | +| - p1 | | | |f1 | | 0.0432 | | 0 | +| - p2 | | | |f1 | | 0.0348 | | 0 | +| - p3 | | | |f1 | | 0.0994 | | 0 | diff --git a/csv_files/outputs/mistralai__Mistral-7B-Instruct-v0.2__gr__10shot.txt b/csv_files/outputs/mistralai__Mistral-7B-Instruct-v0.2__gr__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..b7051f2deb4230647fb3c9bbe0580a2fe84de6d8 --- /dev/null +++ b/csv_files/outputs/mistralai__Mistral-7B-Instruct-v0.2__gr__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=mistralai/Mistral-7B-Instruct-v0.2 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3548 | |0 | +| - p1 | | | |f1 | | 0.3498 | | 0 | +| - p2 | | | |f1 | | 0.3648 | | 0 | +| - p3 | | | |f1 | | 0.3498 | | 0 | +| - RE | | | |f1 | | 0.1862 | |0 | +| - p1 | | | |f1 | | 0.1055 | | 0 | +| - p2 | | | |f1 | | 0.2343 | | 0 | +| - p3 | | | |f1 | | 0.2189 | | 0 | diff --git a/csv_files/outputs/mistralai__Mistral-7B-Instruct-v0.2__it__0shot.txt b/csv_files/outputs/mistralai__Mistral-7B-Instruct-v0.2__it__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..b4e44b2cf5da34e762e0db150b8246e857eae345 --- /dev/null +++ b/csv_files/outputs/mistralai__Mistral-7B-Instruct-v0.2__it__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=mistralai/Mistral-7B-Instruct-v0.2 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2433 | |0 | +| - p1 | | | |f1 | | 0.2788 | | 0 | +| - p2 | | | |f1 | | 0.2030 | | 0 | +| - p3 | | | |f1 | | 0.2481 | | 0 | +| - RE | | | |f1 | | 0.0561 | |0 | +| - p1 | | | |f1 | | 0.1382 | | 0 | +| - p2 | | | |f1 | | 0.0163 | | 0 | +| - p3 | | | |f1 | | 0.0140 | | 0 | diff --git a/csv_files/outputs/mistralai__Mistral-7B-Instruct-v0.2__it__10shot.txt b/csv_files/outputs/mistralai__Mistral-7B-Instruct-v0.2__it__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..8a1dc9d3cb03c6344b978ea84a9310c9d638cfb5 --- /dev/null +++ b/csv_files/outputs/mistralai__Mistral-7B-Instruct-v0.2__it__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=mistralai/Mistral-7B-Instruct-v0.2 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5176 | |0 | +| - p1 | | | |f1 | | 0.5147 | | 0 | +| - p2 | | | |f1 | | 0.5232 | | 0 | +| - p3 | | | |f1 | | 0.5149 | | 0 | +| - RE | | | |f1 | | 0.3958 | |0 | +| - p1 | | | |f1 | | 0.3092 | | 0 | +| - p2 | | | |f1 | | 0.4530 | | 0 | +| - p3 | | | |f1 | | 0.4252 | | 0 | diff --git a/csv_files/outputs/mistralai__Mistral-7B-Instruct-v0.2__pl__0shot.txt b/csv_files/outputs/mistralai__Mistral-7B-Instruct-v0.2__pl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..77b2c7212ed034a17baff6431293cdb59c42592c --- /dev/null +++ b/csv_files/outputs/mistralai__Mistral-7B-Instruct-v0.2__pl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=mistralai/Mistral-7B-Instruct-v0.2 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2953 | |0 | +| - p1 | | | |f1 | | 0.3024 | | 0 | +| - p2 | | | |f1 | | 0.2811 | | 0 | +| - p3 | | | |f1 | | 0.3024 | | 0 | +| - RE | | | |f1 | | 0.1006 | |0 | +| - p1 | | | |f1 | | 0.0863 | | 0 | +| - p2 | | | |f1 | | 0.1292 | | 0 | +| - p3 | | | |f1 | | 0.0863 | | 0 | diff --git a/csv_files/outputs/mistralai__Mistral-7B-Instruct-v0.2__pl__10shot.txt b/csv_files/outputs/mistralai__Mistral-7B-Instruct-v0.2__pl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..0a825a34d1e4073d2ae5da7e22e86582b980912c --- /dev/null +++ b/csv_files/outputs/mistralai__Mistral-7B-Instruct-v0.2__pl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=mistralai/Mistral-7B-Instruct-v0.2 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4956 | |0 | +| - p1 | | | |f1 | | 0.4911 | | 0 | +| - p2 | | | |f1 | | 0.5046 | | 0 | +| - p3 | | | |f1 | | 0.4911 | | 0 | +| - RE | | | |f1 | | 0.3296 | |0 | +| - p1 | | | |f1 | | 0.3895 | | 0 | +| - p2 | | | |f1 | | 0.3311 | | 0 | +| - p3 | | | |f1 | | 0.2683 | | 0 | diff --git a/csv_files/outputs/mistralai__Mistral-7B-Instruct-v0.2__sk__0shot.txt b/csv_files/outputs/mistralai__Mistral-7B-Instruct-v0.2__sk__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..6eeaac48bff7f356aa2168ad7755b879c69be13a --- /dev/null +++ b/csv_files/outputs/mistralai__Mistral-7B-Instruct-v0.2__sk__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=mistralai/Mistral-7B-Instruct-v0.2 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2144 | |0 | +| - p1 | | | |f1 | | 0.2143 | | 0 | +| - p2 | | | |f1 | | 0.2146 | | 0 | +| - p3 | | | |f1 | | 0.2143 | | 0 | +| - RE | | | |f1 | | 0.0782 | |0 | +| - p1 | | | |f1 | | 0.0756 | | 0 | +| - p2 | | | |f1 | | 0.0835 | | 0 | +| - p3 | | | |f1 | | 0.0756 | | 0 | diff --git a/csv_files/outputs/mistralai__Mistral-7B-Instruct-v0.2__sk__10shot.txt b/csv_files/outputs/mistralai__Mistral-7B-Instruct-v0.2__sk__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..3952afe3c0317b08c9e06f3caff5ae01eb9aa4e2 --- /dev/null +++ b/csv_files/outputs/mistralai__Mistral-7B-Instruct-v0.2__sk__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=mistralai/Mistral-7B-Instruct-v0.2 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3951 | |0 | +| - p1 | | | |f1 | | 0.4029 | | 0 | +| - p2 | | | |f1 | | 0.3794 | | 0 | +| - p3 | | | |f1 | | 0.4029 | | 0 | +| - RE | | | |f1 | | 0.2132 | |0 | +| - p1 | | | |f1 | | 0.2155 | | 0 | +| - p2 | | | |f1 | | 0.1948 | | 0 | +| - p3 | | | |f1 | | 0.2293 | | 0 | diff --git a/csv_files/outputs/mistralai__Mistral-7B-Instruct-v0.2__sl__0shot.txt b/csv_files/outputs/mistralai__Mistral-7B-Instruct-v0.2__sl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..e0158432acf19d26d277f1b57deb076edd05514a --- /dev/null +++ b/csv_files/outputs/mistralai__Mistral-7B-Instruct-v0.2__sl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=mistralai/Mistral-7B-Instruct-v0.2 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1826 | |0 | +| - p1 | | | |f1 | | 0.1766 | | 0 | +| - p2 | | | |f1 | | 0.1947 | | 0 | +| - p3 | | | |f1 | | 0.1766 | | 0 | +| - RE | | | |f1 | | 0.1076 | |0 | +| - p1 | | | |f1 | | 0.0766 | | 0 | +| - p2 | | | |f1 | | 0.1695 | | 0 | +| - p3 | | | |f1 | | 0.0766 | | 0 | diff --git a/csv_files/outputs/mistralai__Mistral-7B-Instruct-v0.2__sl__10shot.txt b/csv_files/outputs/mistralai__Mistral-7B-Instruct-v0.2__sl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..c0a21e1f1e602067347fb6c7ae3af7c47c220eb9 --- /dev/null +++ b/csv_files/outputs/mistralai__Mistral-7B-Instruct-v0.2__sl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=mistralai/Mistral-7B-Instruct-v0.2 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4194 | |0 | +| - p1 | | | |f1 | | 0.4204 | | 0 | +| - p2 | | | |f1 | | 0.4174 | | 0 | +| - p3 | | | |f1 | | 0.4204 | | 0 | +| - RE | | | |f1 | | 0.2018 | |0 | +| - p1 | | | |f1 | | 0.1990 | | 0 | +| - p2 | | | |f1 | | 0.1950 | | 0 | +| - p3 | | | |f1 | | 0.2115 | | 0 | diff --git a/csv_files/outputs/mistralai__Mistral-Nemo-Instruct-2407__en__0shot.txt b/csv_files/outputs/mistralai__Mistral-Nemo-Instruct-2407__en__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2af72d7f2124d9fb87aba16aa7ec5979b1490e4 --- /dev/null +++ b/csv_files/outputs/mistralai__Mistral-Nemo-Instruct-2407__en__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=mistralai/Mistral-Nemo-Instruct-2407 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2271 | |0 | +| - p1 | | | |f1 | | 0.2767 | | 0 | +| - p2 | | | |f1 | | 0.2299 | | 0 | +| - p3 | | | |f1 | | 0.1748 | | 0 | +| - RE | | | |f1 | | 0.3472 | |0 | +| - p1 | | | |f1 | | 0.3694 | | 0 | +| - p2 | | | |f1 | | 0.3482 | | 0 | +| - p3 | | | |f1 | | 0.3241 | | 0 | diff --git a/csv_files/outputs/mistralai__Mistral-Nemo-Instruct-2407__en__10shot.txt b/csv_files/outputs/mistralai__Mistral-Nemo-Instruct-2407__en__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..7e232099737212c32a37983c482f6e7bf1aee5d9 --- /dev/null +++ b/csv_files/outputs/mistralai__Mistral-Nemo-Instruct-2407__en__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=mistralai/Mistral-Nemo-Instruct-2407 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5762 | |0 | +| - p1 | | | |f1 | | 0.5777 | | 0 | +| - p2 | | | |f1 | | 0.5841 | | 0 | +| - p3 | | | |f1 | | 0.5668 | | 0 | +| - RE | | | |f1 | | 0.4313 | |0 | +| - p1 | | | |f1 | | 0.3482 | | 0 | +| - p2 | | | |f1 | | 0.5008 | | 0 | +| - p3 | | | |f1 | | 0.4449 | | 0 | diff --git a/csv_files/outputs/mistralai__Mistral-Nemo-Instruct-2407__gr__0shot.txt b/csv_files/outputs/mistralai__Mistral-Nemo-Instruct-2407__gr__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6f9827c9bfca0e3f2fe140aa5bb7f63e64551b9 --- /dev/null +++ b/csv_files/outputs/mistralai__Mistral-Nemo-Instruct-2407__gr__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=mistralai/Mistral-Nemo-Instruct-2407 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0717 | |0 | +| - p1 | | | |f1 | | 0.0732 | | 0 | +| - p2 | | | |f1 | | 0.0687 | | 0 | +| - p3 | | | |f1 | | 0.0732 | | 0 | +| - RE | | | |f1 | | 0.2326 | |0 | +| - p1 | | | |f1 | | 0.1575 | | 0 | +| - p2 | | | |f1 | | 0.2117 | | 0 | +| - p3 | | | |f1 | | 0.3287 | | 0 | diff --git a/csv_files/outputs/mistralai__Mistral-Nemo-Instruct-2407__gr__10shot.txt b/csv_files/outputs/mistralai__Mistral-Nemo-Instruct-2407__gr__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..5c23dd02177855715602246da1ab145d4750a511 --- /dev/null +++ b/csv_files/outputs/mistralai__Mistral-Nemo-Instruct-2407__gr__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=mistralai/Mistral-Nemo-Instruct-2407 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5050 | |0 | +| - p1 | | | |f1 | | 0.5081 | | 0 | +| - p2 | | | |f1 | | 0.4988 | | 0 | +| - p3 | | | |f1 | | 0.5081 | | 0 | +| - RE | | | |f1 | | 0.2549 | |0 | +| - p1 | | | |f1 | | 0.2029 | | 0 | +| - p2 | | | |f1 | | 0.2296 | | 0 | +| - p3 | | | |f1 | | 0.3323 | | 0 | diff --git a/csv_files/outputs/mistralai__Mistral-Nemo-Instruct-2407__it__0shot.txt b/csv_files/outputs/mistralai__Mistral-Nemo-Instruct-2407__it__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..94723d98b89e2a142ae06a3699ea88924444d65a --- /dev/null +++ b/csv_files/outputs/mistralai__Mistral-Nemo-Instruct-2407__it__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=mistralai/Mistral-Nemo-Instruct-2407 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1960 | |0 | +| - p1 | | | |f1 | | 0.2792 | | 0 | +| - p2 | | | |f1 | | 0.1772 | | 0 | +| - p3 | | | |f1 | | 0.1316 | | 0 | +| - RE | | | |f1 | | 0.2365 | |0 | +| - p1 | | | |f1 | | 0.2849 | | 0 | +| - p2 | | | |f1 | | 0.2384 | | 0 | +| - p3 | | | |f1 | | 0.1861 | | 0 | diff --git a/csv_files/outputs/mistralai__Mistral-Nemo-Instruct-2407__it__10shot.txt b/csv_files/outputs/mistralai__Mistral-Nemo-Instruct-2407__it__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..dc5b1148df6fbd0d32f982b72350b9baed9e392d --- /dev/null +++ b/csv_files/outputs/mistralai__Mistral-Nemo-Instruct-2407__it__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=mistralai/Mistral-Nemo-Instruct-2407 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6441 | |0 | +| - p1 | | | |f1 | | 0.6430 | | 0 | +| - p2 | | | |f1 | | 0.6437 | | 0 | +| - p3 | | | |f1 | | 0.6457 | | 0 | +| - RE | | | |f1 | | 0.3556 | |0 | +| - p1 | | | |f1 | | 0.2708 | | 0 | +| - p2 | | | |f1 | | 0.4099 | | 0 | +| - p3 | | | |f1 | | 0.3860 | | 0 | diff --git a/csv_files/outputs/mistralai__Mistral-Nemo-Instruct-2407__pl__0shot.txt b/csv_files/outputs/mistralai__Mistral-Nemo-Instruct-2407__pl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..9a7d8e28e086f27eb9637e6ad992ba36f0de390a --- /dev/null +++ b/csv_files/outputs/mistralai__Mistral-Nemo-Instruct-2407__pl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=mistralai/Mistral-Nemo-Instruct-2407 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0468 | |0 | +| - p1 | | | |f1 | | 0.0483 | | 0 | +| - p2 | | | |f1 | | 0.0439 | | 0 | +| - p3 | | | |f1 | | 0.0483 | | 0 | +| - RE | | | |f1 | | 0.1823 | |0 | +| - p1 | | | |f1 | | 0.2123 | | 0 | +| - p2 | | | |f1 | | 0.1686 | | 0 | +| - p3 | | | |f1 | | 0.1661 | | 0 | diff --git a/csv_files/outputs/mistralai__Mistral-Nemo-Instruct-2407__pl__10shot.txt b/csv_files/outputs/mistralai__Mistral-Nemo-Instruct-2407__pl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..824dea14ef26e4fd07539f5cdc57cb0d72d7a869 --- /dev/null +++ b/csv_files/outputs/mistralai__Mistral-Nemo-Instruct-2407__pl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=mistralai/Mistral-Nemo-Instruct-2407 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5375 | |0 | +| - p1 | | | |f1 | | 0.5352 | | 0 | +| - p2 | | | |f1 | | 0.5421 | | 0 | +| - p3 | | | |f1 | | 0.5352 | | 0 | +| - RE | | | |f1 | | 0.1906 | |0 | +| - p1 | | | |f1 | | 0.1863 | | 0 | +| - p2 | | | |f1 | | 0.1855 | | 0 | +| - p3 | | | |f1 | | 0.2001 | | 0 | diff --git a/csv_files/outputs/mistralai__Mistral-Nemo-Instruct-2407__sk__0shot.txt b/csv_files/outputs/mistralai__Mistral-Nemo-Instruct-2407__sk__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c38472f1e9cb2c7693144b17179e9dcfe88f159 --- /dev/null +++ b/csv_files/outputs/mistralai__Mistral-Nemo-Instruct-2407__sk__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=mistralai/Mistral-Nemo-Instruct-2407 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0738 | |0 | +| - p1 | | | |f1 | | 0.0685 | | 0 | +| - p2 | | | |f1 | | 0.0844 | | 0 | +| - p3 | | | |f1 | | 0.0685 | | 0 | +| - RE | | | |f1 | | 0.1596 | |0 | +| - p1 | | | |f1 | | 0.1696 | | 0 | +| - p2 | | | |f1 | | 0.1396 | | 0 | +| - p3 | | | |f1 | | 0.1696 | | 0 | diff --git a/csv_files/outputs/mistralai__Mistral-Nemo-Instruct-2407__sk__10shot.txt b/csv_files/outputs/mistralai__Mistral-Nemo-Instruct-2407__sk__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e3e1b2ef67ae845db75704f414dd97a01bc4d8a --- /dev/null +++ b/csv_files/outputs/mistralai__Mistral-Nemo-Instruct-2407__sk__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=mistralai/Mistral-Nemo-Instruct-2407 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5030 | |0 | +| - p1 | | | |f1 | | 0.5025 | | 0 | +| - p2 | | | |f1 | | 0.5040 | | 0 | +| - p3 | | | |f1 | | 0.5025 | | 0 | +| - RE | | | |f1 | | 0.1832 | |0 | +| - p1 | | | |f1 | | 0.1237 | | 0 | +| - p2 | | | |f1 | | 0.2166 | | 0 | +| - p3 | | | |f1 | | 0.2094 | | 0 | diff --git a/csv_files/outputs/mistralai__Mistral-Nemo-Instruct-2407__sl__0shot.txt b/csv_files/outputs/mistralai__Mistral-Nemo-Instruct-2407__sl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..832ca83ee26ae570b1c4e4d781100383be94e147 --- /dev/null +++ b/csv_files/outputs/mistralai__Mistral-Nemo-Instruct-2407__sl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=mistralai/Mistral-Nemo-Instruct-2407 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0842 | |0 | +| - p1 | | | |f1 | | 0.0861 | | 0 | +| - p2 | | | |f1 | | 0.0805 | | 0 | +| - p3 | | | |f1 | | 0.0861 | | 0 | +| - RE | | | |f1 | | 0.1905 | |0 | +| - p1 | | | |f1 | | 0.2309 | | 0 | +| - p2 | | | |f1 | | 0.1096 | | 0 | +| - p3 | | | |f1 | | 0.2309 | | 0 | diff --git a/csv_files/outputs/mistralai__Mistral-Nemo-Instruct-2407__sl__10shot.txt b/csv_files/outputs/mistralai__Mistral-Nemo-Instruct-2407__sl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..c2b96898a18579d2b16376fa5e4d1159ed4fc544 --- /dev/null +++ b/csv_files/outputs/mistralai__Mistral-Nemo-Instruct-2407__sl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=mistralai/Mistral-Nemo-Instruct-2407 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5327 | |0 | +| - p1 | | | |f1 | | 0.5323 | | 0 | +| - p2 | | | |f1 | | 0.5335 | | 0 | +| - p3 | | | |f1 | | 0.5323 | | 0 | +| - RE | | | |f1 | | 0.1725 | |0 | +| - p1 | | | |f1 | | 0.1390 | | 0 | +| - p2 | | | |f1 | | 0.2057 | | 0 | +| - p3 | | | |f1 | | 0.1727 | | 0 | diff --git a/csv_files/outputs/tiiuae__Falcon3-10B-Instruct__en__0shot.txt b/csv_files/outputs/tiiuae__Falcon3-10B-Instruct__en__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..0cdeef408ee3cfe2f6230ee84b4f0d454d6847c9 --- /dev/null +++ b/csv_files/outputs/tiiuae__Falcon3-10B-Instruct__en__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=tiiuae/Falcon3-10B-Instruct ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2658 | |0 | +| - p1 | | | |f1 | | 0.2270 | | 0 | +| - p2 | | | |f1 | | 0.2709 | | 0 | +| - p3 | | | |f1 | | 0.2996 | | 0 | +| - RE | | | |f1 | | 0.3280 | |0 | +| - p1 | | | |f1 | | 0.2157 | | 0 | +| - p2 | | | |f1 | | 0.3835 | | 0 | +| - p3 | | | |f1 | | 0.3848 | | 0 | diff --git a/csv_files/outputs/tiiuae__Falcon3-10B-Instruct__en__10shot.txt b/csv_files/outputs/tiiuae__Falcon3-10B-Instruct__en__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..8852cb62bd0e04b214587916191c6b150f925661 --- /dev/null +++ b/csv_files/outputs/tiiuae__Falcon3-10B-Instruct__en__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=tiiuae/Falcon3-10B-Instruct ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5730 | |0 | +| - p1 | | | |f1 | | 0.5840 | | 0 | +| - p2 | | | |f1 | | 0.5421 | | 0 | +| - p3 | | | |f1 | | 0.5928 | | 0 | +| - RE | | | |f1 | | 0.5145 | |0 | +| - p1 | | | |f1 | | 0.4335 | | 0 | +| - p2 | | | |f1 | | 0.5586 | | 0 | +| - p3 | | | |f1 | | 0.5515 | | 0 | diff --git a/csv_files/outputs/tiiuae__Falcon3-10B-Instruct__gr__0shot.txt b/csv_files/outputs/tiiuae__Falcon3-10B-Instruct__gr__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..b7516c1517104e5be8c21dca55222faa85473fdc --- /dev/null +++ b/csv_files/outputs/tiiuae__Falcon3-10B-Instruct__gr__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=tiiuae/Falcon3-10B-Instruct ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1585 | |0 | +| - p1 | | | |f1 | | 0.2130 | | 0 | +| - p2 | | | |f1 | | 0.0495 | | 0 | +| - p3 | | | |f1 | | 0.2130 | | 0 | +| - RE | | | |f1 | | 0.0506 | |0 | +| - p1 | | | |f1 | | 0.0401 | | 0 | +| - p2 | | | |f1 | | 0.0250 | | 0 | +| - p3 | | | |f1 | | 0.0867 | | 0 | diff --git a/csv_files/outputs/tiiuae__Falcon3-10B-Instruct__gr__10shot.txt b/csv_files/outputs/tiiuae__Falcon3-10B-Instruct__gr__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..ba5c002a9264a1d56a51c72b4dc642ee87b8c605 --- /dev/null +++ b/csv_files/outputs/tiiuae__Falcon3-10B-Instruct__gr__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=tiiuae/Falcon3-10B-Instruct ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3448 | |0 | +| - p1 | | | |f1 | | 0.3345 | | 0 | +| - p2 | | | |f1 | | 0.3655 | | 0 | +| - p3 | | | |f1 | | 0.3345 | | 0 | +| - RE | | | |f1 | | 0.3591 | |0 | +| - p1 | | | |f1 | | 0.3749 | | 0 | +| - p2 | | | |f1 | | 0.3755 | | 0 | +| - p3 | | | |f1 | | 0.3268 | | 0 | diff --git a/csv_files/outputs/tiiuae__Falcon3-10B-Instruct__it__0shot.txt b/csv_files/outputs/tiiuae__Falcon3-10B-Instruct__it__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..9bfdea743aa846d49f2acc6c0ba67e678ce8c4b0 --- /dev/null +++ b/csv_files/outputs/tiiuae__Falcon3-10B-Instruct__it__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=tiiuae/Falcon3-10B-Instruct ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2011 | |0 | +| - p1 | | | |f1 | | 0.1261 | | 0 | +| - p2 | | | |f1 | | 0.2327 | | 0 | +| - p3 | | | |f1 | | 0.2444 | | 0 | +| - RE | | | |f1 | | 0.1865 | |0 | +| - p1 | | | |f1 | | 0.2404 | | 0 | +| - p2 | | | |f1 | | 0.1699 | | 0 | +| - p3 | | | |f1 | | 0.1492 | | 0 | diff --git a/csv_files/outputs/tiiuae__Falcon3-10B-Instruct__it__10shot.txt b/csv_files/outputs/tiiuae__Falcon3-10B-Instruct__it__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..e8b141d311893ba8c76e8ed7cce50f6f06752573 --- /dev/null +++ b/csv_files/outputs/tiiuae__Falcon3-10B-Instruct__it__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=tiiuae/Falcon3-10B-Instruct ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5625 | |0 | +| - p1 | | | |f1 | | 0.5821 | | 0 | +| - p2 | | | |f1 | | 0.5432 | | 0 | +| - p3 | | | |f1 | | 0.5622 | | 0 | +| - RE | | | |f1 | | 0.5226 | |0 | +| - p1 | | | |f1 | | 0.4622 | | 0 | +| - p2 | | | |f1 | | 0.5458 | | 0 | +| - p3 | | | |f1 | | 0.5597 | | 0 | diff --git a/csv_files/outputs/tiiuae__Falcon3-10B-Instruct__pl__0shot.txt b/csv_files/outputs/tiiuae__Falcon3-10B-Instruct__pl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..edaf86b247e1c20e0f7c4138f96ec19d5a571ae4 --- /dev/null +++ b/csv_files/outputs/tiiuae__Falcon3-10B-Instruct__pl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=tiiuae/Falcon3-10B-Instruct ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2414 | |0 | +| - p1 | | | |f1 | | 0.2452 | | 0 | +| - p2 | | | |f1 | | 0.2338 | | 0 | +| - p3 | | | |f1 | | 0.2452 | | 0 | +| - RE | | | |f1 | | 0.0963 | |0 | +| - p1 | | | |f1 | | 0.1501 | | 0 | +| - p2 | | | |f1 | | 0.0123 | | 0 | +| - p3 | | | |f1 | | 0.1264 | | 0 | diff --git a/csv_files/outputs/tiiuae__Falcon3-10B-Instruct__pl__10shot.txt b/csv_files/outputs/tiiuae__Falcon3-10B-Instruct__pl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..068f9654a4427b28cd68c4493756660bf40e63a0 --- /dev/null +++ b/csv_files/outputs/tiiuae__Falcon3-10B-Instruct__pl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=tiiuae/Falcon3-10B-Instruct ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4244 | |0 | +| - p1 | | | |f1 | | 0.4304 | | 0 | +| - p2 | | | |f1 | | 0.4123 | | 0 | +| - p3 | | | |f1 | | 0.4304 | | 0 | +| - RE | | | |f1 | | 0.5396 | |0 | +| - p1 | | | |f1 | | 0.5129 | | 0 | +| - p2 | | | |f1 | | 0.5571 | | 0 | +| - p3 | | | |f1 | | 0.5489 | | 0 | diff --git a/csv_files/outputs/tiiuae__Falcon3-10B-Instruct__sk__0shot.txt b/csv_files/outputs/tiiuae__Falcon3-10B-Instruct__sk__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..6caf6d188e37dd8d852231914da9bca9053abf92 --- /dev/null +++ b/csv_files/outputs/tiiuae__Falcon3-10B-Instruct__sk__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=tiiuae/Falcon3-10B-Instruct ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2871 | |0 | +| - p1 | | | |f1 | | 0.2717 | | 0 | +| - p2 | | | |f1 | | 0.3178 | | 0 | +| - p3 | | | |f1 | | 0.2717 | | 0 | +| - RE | | | |f1 | | 0.0182 | |0 | +| - p1 | | | |f1 | | 0.0143 | | 0 | +| - p2 | | | |f1 | | 0.0260 | | 0 | +| - p3 | | | |f1 | | 0.0143 | | 0 | diff --git a/csv_files/outputs/tiiuae__Falcon3-10B-Instruct__sk__10shot.txt b/csv_files/outputs/tiiuae__Falcon3-10B-Instruct__sk__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..dc5594c73257786e6edd1f8c852ad343d66e7f30 --- /dev/null +++ b/csv_files/outputs/tiiuae__Falcon3-10B-Instruct__sk__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=tiiuae/Falcon3-10B-Instruct ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4402 | |0 | +| - p1 | | | |f1 | | 0.4545 | | 0 | +| - p2 | | | |f1 | | 0.4116 | | 0 | +| - p3 | | | |f1 | | 0.4545 | | 0 | +| - RE | | | |f1 | | 0.4261 | |0 | +| - p1 | | | |f1 | | 0.3750 | | 0 | +| - p2 | | | |f1 | | 0.4695 | | 0 | +| - p3 | | | |f1 | | 0.4338 | | 0 | diff --git a/csv_files/outputs/tiiuae__Falcon3-10B-Instruct__sl__0shot.txt b/csv_files/outputs/tiiuae__Falcon3-10B-Instruct__sl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..8d586cd4e31f21369f0d3c8873dba4eb0ce073b5 --- /dev/null +++ b/csv_files/outputs/tiiuae__Falcon3-10B-Instruct__sl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=tiiuae/Falcon3-10B-Instruct ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2297 | |0 | +| - p1 | | | |f1 | | 0.2519 | | 0 | +| - p2 | | | |f1 | | 0.1853 | | 0 | +| - p3 | | | |f1 | | 0.2519 | | 0 | +| - RE | | | |f1 | | 0.0050 | |0 | +| - p1 | | | |f1 | | 0.0047 | | 0 | +| - p2 | | | |f1 | | 0.0058 | | 0 | +| - p3 | | | |f1 | | 0.0047 | | 0 | diff --git a/csv_files/outputs/tiiuae__Falcon3-10B-Instruct__sl__10shot.txt b/csv_files/outputs/tiiuae__Falcon3-10B-Instruct__sl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..999a14510d1ea855adb9835bc9235c19f1a60783 --- /dev/null +++ b/csv_files/outputs/tiiuae__Falcon3-10B-Instruct__sl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=tiiuae/Falcon3-10B-Instruct ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4050 | |0 | +| - p1 | | | |f1 | | 0.4121 | | 0 | +| - p2 | | | |f1 | | 0.3909 | | 0 | +| - p3 | | | |f1 | | 0.4121 | | 0 | +| - RE | | | |f1 | | 0.3133 | |0 | +| - p1 | | | |f1 | | 0.2323 | | 0 | +| - p2 | | | |f1 | | 0.3012 | | 0 | +| - p3 | | | |f1 | | 0.4063 | | 0 | diff --git a/csv_files/outputs/unsloth__phi-4__en__0shot.txt b/csv_files/outputs/unsloth__phi-4__en__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..ba5d152519748cd98fa31e6e3d3083ba897a70ba --- /dev/null +++ b/csv_files/outputs/unsloth__phi-4__en__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=unsloth/phi-4 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0275 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0252 | | 0 | +| - p3 | | | |f1 | | 0.0572 | | 0 | +| - RE | | | |f1 | | 0.4090 | |0 | +| - p1 | | | |f1 | | 0.4022 | | 0 | +| - p2 | | | |f1 | | 0.4219 | | 0 | +| - p3 | | | |f1 | | 0.4030 | | 0 | diff --git a/csv_files/outputs/unsloth__phi-4__en__10shot.txt b/csv_files/outputs/unsloth__phi-4__en__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f769aedc59b9b6be40222cca25d09daf6ffd0b3 --- /dev/null +++ b/csv_files/outputs/unsloth__phi-4__en__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=unsloth/phi-4 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5984 | |0 | +| - p1 | | | |f1 | | 0.6098 | | 0 | +| - p2 | | | |f1 | | 0.5711 | | 0 | +| - p3 | | | |f1 | | 0.6141 | | 0 | +| - RE | | | |f1 | | 0.5364 | |0 | +| - p1 | | | |f1 | | 0.4912 | | 0 | +| - p2 | | | |f1 | | 0.5626 | | 0 | +| - p3 | | | |f1 | | 0.5554 | | 0 | diff --git a/csv_files/outputs/unsloth__phi-4__gr__0shot.txt b/csv_files/outputs/unsloth__phi-4__gr__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..a17e8d575c703b7ccebc72cd8ff6aeca0397f1cc --- /dev/null +++ b/csv_files/outputs/unsloth__phi-4__gr__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=unsloth/phi-4 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - RE | | | |f1 | | 0.2011 | |0 | +| - p1 | | | |f1 | | 0.2901 | | 0 | +| - p2 | | | |f1 | | 0.2208 | | 0 | +| - p3 | | | |f1 | | 0.0925 | | 0 | diff --git a/csv_files/outputs/unsloth__phi-4__gr__10shot.txt b/csv_files/outputs/unsloth__phi-4__gr__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f48b7d4235602d80223abd071f8764d2a1a5bfc --- /dev/null +++ b/csv_files/outputs/unsloth__phi-4__gr__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=unsloth/phi-4 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5682 | |0 | +| - p1 | | | |f1 | | 0.5717 | | 0 | +| - p2 | | | |f1 | | 0.5611 | | 0 | +| - p3 | | | |f1 | | 0.5717 | | 0 | +| - RE | | | |f1 | | 0.5291 | |0 | +| - p1 | | | |f1 | | 0.4935 | | 0 | +| - p2 | | | |f1 | | 0.5261 | | 0 | +| - p3 | | | |f1 | | 0.5678 | | 0 | diff --git a/csv_files/outputs/unsloth__phi-4__it__0shot.txt b/csv_files/outputs/unsloth__phi-4__it__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..85e165655342f1ca2d8da14464c516405b5a51c6 --- /dev/null +++ b/csv_files/outputs/unsloth__phi-4__it__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=unsloth/phi-4 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1717 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.1724 | | 0 | +| - p3 | | | |f1 | | 0.3428 | | 0 | +| - RE | | | |f1 | | 0.3589 | |0 | +| - p1 | | | |f1 | | 0.3354 | | 0 | +| - p2 | | | |f1 | | 0.3737 | | 0 | +| - p3 | | | |f1 | | 0.3677 | | 0 | diff --git a/csv_files/outputs/unsloth__phi-4__it__10shot.txt b/csv_files/outputs/unsloth__phi-4__it__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..be3c82925982b576686dd2db835c6bbc58fc89f7 --- /dev/null +++ b/csv_files/outputs/unsloth__phi-4__it__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=unsloth/phi-4 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6759 | |0 | +| - p1 | | | |f1 | | 0.6647 | | 0 | +| - p2 | | | |f1 | | 0.6732 | | 0 | +| - p3 | | | |f1 | | 0.6897 | | 0 | +| - RE | | | |f1 | | 0.5705 | |0 | +| - p1 | | | |f1 | | 0.5608 | | 0 | +| - p2 | | | |f1 | | 0.5820 | | 0 | +| - p3 | | | |f1 | | 0.5688 | | 0 | diff --git a/csv_files/outputs/unsloth__phi-4__pl__0shot.txt b/csv_files/outputs/unsloth__phi-4__pl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..50d734915f57e7a4713da8e3d4cb6ae9a653a9a1 --- /dev/null +++ b/csv_files/outputs/unsloth__phi-4__pl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=unsloth/phi-4 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0279 | |0 | +| - p1 | | | |f1 | | 0.0236 | | 0 | +| - p2 | | | |f1 | | 0.0366 | | 0 | +| - p3 | | | |f1 | | 0.0236 | | 0 | +| - RE | | | |f1 | | 0.3814 | |0 | +| - p1 | | | |f1 | | 0.3799 | | 0 | +| - p2 | | | |f1 | | 0.3829 | | 0 | +| - p3 | | | |f1 | | 0.3813 | | 0 | diff --git a/csv_files/outputs/unsloth__phi-4__pl__10shot.txt b/csv_files/outputs/unsloth__phi-4__pl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..13c70462fcbbc4333d7e40ab047995e60782311c --- /dev/null +++ b/csv_files/outputs/unsloth__phi-4__pl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=unsloth/phi-4 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5474 | |0 | +| - p1 | | | |f1 | | 0.5549 | | 0 | +| - p2 | | | |f1 | | 0.5324 | | 0 | +| - p3 | | | |f1 | | 0.5549 | | 0 | +| - RE | | | |f1 | | 0.5718 | |0 | +| - p1 | | | |f1 | | 0.5423 | | 0 | +| - p2 | | | |f1 | | 0.5760 | | 0 | +| - p3 | | | |f1 | | 0.5972 | | 0 | diff --git a/csv_files/outputs/unsloth__phi-4__sk__0shot.txt b/csv_files/outputs/unsloth__phi-4__sk__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..609bfee5abd16055de50dbbc8a5b5e54bf628dde --- /dev/null +++ b/csv_files/outputs/unsloth__phi-4__sk__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=unsloth/phi-4 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0567 | |0 | +| - p1 | | | |f1 | | 0.0316 | | 0 | +| - p2 | | | |f1 | | 0.1070 | | 0 | +| - p3 | | | |f1 | | 0.0316 | | 0 | +| - RE | | | |f1 | | 0.3277 | |0 | +| - p1 | | | |f1 | | 0.3252 | | 0 | +| - p2 | | | |f1 | | 0.3326 | | 0 | +| - p3 | | | |f1 | | 0.3252 | | 0 | diff --git a/csv_files/outputs/unsloth__phi-4__sk__10shot.txt b/csv_files/outputs/unsloth__phi-4__sk__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..e55439f603a7ee43ebc4fb2b6489d94a69f17b05 --- /dev/null +++ b/csv_files/outputs/unsloth__phi-4__sk__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=unsloth/phi-4 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5524 | |0 | +| - p1 | | | |f1 | | 0.5561 | | 0 | +| - p2 | | | |f1 | | 0.5449 | | 0 | +| - p3 | | | |f1 | | 0.5561 | | 0 | +| - RE | | | |f1 | | 0.5214 | |0 | +| - p1 | | | |f1 | | 0.5106 | | 0 | +| - p2 | | | |f1 | | 0.4994 | | 0 | +| - p3 | | | |f1 | | 0.5541 | | 0 | diff --git a/csv_files/outputs/unsloth__phi-4__sl__0shot.txt b/csv_files/outputs/unsloth__phi-4__sl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..29578b4d5063f990ad13a10dcac7d69a04c24725 --- /dev/null +++ b/csv_files/outputs/unsloth__phi-4__sl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=unsloth/phi-4 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2241 | |0 | +| - p1 | | | |f1 | | 0.2870 | | 0 | +| - p2 | | | |f1 | | 0.0981 | | 0 | +| - p3 | | | |f1 | | 0.2870 | | 0 | +| - RE | | | |f1 | | 0.2721 | |0 | +| - p1 | | | |f1 | | 0.3209 | | 0 | +| - p2 | | | |f1 | | 0.1744 | | 0 | +| - p3 | | | |f1 | | 0.3209 | | 0 | diff --git a/csv_files/outputs/unsloth__phi-4__sl__10shot.txt b/csv_files/outputs/unsloth__phi-4__sl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..debd951319f9e20f02aade8491ff82efa207384f --- /dev/null +++ b/csv_files/outputs/unsloth__phi-4__sl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=unsloth/phi-4 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5577 | |0 | +| - p1 | | | |f1 | | 0.5586 | | 0 | +| - p2 | | | |f1 | | 0.5558 | | 0 | +| - p3 | | | |f1 | | 0.5586 | | 0 | +| - RE | | | |f1 | | 0.5309 | |0 | +| - p1 | | | |f1 | | 0.5117 | | 0 | +| - p2 | | | |f1 | | 0.5232 | | 0 | +| - p3 | | | |f1 | | 0.5579 | | 0 | diff --git a/csv_new/llm_scores_p1_final.xlsx b/csv_new/llm_scores_p1_final.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..923b919a1bab64c07772e60306214192f33545b8 Binary files /dev/null and b/csv_new/llm_scores_p1_final.xlsx differ diff --git a/csv_new/llm_scores_p2_final.xlsx b/csv_new/llm_scores_p2_final.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..4e6ac35e33f86ef21781bf32c1ff038811d49b85 Binary files /dev/null and b/csv_new/llm_scores_p2_final.xlsx differ diff --git a/csv_new/llm_scores_p3_final.xlsx b/csv_new/llm_scores_p3_final.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..a1f28fbada7040d1d53b7925e954ec80b5aeeb19 Binary files /dev/null and b/csv_new/llm_scores_p3_final.xlsx differ diff --git a/csv_new/output/.ipynb_checkpoints/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__en__0shot-checkpoint.txt b/csv_new/output/.ipynb_checkpoints/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__en__0shot-checkpoint.txt new file mode 100644 index 0000000000000000000000000000000000000000..92e6941722e5350b9a314942add661e213655787 --- /dev/null +++ b/csv_new/output/.ipynb_checkpoints/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__en__0shot-checkpoint.txt @@ -0,0 +1,23 @@ +hf (pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2877 | |0 | +| - p1 | | | |f1 | | 0.1963 | | 0 | +| - p2 | | | |f1 | | 0.3459 | | 0 | +| - p3 | | | |f1 | | 0.3208 | | 0 | +| - RE | | | |f1 | | 0.4430 | |0 | +| - p1 | | | |f1 | | 0.4487 | | 0 | +| - p2 | | | |f1 | | 0.4492 | | 0 | +| - p3 | | | |f1 | | 0.4311 | | 0 | +| - RML | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - DIA | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - HIS | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/Henrychur__MMed-Llama-3-8B__en__0shot.txt b/csv_new/output/Henrychur__MMed-Llama-3-8B__en__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e0d44698fc193727d37adeae0ccf2ccebdf34f3 --- /dev/null +++ b/csv_new/output/Henrychur__MMed-Llama-3-8B__en__0shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0918 | |0 | +| - p1 | | | |f1 | | 0.0629 | | 0 | +| - p2 | | | |f1 | | 0.1041 | | 0 | +| - p3 | | | |f1 | | 0.1083 | | 0 | +| - RE | | | |f1 | | 0.2604 | |0 | +| - p1 | | | |f1 | | 0.1287 | | 0 | +| - p2 | | | |f1 | | 0.3394 | | 0 | +| - p3 | | | |f1 | | 0.3131 | | 0 | +| - RML | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - DIA | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - HIS | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/Henrychur__MMed-Llama-3-8B__en__10shot.txt b/csv_new/output/Henrychur__MMed-Llama-3-8B__en__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..8d2b0d31313a12b76c1464713902f00fe033d096 --- /dev/null +++ b/csv_new/output/Henrychur__MMed-Llama-3-8B__en__10shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2142 | |0 | +| - p1 | | | |f1 | | 0.2189 | | 0 | +| - p2 | | | |f1 | | 0.2243 | | 0 | +| - p3 | | | |f1 | | 0.1994 | | 0 | +| - RE | | | |f1 | | 0.1681 | |0 | +| - p1 | | | |f1 | | 0.1189 | | 0 | +| - p2 | | | |f1 | | 0.1668 | | 0 | +| - p3 | | | |f1 | | 0.2185 | | 0 | +| - RML | | | |f1 | | 0.1779 | |0 | +| - p1 | | | |f1 | | 0.1825 | | 0 | +| - p2 | | | |f1 | | 0.1612 | | 0 | +| - p3 | | | |f1 | | 0.1900 | | 0 | +| - DIA | | | |f1 | | 0.1500 | |0 | +| - p1 | | | |f1 | | 0.2415 | | 0 | +| - p2 | | | |f1 | | 0.1416 | | 0 | +| - p3 | | | |f1 | | 0.0668 | | 0 | +| - HIS | | | |f1 | | 0.0147 | |0 | +| - p1 | | | |f1 | | 0.0178 | | 0 | +| - p2 | | | |f1 | | 0.0068 | | 0 | +| - p3 | | | |f1 | | 0.0194 | | 0 | diff --git a/csv_new/output/Henrychur__MMed-Llama-3-8B__gr__0shot.txt b/csv_new/output/Henrychur__MMed-Llama-3-8B__gr__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..256806ae1aae91613bb15e7f61973bb2c3d373e9 --- /dev/null +++ b/csv_new/output/Henrychur__MMed-Llama-3-8B__gr__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0611 | |0 | +| - p1 | | | |f1 | | 0.0620 | | 0 | +| - p2 | | | |f1 | | 0.0592 | | 0 | +| - p3 | | | |f1 | | 0.0620 | | 0 | +| - RE | | | |f1 | | 0.0863 | |0 | +| - p1 | | | |f1 | | 0.1017 | | 0 | +| - p2 | | | |f1 | | 0.0506 | | 0 | +| - p3 | | | |f1 | | 0.1065 | | 0 | diff --git a/csv_new/output/Henrychur__MMed-Llama-3-8B__gr__10shot.txt b/csv_new/output/Henrychur__MMed-Llama-3-8B__gr__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..e1968c70df7ae59de71b96c9719693f1041cc591 --- /dev/null +++ b/csv_new/output/Henrychur__MMed-Llama-3-8B__gr__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1474 | |0 | +| - p1 | | | |f1 | | 0.1667 | | 0 | +| - p2 | | | |f1 | | 0.1089 | | 0 | +| - p3 | | | |f1 | | 0.1667 | | 0 | +| - RE | | | |f1 | | 0.0970 | |0 | +| - p1 | | | |f1 | | 0.0821 | | 0 | +| - p2 | | | |f1 | | 0.1053 | | 0 | +| - p3 | | | |f1 | | 0.1036 | | 0 | diff --git a/csv_new/output/Henrychur__MMed-Llama-3-8B__it__0shot.txt b/csv_new/output/Henrychur__MMed-Llama-3-8B__it__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..ddbca560d833645b835f8b9d6440e08d5992aaa9 --- /dev/null +++ b/csv_new/output/Henrychur__MMed-Llama-3-8B__it__0shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0416 | |0 | +| - p1 | | | |f1 | | 0.0435 | | 0 | +| - p2 | | | |f1 | | 0.0429 | | 0 | +| - p3 | | | |f1 | | 0.0384 | | 0 | +| - RE | | | |f1 | | 0.1413 | |0 | +| - p1 | | | |f1 | | 0.0672 | | 0 | +| - p2 | | | |f1 | | 0.2266 | | 0 | +| - p3 | | | |f1 | | 0.1300 | | 0 | +| - RML | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - DIA | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - HIS | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/Henrychur__MMed-Llama-3-8B__it__10shot.txt b/csv_new/output/Henrychur__MMed-Llama-3-8B__it__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..0ee3bab5e26688cb910d34e5a2d273285ee07ee5 --- /dev/null +++ b/csv_new/output/Henrychur__MMed-Llama-3-8B__it__10shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3753 | |0 | +| - p1 | | | |f1 | | 0.3299 | | 0 | +| - p2 | | | |f1 | | 0.4023 | | 0 | +| - p3 | | | |f1 | | 0.3938 | | 0 | +| - RE | | | |f1 | | 0.1331 | |0 | +| - p1 | | | |f1 | | 0.0977 | | 0 | +| - p2 | | | |f1 | | 0.1226 | | 0 | +| - p3 | | | |f1 | | 0.1789 | | 0 | +| - RML | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - DIA | | | |f1 | | 0.1044 | |0 | +| - p1 | | | |f1 | | 0.0821 | | 0 | +| - p2 | | | |f1 | | 0.1119 | | 0 | +| - p3 | | | |f1 | | 0.1190 | | 0 | +| - HIS | | | |f1 | | 0.0007 | |0 | +| - p1 | | | |f1 | | 0.0010 | | 0 | +| - p2 | | | |f1 | | 0.0002 | | 0 | +| - p3 | | | |f1 | | 0.0008 | | 0 | diff --git a/csv_new/output/Henrychur__MMed-Llama-3-8B__pl__0shot.txt b/csv_new/output/Henrychur__MMed-Llama-3-8B__pl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..922dce80469337edc75e7835aa6a600369523091 --- /dev/null +++ b/csv_new/output/Henrychur__MMed-Llama-3-8B__pl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0379 | |0 | +| - p1 | | | |f1 | | 0.0379 | | 0 | +| - p2 | | | |f1 | | 0.0378 | | 0 | +| - p3 | | | |f1 | | 0.0379 | | 0 | +| - RE | | | |f1 | | 0.0891 | |0 | +| - p1 | | | |f1 | | 0.0602 | | 0 | +| - p2 | | | |f1 | | 0.1293 | | 0 | +| - p3 | | | |f1 | | 0.0778 | | 0 | diff --git a/csv_new/output/Henrychur__MMed-Llama-3-8B__pl__10shot.txt b/csv_new/output/Henrychur__MMed-Llama-3-8B__pl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..9dee6185f81350fdc85c72cb4a61be93b071cc61 --- /dev/null +++ b/csv_new/output/Henrychur__MMed-Llama-3-8B__pl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3966 | |0 | +| - p1 | | | |f1 | | 0.3992 | | 0 | +| - p2 | | | |f1 | | 0.3916 | | 0 | +| - p3 | | | |f1 | | 0.3992 | | 0 | +| - RE | | | |f1 | | 0.1003 | |0 | +| - p1 | | | |f1 | | 0.0998 | | 0 | +| - p2 | | | |f1 | | 0.1055 | | 0 | +| - p3 | | | |f1 | | 0.0956 | | 0 | diff --git a/csv_new/output/Henrychur__MMed-Llama-3-8B__sk__0shot.txt b/csv_new/output/Henrychur__MMed-Llama-3-8B__sk__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..8a25091bbb80e0a2681548322565c52cb0858b07 --- /dev/null +++ b/csv_new/output/Henrychur__MMed-Llama-3-8B__sk__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0385 | |0 | +| - p1 | | | |f1 | | 0.0387 | | 0 | +| - p2 | | | |f1 | | 0.0380 | | 0 | +| - p3 | | | |f1 | | 0.0387 | | 0 | +| - RE | | | |f1 | | 0.0174 | |0 | +| - p1 | | | |f1 | | 0.0121 | | 0 | +| - p2 | | | |f1 | | 0.0280 | | 0 | +| - p3 | | | |f1 | | 0.0121 | | 0 | diff --git a/csv_new/output/Henrychur__MMed-Llama-3-8B__sk__10shot.txt b/csv_new/output/Henrychur__MMed-Llama-3-8B__sk__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c591c7f0a88ced816e237245a16bdc6d688db83 --- /dev/null +++ b/csv_new/output/Henrychur__MMed-Llama-3-8B__sk__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3507 | |0 | +| - p1 | | | |f1 | | 0.3444 | | 0 | +| - p2 | | | |f1 | | 0.3632 | | 0 | +| - p3 | | | |f1 | | 0.3444 | | 0 | +| - RE | | | |f1 | | 0.0884 | |0 | +| - p1 | | | |f1 | | 0.0734 | | 0 | +| - p2 | | | |f1 | | 0.1045 | | 0 | +| - p3 | | | |f1 | | 0.0875 | | 0 | diff --git a/csv_new/output/Henrychur__MMed-Llama-3-8B__sl__0shot.txt b/csv_new/output/Henrychur__MMed-Llama-3-8B__sl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..af66f9c26430a2440fce61f08cdf1c00204b2cf0 --- /dev/null +++ b/csv_new/output/Henrychur__MMed-Llama-3-8B__sl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0438 | |0 | +| - p1 | | | |f1 | | 0.0429 | | 0 | +| - p2 | | | |f1 | | 0.0456 | | 0 | +| - p3 | | | |f1 | | 0.0429 | | 0 | +| - RE | | | |f1 | | 0.1278 | |0 | +| - p1 | | | |f1 | | 0.0967 | | 0 | +| - p2 | | | |f1 | | 0.1900 | | 0 | +| - p3 | | | |f1 | | 0.0967 | | 0 | diff --git a/csv_new/output/Henrychur__MMed-Llama-3-8B__sl__10shot.txt b/csv_new/output/Henrychur__MMed-Llama-3-8B__sl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..f5a52d295a6f2f02b23f1a057560f2abba92d1b8 --- /dev/null +++ b/csv_new/output/Henrychur__MMed-Llama-3-8B__sl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Henrychur/MMed-Llama-3-8B ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3720 | |0 | +| - p1 | | | |f1 | | 0.3558 | | 0 | +| - p2 | | | |f1 | | 0.4045 | | 0 | +| - p3 | | | |f1 | | 0.3558 | | 0 | +| - RE | | | |f1 | | 0.0762 | |0 | +| - p1 | | | |f1 | | 0.0787 | | 0 | +| - p2 | | | |f1 | | 0.0781 | | 0 | +| - p3 | | | |f1 | | 0.0719 | | 0 | diff --git a/csv_new/output/HiTZ__Medical-mT5-large__en__0shot.txt b/csv_new/output/HiTZ__Medical-mT5-large__en__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..f52b9ae12df87c2bcbdec0230947eb6d3debbf6b --- /dev/null +++ b/csv_new/output/HiTZ__Medical-mT5-large__en__0shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0578 | |0 | +| - p1 | | | |f1 | | 0.0940 | | 0 | +| - p2 | | | |f1 | | 0.0331 | | 0 | +| - p3 | | | |f1 | | 0.0464 | | 0 | +| - RE | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - RML | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - DIA | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - HIS | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/HiTZ__Medical-mT5-large__en__10shot.txt b/csv_new/output/HiTZ__Medical-mT5-large__en__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..3b0f20c0b7f3db3eb73357dfd3847f11bf4f7a17 --- /dev/null +++ b/csv_new/output/HiTZ__Medical-mT5-large__en__10shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1317 | |0 | +| - p1 | | | |f1 | | 0.1215 | | 0 | +| - p2 | | | |f1 | | 0.1415 | | 0 | +| - p3 | | | |f1 | | 0.1322 | | 0 | +| - RE | | | |f1 | | 0.0031 | |0 | +| - p1 | | | |f1 | | 0.0028 | | 0 | +| - p2 | | | |f1 | | 0.0016 | | 0 | +| - p3 | | | |f1 | | 0.0049 | | 0 | +| - RML | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - DIA | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - HIS | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/HiTZ__Medical-mT5-large__gr__0shot.txt b/csv_new/output/HiTZ__Medical-mT5-large__gr__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..4cc7d4a784cb754ed058341765da74fe59e4950e --- /dev/null +++ b/csv_new/output/HiTZ__Medical-mT5-large__gr__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0769 | |0 | +| - p1 | | | |f1 | | 0.0859 | | 0 | +| - p2 | | | |f1 | | 0.0591 | | 0 | +| - p3 | | | |f1 | | 0.0859 | | 0 | +| - RE | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/HiTZ__Medical-mT5-large__gr__10shot.txt b/csv_new/output/HiTZ__Medical-mT5-large__gr__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..4edd50dc0d05a279ed9a6be3efb12660fc646344 --- /dev/null +++ b/csv_new/output/HiTZ__Medical-mT5-large__gr__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1448 | |0 | +| - p1 | | | |f1 | | 0.1455 | | 0 | +| - p2 | | | |f1 | | 0.1434 | | 0 | +| - p3 | | | |f1 | | 0.1455 | | 0 | +| - RE | | | |f1 | | 0.0010 | |0 | +| - p1 | | | |f1 | | 0.0024 | | 0 | +| - p2 | | | |f1 | | 0.0007 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/HiTZ__Medical-mT5-large__it__0shot.txt b/csv_new/output/HiTZ__Medical-mT5-large__it__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..e4f0865ef187c77cfa71171ded1282690b2a85dd --- /dev/null +++ b/csv_new/output/HiTZ__Medical-mT5-large__it__0shot.txt @@ -0,0 +1,22 @@ +hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0812 | |0 | +| - p1 | | | |f1 | | 0.0770 | | 0 | +| - p2 | | | |f1 | | 0.0920 | | 0 | +| - p3 | | | |f1 | | 0.0747 | | 0 | +| - RML | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - DIA | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - HIS | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - RE | | | |f1 | | 0.0000 | |0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/HiTZ__Medical-mT5-large__it__10shot.txt b/csv_new/output/HiTZ__Medical-mT5-large__it__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..5335a77907e5ae1c8333fe21656b2257d68b9343 --- /dev/null +++ b/csv_new/output/HiTZ__Medical-mT5-large__it__10shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1694 | |0 | +| - p1 | | | |f1 | | 0.1616 | | 0 | +| - p2 | | | |f1 | | 0.1774 | | 0 | +| - p3 | | | |f1 | | 0.1690 | | 0 | +| - RE | | | |f1 | | 0.0048 | |0 | +| - p1 | | | |f1 | | 0.0035 | | 0 | +| - p2 | | | |f1 | | 0.0064 | | 0 | +| - p3 | | | |f1 | | 0.0046 | | 0 | +| - RML | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - DIA | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - HIS | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/HiTZ__Medical-mT5-large__pl__0shot.txt b/csv_new/output/HiTZ__Medical-mT5-large__pl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..b3febb68ea3e21f9230cb485500075ba859f318f --- /dev/null +++ b/csv_new/output/HiTZ__Medical-mT5-large__pl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0308 | |0 | +| - p1 | | | |f1 | | 0.0244 | | 0 | +| - p2 | | | |f1 | | 0.0436 | | 0 | +| - p3 | | | |f1 | | 0.0244 | | 0 | +| - RE | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/HiTZ__Medical-mT5-large__pl__10shot.txt b/csv_new/output/HiTZ__Medical-mT5-large__pl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..9c96e416317f7b151616c4982e8c0640322bb615 --- /dev/null +++ b/csv_new/output/HiTZ__Medical-mT5-large__pl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1516 | |0 | +| - p1 | | | |f1 | | 0.1500 | | 0 | +| - p2 | | | |f1 | | 0.1548 | | 0 | +| - p3 | | | |f1 | | 0.1500 | | 0 | +| - RE | | | |f1 | | 0.0032 | |0 | +| - p1 | | | |f1 | | 0.0040 | | 0 | +| - p2 | | | |f1 | | 0.0023 | | 0 | +| - p3 | | | |f1 | | 0.0034 | | 0 | diff --git a/csv_new/output/HiTZ__Medical-mT5-large__sk__0shot.txt b/csv_new/output/HiTZ__Medical-mT5-large__sk__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..75cf3c4ce337fe7f13221bf8b230f9c267ae3639 --- /dev/null +++ b/csv_new/output/HiTZ__Medical-mT5-large__sk__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0712 | |0 | +| - p1 | | | |f1 | | 0.0880 | | 0 | +| - p2 | | | |f1 | | 0.0375 | | 0 | +| - p3 | | | |f1 | | 0.0880 | | 0 | +| - RE | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/HiTZ__Medical-mT5-large__sk__10shot.txt b/csv_new/output/HiTZ__Medical-mT5-large__sk__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..ce7ca5e76b585007a9dc187a6dd14ae6e22f17cc --- /dev/null +++ b/csv_new/output/HiTZ__Medical-mT5-large__sk__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1444 | |0 | +| - p1 | | | |f1 | | 0.1485 | | 0 | +| - p2 | | | |f1 | | 0.1360 | | 0 | +| - p3 | | | |f1 | | 0.1485 | | 0 | +| - RE | | | |f1 | | 0.0027 | |0 | +| - p1 | | | |f1 | | 0.0038 | | 0 | +| - p2 | | | |f1 | | 0.0024 | | 0 | +| - p3 | | | |f1 | | 0.0020 | | 0 | diff --git a/csv_new/output/HiTZ__Medical-mT5-large__sl__0shot.txt b/csv_new/output/HiTZ__Medical-mT5-large__sl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..8811248dde3d8e1e5d3e5bd0c4d11888b8adad09 --- /dev/null +++ b/csv_new/output/HiTZ__Medical-mT5-large__sl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0711 | |0 | +| - p1 | | | |f1 | | 0.0777 | | 0 | +| - p2 | | | |f1 | | 0.0579 | | 0 | +| - p3 | | | |f1 | | 0.0777 | | 0 | +| - RE | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/HiTZ__Medical-mT5-large__sl__10shot.txt b/csv_new/output/HiTZ__Medical-mT5-large__sl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..97237b461fcde9621e1b414675820a8989f1add9 --- /dev/null +++ b/csv_new/output/HiTZ__Medical-mT5-large__sl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=HiTZ/Medical-mT5-large ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1422 | |0 | +| - p1 | | | |f1 | | 0.1470 | | 0 | +| - p2 | | | |f1 | | 0.1325 | | 0 | +| - p3 | | | |f1 | | 0.1470 | | 0 | +| - RE | | | |f1 | | 0.0080 | |0 | +| - p1 | | | |f1 | | 0.0073 | | 0 | +| - p2 | | | |f1 | | 0.0074 | | 0 | +| - p3 | | | |f1 | | 0.0093 | | 0 | diff --git a/csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__en__0shot.txt b/csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__en__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..86fc15d9c200aecc6911fda3d0517a3a3184138c --- /dev/null +++ b/csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__en__0shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2500 | |0 | +| - p1 | | | |f1 | | 0.3425 | | 0 | +| - p2 | | | |f1 | | 0.1181 | | 0 | +| - p3 | | | |f1 | | 0.2893 | | 0 | +| - RE | | | |f1 | | 0.4075 | |0 | +| - p1 | | | |f1 | | 0.4135 | | 0 | +| - p2 | | | |f1 | | 0.3917 | | 0 | +| - p3 | | | |f1 | | 0.4172 | | 0 | +| - RML | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - DIA | | | |f1 | | 0.0001 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0002 | | 0 | +| - HIS | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__en__10shot.txt b/csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__en__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..8047c5166003b6dc32ba1a21ccad0b9b41c646a9 --- /dev/null +++ b/csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__en__10shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5993 | |0 | +| - p1 | | | |f1 | | 0.6091 | | 0 | +| - p2 | | | |f1 | | 0.5646 | | 0 | +| - p3 | | | |f1 | | 0.6243 | | 0 | +| - RE | | | |f1 | | 0.6164 | |0 | +| - p1 | | | |f1 | | 0.6332 | | 0 | +| - p2 | | | |f1 | | 0.6025 | | 0 | +| - p3 | | | |f1 | | 0.6133 | | 0 | +| - RML | | | |f1 | | 0.2843 | |0 | +| - p1 | | | |f1 | | 0.2129 | | 0 | +| - p2 | | | |f1 | | 0.3222 | | 0 | +| - p3 | | | |f1 | | 0.3178 | | 0 | +| - DIA | | | |f1 | | 0.1658 | |0 | +| - p1 | | | |f1 | | 0.3073 | | 0 | +| - p2 | | | |f1 | | 0.1137 | | 0 | +| - p3 | | | |f1 | | 0.0764 | | 0 | +| - HIS | | | |f1 | | 0.2370 | |0 | +| - p1 | | | |f1 | | 0.1244 | | 0 | +| - p2 | | | |f1 | | 0.4429 | | 0 | +| - p3 | | | |f1 | | 0.1437 | | 0 | diff --git a/csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__gr__0shot.txt b/csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__gr__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..32e68359dde026f73f4bdc753c7293e1d097dd76 --- /dev/null +++ b/csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__gr__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1290 | |0 | +| - p1 | | | |f1 | | 0.1339 | | 0 | +| - p2 | | | |f1 | | 0.1191 | | 0 | +| - p3 | | | |f1 | | 0.1339 | | 0 | +| - RE | | | |f1 | | 0.3957 | |0 | +| - p1 | | | |f1 | | 0.3796 | | 0 | +| - p2 | | | |f1 | | 0.4266 | | 0 | +| - p3 | | | |f1 | | 0.3810 | | 0 | diff --git a/csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__gr__10shot.txt b/csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__gr__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..8435fa43de5b6d649de6e305295728062df17d85 --- /dev/null +++ b/csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__gr__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6028 | |0 | +| - p1 | | | |f1 | | 0.6119 | | 0 | +| - p2 | | | |f1 | | 0.5847 | | 0 | +| - p3 | | | |f1 | | 0.6119 | | 0 | +| - RE | | | |f1 | | 0.6056 | |0 | +| - p1 | | | |f1 | | 0.5962 | | 0 | +| - p2 | | | |f1 | | 0.6024 | | 0 | +| - p3 | | | |f1 | | 0.6183 | | 0 | diff --git a/csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__it__0shot.txt b/csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__it__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..dcbfd60fb9ee78af684469935464dcf37905b09a --- /dev/null +++ b/csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__it__0shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2137 | |0 | +| - p1 | | | |f1 | | 0.2467 | | 0 | +| - p2 | | | |f1 | | 0.1709 | | 0 | +| - p3 | | | |f1 | | 0.2234 | | 0 | +| - RE | | | |f1 | | 0.4016 | |0 | +| - p1 | | | |f1 | | 0.4173 | | 0 | +| - p2 | | | |f1 | | 0.3770 | | 0 | +| - p3 | | | |f1 | | 0.4106 | | 0 | +| - RML | | | |f1 | | 0.0002 | |0 | +| - p1 | | | |f1 | | 0.0007 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - DIA | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - HIS | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__it__10shot.txt b/csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__it__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d74aa8ca4967e04a5e3873eda76473d60166904 --- /dev/null +++ b/csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__it__10shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6569 | |0 | +| - p1 | | | |f1 | | 0.6719 | | 0 | +| - p2 | | | |f1 | | 0.6327 | | 0 | +| - p3 | | | |f1 | | 0.6661 | | 0 | +| - RE | | | |f1 | | 0.5952 | |0 | +| - p1 | | | |f1 | | 0.5767 | | 0 | +| - p2 | | | |f1 | | 0.5998 | | 0 | +| - p3 | | | |f1 | | 0.6093 | | 0 | +| - RML | | | |f1 | | 0.1557 | |0 | +| - p1 | | | |f1 | | 0.1111 | | 0 | +| - p2 | | | |f1 | | 0.1599 | | 0 | +| - p3 | | | |f1 | | 0.1960 | | 0 | +| - DIA | | | |f1 | | 0.2496 | |0 | +| - p1 | | | |f1 | | 0.4407 | | 0 | +| - p2 | | | |f1 | | 0.1328 | | 0 | +| - p3 | | | |f1 | | 0.1753 | | 0 | +| - HIS | | | |f1 | | 0.2339 | |0 | +| - p1 | | | |f1 | | 0.0817 | | 0 | +| - p2 | | | |f1 | | 0.5103 | | 0 | +| - p3 | | | |f1 | | 0.1096 | | 0 | diff --git a/csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__pl__0shot.txt b/csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__pl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..2bbaa4e441dac6b8c9ed99f717bd896a34a45e3d --- /dev/null +++ b/csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__pl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0586 | |0 | +| - p1 | | | |f1 | | 0.0697 | | 0 | +| - p2 | | | |f1 | | 0.0364 | | 0 | +| - p3 | | | |f1 | | 0.0697 | | 0 | +| - RE | | | |f1 | | 0.4022 | |0 | +| - p1 | | | |f1 | | 0.3803 | | 0 | +| - p2 | | | |f1 | | 0.4464 | | 0 | +| - p3 | | | |f1 | | 0.3800 | | 0 | diff --git a/csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__pl__10shot.txt b/csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__pl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..465d16af61fd9338c7188c53fbf60f164ed3aac6 --- /dev/null +++ b/csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__pl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6092 | |0 | +| - p1 | | | |f1 | | 0.6226 | | 0 | +| - p2 | | | |f1 | | 0.5824 | | 0 | +| - p3 | | | |f1 | | 0.6226 | | 0 | +| - RE | | | |f1 | | 0.5944 | |0 | +| - p1 | | | |f1 | | 0.5991 | | 0 | +| - p2 | | | |f1 | | 0.5466 | | 0 | +| - p3 | | | |f1 | | 0.6375 | | 0 | diff --git a/csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__sk__0shot.txt b/csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__sk__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..8660df7e3f0f119e44cf5a67e7a942f913b8aa4d --- /dev/null +++ b/csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__sk__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0955 | |0 | +| - p1 | | | |f1 | | 0.1220 | | 0 | +| - p2 | | | |f1 | | 0.0426 | | 0 | +| - p3 | | | |f1 | | 0.1220 | | 0 | +| - RE | | | |f1 | | 0.4116 | |0 | +| - p1 | | | |f1 | | 0.4027 | | 0 | +| - p2 | | | |f1 | | 0.4294 | | 0 | +| - p3 | | | |f1 | | 0.4027 | | 0 | diff --git a/csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__sk__10shot.txt b/csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__sk__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..63b5158840c219e67fbf758e2ed730ca530afe7d --- /dev/null +++ b/csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__sk__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6419 | |0 | +| - p1 | | | |f1 | | 0.6386 | | 0 | +| - p2 | | | |f1 | | 0.6486 | | 0 | +| - p3 | | | |f1 | | 0.6386 | | 0 | +| - RE | | | |f1 | | 0.5899 | |0 | +| - p1 | | | |f1 | | 0.5894 | | 0 | +| - p2 | | | |f1 | | 0.5845 | | 0 | +| - p3 | | | |f1 | | 0.5959 | | 0 | diff --git a/csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__sl__0shot.txt b/csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__sl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..52a254555d051acdd5ed2169b161e4db6559e7f6 --- /dev/null +++ b/csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__sl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3398 | |0 | +| - p1 | | | |f1 | | 0.3910 | | 0 | +| - p2 | | | |f1 | | 0.2375 | | 0 | +| - p3 | | | |f1 | | 0.3910 | | 0 | +| - RE | | | |f1 | | 0.3777 | |0 | +| - p1 | | | |f1 | | 0.3775 | | 0 | +| - p2 | | | |f1 | | 0.3783 | | 0 | +| - p3 | | | |f1 | | 0.3775 | | 0 | diff --git a/csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__sl__10shot.txt b/csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__sl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..11a5d3eb944b1de7399b5736ad5127c36767eac5 --- /dev/null +++ b/csv_new/output/Qwen__Qwen2.5-14B-Instruct-1M__sl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen2.5-14B-Instruct-1M ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6371 | |0 | +| - p1 | | | |f1 | | 0.6467 | | 0 | +| - p2 | | | |f1 | | 0.6178 | | 0 | +| - p3 | | | |f1 | | 0.6467 | | 0 | +| - RE | | | |f1 | | 0.5837 | |0 | +| - p1 | | | |f1 | | 0.5949 | | 0 | +| - p2 | | | |f1 | | 0.5782 | | 0 | +| - p3 | | | |f1 | | 0.5781 | | 0 | diff --git a/csv_new/output/Qwen__Qwen2.5-32B-Instruct__en__0shot.txt b/csv_new/output/Qwen__Qwen2.5-32B-Instruct__en__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..e3541b24061567ac17d18c28588fd9704681fe09 --- /dev/null +++ b/csv_new/output/Qwen__Qwen2.5-32B-Instruct__en__0shot.txt @@ -0,0 +1,25 @@ +hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3279 | |0 | +| - p1 | | | |f1 | | 0.3804 | | 0 | +| - p2 | | | |f1 | | 0.3068 | | 0 | +| - p3 | | | |f1 | | 0.2964 | | 0 | +| - RE | | | |f1 | | 0.4658 | |0 | +| - p1 | | | |f1 | | 0.4734 | | 0 | +| - p2 | | | |f1 | | 0.4649 | | 0 | +| - p3 | | | |f1 | | 0.4591 | | 0 | +| - RML | | | |f1 | | 0.0015 | |0 | +| - p1 | | | |f1 | | 0.0005 | | 0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0057 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - DIA | | | |f1 | | 0.0002 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0006 | | 0 | +| - HIS | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/Qwen__Qwen2.5-32B-Instruct__en__10shot.txt b/csv_new/output/Qwen__Qwen2.5-32B-Instruct__en__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..65e185d66b8bc12e9c23da0771639caa6c6472cf --- /dev/null +++ b/csv_new/output/Qwen__Qwen2.5-32B-Instruct__en__10shot.txt @@ -0,0 +1,24 @@ +hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5895 | |0 | +| - p1 | | | |f1 | | 0.5970 | | 0 | +| - p2 | | | |f1 | | 0.5602 | | 0 | +| - p3 | | | |f1 | | 0.6113 | | 0 | +| - RE | | | |f1 | | 0.6440 | |0 | +| - p1 | | | |f1 | | 0.6482 | | 0 | +| - p2 | | | |f1 | | 0.6469 | | 0 | +| - p3 | | | |f1 | | 0.6370 | | 0 | +| - RML | | | |f1 | | 0.0931 | |0 | +| - p1 | | | |f1 | | 0.1501 | | 0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.1383 | | 0 | +| - p3 | | | |f1 | | 0.0839 | | 0 | +| - DIA | | | |f1 | | 0.0286 | |0 | +| - p1 | | | |f1 | | 0.0311 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0546 | | 0 | +| - HIS | | | |f1 | | 0.0659 | |0 | +| - p1 | | | |f1 | | 0.0247 | | 0 | +| - p2 | | | |f1 | | 0.1557 | | 0 | +| - p3 | | | |f1 | | 0.0174 | | 0 | diff --git a/csv_new/output/Qwen__Qwen2.5-32B-Instruct__gr__0shot.txt b/csv_new/output/Qwen__Qwen2.5-32B-Instruct__gr__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d091d2dd0d08ddd7d9ae2f74d581e4787f4ebf9 --- /dev/null +++ b/csv_new/output/Qwen__Qwen2.5-32B-Instruct__gr__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4506 | |0 | +| - p1 | | | |f1 | | 0.5976 | | 0 | +| - p2 | | | |f1 | | 0.1568 | | 0 | +| - p3 | | | |f1 | | 0.5976 | | 0 | +| - RE | | | |f1 | | 0.4104 | |0 | +| - p1 | | | |f1 | | 0.4393 | | 0 | +| - p2 | | | |f1 | | 0.4083 | | 0 | +| - p3 | | | |f1 | | 0.3834 | | 0 | diff --git a/csv_new/output/Qwen__Qwen2.5-32B-Instruct__gr__10shot.txt b/csv_new/output/Qwen__Qwen2.5-32B-Instruct__gr__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..fa6241f9b435b69937d53ca833cc5a27fa25c2c0 --- /dev/null +++ b/csv_new/output/Qwen__Qwen2.5-32B-Instruct__gr__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6175 | |0 | +| - p1 | | | |f1 | | 0.6196 | | 0 | +| - p2 | | | |f1 | | 0.6131 | | 0 | +| - p3 | | | |f1 | | 0.6196 | | 0 | +| - RE | | | |f1 | | 0.5840 | |0 | +| - p1 | | | |f1 | | 0.5913 | | 0 | +| - p2 | | | |f1 | | 0.5896 | | 0 | +| - p3 | | | |f1 | | 0.5710 | | 0 | diff --git a/csv_new/output/Qwen__Qwen2.5-32B-Instruct__it__0shot.txt b/csv_new/output/Qwen__Qwen2.5-32B-Instruct__it__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..31fade95d9724693277c788e245ae4ceaaf485ea --- /dev/null +++ b/csv_new/output/Qwen__Qwen2.5-32B-Instruct__it__0shot.txt @@ -0,0 +1,24 @@ +hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2734 | |0 | +| - p1 | | | |f1 | | 0.3758 | | 0 | +| - p2 | | | |f1 | | 0.1647 | | 0 | +| - p3 | | | |f1 | | 0.2796 | | 0 | +| - RE | | | |f1 | | 0.4370 | |0 | +| - p1 | | | |f1 | | 0.4505 | | 0 | +| - p2 | | | |f1 | | 0.4159 | | 0 | +| - p3 | | | |f1 | | 0.4447 | | 0 | +| - RML | | | |f1 | | 0.0004 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0017 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - DIA | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - HIS | | | |f1 | | 0.0003 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0008 | | 0 | diff --git a/csv_new/output/Qwen__Qwen2.5-32B-Instruct__it__10shot.txt b/csv_new/output/Qwen__Qwen2.5-32B-Instruct__it__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..92d682ce2a72967223ef7140ed1475ff8208673d --- /dev/null +++ b/csv_new/output/Qwen__Qwen2.5-32B-Instruct__it__10shot.txt @@ -0,0 +1,24 @@ +hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.7005 | |0 | +| - p1 | | | |f1 | | 0.6934 | | 0 | +| - p2 | | | |f1 | | 0.7152 | | 0 | +| - p3 | | | |f1 | | 0.6930 | | 0 | +| - RE | | | |f1 | | 0.5641 | |0 | +| - p1 | | | |f1 | | 0.5801 | | 0 | +| - p2 | | | |f1 | | 0.5595 | | 0 | +| - p3 | | | |f1 | | 0.5526 | | 0 | +| - RML | | | |f1 | | 0.0762 | |0 | +| - p1 | | | |f1 | | 0.0398 | | 0 | +| - p2 | | | |f1 | | 0.0599 | | 0 | +| - p3 | | | |f1 | | 0.1025 | | 0 | +| - p3 | | | |f1 | | 0.1025 | | 0 | +| - DIA | | | |f1 | | 0.1086 | |0 | +| - p1 | | | |f1 | | 0.2322 | | 0 | +| - p2 | | | |f1 | | 0.0109 | | 0 | +| - p3 | | | |f1 | | 0.0828 | | 0 | +| - HIS | | | |f1 | | 0.0353 | |0 | +| - p1 | | | |f1 | | 0.0186 | | 0 | +| - p2 | | | |f1 | | 0.0602 | | 0 | +| - p3 | | | |f1 | | 0.0272 | | 0 | diff --git a/csv_new/output/Qwen__Qwen2.5-32B-Instruct__pl__0shot.txt b/csv_new/output/Qwen__Qwen2.5-32B-Instruct__pl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..d4f8030e0178b97b248945b2973d52689441048e --- /dev/null +++ b/csv_new/output/Qwen__Qwen2.5-32B-Instruct__pl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2428 | |0 | +| - p1 | | | |f1 | | 0.2486 | | 0 | +| - p2 | | | |f1 | | 0.2311 | | 0 | +| - p3 | | | |f1 | | 0.2486 | | 0 | +| - RE | | | |f1 | | 0.4074 | |0 | +| - p1 | | | |f1 | | 0.3865 | | 0 | +| - p2 | | | |f1 | | 0.4569 | | 0 | +| - p3 | | | |f1 | | 0.3788 | | 0 | diff --git a/csv_new/output/Qwen__Qwen2.5-32B-Instruct__pl__10shot.txt b/csv_new/output/Qwen__Qwen2.5-32B-Instruct__pl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..c0657f5bc039e0ef6c46d0a9ab79ea5c33277f47 --- /dev/null +++ b/csv_new/output/Qwen__Qwen2.5-32B-Instruct__pl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6006 | |0 | +| - p1 | | | |f1 | | 0.6008 | | 0 | +| - p2 | | | |f1 | | 0.6004 | | 0 | +| - p3 | | | |f1 | | 0.6008 | | 0 | +| - RE | | | |f1 | | 0.5888 | |0 | +| - p1 | | | |f1 | | 0.5858 | | 0 | +| - p2 | | | |f1 | | 0.5868 | | 0 | +| - p3 | | | |f1 | | 0.5938 | | 0 | diff --git a/csv_new/output/Qwen__Qwen2.5-32B-Instruct__sk__0shot.txt b/csv_new/output/Qwen__Qwen2.5-32B-Instruct__sk__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c2c921f81dfd861433916d7a82eae8f0794ee40 --- /dev/null +++ b/csv_new/output/Qwen__Qwen2.5-32B-Instruct__sk__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3375 | |0 | +| - p1 | | | |f1 | | 0.3578 | | 0 | +| - p2 | | | |f1 | | 0.2968 | | 0 | +| - p3 | | | |f1 | | 0.3578 | | 0 | +| - RE | | | |f1 | | 0.4031 | |0 | +| - p1 | | | |f1 | | 0.3971 | | 0 | +| - p2 | | | |f1 | | 0.4152 | | 0 | +| - p3 | | | |f1 | | 0.3971 | | 0 | diff --git a/csv_new/output/Qwen__Qwen2.5-32B-Instruct__sk__10shot.txt b/csv_new/output/Qwen__Qwen2.5-32B-Instruct__sk__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..ccd3f8f6a3d5adfc50bb93253d2b1a2baddb48ea --- /dev/null +++ b/csv_new/output/Qwen__Qwen2.5-32B-Instruct__sk__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6720 | |0 | +| - p1 | | | |f1 | | 0.6743 | | 0 | +| - p2 | | | |f1 | | 0.6673 | | 0 | +| - p3 | | | |f1 | | 0.6743 | | 0 | +| - RE | | | |f1 | | 0.5643 | |0 | +| - p1 | | | |f1 | | 0.5733 | | 0 | +| - p2 | | | |f1 | | 0.5586 | | 0 | +| - p3 | | | |f1 | | 0.5609 | | 0 | diff --git a/csv_new/output/Qwen__Qwen2.5-32B-Instruct__sl__0shot.txt b/csv_new/output/Qwen__Qwen2.5-32B-Instruct__sl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..12d98d519252a33b36dae0af4719974c3d12e5c2 --- /dev/null +++ b/csv_new/output/Qwen__Qwen2.5-32B-Instruct__sl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3183 | |0 | +| - p1 | | | |f1 | | 0.3344 | | 0 | +| - p2 | | | |f1 | | 0.2863 | | 0 | +| - p3 | | | |f1 | | 0.3344 | | 0 | +| - RE | | | |f1 | | 0.4048 | |0 | +| - p1 | | | |f1 | | 0.3979 | | 0 | +| - p2 | | | |f1 | | 0.4186 | | 0 | +| - p3 | | | |f1 | | 0.3979 | | 0 | diff --git a/csv_new/output/Qwen__Qwen2.5-32B-Instruct__sl__10shot.txt b/csv_new/output/Qwen__Qwen2.5-32B-Instruct__sl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..45927874109b49e1ce1db253c58c78ab3ea1a926 --- /dev/null +++ b/csv_new/output/Qwen__Qwen2.5-32B-Instruct__sl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen2.5-32B-Instruct ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6373 | |0 | +| - p1 | | | |f1 | | 0.6253 | | 0 | +| - p2 | | | |f1 | | 0.6615 | | 0 | +| - p3 | | | |f1 | | 0.6253 | | 0 | +| - RE | | | |f1 | | 0.5727 | |0 | +| - p1 | | | |f1 | | 0.5992 | | 0 | +| - p2 | | | |f1 | | 0.5849 | | 0 | +| - p3 | | | |f1 | | 0.5339 | | 0 | diff --git a/csv_new/output/Qwen__Qwen3-30B-A3B-Instruct-2507__en__0shot.txt b/csv_new/output/Qwen__Qwen3-30B-A3B-Instruct-2507__en__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..d02325d7b876b860ed5ef24dc72e0e111501ef18 --- /dev/null +++ b/csv_new/output/Qwen__Qwen3-30B-A3B-Instruct-2507__en__0shot.txt @@ -0,0 +1,22 @@ +hf (pretrained=Qwen/Qwen3-30B-A3B-Instruct-2507 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - RE | | | |f1 | | 0.4141 | |0 | +| - p1 | | | |f1 | | 0.4394 | | 0 | +| - p2 | | | |f1 | | 0.4031 | | 0 | +| - p3 | | | |f1 | | 0.3997 | | 0 | +| - RML | | | |f1 | | 0.0001 | |0 | +| - p1 | | | |f1 | | 0.0003 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - DIA | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0001 | | 0 | +| - HIS | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - NER | | | |f1 | | 0.4445 | |0 | +| - p2 | | | |f1 | | 0.4162 | | 0 | +| - p3 | | | |f1 | | 0.4729 | | 0 | diff --git a/csv_new/output/Qwen__Qwen3-30B-A3B-Instruct-2507__en__10shot.txt b/csv_new/output/Qwen__Qwen3-30B-A3B-Instruct-2507__en__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..6db0705a854106d30d7f6a12c9cd0a3cd6148918 --- /dev/null +++ b/csv_new/output/Qwen__Qwen3-30B-A3B-Instruct-2507__en__10shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=Qwen/Qwen3-30B-A3B-Instruct-2507 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5907 | |0 | +| - p1 | | | |f1 | | 0.5986 | | 0 | +| - p2 | | | |f1 | | 0.5593 | | 0 | +| - p3 | | | |f1 | | 0.6143 | | 0 | +| - RE | | | |f1 | | 0.5259 | |0 | +| - p1 | | | |f1 | | 0.5150 | | 0 | +| - p2 | | | |f1 | | 0.5261 | | 0 | +| - p3 | | | |f1 | | 0.5364 | | 0 | +| - RML | | | |f1 | | 0.3351 | |0 | +| - p1 | | | |f1 | | 0.3206 | | 0 | +| - p2 | | | |f1 | | 0.3581 | | 0 | +| - p3 | | | |f1 | | 0.3267 | | 0 | +| - DIA | | | |f1 | | 0.3195 | |0 | +| - p1 | | | |f1 | | 0.3810 | | 0 | +| - p2 | | | |f1 | | 0.3651 | | 0 | +| - p3 | | | |f1 | | 0.2125 | | 0 | +| - HIS | | | |f1 | | 0.4256 | |0 | +| - p1 | | | |f1 | | 0.4154 | | 0 | +| - p2 | | | |f1 | | 0.2924 | | 0 | +| - p3 | | | |f1 | | 0.5690 | | 0 | diff --git a/csv_new/output/Qwen__Qwen3-30B-A3B-Instruct-2507__gr__0shot.txt b/csv_new/output/Qwen__Qwen3-30B-A3B-Instruct-2507__gr__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..9679a42ef05e59d1976f28f381ab016e9bd01f2b --- /dev/null +++ b/csv_new/output/Qwen__Qwen3-30B-A3B-Instruct-2507__gr__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen3-30B-A3B-Instruct-2507 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4368 | |0 | +| - p1 | | | |f1 | | 0.4291 | | 0 | +| - p2 | | | |f1 | | 0.4521 | | 0 | +| - p3 | | | |f1 | | 0.4291 | | 0 | +| - RE | | | |f1 | | 0.3776 | |0 | +| - p1 | | | |f1 | | 0.3733 | | 0 | +| - p2 | | | |f1 | | 0.3799 | | 0 | +| - p3 | | | |f1 | | 0.3798 | | 0 | diff --git a/csv_new/output/Qwen__Qwen3-30B-A3B-Instruct-2507__gr__10shot.txt b/csv_new/output/Qwen__Qwen3-30B-A3B-Instruct-2507__gr__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..e7b5e451b837ea640578ca02095bc52a621c7ee1 --- /dev/null +++ b/csv_new/output/Qwen__Qwen3-30B-A3B-Instruct-2507__gr__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen3-30B-A3B-Instruct-2507 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5999 | |0 | +| - p1 | | | |f1 | | 0.6164 | | 0 | +| - p2 | | | |f1 | | 0.5669 | | 0 | +| - p3 | | | |f1 | | 0.6164 | | 0 | +| - RE | | | |f1 | | 0.5149 | |0 | +| - p1 | | | |f1 | | 0.5015 | | 0 | +| - p2 | | | |f1 | | 0.5209 | | 0 | +| - p3 | | | |f1 | | 0.5223 | | 0 | diff --git a/csv_new/output/Qwen__Qwen3-30B-A3B-Instruct-2507__it__0shot.txt b/csv_new/output/Qwen__Qwen3-30B-A3B-Instruct-2507__it__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..6aae25b2fed715236df379e0bc39ca9aa0dd12f7 --- /dev/null +++ b/csv_new/output/Qwen__Qwen3-30B-A3B-Instruct-2507__it__0shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=Qwen/Qwen3-30B-A3B-Instruct-2507 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3572 | |0 | +| - p1 | | | |f1 | | 0.0885 | | 0 | +| - p2 | | | |f1 | | 0.5316 | | 0 | +| - p3 | | | |f1 | | 0.4514 | | 0 | +| - RE | | | |f1 | | 0.3959 | |0 | +| - p1 | | | |f1 | | 0.3784 | | 0 | +| - p2 | | | |f1 | | 0.4123 | | 0 | +| - p3 | | | |f1 | | 0.3972 | | 0 | +| - RML | | | |f1 | | 0.0001 | |0 | +| - p1 | | | |f1 | | 0.0002 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - DIA | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - HIS | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/Qwen__Qwen3-30B-A3B-Instruct-2507__it__10shot.txt b/csv_new/output/Qwen__Qwen3-30B-A3B-Instruct-2507__it__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..85cb53237a3f4b3ab9e17ad5699c66352f22ca73 --- /dev/null +++ b/csv_new/output/Qwen__Qwen3-30B-A3B-Instruct-2507__it__10shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=Qwen/Qwen3-30B-A3B-Instruct-2507 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6673 | |0 | +| - p1 | | | |f1 | | 0.6793 | | 0 | +| - p2 | | | |f1 | | 0.6447 | | 0 | +| - p3 | | | |f1 | | 0.6778 | | 0 | +| - RE | | | |f1 | | 0.5982 | |0 | +| - p1 | | | |f1 | | 0.6041 | | 0 | +| - p2 | | | |f1 | | 0.5838 | | 0 | +| - p3 | | | |f1 | | 0.6065 | | 0 | +| - RML | | | |f1 | | 0.1973 | |0 | +| - p1 | | | |f1 | | 0.1620 | | 0 | +| - p2 | | | |f1 | | 0.2566 | | 0 | +| - p3 | | | |f1 | | 0.1734 | | 0 | +| - DIA | | | |f1 | | 0.4794 | |0 | +| - p1 | | | |f1 | | 0.4512 | | 0 | +| - p2 | | | |f1 | | 0.5464 | | 0 | +| - p3 | | | |f1 | | 0.4407 | | 0 | +| - HIS | | | |f1 | | 0.3069 | |0 | +| - p1 | | | |f1 | | 0.2147 | | 0 | +| - p2 | | | |f1 | | 0.5071 | | 0 | +| - p3 | | | |f1 | | 0.1988 | | 0 | diff --git a/csv_new/output/Qwen__Qwen3-30B-A3B-Instruct-2507__pl__0shot.txt b/csv_new/output/Qwen__Qwen3-30B-A3B-Instruct-2507__pl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6358c6bb902fb22713a21ed802c947dd78e7ea6 --- /dev/null +++ b/csv_new/output/Qwen__Qwen3-30B-A3B-Instruct-2507__pl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen3-30B-A3B-Instruct-2507 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4235 | |0 | +| - p1 | | | |f1 | | 0.4332 | | 0 | +| - p2 | | | |f1 | | 0.4043 | | 0 | +| - p3 | | | |f1 | | 0.4332 | | 0 | +| - RE | | | |f1 | | 0.4186 | |0 | +| - p1 | | | |f1 | | 0.4152 | | 0 | +| - p2 | | | |f1 | | 0.4220 | | 0 | +| - p3 | | | |f1 | | 0.4187 | | 0 | diff --git a/csv_new/output/Qwen__Qwen3-30B-A3B-Instruct-2507__pl__10shot.txt b/csv_new/output/Qwen__Qwen3-30B-A3B-Instruct-2507__pl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f116f8c7deee3f443c689514bb8a23fdb8d305c --- /dev/null +++ b/csv_new/output/Qwen__Qwen3-30B-A3B-Instruct-2507__pl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen3-30B-A3B-Instruct-2507 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6118 | |0 | +| - p1 | | | |f1 | | 0.6276 | | 0 | +| - p2 | | | |f1 | | 0.5803 | | 0 | +| - p3 | | | |f1 | | 0.6276 | | 0 | +| - RE | | | |f1 | | 0.5166 | |0 | +| - p1 | | | |f1 | | 0.5103 | | 0 | +| - p2 | | | |f1 | | 0.5200 | | 0 | +| - p3 | | | |f1 | | 0.5195 | | 0 | diff --git a/csv_new/output/Qwen__Qwen3-30B-A3B-Instruct-2507__sk__0shot.txt b/csv_new/output/Qwen__Qwen3-30B-A3B-Instruct-2507__sk__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..82f30fd004efa2674df97a1dae911f0a92ff3e26 --- /dev/null +++ b/csv_new/output/Qwen__Qwen3-30B-A3B-Instruct-2507__sk__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen3-30B-A3B-Instruct-2507 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3287 | |0 | +| - p1 | | | |f1 | | 0.3231 | | 0 | +| - p2 | | | |f1 | | 0.3398 | | 0 | +| - p3 | | | |f1 | | 0.3231 | | 0 | +| - RE | | | |f1 | | 0.3943 | |0 | +| - p1 | | | |f1 | | 0.3980 | | 0 | +| - p2 | | | |f1 | | 0.3867 | | 0 | +| - p3 | | | |f1 | | 0.3980 | | 0 | diff --git a/csv_new/output/Qwen__Qwen3-30B-A3B-Instruct-2507__sk__10shot.txt b/csv_new/output/Qwen__Qwen3-30B-A3B-Instruct-2507__sk__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..652672223f87eeb324263928437a787e75b87b20 --- /dev/null +++ b/csv_new/output/Qwen__Qwen3-30B-A3B-Instruct-2507__sk__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen3-30B-A3B-Instruct-2507 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6030 | |0 | +| - p1 | | | |f1 | | 0.6085 | | 0 | +| - p2 | | | |f1 | | 0.5919 | | 0 | +| - p3 | | | |f1 | | 0.6085 | | 0 | +| - RE | | | |f1 | | 0.5106 | |0 | +| - p1 | | | |f1 | | 0.4920 | | 0 | +| - p2 | | | |f1 | | 0.5025 | | 0 | +| - p3 | | | |f1 | | 0.5373 | | 0 | diff --git a/csv_new/output/Qwen__Qwen3-30B-A3B-Instruct-2507__sl__0shot.txt b/csv_new/output/Qwen__Qwen3-30B-A3B-Instruct-2507__sl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..126b784a0d6414a7ebb39eb6954f1444f9a726e9 --- /dev/null +++ b/csv_new/output/Qwen__Qwen3-30B-A3B-Instruct-2507__sl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen3-30B-A3B-Instruct-2507 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4501 | |0 | +| - p1 | | | |f1 | | 0.4486 | | 0 | +| - p2 | | | |f1 | | 0.4531 | | 0 | +| - p3 | | | |f1 | | 0.4486 | | 0 | +| - RE | | | |f1 | | 0.4118 | |0 | +| - p1 | | | |f1 | | 0.4115 | | 0 | +| - p2 | | | |f1 | | 0.4126 | | 0 | +| - p3 | | | |f1 | | 0.4115 | | 0 | diff --git a/csv_new/output/Qwen__Qwen3-30B-A3B-Instruct-2507__sl__10shot.txt b/csv_new/output/Qwen__Qwen3-30B-A3B-Instruct-2507__sl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..827b5e0d0dd790eea628cc4c77b18800829dd3d5 --- /dev/null +++ b/csv_new/output/Qwen__Qwen3-30B-A3B-Instruct-2507__sl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=Qwen/Qwen3-30B-A3B-Instruct-2507 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6391 | |0 | +| - p1 | | | |f1 | | 0.6615 | | 0 | +| - p2 | | | |f1 | | 0.5944 | | 0 | +| - p3 | | | |f1 | | 0.6615 | | 0 | +| - RE | | | |f1 | | 0.5356 | |0 | +| - p1 | | | |f1 | | 0.5062 | | 0 | +| - p2 | | | |f1 | | 0.5576 | | 0 | +| - p3 | | | |f1 | | 0.5429 | | 0 | diff --git a/csv_new/output/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__en__0shot.txt b/csv_new/output/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__en__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..92e6941722e5350b9a314942add661e213655787 --- /dev/null +++ b/csv_new/output/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__en__0shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2877 | |0 | +| - p1 | | | |f1 | | 0.1963 | | 0 | +| - p2 | | | |f1 | | 0.3459 | | 0 | +| - p3 | | | |f1 | | 0.3208 | | 0 | +| - RE | | | |f1 | | 0.4430 | |0 | +| - p1 | | | |f1 | | 0.4487 | | 0 | +| - p2 | | | |f1 | | 0.4492 | | 0 | +| - p3 | | | |f1 | | 0.4311 | | 0 | +| - RML | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - DIA | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - HIS | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__en__10shot.txt b/csv_new/output/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__en__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..2a1b8b0df9eb5480ebc575744894b7dd65a6e792 --- /dev/null +++ b/csv_new/output/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__en__10shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5963 | |0 | +| - p1 | | | |f1 | | 0.6024 | | 0 | +| - p2 | | | |f1 | | 0.5929 | | 0 | +| - p3 | | | |f1 | | 0.5935 | | 0 | +| - RE | | | |f1 | | 0.5221 | |0 | +| - p1 | | | |f1 | | 0.5191 | | 0 | +| - p2 | | | |f1 | | 0.5199 | | 0 | +| - p3 | | | |f1 | | 0.5273 | | 0 | +| - RML | | | |f1 | | 0.1768 | |0 | +| - p1 | | | |f1 | | 0.1169 | | 0 | +| - p2 | | | |f1 | | 0.1503 | | 0 | +| - p3 | | | |f1 | | 0.2633 | | 0 | +| - DIA | | | |f1 | | 0.2339 | |0 | +| - p1 | | | |f1 | | 0.3117 | | 0 | +| - p2 | | | |f1 | | 0.2416 | | 0 | +| - p3 | | | |f1 | | 0.1483 | | 0 | +| - HIS | | | |f1 | | 0.4828 | |0 | +| - p1 | | | |f1 | | 0.3922 | | 0 | +| - p2 | | | |f1 | | 0.5191 | | 0 | +| - p3 | | | |f1 | | 0.5371 | | 0 | diff --git a/csv_new/output/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__gr__0shot.txt b/csv_new/output/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__gr__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..79aa97329e86a168483edd679e8cc64109aed7a6 --- /dev/null +++ b/csv_new/output/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__gr__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3421 | |0 | +| - p1 | | | |f1 | | 0.3455 | | 0 | +| - p2 | | | |f1 | | 0.3354 | | 0 | +| - p3 | | | |f1 | | 0.3455 | | 0 | +| - RE | | | |f1 | | 0.3485 | |0 | +| - p1 | | | |f1 | | 0.2406 | | 0 | +| - p2 | | | |f1 | | 0.3947 | | 0 | +| - p3 | | | |f1 | | 0.4102 | | 0 | diff --git a/csv_new/output/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__gr__10shot.txt b/csv_new/output/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__gr__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..d10be9c5334b54f3adcb1cee0c3d5a9defc21084 --- /dev/null +++ b/csv_new/output/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__gr__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5884 | |0 | +| - p1 | | | |f1 | | 0.5928 | | 0 | +| - p2 | | | |f1 | | 0.5796 | | 0 | +| - p3 | | | |f1 | | 0.5928 | | 0 | +| - RE | | | |f1 | | 0.4415 | |0 | +| - p1 | | | |f1 | | 0.4467 | | 0 | +| - p2 | | | |f1 | | 0.4210 | | 0 | +| - p3 | | | |f1 | | 0.4569 | | 0 | diff --git a/csv_new/output/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__it__0shot.txt b/csv_new/output/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__it__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..3b8bc3f3cbb4e83e24bd27ded97151cc85559b8b --- /dev/null +++ b/csv_new/output/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__it__0shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3220 | |0 | +| - p1 | | | |f1 | | 0.2678 | | 0 | +| - p2 | | | |f1 | | 0.3568 | | 0 | +| - p3 | | | |f1 | | 0.3414 | | 0 | +| - RE | | | |f1 | | 0.4452 | |0 | +| - p1 | | | |f1 | | 0.4519 | | 0 | +| - p2 | | | |f1 | | 0.4611 | | 0 | +| - p3 | | | |f1 | | 0.4227 | | 0 | +| - RML | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - DIA | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - HIS | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__it__10shot.txt b/csv_new/output/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__it__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..45d0d9c0d3a66e282bc97331a00d9ec040029b26 --- /dev/null +++ b/csv_new/output/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__it__10shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6864 | |0 | +| - p1 | | | |f1 | | 0.6982 | | 0 | +| - p2 | | | |f1 | | 0.6679 | | 0 | +| - p3 | | | |f1 | | 0.6930 | | 0 | +| - RE | | | |f1 | | 0.5530 | |0 | +| - p1 | | | |f1 | | 0.5546 | | 0 | +| - p2 | | | |f1 | | 0.5526 | | 0 | +| - p3 | | | |f1 | | 0.5518 | | 0 | +| - RML | | | |f1 | | 0.0570 | |0 | +| - p1 | | | |f1 | | 0.0308 | | 0 | +| - p2 | | | |f1 | | 0.0174 | | 0 | +| - p3 | | | |f1 | | 0.1228 | | 0 | +| - DIA | | | |f1 | | 0.2965 | |0 | +| - p1 | | | |f1 | | 0.2795 | | 0 | +| - p2 | | | |f1 | | 0.2920 | | 0 | +| - p3 | | | |f1 | | 0.3181 | | 0 | +| - HIS | | | |f1 | | 0.2830 | |0 | +| - p1 | | | |f1 | | 0.2630 | | 0 | +| - p2 | | | |f1 | | 0.2967 | | 0 | +| - p3 | | | |f1 | | 0.2894 | | 0 | diff --git a/csv_new/output/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__pl__0shot.txt b/csv_new/output/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__pl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..16a21b3d60d0b28e03e3f22502e906d4f9d2586d --- /dev/null +++ b/csv_new/output/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__pl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3379 | |0 | +| - p1 | | | |f1 | | 0.3204 | | 0 | +| - p2 | | | |f1 | | 0.3728 | | 0 | +| - p3 | | | |f1 | | 0.3204 | | 0 | +| - RE | | | |f1 | | 0.4131 | |0 | +| - p1 | | | |f1 | | 0.3983 | | 0 | +| - p2 | | | |f1 | | 0.4327 | | 0 | +| - p3 | | | |f1 | | 0.4083 | | 0 | diff --git a/csv_new/output/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__pl__10shot.txt b/csv_new/output/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__pl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..7bbdde1853e3107c0e3fa26a80be768aedf20a06 --- /dev/null +++ b/csv_new/output/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__pl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6189 | |0 | +| - p1 | | | |f1 | | 0.6214 | | 0 | +| - p2 | | | |f1 | | 0.6140 | | 0 | +| - p3 | | | |f1 | | 0.6214 | | 0 | +| - RE | | | |f1 | | 0.5023 | |0 | +| - p1 | | | |f1 | | 0.4863 | | 0 | +| - p2 | | | |f1 | | 0.5129 | | 0 | +| - p3 | | | |f1 | | 0.5076 | | 0 | diff --git a/csv_new/output/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__sk__0shot.txt b/csv_new/output/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__sk__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..ffa46c1c5e22e9d8e038069c447c1f026cfc61f6 --- /dev/null +++ b/csv_new/output/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__sk__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2521 | |0 | +| - p1 | | | |f1 | | 0.2829 | | 0 | +| - p2 | | | |f1 | | 0.1905 | | 0 | +| - p3 | | | |f1 | | 0.2829 | | 0 | +| - RE | | | |f1 | | 0.3959 | |0 | +| - p1 | | | |f1 | | 0.3893 | | 0 | +| - p2 | | | |f1 | | 0.4091 | | 0 | +| - p3 | | | |f1 | | 0.3893 | | 0 | diff --git a/csv_new/output/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__sk__10shot.txt b/csv_new/output/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__sk__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..6516944fdac3db3ed3380f5c97391fae7dbc061d --- /dev/null +++ b/csv_new/output/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__sk__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6302 | |0 | +| - p1 | | | |f1 | | 0.6347 | | 0 | +| - p2 | | | |f1 | | 0.6211 | | 0 | +| - p3 | | | |f1 | | 0.6347 | | 0 | +| - RE | | | |f1 | | 0.4646 | |0 | +| - p1 | | | |f1 | | 0.4799 | | 0 | +| - p2 | | | |f1 | | 0.4451 | | 0 | +| - p3 | | | |f1 | | 0.4689 | | 0 | diff --git a/csv_new/output/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__sl__0shot.txt b/csv_new/output/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__sl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..c0a0fe07a3970ef5a820c76cb9751944d12fdab2 --- /dev/null +++ b/csv_new/output/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__sl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2604 | |0 | +| - p1 | | | |f1 | | 0.2810 | | 0 | +| - p2 | | | |f1 | | 0.2192 | | 0 | +| - p3 | | | |f1 | | 0.2810 | | 0 | +| - RE | | | |f1 | | 0.4116 | |0 | +| - p1 | | | |f1 | | 0.4116 | | 0 | +| - p2 | | | |f1 | | 0.4115 | | 0 | +| - p3 | | | |f1 | | 0.4116 | | 0 | diff --git a/csv_new/output/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__sl__10shot.txt b/csv_new/output/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__sl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..698e2379856e9df40de4014fdbd473b61395c81b --- /dev/null +++ b/csv_new/output/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B__sl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6026 | |0 | +| - p1 | | | |f1 | | 0.6015 | | 0 | +| - p2 | | | |f1 | | 0.6049 | | 0 | +| - p3 | | | |f1 | | 0.6015 | | 0 | +| - RE | | | |f1 | | 0.4911 | |0 | +| - p1 | | | |f1 | | 0.5137 | | 0 | +| - p2 | | | |f1 | | 0.4674 | | 0 | +| - p3 | | | |f1 | | 0.4923 | | 0 | diff --git a/csv_new/output/epfl-llm__meditron-7b__en__0shot.txt b/csv_new/output/epfl-llm__meditron-7b__en__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..813bfc3b784509efa44e172f6ca41a8394dba25f --- /dev/null +++ b/csv_new/output/epfl-llm__meditron-7b__en__0shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=epfl-llm/meditron-7b ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0612 | |0 | +| - p1 | | | |f1 | | 0.0578 | | 0 | +| - p2 | | | |f1 | | 0.0410 | | 0 | +| - p3 | | | |f1 | | 0.0848 | | 0 | +| - RE | | | |f1 | | 0.0313 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0442 | | 0 | +| - p3 | | | |f1 | | 0.0497 | | 0 | +| - RML | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - DIA | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - HIS | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/epfl-llm__meditron-7b__en__10shot.txt b/csv_new/output/epfl-llm__meditron-7b__en__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..fa0e5a5b8b2a56ea1ecd44e817fcc20657b038e0 --- /dev/null +++ b/csv_new/output/epfl-llm__meditron-7b__en__10shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=epfl-llm/meditron-7b ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1245 | |0 | +| - p1 | | | |f1 | | 0.0803 | | 0 | +| - p2 | | | |f1 | | 0.1479 | | 0 | +| - p3 | | | |f1 | | 0.1454 | | 0 | +| - RE | | | |f1 | | 0.0692 | |0 | +| - p1 | | | |f1 | | 0.0722 | | 0 | +| - p2 | | | |f1 | | 0.0692 | | 0 | +| - p3 | | | |f1 | | 0.0663 | | 0 | +| - RML | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - DIA | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - HIS | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/epfl-llm__meditron-7b__gr__0shot.txt b/csv_new/output/epfl-llm__meditron-7b__gr__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..b733c326cb013320727e13c717645ad3b4ff775e --- /dev/null +++ b/csv_new/output/epfl-llm__meditron-7b__gr__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=epfl-llm/meditron-7b ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2426 | |0 | +| - p1 | | | |f1 | | 0.2417 | | 0 | +| - p2 | | | |f1 | | 0.2443 | | 0 | +| - p3 | | | |f1 | | 0.2417 | | 0 | +| - RE | | | |f1 | | 0.0592 | |0 | +| - p1 | | | |f1 | | 0.1556 | | 0 | +| - p2 | | | |f1 | | 0.0161 | | 0 | +| - p3 | | | |f1 | | 0.0058 | | 0 | diff --git a/csv_new/output/epfl-llm__meditron-7b__gr__10shot.txt b/csv_new/output/epfl-llm__meditron-7b__gr__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..87b319e4253b8aba65bfcf2e4ade2615fc2ae10e --- /dev/null +++ b/csv_new/output/epfl-llm__meditron-7b__gr__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=epfl-llm/meditron-7b ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - RE | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/epfl-llm__meditron-7b__it__0shot.txt b/csv_new/output/epfl-llm__meditron-7b__it__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..079f4b749f49a39a5fa21370618f59fca4c06bd5 --- /dev/null +++ b/csv_new/output/epfl-llm__meditron-7b__it__0shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=epfl-llm/meditron-7b ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0639 | |0 | +| - p1 | | | |f1 | | 0.0773 | | 0 | +| - p2 | | | |f1 | | 0.0612 | | 0 | +| - p3 | | | |f1 | | 0.0531 | | 0 | +| - RE | | | |f1 | | 0.1072 | |0 | +| - p1 | | | |f1 | | 0.0020 | | 0 | +| - p2 | | | |f1 | | 0.1929 | | 0 | +| - p3 | | | |f1 | | 0.1268 | | 0 | +| - RML | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - DIA | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - HIS | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/epfl-llm__meditron-7b__it__10shot.txt b/csv_new/output/epfl-llm__meditron-7b__it__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..d8ad9ef5a9a45b0f140c187cbddc77e035f7d352 --- /dev/null +++ b/csv_new/output/epfl-llm__meditron-7b__it__10shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=epfl-llm/meditron-7b ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3288 | |0 | +| - p1 | | | |f1 | | 0.2991 | | 0 | +| - p2 | | | |f1 | | 0.3563 | | 0 | +| - p3 | | | |f1 | | 0.3311 | | 0 | +| - RE | | | |f1 | | 0.0896 | |0 | +| - p1 | | | |f1 | | 0.0832 | | 0 | +| - p2 | | | |f1 | | 0.0887 | | 0 | +| - p3 | | | |f1 | | 0.0968 | | 0 | +| - RML | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - DIA | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - HIS | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/epfl-llm__meditron-7b__pl__0shot.txt b/csv_new/output/epfl-llm__meditron-7b__pl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..7c763fab18fe2af421c37a99965e57159fb9f0dd --- /dev/null +++ b/csv_new/output/epfl-llm__meditron-7b__pl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=epfl-llm/meditron-7b ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1161 | |0 | +| - p1 | | | |f1 | | 0.1140 | | 0 | +| - p2 | | | |f1 | | 0.1203 | | 0 | +| - p3 | | | |f1 | | 0.1140 | | 0 | +| - RE | | | |f1 | | 0.0025 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0076 | | 0 | diff --git a/csv_new/output/epfl-llm__meditron-7b__pl__10shot.txt b/csv_new/output/epfl-llm__meditron-7b__pl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..14675a45035d0e69895142e3b0f6800ec9197583 --- /dev/null +++ b/csv_new/output/epfl-llm__meditron-7b__pl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=epfl-llm/meditron-7b ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3222 | |0 | +| - p1 | | | |f1 | | 0.3184 | | 0 | +| - p2 | | | |f1 | | 0.3297 | | 0 | +| - p3 | | | |f1 | | 0.3184 | | 0 | +| - RE | | | |f1 | | 0.0510 | |0 | +| - p1 | | | |f1 | | 0.0533 | | 0 | +| - p2 | | | |f1 | | 0.0461 | | 0 | +| - p3 | | | |f1 | | 0.0535 | | 0 | diff --git a/csv_new/output/epfl-llm__meditron-7b__sk__0shot.txt b/csv_new/output/epfl-llm__meditron-7b__sk__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..93e3d58f1c091c0ef7928d528ba2d95cfc046831 --- /dev/null +++ b/csv_new/output/epfl-llm__meditron-7b__sk__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=epfl-llm/meditron-7b ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0778 | |0 | +| - p1 | | | |f1 | | 0.0874 | | 0 | +| - p2 | | | |f1 | | 0.0586 | | 0 | +| - p3 | | | |f1 | | 0.0874 | | 0 | +| - RE | | | |f1 | | 0.0034 | |0 | +| - p1 | | | |f1 | | 0.0036 | | 0 | +| - p2 | | | |f1 | | 0.0031 | | 0 | +| - p3 | | | |f1 | | 0.0036 | | 0 | diff --git a/csv_new/output/epfl-llm__meditron-7b__sk__10shot.txt b/csv_new/output/epfl-llm__meditron-7b__sk__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..bad9a6c35cda030096e0a1ffe1e020b004d5263a --- /dev/null +++ b/csv_new/output/epfl-llm__meditron-7b__sk__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=epfl-llm/meditron-7b ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2993 | |0 | +| - p1 | | | |f1 | | 0.3004 | | 0 | +| - p2 | | | |f1 | | 0.2970 | | 0 | +| - p3 | | | |f1 | | 0.3004 | | 0 | +| - RE | | | |f1 | | 0.0404 | |0 | +| - p1 | | | |f1 | | 0.0445 | | 0 | +| - p2 | | | |f1 | | 0.0393 | | 0 | +| - p3 | | | |f1 | | 0.0375 | | 0 | diff --git a/csv_new/output/epfl-llm__meditron-7b__sl__0shot.txt b/csv_new/output/epfl-llm__meditron-7b__sl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..cfb36aa25fe8478844f4e4741701e7cd84df3e6c --- /dev/null +++ b/csv_new/output/epfl-llm__meditron-7b__sl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=epfl-llm/meditron-7b ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0951 | |0 | +| - p1 | | | |f1 | | 0.1197 | | 0 | +| - p2 | | | |f1 | | 0.0460 | | 0 | +| - p3 | | | |f1 | | 0.1197 | | 0 | +| - RE | | | |f1 | | 0.0445 | |0 | +| - p1 | | | |f1 | | 0.0598 | | 0 | +| - p2 | | | |f1 | | 0.0137 | | 0 | +| - p3 | | | |f1 | | 0.0598 | | 0 | diff --git a/csv_new/output/epfl-llm__meditron-7b__sl__10shot.txt b/csv_new/output/epfl-llm__meditron-7b__sl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..eab52b0c7040bdf63e365ec759ff69b327922c10 --- /dev/null +++ b/csv_new/output/epfl-llm__meditron-7b__sl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=epfl-llm/meditron-7b ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3052 | |0 | +| - p1 | | | |f1 | | 0.3119 | | 0 | +| - p2 | | | |f1 | | 0.2916 | | 0 | +| - p3 | | | |f1 | | 0.3119 | | 0 | +| - RE | | | |f1 | | 0.0502 | |0 | +| - p1 | | | |f1 | | 0.0477 | | 0 | +| - p2 | | | |f1 | | 0.0501 | | 0 | +| - p3 | | | |f1 | | 0.0528 | | 0 | diff --git a/csv_new/output/google__gemma-2-9b-it__en__0shot.txt b/csv_new/output/google__gemma-2-9b-it__en__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..81faf740dd7a13ccae7e56e1e2eaa3521b92615c --- /dev/null +++ b/csv_new/output/google__gemma-2-9b-it__en__0shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=google/gemma-2-9b-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4603 | |0 | +| - p1 | | | |f1 | | 0.3267 | | 0 | +| - p2 | | | |f1 | | 0.5174 | | 0 | +| - p3 | | | |f1 | | 0.5370 | | 0 | +| - RE | | | |f1 | | 0.4211 | |0 | +| - p1 | | | |f1 | | 0.4360 | | 0 | +| - p2 | | | |f1 | | 0.4205 | | 0 | +| - p3 | | | |f1 | | 0.4067 | | 0 | +| - RML | | | |f1 | | 0.0267 | |0 | +| - p1 | | | |f1 | | 0.0006 | | 0 | +| - p2 | | | |f1 | | 0.0794 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - DIA | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - HIS | | | |f1 | | 0.0046 | |0 | +| - p1 | | | |f1 | | 0.0013 | | 0 | +| - p2 | | | |f1 | | 0.0126 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/google__gemma-2-9b-it__en__10shot.txt b/csv_new/output/google__gemma-2-9b-it__en__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..dc6c94f35fa38ff6bd6785d97ec6d136a460733e --- /dev/null +++ b/csv_new/output/google__gemma-2-9b-it__en__10shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=google/gemma-2-9b-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5919 | |0 | +| - p1 | | | |f1 | | 0.6200 | | 0 | +| - p2 | | | |f1 | | 0.5639 | | 0 | +| - p3 | | | |f1 | | 0.5918 | | 0 | +| - RE | | | |f1 | | 0.5303 | |0 | +| - p1 | | | |f1 | | 0.5163 | | 0 | +| - p2 | | | |f1 | | 0.5337 | | 0 | +| - p3 | | | |f1 | | 0.5409 | | 0 | +| - RML | | | |f1 | | 0.3200 | |0 | +| - p1 | | | |f1 | | 0.2951 | | 0 | +| - p2 | | | |f1 | | 0.3388 | | 0 | +| - p3 | | | |f1 | | 0.3262 | | 0 | +| - DIA | | | |f1 | | 0.2120 | |0 | +| - p1 | | | |f1 | | 0.3118 | | 0 | +| - p2 | | | |f1 | | 0.2737 | | 0 | +| - p3 | | | |f1 | | 0.0506 | | 0 | +| - HIS | | | |f1 | | 0.1624 | |0 | +| - p1 | | | |f1 | | 0.1067 | | 0 | +| - p2 | | | |f1 | | 0.2649 | | 0 | +| - p3 | | | |f1 | | 0.1158 | | 0 | diff --git a/csv_new/output/google__gemma-2-9b-it__gr__0shot.txt b/csv_new/output/google__gemma-2-9b-it__gr__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..87226c5046c2278e0f2a6e57fa83aa395a61ca52 --- /dev/null +++ b/csv_new/output/google__gemma-2-9b-it__gr__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/gemma-2-9b-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5292 | |0 | +| - p1 | | | |f1 | | 0.5549 | | 0 | +| - p2 | | | |f1 | | 0.4777 | | 0 | +| - p3 | | | |f1 | | 0.5549 | | 0 | +| - RE | | | |f1 | | 0.4008 | |0 | +| - p1 | | | |f1 | | 0.4124 | | 0 | +| - p2 | | | |f1 | | 0.3957 | | 0 | +| - p3 | | | |f1 | | 0.3943 | | 0 | diff --git a/csv_new/output/google__gemma-2-9b-it__gr__10shot.txt b/csv_new/output/google__gemma-2-9b-it__gr__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..739bad8c7a5639671141f53c0413696e38d96592 --- /dev/null +++ b/csv_new/output/google__gemma-2-9b-it__gr__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/gemma-2-9b-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5943 | |0 | +| - p1 | | | |f1 | | 0.6083 | | 0 | +| - p2 | | | |f1 | | 0.5663 | | 0 | +| - p3 | | | |f1 | | 0.6083 | | 0 | +| - RE | | | |f1 | | 0.5162 | |0 | +| - p1 | | | |f1 | | 0.5070 | | 0 | +| - p2 | | | |f1 | | 0.4971 | | 0 | +| - p3 | | | |f1 | | 0.5444 | | 0 | diff --git a/csv_new/output/google__gemma-2-9b-it__it__0shot.txt b/csv_new/output/google__gemma-2-9b-it__it__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..564507d8be454a27e616ac79f0bf9d60884b836a --- /dev/null +++ b/csv_new/output/google__gemma-2-9b-it__it__0shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=google/gemma-2-9b-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6158 | |0 | +| - p1 | | | |f1 | | 0.5739 | | 0 | +| - p2 | | | |f1 | | 0.6524 | | 0 | +| - p3 | | | |f1 | | 0.6210 | | 0 | +| - RE | | | |f1 | | 0.4298 | |0 | +| - p1 | | | |f1 | | 0.4585 | | 0 | +| - p2 | | | |f1 | | 0.4113 | | 0 | +| - p3 | | | |f1 | | 0.4196 | | 0 | +| - RML | | | |f1 | | 0.0008 | |0 | +| - p1 | | | |f1 | | 0.0024 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - DIA | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - HIS | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/google__gemma-2-9b-it__it__10shot.txt b/csv_new/output/google__gemma-2-9b-it__it__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..fe4f0d215fe5d0d8e46c5c8ef607fecd8712723b --- /dev/null +++ b/csv_new/output/google__gemma-2-9b-it__it__10shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=google/gemma-2-9b-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6707 | |0 | +| - p1 | | | |f1 | | 0.6910 | | 0 | +| - p2 | | | |f1 | | 0.6643 | | 0 | +| - p3 | | | |f1 | | 0.6569 | | 0 | +| - RE | | | |f1 | | 0.5209 | |0 | +| - p1 | | | |f1 | | 0.4958 | | 0 | +| - p2 | | | |f1 | | 0.5365 | | 0 | +| - p3 | | | |f1 | | 0.5305 | | 0 | +| - RML | | | |f1 | | 0.1509 | |0 | +| - p1 | | | |f1 | | 0.1790 | | 0 | +| - p2 | | | |f1 | | 0.1653 | | 0 | +| - p3 | | | |f1 | | 0.1084 | | 0 | +| - DIA | | | |f1 | | 0.2747 | |0 | +| - p1 | | | |f1 | | 0.3288 | | 0 | +| - p2 | | | |f1 | | 0.4035 | | 0 | +| - p3 | | | |f1 | | 0.0919 | | 0 | +| - HIS | | | |f1 | | 0.1412 | |0 | +| - p1 | | | |f1 | | 0.0851 | | 0 | +| - p2 | | | |f1 | | 0.2653 | | 0 | +| - p3 | | | |f1 | | 0.0732 | | 0 | diff --git a/csv_new/output/google__gemma-2-9b-it__pl__0shot.txt b/csv_new/output/google__gemma-2-9b-it__pl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..f187b39ad69bd8bb0dbf697be691b129eadee340 --- /dev/null +++ b/csv_new/output/google__gemma-2-9b-it__pl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/gemma-2-9b-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4092 | |0 | +| - p1 | | | |f1 | | 0.4060 | | 0 | +| - p2 | | | |f1 | | 0.4155 | | 0 | +| - p3 | | | |f1 | | 0.4060 | | 0 | +| - RE | | | |f1 | | 0.3891 | |0 | +| - p1 | | | |f1 | | 0.3674 | | 0 | +| - p2 | | | |f1 | | 0.4271 | | 0 | +| - p3 | | | |f1 | | 0.3729 | | 0 | diff --git a/csv_new/output/google__gemma-2-9b-it__pl__10shot.txt b/csv_new/output/google__gemma-2-9b-it__pl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..5ac1adba3779f7f68d4bfbf2cf88b163e3b84f4b --- /dev/null +++ b/csv_new/output/google__gemma-2-9b-it__pl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/gemma-2-9b-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5893 | |0 | +| - p1 | | | |f1 | | 0.5908 | | 0 | +| - p2 | | | |f1 | | 0.5862 | | 0 | +| - p3 | | | |f1 | | 0.5908 | | 0 | +| - RE | | | |f1 | | 0.5033 | |0 | +| - p1 | | | |f1 | | 0.5168 | | 0 | +| - p2 | | | |f1 | | 0.4808 | | 0 | +| - p3 | | | |f1 | | 0.5124 | | 0 | diff --git a/csv_new/output/google__gemma-2-9b-it__sk__0shot.txt b/csv_new/output/google__gemma-2-9b-it__sk__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..8f0bc370ed50b39ebc88b5e7e85f8a110f45a283 --- /dev/null +++ b/csv_new/output/google__gemma-2-9b-it__sk__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/gemma-2-9b-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4775 | |0 | +| - p1 | | | |f1 | | 0.4875 | | 0 | +| - p2 | | | |f1 | | 0.4575 | | 0 | +| - p3 | | | |f1 | | 0.4875 | | 0 | +| - RE | | | |f1 | | 0.4106 | |0 | +| - p1 | | | |f1 | | 0.3989 | | 0 | +| - p2 | | | |f1 | | 0.4340 | | 0 | +| - p3 | | | |f1 | | 0.3989 | | 0 | diff --git a/csv_new/output/google__gemma-2-9b-it__sk__10shot.txt b/csv_new/output/google__gemma-2-9b-it__sk__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..b909c3988f118020a3d985678e092a54be2f61f1 --- /dev/null +++ b/csv_new/output/google__gemma-2-9b-it__sk__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/gemma-2-9b-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6135 | |0 | +| - p1 | | | |f1 | | 0.6141 | | 0 | +| - p2 | | | |f1 | | 0.6122 | | 0 | +| - p3 | | | |f1 | | 0.6141 | | 0 | +| - RE | | | |f1 | | 0.5007 | |0 | +| - p1 | | | |f1 | | 0.5153 | | 0 | +| - p2 | | | |f1 | | 0.4754 | | 0 | +| - p3 | | | |f1 | | 0.5114 | | 0 | diff --git a/csv_new/output/google__gemma-2-9b-it__sl__0shot.txt b/csv_new/output/google__gemma-2-9b-it__sl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..98ce20cc51f351ff18c9e416c89a29a920d6bacd --- /dev/null +++ b/csv_new/output/google__gemma-2-9b-it__sl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/gemma-2-9b-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4487 | |0 | +| - p1 | | | |f1 | | 0.4707 | | 0 | +| - p2 | | | |f1 | | 0.4046 | | 0 | +| - p3 | | | |f1 | | 0.4707 | | 0 | +| - RE | | | |f1 | | 0.4058 | |0 | +| - p1 | | | |f1 | | 0.4079 | | 0 | +| - p2 | | | |f1 | | 0.4016 | | 0 | +| - p3 | | | |f1 | | 0.4079 | | 0 | diff --git a/csv_new/output/google__gemma-2-9b-it__sl__10shot.txt b/csv_new/output/google__gemma-2-9b-it__sl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..e956839ba21e57d9f60059817766b32fe88d80a2 --- /dev/null +++ b/csv_new/output/google__gemma-2-9b-it__sl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/gemma-2-9b-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6156 | |0 | +| - p1 | | | |f1 | | 0.6365 | | 0 | +| - p2 | | | |f1 | | 0.5737 | | 0 | +| - p3 | | | |f1 | | 0.6365 | | 0 | +| - RE | | | |f1 | | 0.4883 | |0 | +| - p1 | | | |f1 | | 0.4801 | | 0 | +| - p2 | | | |f1 | | 0.4878 | | 0 | +| - p3 | | | |f1 | | 0.4972 | | 0 | diff --git a/csv_new/output/google__gemma-3-27b-it__en__0shot.txt b/csv_new/output/google__gemma-3-27b-it__en__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..1deb30f5e851aa6b7925c9feb20fe9fff8675c1b --- /dev/null +++ b/csv_new/output/google__gemma-3-27b-it__en__0shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=google/gemma-3-27b-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5490 | |0 | +| - p1 | | | |f1 | | 0.5446 | | 0 | +| - p2 | | | |f1 | | 0.5830 | | 0 | +| - p3 | | | |f1 | | 0.5194 | | 0 | +| - RE | | | |f1 | | 0.4623 | |0 | +| - p1 | | | |f1 | | 0.4543 | | 0 | +| - p2 | | | |f1 | | 0.4582 | | 0 | +| - p3 | | | |f1 | | 0.4743 | | 0 | +| - RML | | | |f1 | | 0.0924 | |0 | +| - p1 | | | |f1 | | 0.1559 | | 0 | +| - p2 | | | |f1 | | 0.1213 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - DIA | | | |f1 | | 0.0044 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0131 | | 0 | +| - HIS | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/google__gemma-3-27b-it__en__10shot.txt b/csv_new/output/google__gemma-3-27b-it__en__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..c2289ddf299b48c07abb85a786b9488d1d720da4 --- /dev/null +++ b/csv_new/output/google__gemma-3-27b-it__en__10shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=google/gemma-3-27b-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6187 | |0 | +| - p1 | | | |f1 | | 0.6160 | | 0 | +| - p2 | | | |f1 | | 0.6308 | | 0 | +| - p3 | | | |f1 | | 0.6094 | | 0 | +| - RE | | | |f1 | | 0.5518 | |0 | +| - p1 | | | |f1 | | 0.5191 | | 0 | +| - p2 | | | |f1 | | 0.5600 | | 0 | +| - p3 | | | |f1 | | 0.5764 | | 0 | +| - RML | | | |f1 | | 0.3305 | |0 | +| - p1 | | | |f1 | | 0.3271 | | 0 | +| - p2 | | | |f1 | | 0.3301 | | 0 | +| - p3 | | | |f1 | | 0.3342 | | 0 | +| - DIA | | | |f1 | | 0.2902 | |0 | +| - p1 | | | |f1 | | 0.4022 | | 0 | +| - p2 | | | |f1 | | 0.3858 | | 0 | +| - p3 | | | |f1 | | 0.0828 | | 0 | +| - HIS | | | |f1 | | 0.3034 | |0 | +| - p1 | | | |f1 | | 0.2449 | | 0 | +| - p2 | | | |f1 | | 0.4821 | | 0 | +| - p3 | | | |f1 | | 0.1832 | | 0 | diff --git a/csv_new/output/google__gemma-3-27b-it__gr__0shot.txt b/csv_new/output/google__gemma-3-27b-it__gr__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..93e9d713d614c2dbaa4d17389ea32a9b3021a3cf --- /dev/null +++ b/csv_new/output/google__gemma-3-27b-it__gr__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/gemma-3-27b-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5151 | |0 | +| - p1 | | | |f1 | | 0.4866 | | 0 | +| - p2 | | | |f1 | | 0.5721 | | 0 | +| - p3 | | | |f1 | | 0.4866 | | 0 | +| - RE | | | |f1 | | 0.4473 | |0 | +| - p1 | | | |f1 | | 0.3955 | | 0 | +| - p2 | | | |f1 | | 0.4695 | | 0 | +| - p3 | | | |f1 | | 0.4769 | | 0 | diff --git a/csv_new/output/google__gemma-3-27b-it__gr__10shot.txt b/csv_new/output/google__gemma-3-27b-it__gr__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..f1b60273ff3047ec635fd913fc4fe0db8a2ca133 --- /dev/null +++ b/csv_new/output/google__gemma-3-27b-it__gr__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/gemma-3-27b-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6570 | |0 | +| - p1 | | | |f1 | | 0.6551 | | 0 | +| - p2 | | | |f1 | | 0.6608 | | 0 | +| - p3 | | | |f1 | | 0.6551 | | 0 | +| - RE | | | |f1 | | 0.5405 | |0 | +| - p1 | | | |f1 | | 0.5083 | | 0 | +| - p2 | | | |f1 | | 0.5550 | | 0 | +| - p3 | | | |f1 | | 0.5581 | | 0 | diff --git a/csv_new/output/google__gemma-3-27b-it__it__0shot.txt b/csv_new/output/google__gemma-3-27b-it__it__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..dae4b670c0c60c3fac20a66d4d9d9bfd6e268f8d --- /dev/null +++ b/csv_new/output/google__gemma-3-27b-it__it__0shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=google/gemma-3-27b-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6065 | |0 | +| - p1 | | | |f1 | | 0.5543 | | 0 | +| - p2 | | | |f1 | | 0.6697 | | 0 | +| - p3 | | | |f1 | | 0.5954 | | 0 | +| - RE | | | |f1 | | 0.4737 | |0 | +| - p1 | | | |f1 | | 0.4390 | | 0 | +| - p2 | | | |f1 | | 0.4895 | | 0 | +| - p3 | | | |f1 | | 0.4927 | | 0 | +| - RML | | | |f1 | | 0.0615 | |0 | +| - p1 | | | |f1 | | 0.1234 | | 0 | +| - p2 | | | |f1 | | 0.0611 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - DIA | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - HIS | | | |f1 | | 0.0002 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0007 | | 0 | diff --git a/csv_new/output/google__gemma-3-27b-it__it__10shot.txt b/csv_new/output/google__gemma-3-27b-it__it__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..56ef2094b14f7961137092ee039dacb1291554e3 --- /dev/null +++ b/csv_new/output/google__gemma-3-27b-it__it__10shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=google/gemma-3-27b-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.7115 | |0 | +| - p1 | | | |f1 | | 0.7142 | | 0 | +| - p2 | | | |f1 | | 0.6992 | | 0 | +| - p3 | | | |f1 | | 0.7212 | | 0 | +| - RE | | | |f1 | | 0.5615 | |0 | +| - p1 | | | |f1 | | 0.5223 | | 0 | +| - p2 | | | |f1 | | 0.5837 | | 0 | +| - p3 | | | |f1 | | 0.5786 | | 0 | +| - RML | | | |f1 | | 0.2109 | |0 | +| - p1 | | | |f1 | | 0.1965 | | 0 | +| - p2 | | | |f1 | | 0.2487 | | 0 | +| - p3 | | | |f1 | | 0.1874 | | 0 | +| - DIA | | | |f1 | | 0.3773 | |0 | +| - p1 | | | |f1 | | 0.5732 | | 0 | +| - p2 | | | |f1 | | 0.3443 | | 0 | +| - p3 | | | |f1 | | 0.2144 | | 0 | +| - HIS | | | |f1 | | 0.1043 | |0 | +| - p1 | | | |f1 | | 0.1347 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.1783 | | 0 | diff --git a/csv_new/output/google__gemma-3-27b-it__pl__0shot.txt b/csv_new/output/google__gemma-3-27b-it__pl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..e9f1519cf73039b96aee8faf352b110818910761 --- /dev/null +++ b/csv_new/output/google__gemma-3-27b-it__pl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/gemma-3-27b-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4508 | |0 | +| - p1 | | | |f1 | | 0.4506 | | 0 | +| - p2 | | | |f1 | | 0.4511 | | 0 | +| - p3 | | | |f1 | | 0.4506 | | 0 | +| - RE | | | |f1 | | 0.4307 | |0 | +| - p1 | | | |f1 | | 0.4384 | | 0 | +| - p2 | | | |f1 | | 0.4267 | | 0 | +| - p3 | | | |f1 | | 0.4271 | | 0 | diff --git a/csv_new/output/google__gemma-3-27b-it__pl__10shot.txt b/csv_new/output/google__gemma-3-27b-it__pl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..9eed39172f7a13688201364ca71ab665a6378bda --- /dev/null +++ b/csv_new/output/google__gemma-3-27b-it__pl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/gemma-3-27b-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6618 | |0 | +| - p1 | | | |f1 | | 0.6591 | | 0 | +| - p2 | | | |f1 | | 0.6672 | | 0 | +| - p3 | | | |f1 | | 0.6591 | | 0 | +| - RE | | | |f1 | | 0.5592 | |0 | +| - p1 | | | |f1 | | 0.5795 | | 0 | +| - p2 | | | |f1 | | 0.5601 | | 0 | +| - p3 | | | |f1 | | 0.5380 | | 0 | diff --git a/csv_new/output/google__gemma-3-27b-it__sk__0shot.txt b/csv_new/output/google__gemma-3-27b-it__sk__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..27cb80b8b7a5de7165aa52e89c5f70d0ac61dc23 --- /dev/null +++ b/csv_new/output/google__gemma-3-27b-it__sk__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/gemma-3-27b-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2841 | |0 | +| - p1 | | | |f1 | | 0.3183 | | 0 | +| - p2 | | | |f1 | | 0.2157 | | 0 | +| - p3 | | | |f1 | | 0.3183 | | 0 | +| - RE | | | |f1 | | 0.4369 | |0 | +| - p1 | | | |f1 | | 0.4373 | | 0 | +| - p2 | | | |f1 | | 0.4360 | | 0 | +| - p3 | | | |f1 | | 0.4373 | | 0 | diff --git a/csv_new/output/google__gemma-3-27b-it__sk__10shot.txt b/csv_new/output/google__gemma-3-27b-it__sk__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f8297965f5ccbab4e4581425fdb9d9628f5cc8c --- /dev/null +++ b/csv_new/output/google__gemma-3-27b-it__sk__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/gemma-3-27b-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6786 | |0 | +| - p1 | | | |f1 | | 0.6737 | | 0 | +| - p2 | | | |f1 | | 0.6885 | | 0 | +| - p3 | | | |f1 | | 0.6737 | | 0 | +| - RE | | | |f1 | | 0.5095 | |0 | +| - p1 | | | |f1 | | 0.5121 | | 0 | +| - p2 | | | |f1 | | 0.5061 | | 0 | +| - p3 | | | |f1 | | 0.5103 | | 0 | diff --git a/csv_new/output/google__gemma-3-27b-it__sl__0shot.txt b/csv_new/output/google__gemma-3-27b-it__sl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..39bd48a7d0643c98d4e640b58b7343ee908f2d64 --- /dev/null +++ b/csv_new/output/google__gemma-3-27b-it__sl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/gemma-3-27b-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4508 | |0 | +| - p1 | | | |f1 | | 0.4370 | | 0 | +| - p2 | | | |f1 | | 0.4783 | | 0 | +| - p3 | | | |f1 | | 0.4370 | | 0 | +| - RE | | | |f1 | | 0.4301 | |0 | +| - p1 | | | |f1 | | 0.4255 | | 0 | +| - p2 | | | |f1 | | 0.4391 | | 0 | +| - p3 | | | |f1 | | 0.4255 | | 0 | diff --git a/csv_new/output/google__gemma-3-27b-it__sl__10shot.txt b/csv_new/output/google__gemma-3-27b-it__sl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..ed1c6926d8cdf2d22c23adf3393d14f1da9cd4d9 --- /dev/null +++ b/csv_new/output/google__gemma-3-27b-it__sl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/gemma-3-27b-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6806 | |0 | +| - p1 | | | |f1 | | 0.6750 | | 0 | +| - p2 | | | |f1 | | 0.6918 | | 0 | +| - p3 | | | |f1 | | 0.6750 | | 0 | +| - RE | | | |f1 | | 0.4999 | |0 | +| - p1 | | | |f1 | | 0.5149 | | 0 | +| - p2 | | | |f1 | | 0.4703 | | 0 | +| - p3 | | | |f1 | | 0.5145 | | 0 | diff --git a/csv_new/output/google__medgemma-27b-text-it__en__0shot.txt b/csv_new/output/google__medgemma-27b-text-it__en__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..a4f69c367cddedff7d23f6012c6e2f1ccd549c5d --- /dev/null +++ b/csv_new/output/google__medgemma-27b-text-it__en__0shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=google/medgemma-27b-text-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5011 | |0 | +| - p1 | | | |f1 | | 0.3842 | | 0 | +| - p2 | | | |f1 | | 0.6035 | | 0 | +| - p3 | | | |f1 | | 0.5156 | | 0 | +| - RE | | | |f1 | | 0.4681 | |0 | +| - p1 | | | |f1 | | 0.4836 | | 0 | +| - p2 | | | |f1 | | 0.4763 | | 0 | +| - p3 | | | |f1 | | 0.4443 | | 0 | +| - RML | | | |f1 | | 0.0317 | |0 | +| - p1 | | | |f1 | | 0.0623 | | 0 | +| - p2 | | | |f1 | | 0.0327 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - DIA | | | |f1 | | 0.0003 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0009 | | 0 | +| - HIS | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/google__medgemma-27b-text-it__en__10shot.txt b/csv_new/output/google__medgemma-27b-text-it__en__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ad6f4457a604f30d69ebc96d8925759bf4ed3e3 --- /dev/null +++ b/csv_new/output/google__medgemma-27b-text-it__en__10shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=google/medgemma-27b-text-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6324 | |0 | +| - p1 | | | |f1 | | 0.6355 | | 0 | +| - p2 | | | |f1 | | 0.6161 | | 0 | +| - p3 | | | |f1 | | 0.6455 | | 0 | +| - RE | | | |f1 | | 0.5540 | |0 | +| - p1 | | | |f1 | | 0.5562 | | 0 | +| - p2 | | | |f1 | | 0.5494 | | 0 | +| - p3 | | | |f1 | | 0.5565 | | 0 | +| - RML | | | |f1 | | 0.3550 | |0 | +| - p1 | | | |f1 | | 0.3711 | | 0 | +| - p2 | | | |f1 | | 0.3582 | | 0 | +| - p3 | | | |f1 | | 0.3355 | | 0 | +| - DIA | | | |f1 | | 0.3327 | |0 | +| - p1 | | | |f1 | | 0.5480 | | 0 | +| - p2 | | | |f1 | | 0.4010 | | 0 | +| - p3 | | | |f1 | | 0.0491 | | 0 | +| - HIS | | | |f1 | | 0.4941 | |0 | +| - p1 | | | |f1 | | 0.4899 | | 0 | +| - p2 | | | |f1 | | 0.3801 | | 0 | +| - p3 | | | |f1 | | 0.6124 | | 0 | diff --git a/csv_new/output/google__medgemma-27b-text-it__gr__0shot.txt b/csv_new/output/google__medgemma-27b-text-it__gr__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..d065c4984ef73926a5399a10a41f28371f93fc00 --- /dev/null +++ b/csv_new/output/google__medgemma-27b-text-it__gr__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/medgemma-27b-text-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5585 | |0 | +| - p1 | | | |f1 | | 0.5314 | | 0 | +| - p2 | | | |f1 | | 0.6126 | | 0 | +| - p3 | | | |f1 | | 0.5314 | | 0 | +| - RE | | | |f1 | | 0.4199 | |0 | +| - p1 | | | |f1 | | 0.4069 | | 0 | +| - p2 | | | |f1 | | 0.4332 | | 0 | +| - p3 | | | |f1 | | 0.4197 | | 0 | diff --git a/csv_new/output/google__medgemma-27b-text-it__gr__10shot.txt b/csv_new/output/google__medgemma-27b-text-it__gr__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..70f88e2528d3a1ff7cf33f95ceb671fc00a7aa14 --- /dev/null +++ b/csv_new/output/google__medgemma-27b-text-it__gr__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/medgemma-27b-text-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6839 | |0 | +| - p1 | | | |f1 | | 0.6836 | | 0 | +| - p2 | | | |f1 | | 0.6846 | | 0 | +| - p3 | | | |f1 | | 0.6836 | | 0 | +| - RE | | | |f1 | | 0.5680 | |0 | +| - p1 | | | |f1 | | 0.5392 | | 0 | +| - p2 | | | |f1 | | 0.5867 | | 0 | +| - p3 | | | |f1 | | 0.5780 | | 0 | diff --git a/csv_new/output/google__medgemma-27b-text-it__it__0shot.txt b/csv_new/output/google__medgemma-27b-text-it__it__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..919fa792ca00b73012ddfda89685fb88e1768710 --- /dev/null +++ b/csv_new/output/google__medgemma-27b-text-it__it__0shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=google/medgemma-27b-text-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5351 | |0 | +| - p1 | | | |f1 | | 0.4261 | | 0 | +| - p2 | | | |f1 | | 0.6212 | | 0 | +| - p3 | | | |f1 | | 0.5582 | | 0 | +| - RE | | | |f1 | | 0.4521 | |0 | +| - p1 | | | |f1 | | 0.4042 | | 0 | +| - p2 | | | |f1 | | 0.4916 | | 0 | +| - p3 | | | |f1 | | 0.4604 | | 0 | +| - RML | | | |f1 | | 0.0180 | |0 | +| - p1 | | | |f1 | | 0.0472 | | 0 | +| - p2 | | | |f1 | | 0.0064 | | 0 | +| - p3 | | | |f1 | | 0.0003 | | 0 | +| - DIA | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - HIS | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/google__medgemma-27b-text-it__it__10shot.txt b/csv_new/output/google__medgemma-27b-text-it__it__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..1799b14c9a4fb9bc8f3139242d6a9d639533436e --- /dev/null +++ b/csv_new/output/google__medgemma-27b-text-it__it__10shot.txt @@ -0,0 +1,22 @@ +hf (pretrained=google/medgemma-27b-text-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.7133 | |0 | +| - p1 | | | |f1 | | 0.7262 | | 0 | +| - p2 | | | |f1 | | 0.7005 | | 0 | +| - RE | | | |f1 | | 0.5960 | |0 | +| - p1 | | | |f1 | | 0.5919 | | 0 | +| - p2 | | | |f1 | | 0.6235 | | 0 | +| - p3 | | | |f1 | | 0.5726 | | 0 | +| - RML | | | |f1 | | 0.2282 | |0 | +| - p1 | | | |f1 | | 0.2314 | | 0 | +| - p2 | | | |f1 | | 0.2992 | | 0 | +| - p3 | | | |f1 | | 0.1541 | | 0 | +| - DIA | | | |f1 | | 0.4075 | |0 | +| - p1 | | | |f1 | | 0.5898 | | 0 | +| - p2 | | | |f1 | | 0.5797 | | 0 | +| - p3 | | | |f1 | | 0.0528 | | 0 | +| - HIS | | | |f1 | | 0.3517 | |0 | +| - p1 | | | |f1 | | 0.5265 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.5285 | | 0 | diff --git a/csv_new/output/google__medgemma-27b-text-it__pl__0shot.txt b/csv_new/output/google__medgemma-27b-text-it__pl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..f9eba07f4234c3cc6cc8a0f05a5cd8f0cd620ac8 --- /dev/null +++ b/csv_new/output/google__medgemma-27b-text-it__pl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/medgemma-27b-text-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4245 | |0 | +| - p1 | | | |f1 | | 0.4216 | | 0 | +| - p2 | | | |f1 | | 0.4303 | | 0 | +| - p3 | | | |f1 | | 0.4216 | | 0 | +| - RE | | | |f1 | | 0.4332 | |0 | +| - p1 | | | |f1 | | 0.4325 | | 0 | +| - p2 | | | |f1 | | 0.4424 | | 0 | +| - p3 | | | |f1 | | 0.4246 | | 0 | diff --git a/csv_new/output/google__medgemma-27b-text-it__pl__10shot.txt b/csv_new/output/google__medgemma-27b-text-it__pl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..0276a07fab4408898651f3f147a274d5d5df3c97 --- /dev/null +++ b/csv_new/output/google__medgemma-27b-text-it__pl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/medgemma-27b-text-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6791 | |0 | +| - p1 | | | |f1 | | 0.6829 | | 0 | +| - p2 | | | |f1 | | 0.6715 | | 0 | +| - p3 | | | |f1 | | 0.6829 | | 0 | +| - RE | | | |f1 | | 0.5997 | |0 | +| - p1 | | | |f1 | | 0.5940 | | 0 | +| - p2 | | | |f1 | | 0.6133 | | 0 | +| - p3 | | | |f1 | | 0.5918 | | 0 | diff --git a/csv_new/output/google__medgemma-27b-text-it__sk__0shot.txt b/csv_new/output/google__medgemma-27b-text-it__sk__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..bd8b98a582ba732f34685230d5c2af7c07ed3a59 --- /dev/null +++ b/csv_new/output/google__medgemma-27b-text-it__sk__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/medgemma-27b-text-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2336 | |0 | +| - p1 | | | |f1 | | 0.2971 | | 0 | +| - p2 | | | |f1 | | 0.1066 | | 0 | +| - p3 | | | |f1 | | 0.2971 | | 0 | +| - RE | | | |f1 | | 0.4440 | |0 | +| - p1 | | | |f1 | | 0.4395 | | 0 | +| - p2 | | | |f1 | | 0.4531 | | 0 | +| - p3 | | | |f1 | | 0.4395 | | 0 | diff --git a/csv_new/output/google__medgemma-27b-text-it__sk__10shot.txt b/csv_new/output/google__medgemma-27b-text-it__sk__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..003be3f4ed88a6a499b89ed958c415cc485b70c3 --- /dev/null +++ b/csv_new/output/google__medgemma-27b-text-it__sk__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/medgemma-27b-text-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.7137 | |0 | +| - p1 | | | |f1 | | 0.7143 | | 0 | +| - p2 | | | |f1 | | 0.7127 | | 0 | +| - p3 | | | |f1 | | 0.7143 | | 0 | +| - RE | | | |f1 | | 0.5156 | |0 | +| - p1 | | | |f1 | | 0.5111 | | 0 | +| - p2 | | | |f1 | | 0.5188 | | 0 | +| - p3 | | | |f1 | | 0.5171 | | 0 | diff --git a/csv_new/output/google__medgemma-27b-text-it__sl__0shot.txt b/csv_new/output/google__medgemma-27b-text-it__sl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c70cd19fab747d42922e7a5ecdf7736a81004f9 --- /dev/null +++ b/csv_new/output/google__medgemma-27b-text-it__sl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/medgemma-27b-text-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4863 | |0 | +| - p1 | | | |f1 | | 0.4675 | | 0 | +| - p2 | | | |f1 | | 0.5238 | | 0 | +| - p3 | | | |f1 | | 0.4675 | | 0 | +| - RE | | | |f1 | | 0.4201 | |0 | +| - p1 | | | |f1 | | 0.4182 | | 0 | +| - p2 | | | |f1 | | 0.4239 | | 0 | +| - p3 | | | |f1 | | 0.4182 | | 0 | diff --git a/csv_new/output/google__medgemma-27b-text-it__sl__10shot.txt b/csv_new/output/google__medgemma-27b-text-it__sl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c8ad321754c222e65360614da1f6192f3387c7c --- /dev/null +++ b/csv_new/output/google__medgemma-27b-text-it__sl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/medgemma-27b-text-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6887 | |0 | +| - p1 | | | |f1 | | 0.6947 | | 0 | +| - p2 | | | |f1 | | 0.6765 | | 0 | +| - p3 | | | |f1 | | 0.6947 | | 0 | +| - RE | | | |f1 | | 0.5469 | |0 | +| - p1 | | | |f1 | | 0.5323 | | 0 | +| - p2 | | | |f1 | | 0.5590 | | 0 | +| - p3 | | | |f1 | | 0.5494 | | 0 | diff --git a/csv_new/output/google__medgemma-4b-it__en__0shot.txt b/csv_new/output/google__medgemma-4b-it__en__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..91fd7d86851627db8904f7d9d037cbe8bbeaf315 --- /dev/null +++ b/csv_new/output/google__medgemma-4b-it__en__0shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=google/medgemma-4b-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2625 | |0 | +| - p1 | | | |f1 | | 0.2635 | | 0 | +| - p2 | | | |f1 | | 0.2503 | | 0 | +| - p3 | | | |f1 | | 0.2737 | | 0 | +| - RE | | | |f1 | | 0.2851 | |0 | +| - p1 | | | |f1 | | 0.2095 | | 0 | +| - p2 | | | |f1 | | 0.3257 | | 0 | +| - p3 | | | |f1 | | 0.3203 | | 0 | +| - RML | | | |f1 | | 0.0039 | |0 | +| - p1 | | | |f1 | | 0.0061 | | 0 | +| - p2 | | | |f1 | | 0.0056 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - DIA | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - HIS | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/google__medgemma-4b-it__en__10shot.txt b/csv_new/output/google__medgemma-4b-it__en__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e7bfd5b6ef4f3622dc727197872512f2cb6fa5f --- /dev/null +++ b/csv_new/output/google__medgemma-4b-it__en__10shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=google/medgemma-4b-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4930 | |0 | +| - p1 | | | |f1 | | 0.4833 | | 0 | +| - p2 | | | |f1 | | 0.5005 | | 0 | +| - p3 | | | |f1 | | 0.4951 | | 0 | +| - RE | | | |f1 | | 0.1198 | |0 | +| - p1 | | | |f1 | | 0.0964 | | 0 | +| - p2 | | | |f1 | | 0.1237 | | 0 | +| - p3 | | | |f1 | | 0.1391 | | 0 | +| - RML | | | |f1 | | 0.2646 | |0 | +| - p1 | | | |f1 | | 0.2659 | | 0 | +| - p2 | | | |f1 | | 0.2671 | | 0 | +| - p3 | | | |f1 | | 0.2607 | | 0 | +| - DIA | | | |f1 | | 0.2489 | |0 | +| - p1 | | | |f1 | | 0.3662 | | 0 | +| - p2 | | | |f1 | | 0.3800 | | 0 | +| - p3 | | | |f1 | | 0.0006 | | 0 | +| - HIS | | | |f1 | | 0.4228 | |0 | +| - p1 | | | |f1 | | 0.4505 | | 0 | +| - p2 | | | |f1 | | 0.3799 | | 0 | +| - p3 | | | |f1 | | 0.4378 | | 0 | diff --git a/csv_new/output/google__medgemma-4b-it__gr__0shot.txt b/csv_new/output/google__medgemma-4b-it__gr__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..d0f048d8fb01e8ed8352829fa0179010381f66ca --- /dev/null +++ b/csv_new/output/google__medgemma-4b-it__gr__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/medgemma-4b-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2688 | |0 | +| - p1 | | | |f1 | | 0.2705 | | 0 | +| - p2 | | | |f1 | | 0.2654 | | 0 | +| - p3 | | | |f1 | | 0.2705 | | 0 | +| - RE | | | |f1 | | 0.2053 | |0 | +| - p1 | | | |f1 | | 0.2381 | | 0 | +| - p2 | | | |f1 | | 0.3024 | | 0 | +| - p3 | | | |f1 | | 0.0754 | | 0 | diff --git a/csv_new/output/google__medgemma-4b-it__gr__10shot.txt b/csv_new/output/google__medgemma-4b-it__gr__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..4352edec2156fd74172ffb93bfc7069ed935cce2 --- /dev/null +++ b/csv_new/output/google__medgemma-4b-it__gr__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/medgemma-4b-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4953 | |0 | +| - p1 | | | |f1 | | 0.4910 | | 0 | +| - p2 | | | |f1 | | 0.5039 | | 0 | +| - p3 | | | |f1 | | 0.4910 | | 0 | +| - RE | | | |f1 | | 0.1453 | |0 | +| - p1 | | | |f1 | | 0.1204 | | 0 | +| - p2 | | | |f1 | | 0.1605 | | 0 | +| - p3 | | | |f1 | | 0.1551 | | 0 | diff --git a/csv_new/output/google__medgemma-4b-it__it__0shot.txt b/csv_new/output/google__medgemma-4b-it__it__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..703bbb937dd58a2fa5aaaf6925656b5c79b03b31 --- /dev/null +++ b/csv_new/output/google__medgemma-4b-it__it__0shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=google/medgemma-4b-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2929 | |0 | +| - p1 | | | |f1 | | 0.3157 | | 0 | +| - p2 | | | |f1 | | 0.2627 | | 0 | +| - p3 | | | |f1 | | 0.3004 | | 0 | +| - RE | | | |f1 | | 0.1767 | |0 | +| - p1 | | | |f1 | | 0.2154 | | 0 | +| - p2 | | | |f1 | | 0.2461 | | 0 | +| - p3 | | | |f1 | | 0.0688 | | 0 | +| - RML | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - DIA | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - HIS | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/google__medgemma-4b-it__it__10shot.txt b/csv_new/output/google__medgemma-4b-it__it__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..a8a3b8ab16be63315575f29ea4ab843ed471ded9 --- /dev/null +++ b/csv_new/output/google__medgemma-4b-it__it__10shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=google/medgemma-4b-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5454 | |0 | +| - p1 | | | |f1 | | 0.5633 | | 0 | +| - p2 | | | |f1 | | 0.5377 | | 0 | +| - p3 | | | |f1 | | 0.5352 | | 0 | +| - RE | | | |f1 | | 0.1753 | |0 | +| - p1 | | | |f1 | | 0.1592 | | 0 | +| - p2 | | | |f1 | | 0.1917 | | 0 | +| - p3 | | | |f1 | | 0.1751 | | 0 | +| - RML | | | |f1 | | 0.1096 | |0 | +| - p1 | | | |f1 | | 0.1072 | | 0 | +| - p2 | | | |f1 | | 0.1355 | | 0 | +| - p3 | | | |f1 | | 0.0861 | | 0 | +| - DIA | | | |f1 | | 0.3524 | |0 | +| - p1 | | | |f1 | | 0.5229 | | 0 | +| - p2 | | | |f1 | | 0.5289 | | 0 | +| - p3 | | | |f1 | | 0.0054 | | 0 | +| - HIS | | | |f1 | | 0.2891 | |0 | +| - p1 | | | |f1 | | 0.4314 | | 0 | +| - p2 | | | |f1 | | 0.0052 | | 0 | +| - p3 | | | |f1 | | 0.4306 | | 0 | diff --git a/csv_new/output/google__medgemma-4b-it__pl__0shot.txt b/csv_new/output/google__medgemma-4b-it__pl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..1da79eb1fdbb0c791458ff4a9cce9c33a7da6497 --- /dev/null +++ b/csv_new/output/google__medgemma-4b-it__pl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/medgemma-4b-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2231 | |0 | +| - p1 | | | |f1 | | 0.2255 | | 0 | +| - p2 | | | |f1 | | 0.2183 | | 0 | +| - p3 | | | |f1 | | 0.2255 | | 0 | +| - RE | | | |f1 | | 0.1173 | |0 | +| - p1 | | | |f1 | | 0.1150 | | 0 | +| - p2 | | | |f1 | | 0.1314 | | 0 | +| - p3 | | | |f1 | | 0.1054 | | 0 | diff --git a/csv_new/output/google__medgemma-4b-it__pl__10shot.txt b/csv_new/output/google__medgemma-4b-it__pl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..79c82263014a0069b6c825385d95cf6477004a4a --- /dev/null +++ b/csv_new/output/google__medgemma-4b-it__pl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/medgemma-4b-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5193 | |0 | +| - p1 | | | |f1 | | 0.5186 | | 0 | +| - p2 | | | |f1 | | 0.5206 | | 0 | +| - p3 | | | |f1 | | 0.5186 | | 0 | +| - RE | | | |f1 | | 0.1055 | |0 | +| - p1 | | | |f1 | | 0.1171 | | 0 | +| - p2 | | | |f1 | | 0.0997 | | 0 | +| - p3 | | | |f1 | | 0.0997 | | 0 | diff --git a/csv_new/output/google__medgemma-4b-it__sk__0shot.txt b/csv_new/output/google__medgemma-4b-it__sk__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..1bea1720e28d395cc9d1f6ee52968a6965a98c84 --- /dev/null +++ b/csv_new/output/google__medgemma-4b-it__sk__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/medgemma-4b-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2427 | |0 | +| - p1 | | | |f1 | | 0.2447 | | 0 | +| - p2 | | | |f1 | | 0.2387 | | 0 | +| - p3 | | | |f1 | | 0.2447 | | 0 | +| - RE | | | |f1 | | 0.1212 | |0 | +| - p1 | | | |f1 | | 0.1119 | | 0 | +| - p2 | | | |f1 | | 0.1399 | | 0 | +| - p3 | | | |f1 | | 0.1119 | | 0 | diff --git a/csv_new/output/google__medgemma-4b-it__sk__10shot.txt b/csv_new/output/google__medgemma-4b-it__sk__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..19c6346c5007538093d4b83f945e14ee4616490c --- /dev/null +++ b/csv_new/output/google__medgemma-4b-it__sk__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/medgemma-4b-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4654 | |0 | +| - p1 | | | |f1 | | 0.4756 | | 0 | +| - p2 | | | |f1 | | 0.4449 | | 0 | +| - p3 | | | |f1 | | 0.4756 | | 0 | +| - RE | | | |f1 | | 0.1035 | |0 | +| - p1 | | | |f1 | | 0.1095 | | 0 | +| - p2 | | | |f1 | | 0.1009 | | 0 | +| - p3 | | | |f1 | | 0.1000 | | 0 | diff --git a/csv_new/output/google__medgemma-4b-it__sl__0shot.txt b/csv_new/output/google__medgemma-4b-it__sl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..56dbab87e4f7fed3a562f04e90d5511d788bdc34 --- /dev/null +++ b/csv_new/output/google__medgemma-4b-it__sl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/medgemma-4b-it ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2569 | |0 | +| - p1 | | | |f1 | | 0.2574 | | 0 | +| - p2 | | | |f1 | | 0.2558 | | 0 | +| - p3 | | | |f1 | | 0.2574 | | 0 | +| - RE | | | |f1 | | 0.1012 | |0 | +| - p1 | | | |f1 | | 0.0973 | | 0 | +| - p2 | | | |f1 | | 0.1089 | | 0 | +| - p3 | | | |f1 | | 0.0973 | | 0 | diff --git a/csv_new/output/google__medgemma-4b-it__sl__10shot.txt b/csv_new/output/google__medgemma-4b-it__sl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..cc424f90dde2288be7dda70e93c0e761287409da --- /dev/null +++ b/csv_new/output/google__medgemma-4b-it__sl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=google/medgemma-4b-it ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5063 | |0 | +| - p1 | | | |f1 | | 0.5117 | | 0 | +| - p2 | | | |f1 | | 0.4955 | | 0 | +| - p3 | | | |f1 | | 0.5117 | | 0 | +| - RE | | | |f1 | | 0.1260 | |0 | +| - p1 | | | |f1 | | 0.1178 | | 0 | +| - p2 | | | |f1 | | 0.1101 | | 0 | +| - p3 | | | |f1 | | 0.1501 | | 0 | diff --git a/csv_new/output/microsoft__MediPhi-Clinical__en__0shot.txt b/csv_new/output/microsoft__MediPhi-Clinical__en__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..36836266cb450f999e59cc1c8ded4286dd2f967a --- /dev/null +++ b/csv_new/output/microsoft__MediPhi-Clinical__en__0shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=microsoft/MediPhi-Clinical ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2786 | |0 | +| - p1 | | | |f1 | | 0.2502 | | 0 | +| - p2 | | | |f1 | | 0.3089 | | 0 | +| - p3 | | | |f1 | | 0.2768 | | 0 | +| - RE | | | |f1 | | 0.3248 | |0 | +| - p1 | | | |f1 | | 0.2274 | | 0 | +| - p2 | | | |f1 | | 0.3929 | | 0 | +| - p3 | | | |f1 | | 0.3542 | | 0 | +| - RML | | | |f1 | | 0.0001 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0003 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - DIA | | | |f1 | | 0.0001 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0003 | | 0 | +| - HIS | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/microsoft__MediPhi-Clinical__en__10shot.txt b/csv_new/output/microsoft__MediPhi-Clinical__en__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..bdbfc7235dd70690e8fedb9cfe08ee7d8ee39e35 --- /dev/null +++ b/csv_new/output/microsoft__MediPhi-Clinical__en__10shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=microsoft/MediPhi-Clinical ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5008 | |0 | +| - p1 | | | |f1 | | 0.5009 | | 0 | +| - p2 | | | |f1 | | 0.4966 | | 0 | +| - p3 | | | |f1 | | 0.5049 | | 0 | +| - RE | | | |f1 | | 0.1125 | |0 | +| - p1 | | | |f1 | | 0.1175 | | 0 | +| - p2 | | | |f1 | | 0.1095 | | 0 | +| - p3 | | | |f1 | | 0.1107 | | 0 | +| - RML | | | |f1 | | 0.3189 | |0 | +| - p1 | | | |f1 | | 0.3052 | | 0 | +| - p2 | | | |f1 | | 0.3307 | | 0 | +| - p3 | | | |f1 | | 0.3208 | | 0 | +| - DIA | | | |f1 | | 0.2879 | |0 | +| - p1 | | | |f1 | | 0.1833 | | 0 | +| - p2 | | | |f1 | | 0.2803 | | 0 | +| - p3 | | | |f1 | | 0.4002 | | 0 | +| - HIS | | | |f1 | | 0.3722 | |0 | +| - p1 | | | |f1 | | 0.3528 | | 0 | +| - p2 | | | |f1 | | 0.2818 | | 0 | +| - p3 | | | |f1 | | 0.4820 | | 0 | diff --git a/csv_new/output/microsoft__MediPhi-Clinical__gr__0shot.txt b/csv_new/output/microsoft__MediPhi-Clinical__gr__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..a0c5c0a5a00e2cdf36cc9f66ad99c89cd41760ac --- /dev/null +++ b/csv_new/output/microsoft__MediPhi-Clinical__gr__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=microsoft/MediPhi-Clinical ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1717 | |0 | +| - p1 | | | |f1 | | 0.1641 | | 0 | +| - p2 | | | |f1 | | 0.1869 | | 0 | +| - p3 | | | |f1 | | 0.1641 | | 0 | +| - RE | | | |f1 | | 0.0977 | |0 | +| - p1 | | | |f1 | | 0.0736 | | 0 | +| - p2 | | | |f1 | | 0.0778 | | 0 | +| - p3 | | | |f1 | | 0.1418 | | 0 | diff --git a/csv_new/output/microsoft__MediPhi-Clinical__gr__10shot.txt b/csv_new/output/microsoft__MediPhi-Clinical__gr__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..eba564920c2c92d7800080552c6a59b8def8c9b7 --- /dev/null +++ b/csv_new/output/microsoft__MediPhi-Clinical__gr__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=microsoft/MediPhi-Clinical ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3384 | |0 | +| - p1 | | | |f1 | | 0.3375 | | 0 | +| - p2 | | | |f1 | | 0.3403 | | 0 | +| - p3 | | | |f1 | | 0.3375 | | 0 | +| - RE | | | |f1 | | 0.0606 | |0 | +| - p1 | | | |f1 | | 0.0427 | | 0 | +| - p2 | | | |f1 | | 0.0681 | | 0 | +| - p3 | | | |f1 | | 0.0711 | | 0 | diff --git a/csv_new/output/microsoft__MediPhi-Clinical__it__0shot.txt b/csv_new/output/microsoft__MediPhi-Clinical__it__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..a96f78aa4cc8265c0f08244dc2c89d552f40dd2b --- /dev/null +++ b/csv_new/output/microsoft__MediPhi-Clinical__it__0shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=microsoft/MediPhi-Clinical ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3307 | |0 | +| - p1 | | | |f1 | | 0.3397 | | 0 | +| - p2 | | | |f1 | | 0.3300 | | 0 | +| - p3 | | | |f1 | | 0.3226 | | 0 | +| - RE | | | |f1 | | 0.0792 | |0 | +| - p1 | | | |f1 | | 0.1489 | | 0 | +| - p2 | | | |f1 | | 0.0736 | | 0 | +| - p3 | | | |f1 | | 0.0149 | | 0 | +| - RML | | | |f1 | | 0.0021 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0064 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - DIA | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - HIS | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/microsoft__MediPhi-Clinical__it__10shot.txt b/csv_new/output/microsoft__MediPhi-Clinical__it__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..c2d3f133b2005537efcbebceed1f048beff58a25 --- /dev/null +++ b/csv_new/output/microsoft__MediPhi-Clinical__it__10shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=microsoft/MediPhi-Clinical ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5257 | |0 | +| - p1 | | | |f1 | | 0.5195 | | 0 | +| - p2 | | | |f1 | | 0.5301 | | 0 | +| - p3 | | | |f1 | | 0.5275 | | 0 | +| - RE | | | |f1 | | 0.1499 | |0 | +| - p1 | | | |f1 | | 0.2114 | | 0 | +| - p2 | | | |f1 | | 0.0961 | | 0 | +| - p3 | | | |f1 | | 0.1422 | | 0 | +| - RML | | | |f1 | | 0.1299 | |0 | +| - p1 | | | |f1 | | 0.1422 | | 0 | +| - p2 | | | |f1 | | 0.1646 | | 0 | +| - p3 | | | |f1 | | 0.0829 | | 0 | +| - DIA | | | |f1 | | 0.4128 | |0 | +| - p1 | | | |f1 | | 0.3221 | | 0 | +| - p2 | | | |f1 | | 0.3798 | | 0 | +| - p3 | | | |f1 | | 0.5365 | | 0 | +| - HIS | | | |f1 | | 0.2053 | |0 | +| - p1 | | | |f1 | | 0.1169 | | 0 | +| - p2 | | | |f1 | | 0.3103 | | 0 | +| - p3 | | | |f1 | | 0.1885 | | 0 | diff --git a/csv_new/output/microsoft__MediPhi-Clinical__pl__0shot.txt b/csv_new/output/microsoft__MediPhi-Clinical__pl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..398a394b401f757945eec334c5bd3723685aa80b --- /dev/null +++ b/csv_new/output/microsoft__MediPhi-Clinical__pl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=microsoft/MediPhi-Clinical ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2831 | |0 | +| - p1 | | | |f1 | | 0.2815 | | 0 | +| - p2 | | | |f1 | | 0.2861 | | 0 | +| - p3 | | | |f1 | | 0.2815 | | 0 | +| - RE | | | |f1 | | 0.2693 | |0 | +| - p1 | | | |f1 | | 0.2109 | | 0 | +| - p2 | | | |f1 | | 0.2908 | | 0 | +| - p3 | | | |f1 | | 0.3061 | | 0 | diff --git a/csv_new/output/microsoft__MediPhi-Clinical__pl__10shot.txt b/csv_new/output/microsoft__MediPhi-Clinical__pl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..4fc3f8a0bc1e4a0d38f2f3729779f31828e7b70b --- /dev/null +++ b/csv_new/output/microsoft__MediPhi-Clinical__pl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=microsoft/MediPhi-Clinical ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3986 | |0 | +| - p1 | | | |f1 | | 0.3913 | | 0 | +| - p2 | | | |f1 | | 0.4132 | | 0 | +| - p3 | | | |f1 | | 0.3913 | | 0 | +| - RE | | | |f1 | | 0.1366 | |0 | +| - p1 | | | |f1 | | 0.1255 | | 0 | +| - p2 | | | |f1 | | 0.1207 | | 0 | +| - p3 | | | |f1 | | 0.1636 | | 0 | diff --git a/csv_new/output/microsoft__MediPhi-Clinical__sk__0shot.txt b/csv_new/output/microsoft__MediPhi-Clinical__sk__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..9aaca4c790d98eb11b6ab172e1b2f9ba297b0a56 --- /dev/null +++ b/csv_new/output/microsoft__MediPhi-Clinical__sk__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=microsoft/MediPhi-Clinical ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2710 | |0 | +| - p1 | | | |f1 | | 0.2571 | | 0 | +| - p2 | | | |f1 | | 0.2987 | | 0 | +| - p3 | | | |f1 | | 0.2571 | | 0 | +| - RE | | | |f1 | | 0.1062 | |0 | +| - p1 | | | |f1 | | 0.1554 | | 0 | +| - p2 | | | |f1 | | 0.0077 | | 0 | +| - p3 | | | |f1 | | 0.1554 | | 0 | diff --git a/csv_new/output/microsoft__MediPhi-Clinical__sk__10shot.txt b/csv_new/output/microsoft__MediPhi-Clinical__sk__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..b031c9c0bd0f5fe6669a53563e3681aa1a74d890 --- /dev/null +++ b/csv_new/output/microsoft__MediPhi-Clinical__sk__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=microsoft/MediPhi-Clinical ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4025 | |0 | +| - p1 | | | |f1 | | 0.4106 | | 0 | +| - p2 | | | |f1 | | 0.3861 | | 0 | +| - p3 | | | |f1 | | 0.4106 | | 0 | +| - RE | | | |f1 | | 0.0613 | |0 | +| - p1 | | | |f1 | | 0.0509 | | 0 | +| - p2 | | | |f1 | | 0.0606 | | 0 | +| - p3 | | | |f1 | | 0.0724 | | 0 | diff --git a/csv_new/output/microsoft__MediPhi-Clinical__sl__0shot.txt b/csv_new/output/microsoft__MediPhi-Clinical__sl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd2d143234ccb78abac42a387aa128d3314e802f --- /dev/null +++ b/csv_new/output/microsoft__MediPhi-Clinical__sl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=microsoft/MediPhi-Clinical ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2892 | |0 | +| - p1 | | | |f1 | | 0.2998 | | 0 | +| - p2 | | | |f1 | | 0.2680 | | 0 | +| - p3 | | | |f1 | | 0.2998 | | 0 | +| - RE | | | |f1 | | 0.0304 | |0 | +| - p1 | | | |f1 | | 0.0395 | | 0 | +| - p2 | | | |f1 | | 0.0121 | | 0 | +| - p3 | | | |f1 | | 0.0395 | | 0 | diff --git a/csv_new/output/microsoft__MediPhi-Clinical__sl__10shot.txt b/csv_new/output/microsoft__MediPhi-Clinical__sl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..891a18854ac34e549e2ead223a4f5c50fa589fb3 --- /dev/null +++ b/csv_new/output/microsoft__MediPhi-Clinical__sl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=microsoft/MediPhi-Clinical ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4021 | |0 | +| - p1 | | | |f1 | | 0.4036 | | 0 | +| - p2 | | | |f1 | | 0.3990 | | 0 | +| - p3 | | | |f1 | | 0.4036 | | 0 | +| - RE | | | |f1 | | 0.0748 | |0 | +| - p1 | | | |f1 | | 0.0829 | | 0 | +| - p2 | | | |f1 | | 0.0674 | | 0 | +| - p3 | | | |f1 | | 0.0742 | | 0 | diff --git a/csv_new/output/microsoft__MediPhi-Instruct__en__0shot.txt b/csv_new/output/microsoft__MediPhi-Instruct__en__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..3006762bb6d519f644dfef9a8d9d38d675ad4a64 --- /dev/null +++ b/csv_new/output/microsoft__MediPhi-Instruct__en__0shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=microsoft/MediPhi-Instruct ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1598 | |0 | +| - p1 | | | |f1 | | 0.0761 | | 0 | +| - p2 | | | |f1 | | 0.2410 | | 0 | +| - p3 | | | |f1 | | 0.1625 | | 0 | +| - RE | | | |f1 | | 0.2982 | |0 | +| - p1 | | | |f1 | | 0.1135 | | 0 | +| - p2 | | | |f1 | | 0.4006 | | 0 | +| - p3 | | | |f1 | | 0.3804 | | 0 | +| - RML | | | |f1 | | 0.0015 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0045 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - DIA | | | |f1 | | 0.0004 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0012 | | 0 | +| - HIS | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/microsoft__MediPhi-Instruct__en__10shot.txt b/csv_new/output/microsoft__MediPhi-Instruct__en__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b6057e20590af0e6a0c136089c6200b588e3e02 --- /dev/null +++ b/csv_new/output/microsoft__MediPhi-Instruct__en__10shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=microsoft/MediPhi-Instruct ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5216 | |0 | +| - p1 | | | |f1 | | 0.5357 | | 0 | +| - p2 | | | |f1 | | 0.5227 | | 0 | +| - p3 | | | |f1 | | 0.5063 | | 0 | +| - RE | | | |f1 | | 0.1719 | |0 | +| - p1 | | | |f1 | | 0.1432 | | 0 | +| - p2 | | | |f1 | | 0.1888 | | 0 | +| - p3 | | | |f1 | | 0.1836 | | 0 | +| - RML | | | |f1 | | 0.2856 | |0 | +| - p1 | | | |f1 | | 0.2742 | | 0 | +| - p2 | | | |f1 | | 0.3438 | | 0 | +| - p3 | | | |f1 | | 0.2387 | | 0 | +| - DIA | | | |f1 | | 0.3436 | |0 | +| - p1 | | | |f1 | | 0.2162 | | 0 | +| - p2 | | | |f1 | | 0.3002 | | 0 | +| - p3 | | | |f1 | | 0.5144 | | 0 | +| - HIS | | | |f1 | | 0.4173 | |0 | +| - p1 | | | |f1 | | 0.4543 | | 0 | +| - p2 | | | |f1 | | 0.3176 | | 0 | +| - p3 | | | |f1 | | 0.4801 | | 0 | diff --git a/csv_new/output/microsoft__MediPhi-Instruct__gr__0shot.txt b/csv_new/output/microsoft__MediPhi-Instruct__gr__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..033006f71175ad0e7ba4e4f9b2b91bd4b604c058 --- /dev/null +++ b/csv_new/output/microsoft__MediPhi-Instruct__gr__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=microsoft/MediPhi-Instruct ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1159 | |0 | +| - p1 | | | |f1 | | 0.1294 | | 0 | +| - p2 | | | |f1 | | 0.0890 | | 0 | +| - p3 | | | |f1 | | 0.1294 | | 0 | +| - RE | | | |f1 | | 0.1184 | |0 | +| - p1 | | | |f1 | | 0.0962 | | 0 | +| - p2 | | | |f1 | | 0.0673 | | 0 | +| - p3 | | | |f1 | | 0.1916 | | 0 | diff --git a/csv_new/output/microsoft__MediPhi-Instruct__gr__10shot.txt b/csv_new/output/microsoft__MediPhi-Instruct__gr__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..116bb08deaa20d0ae0a961c362d4802b12d2add2 --- /dev/null +++ b/csv_new/output/microsoft__MediPhi-Instruct__gr__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=microsoft/MediPhi-Instruct ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2881 | |0 | +| - p1 | | | |f1 | | 0.2822 | | 0 | +| - p2 | | | |f1 | | 0.2999 | | 0 | +| - p3 | | | |f1 | | 0.2822 | | 0 | +| - RE | | | |f1 | | 0.0675 | |0 | +| - p1 | | | |f1 | | 0.0576 | | 0 | +| - p2 | | | |f1 | | 0.0674 | | 0 | +| - p3 | | | |f1 | | 0.0777 | | 0 | diff --git a/csv_new/output/microsoft__MediPhi-Instruct__it__0shot.txt b/csv_new/output/microsoft__MediPhi-Instruct__it__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..20300e67c6f1830ad29329449c805bf4c30291ed --- /dev/null +++ b/csv_new/output/microsoft__MediPhi-Instruct__it__0shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=microsoft/MediPhi-Instruct ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2023 | |0 | +| - p1 | | | |f1 | | 0.0867 | | 0 | +| - p2 | | | |f1 | | 0.2484 | | 0 | +| - p3 | | | |f1 | | 0.2717 | | 0 | +| - RE | | | |f1 | | 0.2623 | |0 | +| - p1 | | | |f1 | | 0.1712 | | 0 | +| - p2 | | | |f1 | | 0.2896 | | 0 | +| - p3 | | | |f1 | | 0.3261 | | 0 | +| - RML | | | |f1 | | 0.0013 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0038 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - DIA | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - HIS | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/microsoft__MediPhi-Instruct__it__10shot.txt b/csv_new/output/microsoft__MediPhi-Instruct__it__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..1ee5e41acdcd926d55b6f5cea9a2ac30593bf8ef --- /dev/null +++ b/csv_new/output/microsoft__MediPhi-Instruct__it__10shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=microsoft/MediPhi-Instruct ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5715 | |0 | +| - p1 | | | |f1 | | 0.5729 | | 0 | +| - p2 | | | |f1 | | 0.5627 | | 0 | +| - p3 | | | |f1 | | 0.5790 | | 0 | +| - RE | | | |f1 | | 0.2679 | |0 | +| - p1 | | | |f1 | | 0.2873 | | 0 | +| - p2 | | | |f1 | | 0.2307 | | 0 | +| - p3 | | | |f1 | | 0.2858 | | 0 | +| - RML | | | |f1 | | 0.1176 | |0 | +| - p1 | | | |f1 | | 0.1545 | | 0 | +| - p2 | | | |f1 | | 0.1508 | | 0 | +| - p3 | | | |f1 | | 0.0475 | | 0 | +| - DIA | | | |f1 | | 0.4763 | |0 | +| - p1 | | | |f1 | | 0.5617 | | 0 | +| - p2 | | | |f1 | | 0.3270 | | 0 | +| - p3 | | | |f1 | | 0.5404 | | 0 | +| - HIS | | | |f1 | | 0.3735 | |0 | +| - p1 | | | |f1 | | 0.3291 | | 0 | +| - p2 | | | |f1 | | 0.4029 | | 0 | +| - p3 | | | |f1 | | 0.3884 | | 0 | diff --git a/csv_new/output/microsoft__MediPhi-Instruct__pl__0shot.txt b/csv_new/output/microsoft__MediPhi-Instruct__pl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..9c6bc50efa744a67b9f472d4fdd237432f562068 --- /dev/null +++ b/csv_new/output/microsoft__MediPhi-Instruct__pl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=microsoft/MediPhi-Instruct ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1567 | |0 | +| - p1 | | | |f1 | | 0.1510 | | 0 | +| - p2 | | | |f1 | | 0.1680 | | 0 | +| - p3 | | | |f1 | | 0.1510 | | 0 | +| - RE | | | |f1 | | 0.2881 | |0 | +| - p1 | | | |f1 | | 0.2683 | | 0 | +| - p2 | | | |f1 | | 0.3126 | | 0 | +| - p3 | | | |f1 | | 0.2832 | | 0 | diff --git a/csv_new/output/microsoft__MediPhi-Instruct__pl__10shot.txt b/csv_new/output/microsoft__MediPhi-Instruct__pl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..72e10fb742c76990d605ee3a2c3c4ef35b670091 --- /dev/null +++ b/csv_new/output/microsoft__MediPhi-Instruct__pl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=microsoft/MediPhi-Instruct ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4447 | |0 | +| - p1 | | | |f1 | | 0.4417 | | 0 | +| - p2 | | | |f1 | | 0.4506 | | 0 | +| - p3 | | | |f1 | | 0.4417 | | 0 | +| - RE | | | |f1 | | 0.2291 | |0 | +| - p1 | | | |f1 | | 0.1525 | | 0 | +| - p2 | | | |f1 | | 0.2686 | | 0 | +| - p3 | | | |f1 | | 0.2662 | | 0 | diff --git a/csv_new/output/microsoft__MediPhi-Instruct__sk__0shot.txt b/csv_new/output/microsoft__MediPhi-Instruct__sk__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..a3c42b94084e72cbff224a863d3f392fb1b26463 --- /dev/null +++ b/csv_new/output/microsoft__MediPhi-Instruct__sk__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=microsoft/MediPhi-Instruct ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1788 | |0 | +| - p1 | | | |f1 | | 0.1641 | | 0 | +| - p2 | | | |f1 | | 0.2081 | | 0 | +| - p3 | | | |f1 | | 0.1641 | | 0 | +| - RE | | | |f1 | | 0.1221 | |0 | +| - p1 | | | |f1 | | 0.1776 | | 0 | +| - p2 | | | |f1 | | 0.0112 | | 0 | +| - p3 | | | |f1 | | 0.1776 | | 0 | diff --git a/csv_new/output/microsoft__MediPhi-Instruct__sk__10shot.txt b/csv_new/output/microsoft__MediPhi-Instruct__sk__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..2d69cd7d7928f4a6c643567b723646b0ea9b62cc --- /dev/null +++ b/csv_new/output/microsoft__MediPhi-Instruct__sk__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=microsoft/MediPhi-Instruct ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4226 | |0 | +| - p1 | | | |f1 | | 0.4327 | | 0 | +| - p2 | | | |f1 | | 0.4023 | | 0 | +| - p3 | | | |f1 | | 0.4327 | | 0 | +| - RE | | | |f1 | | 0.1313 | |0 | +| - p1 | | | |f1 | | 0.1070 | | 0 | +| - p2 | | | |f1 | | 0.1395 | | 0 | +| - p3 | | | |f1 | | 0.1473 | | 0 | diff --git a/csv_new/output/microsoft__MediPhi-Instruct__sl__0shot.txt b/csv_new/output/microsoft__MediPhi-Instruct__sl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d1efab70337f7f4131da606243432c236e0a589 --- /dev/null +++ b/csv_new/output/microsoft__MediPhi-Instruct__sl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=microsoft/MediPhi-Instruct ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1792 | |0 | +| - p1 | | | |f1 | | 0.1758 | | 0 | +| - p2 | | | |f1 | | 0.1860 | | 0 | +| - p3 | | | |f1 | | 0.1758 | | 0 | +| - RE | | | |f1 | | 0.1325 | |0 | +| - p1 | | | |f1 | | 0.1446 | | 0 | +| - p2 | | | |f1 | | 0.1084 | | 0 | +| - p3 | | | |f1 | | 0.1446 | | 0 | diff --git a/csv_new/output/microsoft__MediPhi-Instruct__sl__10shot.txt b/csv_new/output/microsoft__MediPhi-Instruct__sl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..679149a7bff8efd0e260ce4a2032de17a86487c6 --- /dev/null +++ b/csv_new/output/microsoft__MediPhi-Instruct__sl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=microsoft/MediPhi-Instruct ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3837 | |0 | +| - p1 | | | |f1 | | 0.3973 | | 0 | +| - p2 | | | |f1 | | 0.3564 | | 0 | +| - p3 | | | |f1 | | 0.3973 | | 0 | +| - RE | | | |f1 | | 0.1550 | |0 | +| - p1 | | | |f1 | | 0.1155 | | 0 | +| - p2 | | | |f1 | | 0.1468 | | 0 | +| - p3 | | | |f1 | | 0.2027 | | 0 | diff --git a/csv_new/output/mistralai__Mistral-7B-Instruct-v0.2__en__0shot.txt b/csv_new/output/mistralai__Mistral-7B-Instruct-v0.2__en__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..355a40ec5717374a0494d89524ed0311cbeb1e87 --- /dev/null +++ b/csv_new/output/mistralai__Mistral-7B-Instruct-v0.2__en__0shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=mistralai/Mistral-7B-Instruct-v0.2 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2278 | |0 | +| - p1 | | | |f1 | | 0.2529 | | 0 | +| - p2 | | | |f1 | | 0.2144 | | 0 | +| - p3 | | | |f1 | | 0.2162 | | 0 | +| - RE | | | |f1 | | 0.3007 | |0 | +| - p1 | | | |f1 | | 0.3688 | | 0 | +| - p2 | | | |f1 | | 0.3642 | | 0 | +| - p3 | | | |f1 | | 0.1693 | | 0 | +| - RML | | | |f1 | | 0.0001 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0002 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - DIA | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - HIS | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/mistralai__Mistral-7B-Instruct-v0.2__en__10shot.txt b/csv_new/output/mistralai__Mistral-7B-Instruct-v0.2__en__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..aebef1ac7d25fdce9740914ef074f918c87812a7 --- /dev/null +++ b/csv_new/output/mistralai__Mistral-7B-Instruct-v0.2__en__10shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=mistralai/Mistral-7B-Instruct-v0.2 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4753 | |0 | +| - p1 | | | |f1 | | 0.4725 | | 0 | +| - p2 | | | |f1 | | 0.4730 | | 0 | +| - p3 | | | |f1 | | 0.4805 | | 0 | +| - RE | | | |f1 | | 0.3592 | |0 | +| - p1 | | | |f1 | | 0.2593 | | 0 | +| - p2 | | | |f1 | | 0.4034 | | 0 | +| - p3 | | | |f1 | | 0.4148 | | 0 | +| - RML | | | |f1 | | 0.1222 | |0 | +| - p1 | | | |f1 | | 0.0099 | | 0 | +| - p2 | | | |f1 | | 0.1388 | | 0 | +| - p3 | | | |f1 | | 0.2178 | | 0 | +| - DIA | | | |f1 | | 0.0640 | |0 | +| - p1 | | | |f1 | | 0.0137 | | 0 | +| - p2 | | | |f1 | | 0.0240 | | 0 | +| - p3 | | | |f1 | | 0.1543 | | 0 | +| - HIS | | | |f1 | | 0.1376 | |0 | +| - p1 | | | |f1 | | 0.0143 | | 0 | +| - p2 | | | |f1 | | 0.3929 | | 0 | +| - p3 | | | |f1 | | 0.0058 | | 0 | diff --git a/csv_new/output/mistralai__Mistral-7B-Instruct-v0.2__gr__0shot.txt b/csv_new/output/mistralai__Mistral-7B-Instruct-v0.2__gr__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..0017f0d9afa0d196c05a399d60bf6b69bf801441 --- /dev/null +++ b/csv_new/output/mistralai__Mistral-7B-Instruct-v0.2__gr__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=mistralai/Mistral-7B-Instruct-v0.2 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1705 | |0 | +| - p1 | | | |f1 | | 0.1603 | | 0 | +| - p2 | | | |f1 | | 0.1909 | | 0 | +| - p3 | | | |f1 | | 0.1603 | | 0 | +| - RE | | | |f1 | | 0.0592 | |0 | +| - p1 | | | |f1 | | 0.0432 | | 0 | +| - p2 | | | |f1 | | 0.0348 | | 0 | +| - p3 | | | |f1 | | 0.0994 | | 0 | diff --git a/csv_new/output/mistralai__Mistral-7B-Instruct-v0.2__gr__10shot.txt b/csv_new/output/mistralai__Mistral-7B-Instruct-v0.2__gr__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..b7051f2deb4230647fb3c9bbe0580a2fe84de6d8 --- /dev/null +++ b/csv_new/output/mistralai__Mistral-7B-Instruct-v0.2__gr__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=mistralai/Mistral-7B-Instruct-v0.2 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3548 | |0 | +| - p1 | | | |f1 | | 0.3498 | | 0 | +| - p2 | | | |f1 | | 0.3648 | | 0 | +| - p3 | | | |f1 | | 0.3498 | | 0 | +| - RE | | | |f1 | | 0.1862 | |0 | +| - p1 | | | |f1 | | 0.1055 | | 0 | +| - p2 | | | |f1 | | 0.2343 | | 0 | +| - p3 | | | |f1 | | 0.2189 | | 0 | diff --git a/csv_new/output/mistralai__Mistral-7B-Instruct-v0.2__it__0shot.txt b/csv_new/output/mistralai__Mistral-7B-Instruct-v0.2__it__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..286e5a1cdf69131a9fbaed275d245cfdd5eddaa7 --- /dev/null +++ b/csv_new/output/mistralai__Mistral-7B-Instruct-v0.2__it__0shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=mistralai/Mistral-7B-Instruct-v0.2 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2433 | |0 | +| - p1 | | | |f1 | | 0.2788 | | 0 | +| - p2 | | | |f1 | | 0.2030 | | 0 | +| - p3 | | | |f1 | | 0.2481 | | 0 | +| - RE | | | |f1 | | 0.0561 | |0 | +| - p1 | | | |f1 | | 0.1382 | | 0 | +| - p2 | | | |f1 | | 0.0163 | | 0 | +| - p3 | | | |f1 | | 0.0140 | | 0 | +| - RML | | | |f1 | | 0.0001 | |0 | +| - p1 | | | |f1 | | 0.0002 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - DIA | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - HIS | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/mistralai__Mistral-7B-Instruct-v0.2__it__10shot.txt b/csv_new/output/mistralai__Mistral-7B-Instruct-v0.2__it__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..fb98c1bf90005a71030c1b7421225ad849521246 --- /dev/null +++ b/csv_new/output/mistralai__Mistral-7B-Instruct-v0.2__it__10shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=mistralai/Mistral-7B-Instruct-v0.2 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5176 | |0 | +| - p1 | | | |f1 | | 0.5147 | | 0 | +| - p2 | | | |f1 | | 0.5232 | | 0 | +| - p3 | | | |f1 | | 0.5149 | | 0 | +| - RE | | | |f1 | | 0.3958 | |0 | +| - p1 | | | |f1 | | 0.3092 | | 0 | +| - p2 | | | |f1 | | 0.4530 | | 0 | +| - p3 | | | |f1 | | 0.4252 | | 0 | +| - RML | | | |f1 | | 0.0917 | |0 | +| - p1 | | | |f1 | | 0.0154 | | 0 | +| - p2 | | | |f1 | | 0.1434 | | 0 | +| - p3 | | | |f1 | | 0.1162 | | 0 | +| - DIA | | | |f1 | | 0.2360 | |0 | +| - p1 | | | |f1 | | 0.0163 | | 0 | +| - p2 | | | |f1 | | 0.5695 | | 0 | +| - p3 | | | |f1 | | 0.1222 | | 0 | +| - HIS | | | |f1 | | 0.1182 | |0 | +| - p1 | | | |f1 | | 0.0749 | | 0 | +| - p2 | | | |f1 | | 0.2141 | | 0 | +| - p3 | | | |f1 | | 0.0655 | | 0 | diff --git a/csv_new/output/mistralai__Mistral-7B-Instruct-v0.2__pl__0shot.txt b/csv_new/output/mistralai__Mistral-7B-Instruct-v0.2__pl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..77b2c7212ed034a17baff6431293cdb59c42592c --- /dev/null +++ b/csv_new/output/mistralai__Mistral-7B-Instruct-v0.2__pl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=mistralai/Mistral-7B-Instruct-v0.2 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2953 | |0 | +| - p1 | | | |f1 | | 0.3024 | | 0 | +| - p2 | | | |f1 | | 0.2811 | | 0 | +| - p3 | | | |f1 | | 0.3024 | | 0 | +| - RE | | | |f1 | | 0.1006 | |0 | +| - p1 | | | |f1 | | 0.0863 | | 0 | +| - p2 | | | |f1 | | 0.1292 | | 0 | +| - p3 | | | |f1 | | 0.0863 | | 0 | diff --git a/csv_new/output/mistralai__Mistral-7B-Instruct-v0.2__pl__10shot.txt b/csv_new/output/mistralai__Mistral-7B-Instruct-v0.2__pl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..0a825a34d1e4073d2ae5da7e22e86582b980912c --- /dev/null +++ b/csv_new/output/mistralai__Mistral-7B-Instruct-v0.2__pl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=mistralai/Mistral-7B-Instruct-v0.2 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4956 | |0 | +| - p1 | | | |f1 | | 0.4911 | | 0 | +| - p2 | | | |f1 | | 0.5046 | | 0 | +| - p3 | | | |f1 | | 0.4911 | | 0 | +| - RE | | | |f1 | | 0.3296 | |0 | +| - p1 | | | |f1 | | 0.3895 | | 0 | +| - p2 | | | |f1 | | 0.3311 | | 0 | +| - p3 | | | |f1 | | 0.2683 | | 0 | diff --git a/csv_new/output/mistralai__Mistral-7B-Instruct-v0.2__sk__0shot.txt b/csv_new/output/mistralai__Mistral-7B-Instruct-v0.2__sk__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..6eeaac48bff7f356aa2168ad7755b879c69be13a --- /dev/null +++ b/csv_new/output/mistralai__Mistral-7B-Instruct-v0.2__sk__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=mistralai/Mistral-7B-Instruct-v0.2 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2144 | |0 | +| - p1 | | | |f1 | | 0.2143 | | 0 | +| - p2 | | | |f1 | | 0.2146 | | 0 | +| - p3 | | | |f1 | | 0.2143 | | 0 | +| - RE | | | |f1 | | 0.0782 | |0 | +| - p1 | | | |f1 | | 0.0756 | | 0 | +| - p2 | | | |f1 | | 0.0835 | | 0 | +| - p3 | | | |f1 | | 0.0756 | | 0 | diff --git a/csv_new/output/mistralai__Mistral-7B-Instruct-v0.2__sk__10shot.txt b/csv_new/output/mistralai__Mistral-7B-Instruct-v0.2__sk__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..3952afe3c0317b08c9e06f3caff5ae01eb9aa4e2 --- /dev/null +++ b/csv_new/output/mistralai__Mistral-7B-Instruct-v0.2__sk__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=mistralai/Mistral-7B-Instruct-v0.2 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3951 | |0 | +| - p1 | | | |f1 | | 0.4029 | | 0 | +| - p2 | | | |f1 | | 0.3794 | | 0 | +| - p3 | | | |f1 | | 0.4029 | | 0 | +| - RE | | | |f1 | | 0.2132 | |0 | +| - p1 | | | |f1 | | 0.2155 | | 0 | +| - p2 | | | |f1 | | 0.1948 | | 0 | +| - p3 | | | |f1 | | 0.2293 | | 0 | diff --git a/csv_new/output/mistralai__Mistral-7B-Instruct-v0.2__sl__0shot.txt b/csv_new/output/mistralai__Mistral-7B-Instruct-v0.2__sl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..e0158432acf19d26d277f1b57deb076edd05514a --- /dev/null +++ b/csv_new/output/mistralai__Mistral-7B-Instruct-v0.2__sl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=mistralai/Mistral-7B-Instruct-v0.2 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1826 | |0 | +| - p1 | | | |f1 | | 0.1766 | | 0 | +| - p2 | | | |f1 | | 0.1947 | | 0 | +| - p3 | | | |f1 | | 0.1766 | | 0 | +| - RE | | | |f1 | | 0.1076 | |0 | +| - p1 | | | |f1 | | 0.0766 | | 0 | +| - p2 | | | |f1 | | 0.1695 | | 0 | +| - p3 | | | |f1 | | 0.0766 | | 0 | diff --git a/csv_new/output/mistralai__Mistral-7B-Instruct-v0.2__sl__10shot.txt b/csv_new/output/mistralai__Mistral-7B-Instruct-v0.2__sl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..c0a21e1f1e602067347fb6c7ae3af7c47c220eb9 --- /dev/null +++ b/csv_new/output/mistralai__Mistral-7B-Instruct-v0.2__sl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=mistralai/Mistral-7B-Instruct-v0.2 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4194 | |0 | +| - p1 | | | |f1 | | 0.4204 | | 0 | +| - p2 | | | |f1 | | 0.4174 | | 0 | +| - p3 | | | |f1 | | 0.4204 | | 0 | +| - RE | | | |f1 | | 0.2018 | |0 | +| - p1 | | | |f1 | | 0.1990 | | 0 | +| - p2 | | | |f1 | | 0.1950 | | 0 | +| - p3 | | | |f1 | | 0.2115 | | 0 | diff --git a/csv_new/output/mistralai__Mistral-Nemo-Instruct-2407__en__0shot.txt b/csv_new/output/mistralai__Mistral-Nemo-Instruct-2407__en__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..c1ebf3a368f471a2301474f0203334d11a2941ea --- /dev/null +++ b/csv_new/output/mistralai__Mistral-Nemo-Instruct-2407__en__0shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=mistralai/Mistral-Nemo-Instruct-2407 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2271 | |0 | +| - p1 | | | |f1 | | 0.2767 | | 0 | +| - p2 | | | |f1 | | 0.2299 | | 0 | +| - p3 | | | |f1 | | 0.1748 | | 0 | +| - RE | | | |f1 | | 0.3472 | |0 | +| - p1 | | | |f1 | | 0.3694 | | 0 | +| - p2 | | | |f1 | | 0.3482 | | 0 | +| - p3 | | | |f1 | | 0.3241 | | 0 | +| - RML | | | |f1 | | 0.0129 | |0 | +| - p1 | | | |f1 | | 0.0385 | | 0 | +| - p2 | | | |f1 | | 0.0003 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - DIA | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - HIS | | | |f1 | | 0.0008 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0024 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/mistralai__Mistral-Nemo-Instruct-2407__en__10shot.txt b/csv_new/output/mistralai__Mistral-Nemo-Instruct-2407__en__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..f8bc04b0bceabfdca64b67b86f539f97c755754e --- /dev/null +++ b/csv_new/output/mistralai__Mistral-Nemo-Instruct-2407__en__10shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=mistralai/Mistral-Nemo-Instruct-2407 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5762 | |0 | +| - p1 | | | |f1 | | 0.5777 | | 0 | +| - p2 | | | |f1 | | 0.5841 | | 0 | +| - p3 | | | |f1 | | 0.5668 | | 0 | +| - RE | | | |f1 | | 0.4313 | |0 | +| - p1 | | | |f1 | | 0.3482 | | 0 | +| - p2 | | | |f1 | | 0.5008 | | 0 | +| - p3 | | | |f1 | | 0.4449 | | 0 | +| - RML | | | |f1 | | 0.2524 | |0 | +| - p1 | | | |f1 | | 0.2499 | | 0 | +| - p2 | | | |f1 | | 0.2718 | | 0 | +| - p3 | | | |f1 | | 0.2356 | | 0 | +| - DIA | | | |f1 | | 0.4329 | |0 | +| - p1 | | | |f1 | | 0.3034 | | 0 | +| - p2 | | | |f1 | | 0.4176 | | 0 | +| - p3 | | | |f1 | | 0.5778 | | 0 | +| - HIS | | | |f1 | | 0.2251 | |0 | +| - p1 | | | |f1 | | 0.0905 | | 0 | +| - p2 | | | |f1 | | 0.4043 | | 0 | +| - p3 | | | |f1 | | 0.1804 | | 0 | diff --git a/csv_new/output/mistralai__Mistral-Nemo-Instruct-2407__gr__0shot.txt b/csv_new/output/mistralai__Mistral-Nemo-Instruct-2407__gr__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6f9827c9bfca0e3f2fe140aa5bb7f63e64551b9 --- /dev/null +++ b/csv_new/output/mistralai__Mistral-Nemo-Instruct-2407__gr__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=mistralai/Mistral-Nemo-Instruct-2407 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0717 | |0 | +| - p1 | | | |f1 | | 0.0732 | | 0 | +| - p2 | | | |f1 | | 0.0687 | | 0 | +| - p3 | | | |f1 | | 0.0732 | | 0 | +| - RE | | | |f1 | | 0.2326 | |0 | +| - p1 | | | |f1 | | 0.1575 | | 0 | +| - p2 | | | |f1 | | 0.2117 | | 0 | +| - p3 | | | |f1 | | 0.3287 | | 0 | diff --git a/csv_new/output/mistralai__Mistral-Nemo-Instruct-2407__gr__10shot.txt b/csv_new/output/mistralai__Mistral-Nemo-Instruct-2407__gr__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..5c23dd02177855715602246da1ab145d4750a511 --- /dev/null +++ b/csv_new/output/mistralai__Mistral-Nemo-Instruct-2407__gr__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=mistralai/Mistral-Nemo-Instruct-2407 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5050 | |0 | +| - p1 | | | |f1 | | 0.5081 | | 0 | +| - p2 | | | |f1 | | 0.4988 | | 0 | +| - p3 | | | |f1 | | 0.5081 | | 0 | +| - RE | | | |f1 | | 0.2549 | |0 | +| - p1 | | | |f1 | | 0.2029 | | 0 | +| - p2 | | | |f1 | | 0.2296 | | 0 | +| - p3 | | | |f1 | | 0.3323 | | 0 | diff --git a/csv_new/output/mistralai__Mistral-Nemo-Instruct-2407__it__0shot.txt b/csv_new/output/mistralai__Mistral-Nemo-Instruct-2407__it__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..0677e3e7254c92e03660f5c8741abe105b516256 --- /dev/null +++ b/csv_new/output/mistralai__Mistral-Nemo-Instruct-2407__it__0shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=mistralai/Mistral-Nemo-Instruct-2407 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1960 | |0 | +| - p1 | | | |f1 | | 0.2792 | | 0 | +| - p2 | | | |f1 | | 0.1772 | | 0 | +| - p3 | | | |f1 | | 0.1316 | | 0 | +| - RE | | | |f1 | | 0.2365 | |0 | +| - p1 | | | |f1 | | 0.2849 | | 0 | +| - p2 | | | |f1 | | 0.2384 | | 0 | +| - p3 | | | |f1 | | 0.1861 | | 0 | +| - RML | | | |f1 | | 0.0010 | |0 | +| - p1 | | | |f1 | | 0.0029 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - DIA | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - HIS | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/mistralai__Mistral-Nemo-Instruct-2407__it__10shot.txt b/csv_new/output/mistralai__Mistral-Nemo-Instruct-2407__it__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..7e10ec1ce7e0e98bf9f8ccba81394cd7caa10c82 --- /dev/null +++ b/csv_new/output/mistralai__Mistral-Nemo-Instruct-2407__it__10shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=mistralai/Mistral-Nemo-Instruct-2407 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6441 | |0 | +| - p1 | | | |f1 | | 0.6430 | | 0 | +| - p2 | | | |f1 | | 0.6437 | | 0 | +| - p3 | | | |f1 | | 0.6457 | | 0 | +| - RE | | | |f1 | | 0.3556 | |0 | +| - p1 | | | |f1 | | 0.2708 | | 0 | +| - p2 | | | |f1 | | 0.4099 | | 0 | +| - p3 | | | |f1 | | 0.3860 | | 0 | +| - RML | | | |f1 | | 0.1269 | |0 | +| - p1 | | | |f1 | | 0.1219 | | 0 | +| - p2 | | | |f1 | | 0.1299 | | 0 | +| - p3 | | | |f1 | | 0.1287 | | 0 | +| - DIA | | | |f1 | | 0.4612 | |0 | +| - p1 | | | |f1 | | 0.4449 | | 0 | +| - p2 | | | |f1 | | 0.5659 | | 0 | +| - p3 | | | |f1 | | 0.3728 | | 0 | +| - HIS | | | |f1 | | 0.1763 | |0 | +| - p1 | | | |f1 | | 0.0690 | | 0 | +| - p2 | | | |f1 | | 0.3846 | | 0 | +| - p3 | | | |f1 | | 0.0753 | | 0 | diff --git a/csv_new/output/mistralai__Mistral-Nemo-Instruct-2407__pl__0shot.txt b/csv_new/output/mistralai__Mistral-Nemo-Instruct-2407__pl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..9a7d8e28e086f27eb9637e6ad992ba36f0de390a --- /dev/null +++ b/csv_new/output/mistralai__Mistral-Nemo-Instruct-2407__pl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=mistralai/Mistral-Nemo-Instruct-2407 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0468 | |0 | +| - p1 | | | |f1 | | 0.0483 | | 0 | +| - p2 | | | |f1 | | 0.0439 | | 0 | +| - p3 | | | |f1 | | 0.0483 | | 0 | +| - RE | | | |f1 | | 0.1823 | |0 | +| - p1 | | | |f1 | | 0.2123 | | 0 | +| - p2 | | | |f1 | | 0.1686 | | 0 | +| - p3 | | | |f1 | | 0.1661 | | 0 | diff --git a/csv_new/output/mistralai__Mistral-Nemo-Instruct-2407__pl__10shot.txt b/csv_new/output/mistralai__Mistral-Nemo-Instruct-2407__pl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..824dea14ef26e4fd07539f5cdc57cb0d72d7a869 --- /dev/null +++ b/csv_new/output/mistralai__Mistral-Nemo-Instruct-2407__pl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=mistralai/Mistral-Nemo-Instruct-2407 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5375 | |0 | +| - p1 | | | |f1 | | 0.5352 | | 0 | +| - p2 | | | |f1 | | 0.5421 | | 0 | +| - p3 | | | |f1 | | 0.5352 | | 0 | +| - RE | | | |f1 | | 0.1906 | |0 | +| - p1 | | | |f1 | | 0.1863 | | 0 | +| - p2 | | | |f1 | | 0.1855 | | 0 | +| - p3 | | | |f1 | | 0.2001 | | 0 | diff --git a/csv_new/output/mistralai__Mistral-Nemo-Instruct-2407__sk__0shot.txt b/csv_new/output/mistralai__Mistral-Nemo-Instruct-2407__sk__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c38472f1e9cb2c7693144b17179e9dcfe88f159 --- /dev/null +++ b/csv_new/output/mistralai__Mistral-Nemo-Instruct-2407__sk__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=mistralai/Mistral-Nemo-Instruct-2407 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0738 | |0 | +| - p1 | | | |f1 | | 0.0685 | | 0 | +| - p2 | | | |f1 | | 0.0844 | | 0 | +| - p3 | | | |f1 | | 0.0685 | | 0 | +| - RE | | | |f1 | | 0.1596 | |0 | +| - p1 | | | |f1 | | 0.1696 | | 0 | +| - p2 | | | |f1 | | 0.1396 | | 0 | +| - p3 | | | |f1 | | 0.1696 | | 0 | diff --git a/csv_new/output/mistralai__Mistral-Nemo-Instruct-2407__sk__10shot.txt b/csv_new/output/mistralai__Mistral-Nemo-Instruct-2407__sk__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e3e1b2ef67ae845db75704f414dd97a01bc4d8a --- /dev/null +++ b/csv_new/output/mistralai__Mistral-Nemo-Instruct-2407__sk__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=mistralai/Mistral-Nemo-Instruct-2407 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5030 | |0 | +| - p1 | | | |f1 | | 0.5025 | | 0 | +| - p2 | | | |f1 | | 0.5040 | | 0 | +| - p3 | | | |f1 | | 0.5025 | | 0 | +| - RE | | | |f1 | | 0.1832 | |0 | +| - p1 | | | |f1 | | 0.1237 | | 0 | +| - p2 | | | |f1 | | 0.2166 | | 0 | +| - p3 | | | |f1 | | 0.2094 | | 0 | diff --git a/csv_new/output/mistralai__Mistral-Nemo-Instruct-2407__sl__0shot.txt b/csv_new/output/mistralai__Mistral-Nemo-Instruct-2407__sl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..832ca83ee26ae570b1c4e4d781100383be94e147 --- /dev/null +++ b/csv_new/output/mistralai__Mistral-Nemo-Instruct-2407__sl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=mistralai/Mistral-Nemo-Instruct-2407 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0842 | |0 | +| - p1 | | | |f1 | | 0.0861 | | 0 | +| - p2 | | | |f1 | | 0.0805 | | 0 | +| - p3 | | | |f1 | | 0.0861 | | 0 | +| - RE | | | |f1 | | 0.1905 | |0 | +| - p1 | | | |f1 | | 0.2309 | | 0 | +| - p2 | | | |f1 | | 0.1096 | | 0 | +| - p3 | | | |f1 | | 0.2309 | | 0 | diff --git a/csv_new/output/mistralai__Mistral-Nemo-Instruct-2407__sl__10shot.txt b/csv_new/output/mistralai__Mistral-Nemo-Instruct-2407__sl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..c2b96898a18579d2b16376fa5e4d1159ed4fc544 --- /dev/null +++ b/csv_new/output/mistralai__Mistral-Nemo-Instruct-2407__sl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=mistralai/Mistral-Nemo-Instruct-2407 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5327 | |0 | +| - p1 | | | |f1 | | 0.5323 | | 0 | +| - p2 | | | |f1 | | 0.5335 | | 0 | +| - p3 | | | |f1 | | 0.5323 | | 0 | +| - RE | | | |f1 | | 0.1725 | |0 | +| - p1 | | | |f1 | | 0.1390 | | 0 | +| - p2 | | | |f1 | | 0.2057 | | 0 | +| - p3 | | | |f1 | | 0.1727 | | 0 | diff --git a/csv_new/output/tiiuae__Falcon3-10B-Instruct__en__0shot.txt b/csv_new/output/tiiuae__Falcon3-10B-Instruct__en__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..c4860d37479d7974231c73818241e4a7e152fb8d --- /dev/null +++ b/csv_new/output/tiiuae__Falcon3-10B-Instruct__en__0shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=tiiuae/Falcon3-10B-Instruct ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2658 | |0 | +| - p1 | | | |f1 | | 0.2270 | | 0 | +| - p2 | | | |f1 | | 0.2709 | | 0 | +| - p3 | | | |f1 | | 0.2996 | | 0 | +| - RE | | | |f1 | | 0.3280 | |0 | +| - p1 | | | |f1 | | 0.2157 | | 0 | +| - p2 | | | |f1 | | 0.3835 | | 0 | +| - p3 | | | |f1 | | 0.3848 | | 0 | +| - RML | | | |f1 | | 0.0018 | |0 | +| - p1 | | | |f1 | | 0.0055 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - DIA | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - HIS | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/tiiuae__Falcon3-10B-Instruct__en__10shot.txt b/csv_new/output/tiiuae__Falcon3-10B-Instruct__en__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..f33e7a0b6986bcfa6bd99ca0830d837019911409 --- /dev/null +++ b/csv_new/output/tiiuae__Falcon3-10B-Instruct__en__10shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=tiiuae/Falcon3-10B-Instruct ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5730 | |0 | +| - p1 | | | |f1 | | 0.5840 | | 0 | +| - p2 | | | |f1 | | 0.5421 | | 0 | +| - p3 | | | |f1 | | 0.5928 | | 0 | +| - RE | | | |f1 | | 0.5145 | |0 | +| - p1 | | | |f1 | | 0.4335 | | 0 | +| - p2 | | | |f1 | | 0.5586 | | 0 | +| - p3 | | | |f1 | | 0.5515 | | 0 | +| - RML | | | |f1 | | 0.1652 | |0 | +| - p1 | | | |f1 | | 0.2792 | | 0 | +| - p2 | | | |f1 | | 0.1816 | | 0 | +| - p3 | | | |f1 | | 0.0350 | | 0 | +| - DIA | | | |f1 | | 0.1081 | |0 | +| - p1 | | | |f1 | | 0.0708 | | 0 | +| - p2 | | | |f1 | | 0.1658 | | 0 | +| - p3 | | | |f1 | | 0.0877 | | 0 | +| - HIS | | | |f1 | | 0.1121 | |0 | +| - p1 | | | |f1 | | 0.0211 | | 0 | +| - p2 | | | |f1 | | 0.2678 | | 0 | +| - p3 | | | |f1 | | 0.0474 | | 0 | diff --git a/csv_new/output/tiiuae__Falcon3-10B-Instruct__gr__0shot.txt b/csv_new/output/tiiuae__Falcon3-10B-Instruct__gr__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..b7516c1517104e5be8c21dca55222faa85473fdc --- /dev/null +++ b/csv_new/output/tiiuae__Falcon3-10B-Instruct__gr__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=tiiuae/Falcon3-10B-Instruct ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1585 | |0 | +| - p1 | | | |f1 | | 0.2130 | | 0 | +| - p2 | | | |f1 | | 0.0495 | | 0 | +| - p3 | | | |f1 | | 0.2130 | | 0 | +| - RE | | | |f1 | | 0.0506 | |0 | +| - p1 | | | |f1 | | 0.0401 | | 0 | +| - p2 | | | |f1 | | 0.0250 | | 0 | +| - p3 | | | |f1 | | 0.0867 | | 0 | diff --git a/csv_new/output/tiiuae__Falcon3-10B-Instruct__gr__10shot.txt b/csv_new/output/tiiuae__Falcon3-10B-Instruct__gr__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..ba5c002a9264a1d56a51c72b4dc642ee87b8c605 --- /dev/null +++ b/csv_new/output/tiiuae__Falcon3-10B-Instruct__gr__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=tiiuae/Falcon3-10B-Instruct ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.3448 | |0 | +| - p1 | | | |f1 | | 0.3345 | | 0 | +| - p2 | | | |f1 | | 0.3655 | | 0 | +| - p3 | | | |f1 | | 0.3345 | | 0 | +| - RE | | | |f1 | | 0.3591 | |0 | +| - p1 | | | |f1 | | 0.3749 | | 0 | +| - p2 | | | |f1 | | 0.3755 | | 0 | +| - p3 | | | |f1 | | 0.3268 | | 0 | diff --git a/csv_new/output/tiiuae__Falcon3-10B-Instruct__it__0shot.txt b/csv_new/output/tiiuae__Falcon3-10B-Instruct__it__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..a95413e5d32046ccb504362812df25c2eccc14db --- /dev/null +++ b/csv_new/output/tiiuae__Falcon3-10B-Instruct__it__0shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=tiiuae/Falcon3-10B-Instruct ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2011 | |0 | +| - p1 | | | |f1 | | 0.1261 | | 0 | +| - p2 | | | |f1 | | 0.2327 | | 0 | +| - p3 | | | |f1 | | 0.2444 | | 0 | +| - RE | | | |f1 | | 0.1865 | |0 | +| - p1 | | | |f1 | | 0.2404 | | 0 | +| - p2 | | | |f1 | | 0.1699 | | 0 | +| - p3 | | | |f1 | | 0.1492 | | 0 | +| - RML | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - DIA | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - HIS | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/tiiuae__Falcon3-10B-Instruct__it__10shot.txt b/csv_new/output/tiiuae__Falcon3-10B-Instruct__it__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..ca2948f33281baa195f8a52124b6be256e2a66a1 --- /dev/null +++ b/csv_new/output/tiiuae__Falcon3-10B-Instruct__it__10shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=tiiuae/Falcon3-10B-Instruct ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5625 | |0 | +| - p1 | | | |f1 | | 0.5821 | | 0 | +| - p2 | | | |f1 | | 0.5432 | | 0 | +| - p3 | | | |f1 | | 0.5622 | | 0 | +| - RE | | | |f1 | | 0.5226 | |0 | +| - p1 | | | |f1 | | 0.4622 | | 0 | +| - p2 | | | |f1 | | 0.5458 | | 0 | +| - p3 | | | |f1 | | 0.5597 | | 0 | +| - RML | | | |f1 | | 0.0406 | |0 | +| - p1 | | | |f1 | | 0.0721 | | 0 | +| - p2 | | | |f1 | | 0.0340 | | 0 | +| - p3 | | | |f1 | | 0.0157 | | 0 | +| - DIA | | | |f1 | | 0.0543 | |0 | +| - p1 | | | |f1 | | 0.0186 | | 0 | +| - p2 | | | |f1 | | 0.0668 | | 0 | +| - p3 | | | |f1 | | 0.0774 | | 0 | +| - HIS | | | |f1 | | 0.0074 | |0 | +| - p1 | | | |f1 | | 0.0105 | | 0 | +| - p2 | | | |f1 | | 0.0044 | | 0 | +| - p3 | | | |f1 | | 0.0073 | | 0 | diff --git a/csv_new/output/tiiuae__Falcon3-10B-Instruct__pl__0shot.txt b/csv_new/output/tiiuae__Falcon3-10B-Instruct__pl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..edaf86b247e1c20e0f7c4138f96ec19d5a571ae4 --- /dev/null +++ b/csv_new/output/tiiuae__Falcon3-10B-Instruct__pl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=tiiuae/Falcon3-10B-Instruct ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2414 | |0 | +| - p1 | | | |f1 | | 0.2452 | | 0 | +| - p2 | | | |f1 | | 0.2338 | | 0 | +| - p3 | | | |f1 | | 0.2452 | | 0 | +| - RE | | | |f1 | | 0.0963 | |0 | +| - p1 | | | |f1 | | 0.1501 | | 0 | +| - p2 | | | |f1 | | 0.0123 | | 0 | +| - p3 | | | |f1 | | 0.1264 | | 0 | diff --git a/csv_new/output/tiiuae__Falcon3-10B-Instruct__pl__10shot.txt b/csv_new/output/tiiuae__Falcon3-10B-Instruct__pl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..068f9654a4427b28cd68c4493756660bf40e63a0 --- /dev/null +++ b/csv_new/output/tiiuae__Falcon3-10B-Instruct__pl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=tiiuae/Falcon3-10B-Instruct ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4244 | |0 | +| - p1 | | | |f1 | | 0.4304 | | 0 | +| - p2 | | | |f1 | | 0.4123 | | 0 | +| - p3 | | | |f1 | | 0.4304 | | 0 | +| - RE | | | |f1 | | 0.5396 | |0 | +| - p1 | | | |f1 | | 0.5129 | | 0 | +| - p2 | | | |f1 | | 0.5571 | | 0 | +| - p3 | | | |f1 | | 0.5489 | | 0 | diff --git a/csv_new/output/tiiuae__Falcon3-10B-Instruct__sk__0shot.txt b/csv_new/output/tiiuae__Falcon3-10B-Instruct__sk__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..6caf6d188e37dd8d852231914da9bca9053abf92 --- /dev/null +++ b/csv_new/output/tiiuae__Falcon3-10B-Instruct__sk__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=tiiuae/Falcon3-10B-Instruct ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2871 | |0 | +| - p1 | | | |f1 | | 0.2717 | | 0 | +| - p2 | | | |f1 | | 0.3178 | | 0 | +| - p3 | | | |f1 | | 0.2717 | | 0 | +| - RE | | | |f1 | | 0.0182 | |0 | +| - p1 | | | |f1 | | 0.0143 | | 0 | +| - p2 | | | |f1 | | 0.0260 | | 0 | +| - p3 | | | |f1 | | 0.0143 | | 0 | diff --git a/csv_new/output/tiiuae__Falcon3-10B-Instruct__sk__10shot.txt b/csv_new/output/tiiuae__Falcon3-10B-Instruct__sk__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..dc5594c73257786e6edd1f8c852ad343d66e7f30 --- /dev/null +++ b/csv_new/output/tiiuae__Falcon3-10B-Instruct__sk__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=tiiuae/Falcon3-10B-Instruct ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4402 | |0 | +| - p1 | | | |f1 | | 0.4545 | | 0 | +| - p2 | | | |f1 | | 0.4116 | | 0 | +| - p3 | | | |f1 | | 0.4545 | | 0 | +| - RE | | | |f1 | | 0.4261 | |0 | +| - p1 | | | |f1 | | 0.3750 | | 0 | +| - p2 | | | |f1 | | 0.4695 | | 0 | +| - p3 | | | |f1 | | 0.4338 | | 0 | diff --git a/csv_new/output/tiiuae__Falcon3-10B-Instruct__sl__0shot.txt b/csv_new/output/tiiuae__Falcon3-10B-Instruct__sl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..8d586cd4e31f21369f0d3c8873dba4eb0ce073b5 --- /dev/null +++ b/csv_new/output/tiiuae__Falcon3-10B-Instruct__sl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=tiiuae/Falcon3-10B-Instruct ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2297 | |0 | +| - p1 | | | |f1 | | 0.2519 | | 0 | +| - p2 | | | |f1 | | 0.1853 | | 0 | +| - p3 | | | |f1 | | 0.2519 | | 0 | +| - RE | | | |f1 | | 0.0050 | |0 | +| - p1 | | | |f1 | | 0.0047 | | 0 | +| - p2 | | | |f1 | | 0.0058 | | 0 | +| - p3 | | | |f1 | | 0.0047 | | 0 | diff --git a/csv_new/output/tiiuae__Falcon3-10B-Instruct__sl__10shot.txt b/csv_new/output/tiiuae__Falcon3-10B-Instruct__sl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..999a14510d1ea855adb9835bc9235c19f1a60783 --- /dev/null +++ b/csv_new/output/tiiuae__Falcon3-10B-Instruct__sl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=tiiuae/Falcon3-10B-Instruct ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.4050 | |0 | +| - p1 | | | |f1 | | 0.4121 | | 0 | +| - p2 | | | |f1 | | 0.3909 | | 0 | +| - p3 | | | |f1 | | 0.4121 | | 0 | +| - RE | | | |f1 | | 0.3133 | |0 | +| - p1 | | | |f1 | | 0.2323 | | 0 | +| - p2 | | | |f1 | | 0.3012 | | 0 | +| - p3 | | | |f1 | | 0.4063 | | 0 | diff --git a/csv_new/output/unsloth__phi-4__en__0shot.txt b/csv_new/output/unsloth__phi-4__en__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..b34e9e9d2a5ce204521f78108da9842620e59111 --- /dev/null +++ b/csv_new/output/unsloth__phi-4__en__0shot.txt @@ -0,0 +1,23 @@ +hf (pretrained=unsloth/phi-4 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0275 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0252 | | 0 | +| - p3 | | | |f1 | | 0.0572 | | 0 | +| - RE | | | |f1 | | 0.4090 | |0 | +| - p1 | | | |f1 | | 0.4022 | | 0 | +| - p2 | | | |f1 | | 0.4219 | | 0 | +| - p3 | | | |f1 | | 0.4030 | | 0 | +| - RML | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - DIA | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - HIS | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/unsloth__phi-4__en__10shot.txt b/csv_new/output/unsloth__phi-4__en__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..ab7e4739278848d7b322fb7d27c1531b7ac36392 --- /dev/null +++ b/csv_new/output/unsloth__phi-4__en__10shot.txt @@ -0,0 +1,24 @@ +hf (pretrained=unsloth/phi-4 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5984 | |0 | +| - p1 | | | |f1 | | 0.6098 | | 0 | +| - p2 | | | |f1 | | 0.5711 | | 0 | +| - p3 | | | |f1 | | 0.6141 | | 0 | +| - RE | | | |f1 | | 0.5364 | |0 | +| - p1 | | | |f1 | | 0.4912 | | 0 | +| - p2 | | | |f1 | | 0.5626 | | 0 | +| - p3 | | | |f1 | | 0.5554 | | 0 | +| - RML | | | |f1 | | 0.2878 | |0 | +| - p1 | | | |f1 | | 0.3841 | | 0 | +| - p2 | | | |f1 | | 0.3289 | | 0 | +| - p3 | | | |f1 | | 0.2191 | | 0 | +| - p3 | | | |f1 | | 0.2191 | | 0 | +| - DIA | | | |f1 | | 0.4715 | |0 | +| - p1 | | | |f1 | | 0.4262 | | 0 | +| - p2 | | | |f1 | | 0.5630 | | 0 | +| - p3 | | | |f1 | | 0.4254 | | 0 | +| - HIS | | | |f1 | | 0.4138 | |0 | +| - p1 | | | |f1 | | 0.3736 | | 0 | +| - p2 | | | |f1 | | 0.5020 | | 0 | +| - p3 | | | |f1 | | 0.3658 | | 0 | diff --git a/csv_new/output/unsloth__phi-4__gr__0shot.txt b/csv_new/output/unsloth__phi-4__gr__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..a17e8d575c703b7ccebc72cd8ff6aeca0397f1cc --- /dev/null +++ b/csv_new/output/unsloth__phi-4__gr__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=unsloth/phi-4 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - RE | | | |f1 | | 0.2011 | |0 | +| - p1 | | | |f1 | | 0.2901 | | 0 | +| - p2 | | | |f1 | | 0.2208 | | 0 | +| - p3 | | | |f1 | | 0.0925 | | 0 | diff --git a/csv_new/output/unsloth__phi-4__gr__10shot.txt b/csv_new/output/unsloth__phi-4__gr__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f48b7d4235602d80223abd071f8764d2a1a5bfc --- /dev/null +++ b/csv_new/output/unsloth__phi-4__gr__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=unsloth/phi-4 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5682 | |0 | +| - p1 | | | |f1 | | 0.5717 | | 0 | +| - p2 | | | |f1 | | 0.5611 | | 0 | +| - p3 | | | |f1 | | 0.5717 | | 0 | +| - RE | | | |f1 | | 0.5291 | |0 | +| - p1 | | | |f1 | | 0.4935 | | 0 | +| - p2 | | | |f1 | | 0.5261 | | 0 | +| - p3 | | | |f1 | | 0.5678 | | 0 | diff --git a/csv_new/output/unsloth__phi-4__it__0shot.txt b/csv_new/output/unsloth__phi-4__it__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..b20b4e4c6aae9c122b54207688196bec201d8b6c --- /dev/null +++ b/csv_new/output/unsloth__phi-4__it__0shot.txt @@ -0,0 +1,24 @@ +hf (pretrained=unsloth/phi-4 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.1717 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.1724 | | 0 | +| - p3 | | | |f1 | | 0.3428 | | 0 | +| - RE | | | |f1 | | 0.3589 | |0 | +| - p1 | | | |f1 | | 0.3354 | | 0 | +| - p2 | | | |f1 | | 0.3737 | | 0 | +| - p3 | | | |f1 | | 0.3677 | | 0 | +| - RML | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - DIA | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | +| - HIS | | | |f1 | | 0.0000 | |0 | +| - p1 | | | |f1 | | 0.0000 | | 0 | +| - p2 | | | |f1 | | 0.0000 | | 0 | +| - p3 | | | |f1 | | 0.0000 | | 0 | diff --git a/csv_new/output/unsloth__phi-4__it__10shot.txt b/csv_new/output/unsloth__phi-4__it__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..85053f2ab2acf2c1693a65d4e6852d38d1c1b6a3 --- /dev/null +++ b/csv_new/output/unsloth__phi-4__it__10shot.txt @@ -0,0 +1,24 @@ +hf (pretrained=unsloth/phi-4 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.6759 | |0 | +| - p1 | | | |f1 | | 0.6647 | | 0 | +| - p2 | | | |f1 | | 0.6732 | | 0 | +| - p3 | | | |f1 | | 0.6897 | | 0 | +| - RE | | | |f1 | | 0.5705 | |0 | +| - p1 | | | |f1 | | 0.5608 | | 0 | +| - p2 | | | |f1 | | 0.5820 | | 0 | +| - p3 | | | |f1 | | 0.5688 | | 0 | +| - RML | | | |f1 | | 0.1263 | |0 | +| - p1 | | | |f1 | | 0.1759 | | 0 | +| - p2 | | | |f1 | | 0.1675 | | 0 | +| - p3 | | | |f1 | | 0.0810 | | 0 | +| - p3 | | | |f1 | | 0.0810 | | 0 | +| - DIA | | | |f1 | | 0.5691 | |0 | +| - p1 | | | |f1 | | 0.5835 | | 0 | +| - p2 | | | |f1 | | 0.5676 | | 0 | +| - p3 | | | |f1 | | 0.5564 | | 0 | +| - HIS | | | |f1 | | 0.4656 | |0 | +| - p1 | | | |f1 | | 0.5102 | | 0 | +| - p2 | | | |f1 | | 0.5006 | | 0 | +| - p3 | | | |f1 | | 0.3859 | | 0 | diff --git a/csv_new/output/unsloth__phi-4__pl__0shot.txt b/csv_new/output/unsloth__phi-4__pl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..50d734915f57e7a4713da8e3d4cb6ae9a653a9a1 --- /dev/null +++ b/csv_new/output/unsloth__phi-4__pl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=unsloth/phi-4 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0279 | |0 | +| - p1 | | | |f1 | | 0.0236 | | 0 | +| - p2 | | | |f1 | | 0.0366 | | 0 | +| - p3 | | | |f1 | | 0.0236 | | 0 | +| - RE | | | |f1 | | 0.3814 | |0 | +| - p1 | | | |f1 | | 0.3799 | | 0 | +| - p2 | | | |f1 | | 0.3829 | | 0 | +| - p3 | | | |f1 | | 0.3813 | | 0 | diff --git a/csv_new/output/unsloth__phi-4__pl__10shot.txt b/csv_new/output/unsloth__phi-4__pl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..13c70462fcbbc4333d7e40ab047995e60782311c --- /dev/null +++ b/csv_new/output/unsloth__phi-4__pl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=unsloth/phi-4 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5474 | |0 | +| - p1 | | | |f1 | | 0.5549 | | 0 | +| - p2 | | | |f1 | | 0.5324 | | 0 | +| - p3 | | | |f1 | | 0.5549 | | 0 | +| - RE | | | |f1 | | 0.5718 | |0 | +| - p1 | | | |f1 | | 0.5423 | | 0 | +| - p2 | | | |f1 | | 0.5760 | | 0 | +| - p3 | | | |f1 | | 0.5972 | | 0 | diff --git a/csv_new/output/unsloth__phi-4__sk__0shot.txt b/csv_new/output/unsloth__phi-4__sk__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..609bfee5abd16055de50dbbc8a5b5e54bf628dde --- /dev/null +++ b/csv_new/output/unsloth__phi-4__sk__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=unsloth/phi-4 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.0567 | |0 | +| - p1 | | | |f1 | | 0.0316 | | 0 | +| - p2 | | | |f1 | | 0.1070 | | 0 | +| - p3 | | | |f1 | | 0.0316 | | 0 | +| - RE | | | |f1 | | 0.3277 | |0 | +| - p1 | | | |f1 | | 0.3252 | | 0 | +| - p2 | | | |f1 | | 0.3326 | | 0 | +| - p3 | | | |f1 | | 0.3252 | | 0 | diff --git a/csv_new/output/unsloth__phi-4__sk__10shot.txt b/csv_new/output/unsloth__phi-4__sk__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..e55439f603a7ee43ebc4fb2b6489d94a69f17b05 --- /dev/null +++ b/csv_new/output/unsloth__phi-4__sk__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=unsloth/phi-4 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5524 | |0 | +| - p1 | | | |f1 | | 0.5561 | | 0 | +| - p2 | | | |f1 | | 0.5449 | | 0 | +| - p3 | | | |f1 | | 0.5561 | | 0 | +| - RE | | | |f1 | | 0.5214 | |0 | +| - p1 | | | |f1 | | 0.5106 | | 0 | +| - p2 | | | |f1 | | 0.4994 | | 0 | +| - p3 | | | |f1 | | 0.5541 | | 0 | diff --git a/csv_new/output/unsloth__phi-4__sl__0shot.txt b/csv_new/output/unsloth__phi-4__sl__0shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..29578b4d5063f990ad13a10dcac7d69a04c24725 --- /dev/null +++ b/csv_new/output/unsloth__phi-4__sl__0shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=unsloth/phi-4 ), num_fewshot: 0, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.2241 | |0 | +| - p1 | | | |f1 | | 0.2870 | | 0 | +| - p2 | | | |f1 | | 0.0981 | | 0 | +| - p3 | | | |f1 | | 0.2870 | | 0 | +| - RE | | | |f1 | | 0.2721 | |0 | +| - p1 | | | |f1 | | 0.3209 | | 0 | +| - p2 | | | |f1 | | 0.1744 | | 0 | +| - p3 | | | |f1 | | 0.3209 | | 0 | diff --git a/csv_new/output/unsloth__phi-4__sl__10shot.txt b/csv_new/output/unsloth__phi-4__sl__10shot.txt new file mode 100644 index 0000000000000000000000000000000000000000..debd951319f9e20f02aade8491ff82efa207384f --- /dev/null +++ b/csv_new/output/unsloth__phi-4__sl__10shot.txt @@ -0,0 +1,11 @@ +hf (pretrained=unsloth/phi-4 ), num_fewshot: 10, batch_size: 1 +|Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------|-------|------|------|------|----|------|---|------| +| - NER | | | |f1 | | 0.5577 | |0 | +| - p1 | | | |f1 | | 0.5586 | | 0 | +| - p2 | | | |f1 | | 0.5558 | | 0 | +| - p3 | | | |f1 | | 0.5586 | | 0 | +| - RE | | | |f1 | | 0.5309 | |0 | +| - p1 | | | |f1 | | 0.5117 | | 0 | +| - p2 | | | |f1 | | 0.5232 | | 0 | +| - p3 | | | |f1 | | 0.5579 | | 0 | diff --git a/e3c_llm_requests/.gitattributes b/e3c_llm_requests/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..1ef325f1b111266a6b26e0196871bd78baa8c2f3 --- /dev/null +++ b/e3c_llm_requests/.gitattributes @@ -0,0 +1,59 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.lz4 filter=lfs diff=lfs merge=lfs -text +*.mds filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +# Audio files - uncompressed +*.pcm filter=lfs diff=lfs merge=lfs -text +*.sam filter=lfs diff=lfs merge=lfs -text +*.raw filter=lfs diff=lfs merge=lfs -text +# Audio files - compressed +*.aac filter=lfs diff=lfs merge=lfs -text +*.flac filter=lfs diff=lfs merge=lfs -text +*.mp3 filter=lfs diff=lfs merge=lfs -text +*.ogg filter=lfs diff=lfs merge=lfs -text +*.wav filter=lfs diff=lfs merge=lfs -text +# Image files - uncompressed +*.bmp filter=lfs diff=lfs merge=lfs -text +*.gif filter=lfs diff=lfs merge=lfs -text +*.png filter=lfs diff=lfs merge=lfs -text +*.tiff filter=lfs diff=lfs merge=lfs -text +# Image files - compressed +*.jpg filter=lfs diff=lfs merge=lfs -text +*.jpeg filter=lfs diff=lfs merge=lfs -text +*.webp filter=lfs diff=lfs merge=lfs -text +# Video files - compressed +*.mp4 filter=lfs diff=lfs merge=lfs -text +*.webm filter=lfs diff=lfs merge=lfs -text diff --git a/e3c_llm_requests/Henrychur/MMed-Llama-3-8B.json b/e3c_llm_requests/Henrychur/MMed-Llama-3-8B.json new file mode 100644 index 0000000000000000000000000000000000000000..86b0019fb62092325cdc79c7eb8218aed3bad09f --- /dev/null +++ b/e3c_llm_requests/Henrychur/MMed-Llama-3-8B.json @@ -0,0 +1,8 @@ +{ + "model": "Henrychur/MMed-Llama-3-8B", + "base_model": "LlamaForCausalLM", + "revision": "6c3057bb49ac499970eb2891daaef9b5c14f6943", + "submitted_time": "2024-05-22 09:17:24+00:00", + "num_params_billion": null, + "language": "en_zh_ja_fr_ru_es" +} \ No newline at end of file diff --git a/e3c_llm_requests/HiTZ/Medical-mT5-large.json b/e3c_llm_requests/HiTZ/Medical-mT5-large.json new file mode 100644 index 0000000000000000000000000000000000000000..42da6bb5dbb7d478648d28988f549c4c3e885a7c --- /dev/null +++ b/e3c_llm_requests/HiTZ/Medical-mT5-large.json @@ -0,0 +1,8 @@ +{ + "model": "HiTZ/Medical-mT5-large", + "base_model": "MT5ForConditionalGeneration", + "revision": "e8ae7101f0ab1ed5b8add8846e44a2d39f6e2c47", + "submitted_time": "2023-10-31 15:15:15+00:00", + "num_params_billion": null, + "language": "en_es_fr_it" +} \ No newline at end of file diff --git a/e3c_llm_requests/Qwen/Qwen2.5-14B-Instruct-1M.json b/e3c_llm_requests/Qwen/Qwen2.5-14B-Instruct-1M.json new file mode 100644 index 0000000000000000000000000000000000000000..e4460926779e971e3317af33665cf9278980c10d --- /dev/null +++ b/e3c_llm_requests/Qwen/Qwen2.5-14B-Instruct-1M.json @@ -0,0 +1,8 @@ +{ + "model": "Qwen/Qwen2.5-14B-Instruct-1M", + "base_model": "Qwen2ForCausalLM", + "revision": "620fad32de7bdd2293b3d99b39eba2fe63e97438", + "submitted_time": "2025-01-23 13:23:24+00:00", + "num_params_billion": 14.770033664, + "language": "en" +} \ No newline at end of file diff --git a/e3c_llm_requests/Qwen/Qwen2.5-32B-Instruct.json b/e3c_llm_requests/Qwen/Qwen2.5-32B-Instruct.json new file mode 100644 index 0000000000000000000000000000000000000000..3bfb3ff4c28797c9aad9070719f798119c2784e3 --- /dev/null +++ b/e3c_llm_requests/Qwen/Qwen2.5-32B-Instruct.json @@ -0,0 +1,8 @@ +{ + "model": "Qwen/Qwen2.5-32B-Instruct", + "base_model": "Qwen2ForCausalLM", + "revision": "5ede1c97bbab6ce5cda5812749b4c0bdf79b18dd", + "submitted_time": "2024-09-17 04:17:55+00:00", + "num_params_billion": 32.763876352, + "language": "en" +} \ No newline at end of file diff --git a/e3c_llm_requests/Qwen/Qwen3-30B-A3B-Instruct-2507.json b/e3c_llm_requests/Qwen/Qwen3-30B-A3B-Instruct-2507.json new file mode 100644 index 0000000000000000000000000000000000000000..8c19206d89e4c8cc448322bcf562cdeea4f686ba --- /dev/null +++ b/e3c_llm_requests/Qwen/Qwen3-30B-A3B-Instruct-2507.json @@ -0,0 +1,8 @@ +{ + "model": "Qwen/Qwen3-30B-A3B-Instruct-2507", + "base_model": "Qwen3MoeForCausalLM", + "revision": "61082d4deaa4785f64943b443cbc2b5de7524fad", + "submitted_time": "2025-07-28 07:31:27+00:00", + "num_params_billion": 30.532122624, + "language": "" +} \ No newline at end of file diff --git a/e3c_llm_requests/deepseek-ai/.ipynb_checkpoints/DeepSeek-R1-Distill-Qwen-32B-checkpoint.json b/e3c_llm_requests/deepseek-ai/.ipynb_checkpoints/DeepSeek-R1-Distill-Qwen-32B-checkpoint.json new file mode 100644 index 0000000000000000000000000000000000000000..3a574a6dcb81e49041fa0f592ee86930e1b43847 --- /dev/null +++ b/e3c_llm_requests/deepseek-ai/.ipynb_checkpoints/DeepSeek-R1-Distill-Qwen-32B-checkpoint.json @@ -0,0 +1,8 @@ +{ + "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "base_model": "Qwen2ForCausalLM", + "revision": "711ad2ea6aa40cfca18895e8aca02ab92df1a746", + "submitted_time": "2025-01-20 09:19:00+00:00", + "num_params_billion": 32.763876352, + "language": "" +} \ No newline at end of file diff --git a/e3c_llm_requests/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B.json b/e3c_llm_requests/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B.json new file mode 100644 index 0000000000000000000000000000000000000000..3a574a6dcb81e49041fa0f592ee86930e1b43847 --- /dev/null +++ b/e3c_llm_requests/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B.json @@ -0,0 +1,8 @@ +{ + "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "base_model": "Qwen2ForCausalLM", + "revision": "711ad2ea6aa40cfca18895e8aca02ab92df1a746", + "submitted_time": "2025-01-20 09:19:00+00:00", + "num_params_billion": 32.763876352, + "language": "" +} \ No newline at end of file diff --git a/e3c_llm_requests/epfl-llm/meditron-7b.json b/e3c_llm_requests/epfl-llm/meditron-7b.json new file mode 100644 index 0000000000000000000000000000000000000000..773ee8fbfdfa8623fdc11b71a79ef5122ea682b7 --- /dev/null +++ b/e3c_llm_requests/epfl-llm/meditron-7b.json @@ -0,0 +1,8 @@ +{ + "model": "epfl-llm/meditron-7b", + "base_model": "LlamaForCausalLM", + "revision": "d7d0a5ed929384a6b059ac74198cf1d71f44ba76", + "submitted_time": "2023-11-08 16:03:23+00:00", + "num_params_billion": 6.73855488, + "language": "en" +} \ No newline at end of file diff --git a/e3c_llm_requests/google/gemma-2-9b-it.json b/e3c_llm_requests/google/gemma-2-9b-it.json new file mode 100644 index 0000000000000000000000000000000000000000..bc6b860d3bd567948054e01f7630043dd7220af4 --- /dev/null +++ b/e3c_llm_requests/google/gemma-2-9b-it.json @@ -0,0 +1,8 @@ +{ + "model": "google/gemma-2-9b-it", + "base_model": "Gemma2ForCausalLM", + "revision": "11c9b309abf73637e4b6f9a3fa1e92e615547819", + "submitted_time": "2024-06-24 08:05:41+00:00", + "num_params_billion": 9.241705984, + "language": "" +} \ No newline at end of file diff --git a/e3c_llm_requests/google/gemma-3-27b-it.json b/e3c_llm_requests/google/gemma-3-27b-it.json new file mode 100644 index 0000000000000000000000000000000000000000..0cbbf8fd214db1aa63941c6685829be3c1ef47a6 --- /dev/null +++ b/e3c_llm_requests/google/gemma-3-27b-it.json @@ -0,0 +1,8 @@ +{ + "model": "google/gemma-3-27b-it", + "base_model": "Gemma3ForConditionalGeneration", + "revision": "005ad3404e59d6023443cb575daa05336842228a", + "submitted_time": "2025-03-01 19:10:19+00:00", + "num_params_billion": 27.43240664, + "language": "" +} \ No newline at end of file diff --git a/e3c_llm_requests/google/medgemma-27b-text-it.json b/e3c_llm_requests/google/medgemma-27b-text-it.json new file mode 100644 index 0000000000000000000000000000000000000000..0eadf57aa2ee6563dba193b1a1fd16ebc0362a65 --- /dev/null +++ b/e3c_llm_requests/google/medgemma-27b-text-it.json @@ -0,0 +1,8 @@ +{ + "model": "google/medgemma-27b-text-it", + "base_model": "Gemma3ForCausalLM", + "revision": "6b08c481126ff65a9b8fa5ab4d691b152b8edb5d", + "submitted_time": "2025-05-19 20:53:04+00:00", + "num_params_billion": 27.00900224, + "language": "" +} \ No newline at end of file diff --git a/e3c_llm_requests/google/medgemma-4b-it.json b/e3c_llm_requests/google/medgemma-4b-it.json new file mode 100644 index 0000000000000000000000000000000000000000..7c6a467cac7dee24972df120b27cdd729a1e75fd --- /dev/null +++ b/e3c_llm_requests/google/medgemma-4b-it.json @@ -0,0 +1,8 @@ +{ + "model": "google/medgemma-4b-it", + "base_model": "Gemma3ForConditionalGeneration", + "revision": "efe6cc02361759b6bd501c654ddb7c9d25ec509d", + "submitted_time": "2025-05-19 20:52:44+00:00", + "num_params_billion": 4.300079472, + "language": "" +} \ No newline at end of file diff --git a/e3c_llm_requests/meta-llama/.ipynb_checkpoints/Llama-3.2-1B-Instruct-checkpoint.json b/e3c_llm_requests/meta-llama/.ipynb_checkpoints/Llama-3.2-1B-Instruct-checkpoint.json new file mode 100644 index 0000000000000000000000000000000000000000..dd0dab56bad582a995b770ac81c21b4ad4954553 --- /dev/null +++ b/e3c_llm_requests/meta-llama/.ipynb_checkpoints/Llama-3.2-1B-Instruct-checkpoint.json @@ -0,0 +1,8 @@ +{ + "model": "meta-llama/Llama-3.2-1B-Instruct", + "base_model": "LlamaForCausalLM", + "revision": "9213176726f574b556790deb65791e0c5aa438b6", + "submitted_time": "2024-09-18 15:12:47+00:00", + "num_params_billion": 1.2358144, + "language": "en_de_fr_it_pt_hi_es_th" +} \ No newline at end of file diff --git a/e3c_llm_requests/meta-llama/Llama-3.2-1B-Instruct.json b/e3c_llm_requests/meta-llama/Llama-3.2-1B-Instruct.json new file mode 100644 index 0000000000000000000000000000000000000000..dd0dab56bad582a995b770ac81c21b4ad4954553 --- /dev/null +++ b/e3c_llm_requests/meta-llama/Llama-3.2-1B-Instruct.json @@ -0,0 +1,8 @@ +{ + "model": "meta-llama/Llama-3.2-1B-Instruct", + "base_model": "LlamaForCausalLM", + "revision": "9213176726f574b556790deb65791e0c5aa438b6", + "submitted_time": "2024-09-18 15:12:47+00:00", + "num_params_billion": 1.2358144, + "language": "en_de_fr_it_pt_hi_es_th" +} \ No newline at end of file diff --git a/e3c_llm_requests/microsoft/MediPhi-Clinical.json b/e3c_llm_requests/microsoft/MediPhi-Clinical.json new file mode 100644 index 0000000000000000000000000000000000000000..24031b2427e47fc919f0fda9c2570e2c55afafa6 --- /dev/null +++ b/e3c_llm_requests/microsoft/MediPhi-Clinical.json @@ -0,0 +1,8 @@ +{ + "model": "microsoft/MediPhi-Clinical", + "base_model": "Phi3ForCausalLM", + "revision": "0906e64d321a9c4b058137b34fb3ed6e257e05a0", + "submitted_time": "2025-05-29 20:40:05+00:00", + "num_params_billion": 3.821079552, + "language": "en" +} \ No newline at end of file diff --git a/e3c_llm_requests/microsoft/MediPhi-Instruct.json b/e3c_llm_requests/microsoft/MediPhi-Instruct.json new file mode 100644 index 0000000000000000000000000000000000000000..fd36894ce7cc6bbb938d3639a927ebb3c277254d --- /dev/null +++ b/e3c_llm_requests/microsoft/MediPhi-Instruct.json @@ -0,0 +1,8 @@ +{ + "model": "microsoft/MediPhi-Instruct", + "base_model": "Phi3ForCausalLM", + "revision": "a94ac478e7c246103d55665a0804684042f3b973", + "submitted_time": "2025-07-11 19:28:15+00:00", + "num_params_billion": 3.821079552, + "language": "en" +} \ No newline at end of file diff --git a/e3c_llm_requests/mistralai/Mistral-7B-Instruct-v0.2.json b/e3c_llm_requests/mistralai/Mistral-7B-Instruct-v0.2.json new file mode 100644 index 0000000000000000000000000000000000000000..b36579fb429f3b744a46c6a84fed781411b85cc7 --- /dev/null +++ b/e3c_llm_requests/mistralai/Mistral-7B-Instruct-v0.2.json @@ -0,0 +1,8 @@ +{ + "model": "mistralai/Mistral-7B-Instruct-v0.2", + "base_model": "MistralForCausalLM", + "revision": "63a8b081895390a26e140280378bc85ec8bce07a", + "submitted_time": "2023-12-11 13:18:44+00:00", + "num_params_billion": 7.241732096, + "language": "" +} \ No newline at end of file diff --git a/e3c_llm_requests/mistralai/Mistral-Nemo-Instruct-2407.json b/e3c_llm_requests/mistralai/Mistral-Nemo-Instruct-2407.json new file mode 100644 index 0000000000000000000000000000000000000000..c46a7da06e59d841593280fb14969e4182c18d95 --- /dev/null +++ b/e3c_llm_requests/mistralai/Mistral-Nemo-Instruct-2407.json @@ -0,0 +1,8 @@ +{ + "model": "mistralai/Mistral-Nemo-Instruct-2407", + "base_model": "MistralForCausalLM", + "revision": "04d8a90549d23fc6bd7f642064003592df51e9b3", + "submitted_time": "2024-07-17 17:26:49+00:00", + "num_params_billion": 12.2477824, + "language": "en_fr_de_es_it_pt_ru_zh_ja" +} \ No newline at end of file diff --git a/e3c_llm_requests/tiiuae/Falcon3-10B-Instruct.json b/e3c_llm_requests/tiiuae/Falcon3-10B-Instruct.json new file mode 100644 index 0000000000000000000000000000000000000000..6fb21c1511b3e96748b3e779985cbcbbfca44186 --- /dev/null +++ b/e3c_llm_requests/tiiuae/Falcon3-10B-Instruct.json @@ -0,0 +1,8 @@ +{ + "model": "tiiuae/Falcon3-10B-Instruct", + "base_model": "LlamaForCausalLM", + "revision": "8799bc6aec0152757221dc6b272d824642db6202", + "submitted_time": "2024-12-14 05:17:25+00:00", + "num_params_billion": 10.30565376, + "language": "" +} \ No newline at end of file diff --git a/e3c_llm_requests/unsloth/phi-4.json b/e3c_llm_requests/unsloth/phi-4.json new file mode 100644 index 0000000000000000000000000000000000000000..1aacd86f19c5a378deb37004884fd52f579b6daf --- /dev/null +++ b/e3c_llm_requests/unsloth/phi-4.json @@ -0,0 +1,8 @@ +{ + "model": "unsloth/phi-4", + "base_model": "LlamaForCausalLM", + "revision": "c6220bde10fff762dbd72c3331894aa4cade249d", + "submitted_time": "2025-01-08 21:56:16+00:00", + "num_params_billion": 14.6595072, + "language": "en" +} \ No newline at end of file diff --git a/e3c_llm_results/.gitattributes b/e3c_llm_results/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..1ef325f1b111266a6b26e0196871bd78baa8c2f3 --- /dev/null +++ b/e3c_llm_results/.gitattributes @@ -0,0 +1,59 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.lz4 filter=lfs diff=lfs merge=lfs -text +*.mds filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +# Audio files - uncompressed +*.pcm filter=lfs diff=lfs merge=lfs -text +*.sam filter=lfs diff=lfs merge=lfs -text +*.raw filter=lfs diff=lfs merge=lfs -text +# Audio files - compressed +*.aac filter=lfs diff=lfs merge=lfs -text +*.flac filter=lfs diff=lfs merge=lfs -text +*.mp3 filter=lfs diff=lfs merge=lfs -text +*.ogg filter=lfs diff=lfs merge=lfs -text +*.wav filter=lfs diff=lfs merge=lfs -text +# Image files - uncompressed +*.bmp filter=lfs diff=lfs merge=lfs -text +*.gif filter=lfs diff=lfs merge=lfs -text +*.png filter=lfs diff=lfs merge=lfs -text +*.tiff filter=lfs diff=lfs merge=lfs -text +# Image files - compressed +*.jpg filter=lfs diff=lfs merge=lfs -text +*.jpeg filter=lfs diff=lfs merge=lfs -text +*.webp filter=lfs diff=lfs merge=lfs -text +# Video files - compressed +*.mp4 filter=lfs diff=lfs merge=lfs -text +*.webm filter=lfs diff=lfs merge=lfs -text diff --git a/e3c_llm_results/Henrychur/MMed-Llama-3-8B_0_EN.json b/e3c_llm_results/Henrychur/MMed-Llama-3-8B_0_EN.json new file mode 100644 index 0000000000000000000000000000000000000000..fb662fe1d53a56922d8a772b9baee55d2a7de480 --- /dev/null +++ b/e3c_llm_results/Henrychur/MMed-Llama-3-8B_0_EN.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 8.3819368, + "config": { + "model_name": "Henrychur/MMed-Llama-3-8B", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "EN", + "model": "Henrychur/MMed-Llama-3-8B", + "base_model": "LlamaForCausalLM", + "revision": "6c3057bb49ac499970eb2891daaef9b5c14f6943", + "submitted_time": "2024-05-22 09:17:24+00:00", + "num_params_billion": null, + "language": "en_zh_ja_fr_ru_es" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 6.29, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 10.41, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 10.83, + "stderr": 0.0 + } + ], + "average_accuracy": 9.176666666666668, + "best_prompt": 10.83, + "prompt_id": "p3", + "CPS": 10.650944, + "is_dummy": false, + "std_accuracy": 2.5087314191306596 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 12.870000000000001, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 33.94, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 31.31, + "stderr": 0.0 + } + ], + "average_accuracy": 26.040000000000003, + "best_prompt": 33.94, + "prompt_id": "p2", + "CPS": 31.25874, + "is_dummy": false, + "std_accuracy": 11.481110573459345 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Henrychur/MMed-Llama-3-8B_0_GR.json b/e3c_llm_results/Henrychur/MMed-Llama-3-8B_0_GR.json new file mode 100644 index 0000000000000000000000000000000000000000..491615432b2fe12f3d16cd1efedc04c27b0aef0e --- /dev/null +++ b/e3c_llm_results/Henrychur/MMed-Llama-3-8B_0_GR.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 8.314364166666667, + "config": { + "model_name": "Henrychur/MMed-Llama-3-8B", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "GR", + "model": "Henrychur/MMed-Llama-3-8B", + "base_model": "LlamaForCausalLM", + "revision": "6c3057bb49ac499970eb2891daaef9b5c14f6943", + "submitted_time": "2024-05-22 09:17:24+00:00", + "num_params_billion": null, + "language": "en_zh_ja_fr_ru_es" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 6.2, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 5.92, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 6.2, + "stderr": 0.0 + } + ], + "average_accuracy": 6.1066666666666665, + "best_prompt": 6.2, + "prompt_id": "p1", + "CPS": 6.194213333333334, + "is_dummy": false, + "std_accuracy": 0.16165807537309534 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 10.17, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 5.06, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 10.65, + "stderr": 0.0 + } + ], + "average_accuracy": 8.626666666666667, + "best_prompt": 10.65, + "prompt_id": "p3", + "CPS": 10.434515000000001, + "is_dummy": false, + "std_accuracy": 3.098133846904187 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Henrychur/MMed-Llama-3-8B_0_IT.json b/e3c_llm_results/Henrychur/MMed-Llama-3-8B_0_IT.json new file mode 100644 index 0000000000000000000000000000000000000000..06cfdebdb9527ec4c4ed146ef72adfae1df4bfed --- /dev/null +++ b/e3c_llm_results/Henrychur/MMed-Llama-3-8B_0_IT.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 5.013616333333333, + "config": { + "model_name": "Henrychur/MMed-Llama-3-8B", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "IT", + "model": "Henrychur/MMed-Llama-3-8B", + "base_model": "LlamaForCausalLM", + "revision": "6c3057bb49ac499970eb2891daaef9b5c14f6943", + "submitted_time": "2024-05-22 09:17:24+00:00", + "num_params_billion": null, + "language": "en_zh_ja_fr_ru_es" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 4.35, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 4.29, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 3.84, + "stderr": 0.0 + } + ], + "average_accuracy": 4.16, + "best_prompt": 4.35, + "prompt_id": "p1", + "CPS": 4.341735, + "is_dummy": false, + "std_accuracy": 0.27874719729532704 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 6.72, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 22.66, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 13.0, + "stderr": 0.0 + } + ], + "average_accuracy": 14.126666666666665, + "best_prompt": 22.66, + "prompt_id": "p2", + "CPS": 20.726346666666664, + "is_dummy": false, + "std_accuracy": 8.02950392822205 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Henrychur/MMed-Llama-3-8B_0_PL.json b/e3c_llm_results/Henrychur/MMed-Llama-3-8B_0_PL.json new file mode 100644 index 0000000000000000000000000000000000000000..b5e867f0f3e1ee3e826d798481bb1d32a6350762 --- /dev/null +++ b/e3c_llm_results/Henrychur/MMed-Llama-3-8B_0_PL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 8.100043833333334, + "config": { + "model_name": "Henrychur/MMed-Llama-3-8B", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "PL", + "model": "Henrychur/MMed-Llama-3-8B", + "base_model": "LlamaForCausalLM", + "revision": "6c3057bb49ac499970eb2891daaef9b5c14f6943", + "submitted_time": "2024-05-22 09:17:24+00:00", + "num_params_billion": null, + "language": "en_zh_ja_fr_ru_es" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 3.7900000000000005, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 3.7800000000000002, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 3.7900000000000005, + "stderr": 0.0 + } + ], + "average_accuracy": 3.786666666666667, + "best_prompt": 3.7900000000000005, + "prompt_id": "p1", + "CPS": 3.7898736666666673, + "is_dummy": false, + "std_accuracy": 0.005773502691896391 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 6.02, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 12.93, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 7.779999999999999, + "stderr": 0.0 + } + ], + "average_accuracy": 8.909999999999998, + "best_prompt": 12.93, + "prompt_id": "p2", + "CPS": 12.410214, + "is_dummy": false, + "std_accuracy": 3.590919102402615 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Henrychur/MMed-Llama-3-8B_0_SK.json b/e3c_llm_results/Henrychur/MMed-Llama-3-8B_0_SK.json new file mode 100644 index 0000000000000000000000000000000000000000..0ce0634530fb6e442ffd9a9b74d168b53503e39b --- /dev/null +++ b/e3c_llm_results/Henrychur/MMed-Llama-3-8B_0_SK.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 3.3197085, + "config": { + "model_name": "Henrychur/MMed-Llama-3-8B", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "SK", + "model": "Henrychur/MMed-Llama-3-8B", + "base_model": "LlamaForCausalLM", + "revision": "6c3057bb49ac499970eb2891daaef9b5c14f6943", + "submitted_time": "2024-05-22 09:17:24+00:00", + "num_params_billion": null, + "language": "en_zh_ja_fr_ru_es" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 3.8699999999999997, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 3.8, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 3.8699999999999997, + "stderr": 0.0 + } + ], + "average_accuracy": 3.8466666666666662, + "best_prompt": 3.8699999999999997, + "prompt_id": "p1", + "CPS": 3.8690969999999996, + "is_dummy": false, + "std_accuracy": 0.04041451884327371 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 1.21, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 2.8000000000000003, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 1.21, + "stderr": 0.0 + } + ], + "average_accuracy": 1.74, + "best_prompt": 2.8000000000000003, + "prompt_id": "p2", + "CPS": 2.77032, + "is_dummy": false, + "std_accuracy": 0.9179869280115052 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Henrychur/MMed-Llama-3-8B_0_SL.json b/e3c_llm_results/Henrychur/MMed-Llama-3-8B_0_SL.json new file mode 100644 index 0000000000000000000000000000000000000000..3a241f4dce9440551f68e18e8d3c78b6d08218b2 --- /dev/null +++ b/e3c_llm_results/Henrychur/MMed-Llama-3-8B_0_SL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 11.184996000000002, + "config": { + "model_name": "Henrychur/MMed-Llama-3-8B", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "SL", + "model": "Henrychur/MMed-Llama-3-8B", + "base_model": "LlamaForCausalLM", + "revision": "6c3057bb49ac499970eb2891daaef9b5c14f6943", + "submitted_time": "2024-05-22 09:17:24+00:00", + "num_params_billion": null, + "language": "en_zh_ja_fr_ru_es" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 4.29, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 4.5600000000000005, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 4.29, + "stderr": 0.0 + } + ], + "average_accuracy": 4.38, + "best_prompt": 4.5600000000000005, + "prompt_id": "p2", + "CPS": 4.551792000000001, + "is_dummy": false, + "std_accuracy": 0.1558845726811992 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 9.67, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 19.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 9.67, + "stderr": 0.0 + } + ], + "average_accuracy": 12.780000000000001, + "best_prompt": 19.0, + "prompt_id": "p2", + "CPS": 17.8182, + "is_dummy": false, + "std_accuracy": 5.3866780115392086 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Henrychur/MMed-Llama-3-8B_10_EN.json b/e3c_llm_results/Henrychur/MMed-Llama-3-8B_10_EN.json new file mode 100644 index 0000000000000000000000000000000000000000..9f1e3664b98c96628385befb95c9fa806fce64c4 --- /dev/null +++ b/e3c_llm_results/Henrychur/MMed-Llama-3-8B_10_EN.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 17.1183752, + "config": { + "model_name": "Henrychur/MMed-Llama-3-8B", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "EN", + "model": "Henrychur/MMed-Llama-3-8B", + "base_model": "LlamaForCausalLM", + "revision": "6c3057bb49ac499970eb2891daaef9b5c14f6943", + "submitted_time": "2024-05-22 09:17:24+00:00", + "num_params_billion": null, + "language": "en_zh_ja_fr_ru_es" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 21.89, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 22.43, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 19.939999999999998, + "stderr": 0.0 + } + ], + "average_accuracy": 21.419999999999998, + "best_prompt": 22.43, + "prompt_id": "p2", + "CPS": 22.203457, + "is_dummy": false, + "std_accuracy": 1.309847319346802 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 11.89, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 16.68, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 21.85, + "stderr": 0.0 + } + ], + "average_accuracy": 16.80666666666667, + "best_prompt": 21.85, + "prompt_id": "p3", + "CPS": 20.74803166666667, + "is_dummy": false, + "std_accuracy": 4.981208019480148 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 18.25, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 16.12, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 19.0, + "stderr": 0.0 + } + ], + "average_accuracy": 17.790000000000003, + "best_prompt": 19.0, + "prompt_id": "p3", + "CPS": 18.7701, + "is_dummy": false, + "std_accuracy": 1.4940883508012497 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 24.15, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 14.16, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 6.68, + "stderr": 0.0 + } + ], + "average_accuracy": 14.996666666666668, + "best_prompt": 24.15, + "prompt_id": "p1", + "CPS": 21.93947, + "is_dummy": false, + "std_accuracy": 8.765000475375533 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 1.78, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.6799999999999999, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 1.94, + "stderr": 0.0 + } + ], + "average_accuracy": 1.4666666666666668, + "best_prompt": 1.94, + "prompt_id": "p3", + "CPS": 1.9308173333333332, + "is_dummy": false, + "std_accuracy": 0.6859543230662909 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Henrychur/MMed-Llama-3-8B_10_GR.json b/e3c_llm_results/Henrychur/MMed-Llama-3-8B_10_GR.json new file mode 100644 index 0000000000000000000000000000000000000000..318e4c45e7293126c7d154cba320ba1e37fd24e0 --- /dev/null +++ b/e3c_llm_results/Henrychur/MMed-Llama-3-8B_10_GR.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 13.395712833333334, + "config": { + "model_name": "Henrychur/MMed-Llama-3-8B", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "GR", + "model": "Henrychur/MMed-Llama-3-8B", + "base_model": "LlamaForCausalLM", + "revision": "6c3057bb49ac499970eb2891daaef9b5c14f6943", + "submitted_time": "2024-05-22 09:17:24+00:00", + "num_params_billion": null, + "language": "en_zh_ja_fr_ru_es" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 16.669999999999998, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 10.89, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 16.669999999999998, + "stderr": 0.0 + } + ], + "average_accuracy": 14.743333333333332, + "best_prompt": 16.669999999999998, + "prompt_id": "p1", + "CPS": 16.348824666666665, + "is_dummy": false, + "std_accuracy": 3.337084555916036 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 8.21, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 10.530000000000001, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 10.36, + "stderr": 0.0 + } + ], + "average_accuracy": 9.700000000000001, + "best_prompt": 10.530000000000001, + "prompt_id": "p2", + "CPS": 10.442601000000002, + "is_dummy": false, + "std_accuracy": 1.2931743888586718 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Henrychur/MMed-Llama-3-8B_10_IT.json b/e3c_llm_results/Henrychur/MMed-Llama-3-8B_10_IT.json new file mode 100644 index 0000000000000000000000000000000000000000..b961e82268fc6daaae9860c15c42c9d8cf3b1a25 --- /dev/null +++ b/e3c_llm_results/Henrychur/MMed-Llama-3-8B_10_IT.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 13.608121199999996, + "config": { + "model_name": "Henrychur/MMed-Llama-3-8B", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "IT", + "model": "Henrychur/MMed-Llama-3-8B", + "base_model": "LlamaForCausalLM", + "revision": "6c3057bb49ac499970eb2891daaef9b5c14f6943", + "submitted_time": "2024-05-22 09:17:24+00:00", + "num_params_billion": null, + "language": "en_zh_ja_fr_ru_es" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 32.99, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 40.23, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 39.379999999999995, + "stderr": 0.0 + } + ], + "average_accuracy": 37.53333333333333, + "best_prompt": 40.23, + "prompt_id": "p2", + "CPS": 39.145131, + "is_dummy": false, + "std_accuracy": 3.9575286901465803 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 9.77, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 12.26, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 17.89, + "stderr": 0.0 + } + ], + "average_accuracy": 13.306666666666667, + "best_prompt": 17.89, + "prompt_id": "p3", + "CPS": 17.070041666666665, + "is_dummy": false, + "std_accuracy": 4.159955929253739 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 8.21, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 11.19, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 11.899999999999999, + "stderr": 0.0 + } + ], + "average_accuracy": 10.433333333333332, + "best_prompt": 11.899999999999999, + "prompt_id": "p3", + "CPS": 11.725466666666664, + "is_dummy": false, + "std_accuracy": 1.9579155582744954 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.1, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.02, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.08, + "stderr": 0.0 + } + ], + "average_accuracy": 0.06666666666666667, + "best_prompt": 0.1, + "prompt_id": "p1", + "CPS": 0.09996666666666668, + "is_dummy": false, + "std_accuracy": 0.041633319989322654 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Henrychur/MMed-Llama-3-8B_10_PL.json b/e3c_llm_results/Henrychur/MMed-Llama-3-8B_10_PL.json new file mode 100644 index 0000000000000000000000000000000000000000..f6f07b96eaaa37e2f66017b9ae4f8885302efa01 --- /dev/null +++ b/e3c_llm_results/Henrychur/MMed-Llama-3-8B_10_PL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 25.15700466666667, + "config": { + "model_name": "Henrychur/MMed-Llama-3-8B", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "PL", + "model": "Henrychur/MMed-Llama-3-8B", + "base_model": "LlamaForCausalLM", + "revision": "6c3057bb49ac499970eb2891daaef9b5c14f6943", + "submitted_time": "2024-05-22 09:17:24+00:00", + "num_params_billion": null, + "language": "en_zh_ja_fr_ru_es" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 39.92, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 39.160000000000004, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 39.92, + "stderr": 0.0 + } + ], + "average_accuracy": 39.66666666666667, + "best_prompt": 39.92, + "prompt_id": "p1", + "CPS": 39.81886933333334, + "is_dummy": false, + "std_accuracy": 0.43878620458411444 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 9.98, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 10.549999999999999, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 9.56, + "stderr": 0.0 + } + ], + "average_accuracy": 10.030000000000001, + "best_prompt": 10.549999999999999, + "prompt_id": "p2", + "CPS": 10.49514, + "is_dummy": false, + "std_accuracy": 0.49689032995219296 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Henrychur/MMed-Llama-3-8B_10_SK.json b/e3c_llm_results/Henrychur/MMed-Llama-3-8B_10_SK.json new file mode 100644 index 0000000000000000000000000000000000000000..701059657ff8182b34ce2e870f3372bb13d56e32 --- /dev/null +++ b/e3c_llm_results/Henrychur/MMed-Llama-3-8B_10_SK.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 23.0736205, + "config": { + "model_name": "Henrychur/MMed-Llama-3-8B", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "SK", + "model": "Henrychur/MMed-Llama-3-8B", + "base_model": "LlamaForCausalLM", + "revision": "6c3057bb49ac499970eb2891daaef9b5c14f6943", + "submitted_time": "2024-05-22 09:17:24+00:00", + "num_params_billion": null, + "language": "en_zh_ja_fr_ru_es" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 34.44, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 36.32, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 34.44, + "stderr": 0.0 + } + ], + "average_accuracy": 35.06666666666666, + "best_prompt": 36.32, + "prompt_id": "p2", + "CPS": 35.864789333333334, + "is_dummy": false, + "std_accuracy": 1.085418506076498 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 7.340000000000001, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 10.45, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 8.75, + "stderr": 0.0 + } + ], + "average_accuracy": 8.846666666666666, + "best_prompt": 10.45, + "prompt_id": "p2", + "CPS": 10.282451666666665, + "is_dummy": false, + "std_accuracy": 1.557251852891282 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Henrychur/MMed-Llama-3-8B_10_SL.json b/e3c_llm_results/Henrychur/MMed-Llama-3-8B_10_SL.json new file mode 100644 index 0000000000000000000000000000000000000000..af0a9c3c5710908052425d2c7cf6292fa1065084 --- /dev/null +++ b/e3c_llm_results/Henrychur/MMed-Llama-3-8B_10_SL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 23.493655333333336, + "config": { + "model_name": "Henrychur/MMed-Llama-3-8B", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "SL", + "model": "Henrychur/MMed-Llama-3-8B", + "base_model": "LlamaForCausalLM", + "revision": "6c3057bb49ac499970eb2891daaef9b5c14f6943", + "submitted_time": "2024-05-22 09:17:24+00:00", + "num_params_billion": null, + "language": "en_zh_ja_fr_ru_es" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 35.58, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 40.45, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 35.58, + "stderr": 0.0 + } + ], + "average_accuracy": 37.20333333333333, + "best_prompt": 40.45, + "prompt_id": "p2", + "CPS": 39.136723333333336, + "is_dummy": false, + "std_accuracy": 2.81169581095348 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 7.870000000000001, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 7.8100000000000005, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 7.19, + "stderr": 0.0 + } + ], + "average_accuracy": 7.623333333333334, + "best_prompt": 7.870000000000001, + "prompt_id": "p1", + "CPS": 7.850587333333335, + "is_dummy": false, + "std_accuracy": 0.3764748774265469 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/HiTZ/Medical-mT5-large_0_EN.json b/e3c_llm_results/HiTZ/Medical-mT5-large_0_EN.json new file mode 100644 index 0000000000000000000000000000000000000000..4f1bfb1013bf5d413d06d6315c1d7fc213da2cdd --- /dev/null +++ b/e3c_llm_results/HiTZ/Medical-mT5-large_0_EN.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 1.8120066666666665, + "config": { + "model_name": "HiTZ/Medical-mT5-large", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "EN", + "model": "HiTZ/Medical-mT5-large", + "base_model": "MT5ForConditionalGeneration", + "revision": "e8ae7101f0ab1ed5b8add8846e44a2d39f6e2c47", + "submitted_time": "2023-10-31 15:15:15+00:00", + "num_params_billion": null, + "language": "en_es_fr_it" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 9.4, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 3.3099999999999996, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 4.64, + "stderr": 0.0 + } + ], + "average_accuracy": 5.783333333333334, + "best_prompt": 9.4, + "prompt_id": "p1", + "CPS": 9.060033333333333, + "is_dummy": false, + "std_accuracy": 3.2019421189854973 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/HiTZ/Medical-mT5-large_0_GR.json b/e3c_llm_results/HiTZ/Medical-mT5-large_0_GR.json new file mode 100644 index 0000000000000000000000000000000000000000..d57e9ad3ed157cd7579b7c6862d4ad4077c8b5f9 --- /dev/null +++ b/e3c_llm_results/HiTZ/Medical-mT5-large_0_GR.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 4.256631333333333, + "config": { + "model_name": "HiTZ/Medical-mT5-large", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "GR", + "model": "HiTZ/Medical-mT5-large", + "base_model": "MT5ForConditionalGeneration", + "revision": "e8ae7101f0ab1ed5b8add8846e44a2d39f6e2c47", + "submitted_time": "2023-10-31 15:15:15+00:00", + "num_params_billion": null, + "language": "en_es_fr_it" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 8.59, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 5.91, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 8.59, + "stderr": 0.0 + } + ], + "average_accuracy": 7.696666666666666, + "best_prompt": 8.59, + "prompt_id": "p1", + "CPS": 8.513262666666666, + "is_dummy": false, + "std_accuracy": 1.547298721428197 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/HiTZ/Medical-mT5-large_0_IT.json b/e3c_llm_results/HiTZ/Medical-mT5-large_0_IT.json new file mode 100644 index 0000000000000000000000000000000000000000..36793b360e4d99bcd93ffff1cb624f77261c185f --- /dev/null +++ b/e3c_llm_results/HiTZ/Medical-mT5-large_0_IT.json @@ -0,0 +1,151 @@ +{ + "average_CPS": 1.820189333333333, + "config": { + "model_name": "HiTZ/Medical-mT5-large", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "IT", + "model": "HiTZ/Medical-mT5-large", + "base_model": "MT5ForConditionalGeneration", + "revision": "e8ae7101f0ab1ed5b8add8846e44a2d39f6e2c47", + "submitted_time": "2023-10-31 15:15:15+00:00", + "num_params_billion": null, + "language": "en_es_fr_it" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 7.7, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 9.2, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 7.470000000000001, + "stderr": 0.0 + } + ], + "average_accuracy": 8.123333333333333, + "best_prompt": 9.2, + "prompt_id": "p2", + "CPS": 9.100946666666665, + "is_dummy": false, + "std_accuracy": 0.9394856748952227 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "RE": { + "prompts": [ + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p2", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/HiTZ/Medical-mT5-large_0_PL.json b/e3c_llm_results/HiTZ/Medical-mT5-large_0_PL.json new file mode 100644 index 0000000000000000000000000000000000000000..23ab1afe25cc67c6d5d3da4ad7e79858ef67b9cf --- /dev/null +++ b/e3c_llm_results/HiTZ/Medical-mT5-large_0_PL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 2.1520960000000002, + "config": { + "model_name": "HiTZ/Medical-mT5-large", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "PL", + "model": "HiTZ/Medical-mT5-large", + "base_model": "MT5ForConditionalGeneration", + "revision": "e8ae7101f0ab1ed5b8add8846e44a2d39f6e2c47", + "submitted_time": "2023-10-31 15:15:15+00:00", + "num_params_billion": null, + "language": "en_es_fr_it" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 2.44, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 4.36, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 2.44, + "stderr": 0.0 + } + ], + "average_accuracy": 3.08, + "best_prompt": 4.36, + "prompt_id": "p2", + "CPS": 4.3041920000000005, + "is_dummy": false, + "std_accuracy": 1.1085125168440817 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/HiTZ/Medical-mT5-large_0_SK.json b/e3c_llm_results/HiTZ/Medical-mT5-large_0_SK.json new file mode 100644 index 0000000000000000000000000000000000000000..b8709249ee55ecec1e413d17c8935c718ea1900e --- /dev/null +++ b/e3c_llm_results/HiTZ/Medical-mT5-large_0_SK.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 4.3259333333333325, + "config": { + "model_name": "HiTZ/Medical-mT5-large", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "SK", + "model": "HiTZ/Medical-mT5-large", + "base_model": "MT5ForConditionalGeneration", + "revision": "e8ae7101f0ab1ed5b8add8846e44a2d39f6e2c47", + "submitted_time": "2023-10-31 15:15:15+00:00", + "num_params_billion": null, + "language": "en_es_fr_it" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 8.799999999999999, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 3.75, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 8.799999999999999, + "stderr": 0.0 + } + ], + "average_accuracy": 7.116666666666666, + "best_prompt": 8.799999999999999, + "prompt_id": "p1", + "CPS": 8.651866666666665, + "is_dummy": false, + "std_accuracy": 2.9156188594076093 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/HiTZ/Medical-mT5-large_0_SL.json b/e3c_llm_results/HiTZ/Medical-mT5-large_0_SL.json new file mode 100644 index 0000000000000000000000000000000000000000..72e6cb4c2d5067d737a04008a797598d0a329f51 --- /dev/null +++ b/e3c_llm_results/HiTZ/Medical-mT5-large_0_SL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 3.859359, + "config": { + "model_name": "HiTZ/Medical-mT5-large", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "SL", + "model": "HiTZ/Medical-mT5-large", + "base_model": "MT5ForConditionalGeneration", + "revision": "e8ae7101f0ab1ed5b8add8846e44a2d39f6e2c47", + "submitted_time": "2023-10-31 15:15:15+00:00", + "num_params_billion": null, + "language": "en_es_fr_it" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 7.7700000000000005, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 5.79, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 7.7700000000000005, + "stderr": 0.0 + } + ], + "average_accuracy": 7.11, + "best_prompt": 7.7700000000000005, + "prompt_id": "p1", + "CPS": 7.718718, + "is_dummy": false, + "std_accuracy": 1.1431535329954592 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/HiTZ/Medical-mT5-large_10_EN.json b/e3c_llm_results/HiTZ/Medical-mT5-large_10_EN.json new file mode 100644 index 0000000000000000000000000000000000000000..cbc46eb7beebd4583e1e8d0b8014ea79446bd90d --- /dev/null +++ b/e3c_llm_results/HiTZ/Medical-mT5-large_10_EN.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 2.900183933333333, + "config": { + "model_name": "HiTZ/Medical-mT5-large", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "EN", + "model": "HiTZ/Medical-mT5-large", + "base_model": "MT5ForConditionalGeneration", + "revision": "e8ae7101f0ab1ed5b8add8846e44a2d39f6e2c47", + "submitted_time": "2023-10-31 15:15:15+00:00", + "num_params_billion": null, + "language": "en_es_fr_it" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 12.15, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 14.149999999999999, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 13.22, + "stderr": 0.0 + } + ], + "average_accuracy": 13.173333333333332, + "best_prompt": 14.149999999999999, + "prompt_id": "p2", + "CPS": 14.011801666666665, + "is_dummy": false, + "std_accuracy": 1.0008163334665015 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.27999999999999997, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.16, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.49, + "stderr": 0.0 + } + ], + "average_accuracy": 0.31, + "best_prompt": 0.49, + "prompt_id": "p3", + "CPS": 0.489118, + "is_dummy": false, + "std_accuracy": 0.16703293088490065 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/HiTZ/Medical-mT5-large_10_GR.json b/e3c_llm_results/HiTZ/Medical-mT5-large_10_GR.json new file mode 100644 index 0000000000000000000000000000000000000000..23f2906a587c4ccdd8963e2fcbb3cec004da6db8 --- /dev/null +++ b/e3c_llm_results/HiTZ/Medical-mT5-large_10_GR.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 7.3897435, + "config": { + "model_name": "HiTZ/Medical-mT5-large", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "GR", + "model": "HiTZ/Medical-mT5-large", + "base_model": "MT5ForConditionalGeneration", + "revision": "e8ae7101f0ab1ed5b8add8846e44a2d39f6e2c47", + "submitted_time": "2023-10-31 15:15:15+00:00", + "num_params_billion": null, + "language": "en_es_fr_it" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 14.549999999999999, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 14.34, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 14.549999999999999, + "stderr": 0.0 + } + ], + "average_accuracy": 14.479999999999999, + "best_prompt": 14.549999999999999, + "prompt_id": "p1", + "CPS": 14.539814999999999, + "is_dummy": false, + "std_accuracy": 0.12124355652982088 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.24, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.06999999999999999, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.10333333333333333, + "best_prompt": 0.24, + "prompt_id": "p1", + "CPS": 0.239672, + "is_dummy": false, + "std_accuracy": 0.12342339054382412 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/HiTZ/Medical-mT5-large_10_IT.json b/e3c_llm_results/HiTZ/Medical-mT5-large_10_IT.json new file mode 100644 index 0000000000000000000000000000000000000000..bb39b345327beca90e2e94c31f66b7d1c084ac61 --- /dev/null +++ b/e3c_llm_results/HiTZ/Medical-mT5-large_10_IT.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 3.6471789333333335, + "config": { + "model_name": "HiTZ/Medical-mT5-large", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "IT", + "model": "HiTZ/Medical-mT5-large", + "base_model": "MT5ForConditionalGeneration", + "revision": "e8ae7101f0ab1ed5b8add8846e44a2d39f6e2c47", + "submitted_time": "2023-10-31 15:15:15+00:00", + "num_params_billion": null, + "language": "en_es_fr_it" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 16.16, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 17.740000000000002, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 16.900000000000002, + "stderr": 0.0 + } + ], + "average_accuracy": 16.933333333333337, + "best_prompt": 17.740000000000002, + "prompt_id": "p2", + "CPS": 17.596897333333335, + "is_dummy": false, + "std_accuracy": 0.7905272502155348 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.35000000000000003, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.64, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.45999999999999996, + "stderr": 0.0 + } + ], + "average_accuracy": 0.48333333333333334, + "best_prompt": 0.64, + "prompt_id": "p2", + "CPS": 0.6389973333333333, + "is_dummy": false, + "std_accuracy": 0.14640127503998498 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/HiTZ/Medical-mT5-large_10_PL.json b/e3c_llm_results/HiTZ/Medical-mT5-large_10_PL.json new file mode 100644 index 0000000000000000000000000000000000000000..bcadc3afeb7e410f3f540b229a5d97ed1ceeddd1 --- /dev/null +++ b/e3c_llm_results/HiTZ/Medical-mT5-large_10_PL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 7.915078666666666, + "config": { + "model_name": "HiTZ/Medical-mT5-large", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "PL", + "model": "HiTZ/Medical-mT5-large", + "base_model": "MT5ForConditionalGeneration", + "revision": "e8ae7101f0ab1ed5b8add8846e44a2d39f6e2c47", + "submitted_time": "2023-10-31 15:15:15+00:00", + "num_params_billion": null, + "language": "en_es_fr_it" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 15.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 15.479999999999999, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 15.0, + "stderr": 0.0 + } + ], + "average_accuracy": 15.159999999999998, + "best_prompt": 15.479999999999999, + "prompt_id": "p2", + "CPS": 15.430463999999999, + "is_dummy": false, + "std_accuracy": 0.2771281292110196 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.4, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.22999999999999998, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.33999999999999997, + "stderr": 0.0 + } + ], + "average_accuracy": 0.3233333333333333, + "best_prompt": 0.4, + "prompt_id": "p1", + "CPS": 0.39969333333333334, + "is_dummy": false, + "std_accuracy": 0.08621678104251711 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/HiTZ/Medical-mT5-large_10_SK.json b/e3c_llm_results/HiTZ/Medical-mT5-large_10_SK.json new file mode 100644 index 0000000000000000000000000000000000000000..624fa817e41e538d20238d2ac58d4a7ee9d264fe --- /dev/null +++ b/e3c_llm_results/HiTZ/Medical-mT5-large_10_SK.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 7.5838598333333325, + "config": { + "model_name": "HiTZ/Medical-mT5-large", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "SK", + "model": "HiTZ/Medical-mT5-large", + "base_model": "MT5ForConditionalGeneration", + "revision": "e8ae7101f0ab1ed5b8add8846e44a2d39f6e2c47", + "submitted_time": "2023-10-31 15:15:15+00:00", + "num_params_billion": null, + "language": "en_es_fr_it" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 14.85, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 13.600000000000001, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 14.85, + "stderr": 0.0 + } + ], + "average_accuracy": 14.433333333333335, + "best_prompt": 14.85, + "prompt_id": "p1", + "CPS": 14.788124999999999, + "is_dummy": false, + "std_accuracy": 0.7216878364870312 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.38, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.24, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.2, + "stderr": 0.0 + } + ], + "average_accuracy": 0.2733333333333334, + "best_prompt": 0.38, + "prompt_id": "p1", + "CPS": 0.3795946666666667, + "is_dummy": false, + "std_accuracy": 0.09451631252505216 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/HiTZ/Medical-mT5-large_10_SL.json b/e3c_llm_results/HiTZ/Medical-mT5-large_10_SL.json new file mode 100644 index 0000000000000000000000000000000000000000..4ab112253b8cabcc3002db3f6700835cdeeb1fc0 --- /dev/null +++ b/e3c_llm_results/HiTZ/Medical-mT5-large_10_SL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 7.7788705, + "config": { + "model_name": "HiTZ/Medical-mT5-large", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "SL", + "model": "HiTZ/Medical-mT5-large", + "base_model": "MT5ForConditionalGeneration", + "revision": "e8ae7101f0ab1ed5b8add8846e44a2d39f6e2c47", + "submitted_time": "2023-10-31 15:15:15+00:00", + "num_params_billion": null, + "language": "en_es_fr_it" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 14.7, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 13.25, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 14.7, + "stderr": 0.0 + } + ], + "average_accuracy": 14.216666666666667, + "best_prompt": 14.7, + "prompt_id": "p1", + "CPS": 14.62895, + "is_dummy": false, + "std_accuracy": 0.8371578903249569 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.73, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.74, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.9299999999999999, + "stderr": 0.0 + } + ], + "average_accuracy": 0.7999999999999999, + "best_prompt": 0.9299999999999999, + "prompt_id": "p3", + "CPS": 0.9287909999999999, + "is_dummy": false, + "std_accuracy": 0.11269427669584642 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Qwen/Qwen2.5-14B-Instruct-1M_0_EN.json b/e3c_llm_results/Qwen/Qwen2.5-14B-Instruct-1M_0_EN.json new file mode 100644 index 0000000000000000000000000000000000000000..47893dbc04c87db06236301977b64f06d4b96188 --- /dev/null +++ b/e3c_llm_results/Qwen/Qwen2.5-14B-Instruct-1M_0_EN.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 14.4829312, + "config": { + "model_name": "Qwen/Qwen2.5-14B-Instruct-1M", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "EN", + "model": "Qwen/Qwen2.5-14B-Instruct-1M", + "base_model": "Qwen2ForCausalLM", + "revision": "620fad32de7bdd2293b3d99b39eba2fe63e97438", + "submitted_time": "2025-01-23 13:23:24+00:00", + "num_params_billion": 14.770033664, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 34.25, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 11.81, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 28.93, + "stderr": 0.0 + } + ], + "average_accuracy": 24.99666666666667, + "best_prompt": 34.25, + "prompt_id": "p1", + "CPS": 31.08073333333333, + "is_dummy": false, + "std_accuracy": 11.725686902409313 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 41.349999999999994, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 39.17, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 41.72, + "stderr": 0.0 + } + ], + "average_accuracy": 40.74666666666666, + "best_prompt": 41.72, + "prompt_id": "p3", + "CPS": 41.31392533333333, + "is_dummy": false, + "std_accuracy": 1.3779090439260953 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.02, + "stderr": 0.0 + } + ], + "average_accuracy": 0.006666666666666667, + "best_prompt": 0.02, + "prompt_id": "p3", + "CPS": 0.019997333333333336, + "is_dummy": false, + "std_accuracy": 0.011547005383792516 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Qwen/Qwen2.5-14B-Instruct-1M_0_GR.json b/e3c_llm_results/Qwen/Qwen2.5-14B-Instruct-1M_0_GR.json new file mode 100644 index 0000000000000000000000000000000000000000..5860dc4bf439f917359fd7140fe23fc615698fa1 --- /dev/null +++ b/e3c_llm_results/Qwen/Qwen2.5-14B-Instruct-1M_0_GR.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 27.333585333333332, + "config": { + "model_name": "Qwen/Qwen2.5-14B-Instruct-1M", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "GR", + "model": "Qwen/Qwen2.5-14B-Instruct-1M", + "base_model": "Qwen2ForCausalLM", + "revision": "620fad32de7bdd2293b3d99b39eba2fe63e97438", + "submitted_time": "2025-01-23 13:23:24+00:00", + "num_params_billion": 14.770033664, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 13.389999999999999, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 11.91, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 13.389999999999999, + "stderr": 0.0 + } + ], + "average_accuracy": 12.896666666666667, + "best_prompt": 13.389999999999999, + "prompt_id": "p1", + "CPS": 13.323942666666666, + "is_dummy": false, + "std_accuracy": 0.8544783984006453 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 37.96, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 42.66, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 38.1, + "stderr": 0.0 + } + ], + "average_accuracy": 39.57333333333333, + "best_prompt": 42.66, + "prompt_id": "p2", + "CPS": 41.343227999999996, + "is_dummy": false, + "std_accuracy": 2.6740481172434647 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Qwen/Qwen2.5-14B-Instruct-1M_0_IT.json b/e3c_llm_results/Qwen/Qwen2.5-14B-Instruct-1M_0_IT.json new file mode 100644 index 0000000000000000000000000000000000000000..eeb003c099aa607d64e75c1b2a9138b41ca4667f --- /dev/null +++ b/e3c_llm_results/Qwen/Qwen2.5-14B-Instruct-1M_0_IT.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 13.000253, + "config": { + "model_name": "Qwen/Qwen2.5-14B-Instruct-1M", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "IT", + "model": "Qwen/Qwen2.5-14B-Instruct-1M", + "base_model": "Qwen2ForCausalLM", + "revision": "620fad32de7bdd2293b3d99b39eba2fe63e97438", + "submitted_time": "2025-01-23 13:23:24+00:00", + "num_params_billion": 14.770033664, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 24.67, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 17.09, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 22.34, + "stderr": 0.0 + } + ], + "average_accuracy": 21.36666666666667, + "best_prompt": 24.67, + "prompt_id": "p1", + "CPS": 23.855067666666667, + "is_dummy": false, + "std_accuracy": 3.8826065128124094 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 41.730000000000004, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 37.7, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 41.06, + "stderr": 0.0 + } + ], + "average_accuracy": 40.163333333333334, + "best_prompt": 41.730000000000004, + "prompt_id": "p1", + "CPS": 41.07623, + "is_dummy": false, + "std_accuracy": 2.1594520910020982 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.06999999999999999, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.02333333333333333, + "best_prompt": 0.06999999999999999, + "prompt_id": "p1", + "CPS": 0.06996733333333333, + "is_dummy": false, + "std_accuracy": 0.0404145188432738 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Qwen/Qwen2.5-14B-Instruct-1M_0_PL.json b/e3c_llm_results/Qwen/Qwen2.5-14B-Instruct-1M_0_PL.json new file mode 100644 index 0000000000000000000000000000000000000000..6eddabba6de6b14d519a9ed4660583d3bfc38d52 --- /dev/null +++ b/e3c_llm_results/Qwen/Qwen2.5-14B-Instruct-1M_0_PL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 24.780516499999997, + "config": { + "model_name": "Qwen/Qwen2.5-14B-Instruct-1M", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "PL", + "model": "Qwen/Qwen2.5-14B-Instruct-1M", + "base_model": "Qwen2ForCausalLM", + "revision": "620fad32de7bdd2293b3d99b39eba2fe63e97438", + "submitted_time": "2025-01-23 13:23:24+00:00", + "num_params_billion": 14.770033664, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 6.97, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 3.64, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 6.97, + "stderr": 0.0 + } + ], + "average_accuracy": 5.859999999999999, + "best_prompt": 6.97, + "prompt_id": "p1", + "CPS": 6.892633, + "is_dummy": false, + "std_accuracy": 1.9225763964014535 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 38.03, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 44.64, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 38.0, + "stderr": 0.0 + } + ], + "average_accuracy": 40.223333333333336, + "best_prompt": 44.64, + "prompt_id": "p2", + "CPS": 42.6684, + "is_dummy": false, + "std_accuracy": 3.8249749454517126 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Qwen/Qwen2.5-14B-Instruct-1M_0_SK.json b/e3c_llm_results/Qwen/Qwen2.5-14B-Instruct-1M_0_SK.json new file mode 100644 index 0000000000000000000000000000000000000000..7a3ad59220039bb908bd9284b34d7c0cce9aa1b1 --- /dev/null +++ b/e3c_llm_results/Qwen/Qwen2.5-14B-Instruct-1M_0_SK.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 27.026387333333332, + "config": { + "model_name": "Qwen/Qwen2.5-14B-Instruct-1M", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "SK", + "model": "Qwen/Qwen2.5-14B-Instruct-1M", + "base_model": "Qwen2ForCausalLM", + "revision": "620fad32de7bdd2293b3d99b39eba2fe63e97438", + "submitted_time": "2025-01-23 13:23:24+00:00", + "num_params_billion": 14.770033664, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 12.2, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 4.26, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 12.2, + "stderr": 0.0 + } + ], + "average_accuracy": 9.553333333333333, + "best_prompt": 12.2, + "prompt_id": "p1", + "CPS": 11.877106666666666, + "is_dummy": false, + "std_accuracy": 4.584161137365628 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 40.27, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 42.94, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 40.27, + "stderr": 0.0 + } + ], + "average_accuracy": 41.160000000000004, + "best_prompt": 42.94, + "prompt_id": "p2", + "CPS": 42.175668, + "is_dummy": false, + "std_accuracy": 1.5415252187362976 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Qwen/Qwen2.5-14B-Instruct-1M_0_SL.json b/e3c_llm_results/Qwen/Qwen2.5-14B-Instruct-1M_0_SL.json new file mode 100644 index 0000000000000000000000000000000000000000..55f1eea8c1a13da17cf4d2e14ff0d7d6629d2664 --- /dev/null +++ b/e3c_llm_results/Qwen/Qwen2.5-14B-Instruct-1M_0_SL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 37.45460366666667, + "config": { + "model_name": "Qwen/Qwen2.5-14B-Instruct-1M", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "SL", + "model": "Qwen/Qwen2.5-14B-Instruct-1M", + "base_model": "Qwen2ForCausalLM", + "revision": "620fad32de7bdd2293b3d99b39eba2fe63e97438", + "submitted_time": "2025-01-23 13:23:24+00:00", + "num_params_billion": 14.770033664, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 39.1, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 23.75, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 39.1, + "stderr": 0.0 + } + ], + "average_accuracy": 33.983333333333334, + "best_prompt": 39.1, + "prompt_id": "p1", + "CPS": 37.099383333333336, + "is_dummy": false, + "std_accuracy": 8.862326632060757 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 37.75, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 37.830000000000005, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 37.75, + "stderr": 0.0 + } + ], + "average_accuracy": 37.77666666666667, + "best_prompt": 37.830000000000005, + "prompt_id": "p2", + "CPS": 37.809824000000006, + "is_dummy": false, + "std_accuracy": 0.04618802153517318 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Qwen/Qwen2.5-14B-Instruct-1M_10_EN.json b/e3c_llm_results/Qwen/Qwen2.5-14B-Instruct-1M_10_EN.json new file mode 100644 index 0000000000000000000000000000000000000000..30c36e51a607150b40384df22d1709e46e28567e --- /dev/null +++ b/e3c_llm_results/Qwen/Qwen2.5-14B-Instruct-1M_10_EN.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 43.13491793333334, + "config": { + "model_name": "Qwen/Qwen2.5-14B-Instruct-1M", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "EN", + "model": "Qwen/Qwen2.5-14B-Instruct-1M", + "base_model": "Qwen2ForCausalLM", + "revision": "620fad32de7bdd2293b3d99b39eba2fe63e97438", + "submitted_time": "2025-01-23 13:23:24+00:00", + "num_params_billion": 14.770033664, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 60.91, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 56.46, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 62.43, + "stderr": 0.0 + } + ], + "average_accuracy": 59.93333333333334, + "best_prompt": 62.43, + "prompt_id": "p3", + "CPS": 60.871331000000005, + "is_dummy": false, + "std_accuracy": 3.1025204807274562 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 63.32, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 60.25, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 61.33, + "stderr": 0.0 + } + ], + "average_accuracy": 61.633333333333326, + "best_prompt": 63.32, + "prompt_id": "p1", + "CPS": 62.25200266666666, + "is_dummy": false, + "std_accuracy": 1.5573160672558843 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 21.29, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 32.22, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 31.78, + "stderr": 0.0 + } + ], + "average_accuracy": 28.429999999999996, + "best_prompt": 32.22, + "prompt_id": "p2", + "CPS": 30.998862, + "is_dummy": false, + "std_accuracy": 6.1873338361526935 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 30.73, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 11.37, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 7.64, + "stderr": 0.0 + } + ], + "average_accuracy": 16.580000000000002, + "best_prompt": 30.73, + "prompt_id": "p1", + "CPS": 26.381705, + "is_dummy": false, + "std_accuracy": 12.395366069624568 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 12.44, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 44.29, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 14.37, + "stderr": 0.0 + } + ], + "average_accuracy": 23.7, + "best_prompt": 44.29, + "prompt_id": "p2", + "CPS": 35.170689, + "is_dummy": false, + "std_accuracy": 17.857555823796268 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Qwen/Qwen2.5-14B-Instruct-1M_10_GR.json b/e3c_llm_results/Qwen/Qwen2.5-14B-Instruct-1M_10_GR.json new file mode 100644 index 0000000000000000000000000000000000000000..d8e1a2407294e5d3f02619c1c78fccf58a3929e9 --- /dev/null +++ b/e3c_llm_results/Qwen/Qwen2.5-14B-Instruct-1M_10_GR.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 60.84101533333333, + "config": { + "model_name": "Qwen/Qwen2.5-14B-Instruct-1M", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "GR", + "model": "Qwen/Qwen2.5-14B-Instruct-1M", + "base_model": "Qwen2ForCausalLM", + "revision": "620fad32de7bdd2293b3d99b39eba2fe63e97438", + "submitted_time": "2025-01-23 13:23:24+00:00", + "num_params_billion": 14.770033664, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 61.19, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 58.47, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 61.19, + "stderr": 0.0 + } + ], + "average_accuracy": 60.28333333333333, + "best_prompt": 61.19, + "prompt_id": "p1", + "CPS": 60.635210666666666, + "is_dummy": false, + "std_accuracy": 1.5703927321957813 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 59.62, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 60.24, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 61.83, + "stderr": 0.0 + } + ], + "average_accuracy": 60.56333333333333, + "best_prompt": 61.83, + "prompt_id": "p3", + "CPS": 61.046820000000004, + "is_dummy": false, + "std_accuracy": 1.1399268982409938 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Qwen/Qwen2.5-14B-Instruct-1M_10_IT.json b/e3c_llm_results/Qwen/Qwen2.5-14B-Instruct-1M_10_IT.json new file mode 100644 index 0000000000000000000000000000000000000000..34c3abc839b897074e72cf3f86bf8f3bf88426ac --- /dev/null +++ b/e3c_llm_results/Qwen/Qwen2.5-14B-Instruct-1M_10_IT.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 43.527679133333336, + "config": { + "model_name": "Qwen/Qwen2.5-14B-Instruct-1M", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "IT", + "model": "Qwen/Qwen2.5-14B-Instruct-1M", + "base_model": "Qwen2ForCausalLM", + "revision": "620fad32de7bdd2293b3d99b39eba2fe63e97438", + "submitted_time": "2025-01-23 13:23:24+00:00", + "num_params_billion": 14.770033664, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 67.19000000000001, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 63.27, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 66.61, + "stderr": 0.0 + } + ], + "average_accuracy": 65.69, + "best_prompt": 67.19000000000001, + "prompt_id": "p1", + "CPS": 66.18215000000001, + "is_dummy": false, + "std_accuracy": 2.1157504578754107 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 57.67, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 59.98, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 60.92999999999999, + "stderr": 0.0 + } + ], + "average_accuracy": 59.526666666666664, + "best_prompt": 60.92999999999999, + "prompt_id": "p3", + "CPS": 60.07494899999999, + "is_dummy": false, + "std_accuracy": 1.6766136505865978 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 11.110000000000001, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 15.989999999999998, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 19.6, + "stderr": 0.0 + } + ], + "average_accuracy": 15.566666666666668, + "best_prompt": 19.6, + "prompt_id": "p3", + "CPS": 18.80946666666667, + "is_dummy": false, + "std_accuracy": 4.260801958943097 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 44.07, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 13.28, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 17.53, + "stderr": 0.0 + } + ], + "average_accuracy": 24.959999999999997, + "best_prompt": 44.07, + "prompt_id": "p1", + "CPS": 35.648223, + "is_dummy": false, + "std_accuracy": 16.68561356378602 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 8.17, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 51.03, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 10.96, + "stderr": 0.0 + } + ], + "average_accuracy": 23.386666666666667, + "best_prompt": 51.03, + "prompt_id": "p2", + "CPS": 36.923607000000004, + "is_dummy": false, + "std_accuracy": 23.980438555900797 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Qwen/Qwen2.5-14B-Instruct-1M_10_PL.json b/e3c_llm_results/Qwen/Qwen2.5-14B-Instruct-1M_10_PL.json new file mode 100644 index 0000000000000000000000000000000000000000..452fdffcf83aa97c10140473ffc00f18a70b6776 --- /dev/null +++ b/e3c_llm_results/Qwen/Qwen2.5-14B-Instruct-1M_10_PL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 61.2140455, + "config": { + "model_name": "Qwen/Qwen2.5-14B-Instruct-1M", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "PL", + "model": "Qwen/Qwen2.5-14B-Instruct-1M", + "base_model": "Qwen2ForCausalLM", + "revision": "620fad32de7bdd2293b3d99b39eba2fe63e97438", + "submitted_time": "2025-01-23 13:23:24+00:00", + "num_params_billion": 14.770033664, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 62.260000000000005, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 58.24, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 62.260000000000005, + "stderr": 0.0 + } + ], + "average_accuracy": 60.919999999999995, + "best_prompt": 62.260000000000005, + "prompt_id": "p1", + "CPS": 61.425716, + "is_dummy": false, + "std_accuracy": 2.3209480821422974 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 59.91, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 54.66, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 63.74999999999999, + "stderr": 0.0 + } + ], + "average_accuracy": 59.44, + "best_prompt": 63.74999999999999, + "prompt_id": "p3", + "CPS": 61.002375, + "is_dummy": false, + "std_accuracy": 4.563189673901358 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Qwen/Qwen2.5-14B-Instruct-1M_10_SK.json b/e3c_llm_results/Qwen/Qwen2.5-14B-Instruct-1M_10_SK.json new file mode 100644 index 0000000000000000000000000000000000000000..6d6780459a984f222fd866c1030aefc721a41c5e --- /dev/null +++ b/e3c_llm_results/Qwen/Qwen2.5-14B-Instruct-1M_10_SK.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 61.83102316666667, + "config": { + "model_name": "Qwen/Qwen2.5-14B-Instruct-1M", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "SK", + "model": "Qwen/Qwen2.5-14B-Instruct-1M", + "base_model": "Qwen2ForCausalLM", + "revision": "620fad32de7bdd2293b3d99b39eba2fe63e97438", + "submitted_time": "2025-01-23 13:23:24+00:00", + "num_params_billion": 14.770033664, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 63.85999999999999, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 64.86, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 63.85999999999999, + "stderr": 0.0 + } + ], + "average_accuracy": 64.19333333333333, + "best_prompt": 64.86, + "prompt_id": "p2", + "CPS": 64.4276, + "is_dummy": false, + "std_accuracy": 0.5773502691896298 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 58.940000000000005, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 58.45, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 59.589999999999996, + "stderr": 0.0 + } + ], + "average_accuracy": 58.99333333333334, + "best_prompt": 59.589999999999996, + "prompt_id": "p3", + "CPS": 59.23444633333334, + "is_dummy": false, + "std_accuracy": 0.57186828320281 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Qwen/Qwen2.5-14B-Instruct-1M_10_SL.json b/e3c_llm_results/Qwen/Qwen2.5-14B-Instruct-1M_10_SL.json new file mode 100644 index 0000000000000000000000000000000000000000..b6c3a4a3f4b52ed11463c39827448d9f6ad1334b --- /dev/null +++ b/e3c_llm_results/Qwen/Qwen2.5-14B-Instruct-1M_10_SL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 61.436353666666676, + "config": { + "model_name": "Qwen/Qwen2.5-14B-Instruct-1M", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "SL", + "model": "Qwen/Qwen2.5-14B-Instruct-1M", + "base_model": "Qwen2ForCausalLM", + "revision": "620fad32de7bdd2293b3d99b39eba2fe63e97438", + "submitted_time": "2025-01-23 13:23:24+00:00", + "num_params_billion": 14.770033664, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 64.67, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 61.78, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 64.67, + "stderr": 0.0 + } + ], + "average_accuracy": 63.70666666666667, + "best_prompt": 64.67, + "prompt_id": "p1", + "CPS": 64.04701233333334, + "is_dummy": false, + "std_accuracy": 1.6685422779580188 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 59.489999999999995, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 57.82000000000001, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 57.809999999999995, + "stderr": 0.0 + } + ], + "average_accuracy": 58.373333333333335, + "best_prompt": 59.489999999999995, + "prompt_id": "p1", + "CPS": 58.825695, + "is_dummy": false, + "std_accuracy": 0.967074626558533 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Qwen/Qwen2.5-32B-Instruct_0_EN.json b/e3c_llm_results/Qwen/Qwen2.5-32B-Instruct_0_EN.json new file mode 100644 index 0000000000000000000000000000000000000000..71eaffd7b145ad29dc56910676fe78d6d716255e --- /dev/null +++ b/e3c_llm_results/Qwen/Qwen2.5-32B-Instruct_0_EN.json @@ -0,0 +1,169 @@ +{ + "average_CPS": 16.729891099999996, + "config": { + "model_name": "Qwen/Qwen2.5-32B-Instruct", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "EN", + "model": "Qwen/Qwen2.5-32B-Instruct", + "base_model": "Qwen2ForCausalLM", + "revision": "5ede1c97bbab6ce5cda5812749b4c0bdf79b18dd", + "submitted_time": "2024-09-17 04:17:55+00:00", + "num_params_billion": 32.763876352, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 38.04, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 30.680000000000003, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 29.64, + "stderr": 0.0 + } + ], + "average_accuracy": 32.78666666666667, + "best_prompt": 38.04, + "prompt_id": "p1", + "CPS": 36.041632, + "is_dummy": false, + "std_accuracy": 4.5791411130618505 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 47.339999999999996, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 46.489999999999995, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 45.910000000000004, + "stderr": 0.0 + } + ], + "average_accuracy": 46.57999999999999, + "best_prompt": 47.339999999999996, + "prompt_id": "p1", + "CPS": 46.98021599999999, + "is_dummy": false, + "std_accuracy": 0.7192357054540571 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.05, + "stderr": 0.0 + }, + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.5700000000000001, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.15500000000000003, + "best_prompt": 0.5700000000000001, + "prompt_id": "p2", + "CPS": 0.5676345, + "is_dummy": false, + "std_accuracy": 0.27766886753829645 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.06, + "stderr": 0.0 + } + ], + "average_accuracy": 0.015, + "best_prompt": 0.06, + "prompt_id": "p3", + "CPS": 0.059973, + "is_dummy": false, + "std_accuracy": 0.03 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Qwen/Qwen2.5-32B-Instruct_0_GR.json b/e3c_llm_results/Qwen/Qwen2.5-32B-Instruct_0_GR.json new file mode 100644 index 0000000000000000000000000000000000000000..2a6c06c69e77cfaee82a2299b5651dee82e91bf4 --- /dev/null +++ b/e3c_llm_results/Qwen/Qwen2.5-32B-Instruct_0_GR.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 46.818379166666666, + "config": { + "model_name": "Qwen/Qwen2.5-32B-Instruct", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "GR", + "model": "Qwen/Qwen2.5-32B-Instruct", + "base_model": "Qwen2ForCausalLM", + "revision": "5ede1c97bbab6ce5cda5812749b4c0bdf79b18dd", + "submitted_time": "2024-09-17 04:17:55+00:00", + "num_params_billion": 32.763876352, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 59.760000000000005, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 15.68, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 59.760000000000005, + "stderr": 0.0 + } + ], + "average_accuracy": 45.06666666666666, + "best_prompt": 59.760000000000005, + "prompt_id": "p1", + "CPS": 50.979264, + "is_dummy": false, + "std_accuracy": 25.449599865878707 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 43.93, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 40.83, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 38.34, + "stderr": 0.0 + } + ], + "average_accuracy": 41.03333333333333, + "best_prompt": 43.93, + "prompt_id": "p1", + "CPS": 42.65749433333333, + "is_dummy": false, + "std_accuracy": 2.800541614283445 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Qwen/Qwen2.5-32B-Instruct_0_IT.json b/e3c_llm_results/Qwen/Qwen2.5-32B-Instruct_0_IT.json new file mode 100644 index 0000000000000000000000000000000000000000..6991f293641c30fff627bd6bd487d26d2215fdbc --- /dev/null +++ b/e3c_llm_results/Qwen/Qwen2.5-32B-Instruct_0_IT.json @@ -0,0 +1,163 @@ +{ + "average_CPS": 15.684724516666668, + "config": { + "model_name": "Qwen/Qwen2.5-32B-Instruct", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "IT", + "model": "Qwen/Qwen2.5-32B-Instruct", + "base_model": "Qwen2ForCausalLM", + "revision": "5ede1c97bbab6ce5cda5812749b4c0bdf79b18dd", + "submitted_time": "2024-09-17 04:17:55+00:00", + "num_params_billion": 32.763876352, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 37.580000000000005, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 16.470000000000002, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 27.96, + "stderr": 0.0 + } + ], + "average_accuracy": 27.336666666666673, + "best_prompt": 37.580000000000005, + "prompt_id": "p1", + "CPS": 33.730555333333335, + "is_dummy": false, + "std_accuracy": 10.568795264046578 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 45.050000000000004, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 41.589999999999996, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 44.47, + "stderr": 0.0 + } + ], + "average_accuracy": 43.70333333333334, + "best_prompt": 45.050000000000004, + "prompt_id": "p1", + "CPS": 44.44332666666667, + "is_dummy": false, + "std_accuracy": 1.8530335488957954 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.16999999999999998, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.042499999999999996, + "best_prompt": 0.16999999999999998, + "prompt_id": "p2", + "CPS": 0.16978324999999997, + "is_dummy": false, + "std_accuracy": 0.08499999999999999 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.08, + "stderr": 0.0 + } + ], + "average_accuracy": 0.02666666666666667, + "best_prompt": 0.08, + "prompt_id": "p3", + "CPS": 0.07995733333333332, + "is_dummy": false, + "std_accuracy": 0.046188021535170064 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Qwen/Qwen2.5-32B-Instruct_0_PL.json b/e3c_llm_results/Qwen/Qwen2.5-32B-Instruct_0_PL.json new file mode 100644 index 0000000000000000000000000000000000000000..12c6b090f2660ecdb2863ff322e5d5d532625e08 --- /dev/null +++ b/e3c_llm_results/Qwen/Qwen2.5-32B-Instruct_0_PL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 34.071664166666665, + "config": { + "model_name": "Qwen/Qwen2.5-32B-Instruct", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "PL", + "model": "Qwen/Qwen2.5-32B-Instruct", + "base_model": "Qwen2ForCausalLM", + "revision": "5ede1c97bbab6ce5cda5812749b4c0bdf79b18dd", + "submitted_time": "2024-09-17 04:17:55+00:00", + "num_params_billion": 32.763876352, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 24.86, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 23.11, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 24.86, + "stderr": 0.0 + } + ], + "average_accuracy": 24.276666666666667, + "best_prompt": 24.86, + "prompt_id": "p1", + "CPS": 24.714983333333333, + "is_dummy": false, + "std_accuracy": 1.0103629710818451 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 38.65, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 45.69, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 37.88, + "stderr": 0.0 + } + ], + "average_accuracy": 40.74, + "best_prompt": 45.69, + "prompt_id": "p2", + "CPS": 43.428345, + "is_dummy": false, + "std_accuracy": 4.304079460233045 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Qwen/Qwen2.5-32B-Instruct_0_SK.json b/e3c_llm_results/Qwen/Qwen2.5-32B-Instruct_0_SK.json new file mode 100644 index 0000000000000000000000000000000000000000..b0b8f4bc1d6a597d917e6128ab16aceaf8d51777 --- /dev/null +++ b/e3c_llm_results/Qwen/Qwen2.5-32B-Instruct_0_SK.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 38.03573266666667, + "config": { + "model_name": "Qwen/Qwen2.5-32B-Instruct", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "SK", + "model": "Qwen/Qwen2.5-32B-Instruct", + "base_model": "Qwen2ForCausalLM", + "revision": "5ede1c97bbab6ce5cda5812749b4c0bdf79b18dd", + "submitted_time": "2024-09-17 04:17:55+00:00", + "num_params_billion": 32.763876352, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 35.78, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 29.68, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 35.78, + "stderr": 0.0 + } + ], + "average_accuracy": 33.74666666666667, + "best_prompt": 35.78, + "prompt_id": "p1", + "CPS": 35.05247333333333, + "is_dummy": false, + "std_accuracy": 3.521836642056718 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 39.71, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 41.52, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 39.71, + "stderr": 0.0 + } + ], + "average_accuracy": 40.31333333333333, + "best_prompt": 41.52, + "prompt_id": "p2", + "CPS": 41.018992000000004, + "is_dummy": false, + "std_accuracy": 1.045003987233224 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Qwen/Qwen2.5-32B-Instruct_0_SL.json b/e3c_llm_results/Qwen/Qwen2.5-32B-Instruct_0_SL.json new file mode 100644 index 0000000000000000000000000000000000000000..2245c08641bb15fd75ae546afbc67b47f4e51aa5 --- /dev/null +++ b/e3c_llm_results/Qwen/Qwen2.5-32B-Instruct_0_SL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 37.09308866666666, + "config": { + "model_name": "Qwen/Qwen2.5-32B-Instruct", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "SL", + "model": "Qwen/Qwen2.5-32B-Instruct", + "base_model": "Qwen2ForCausalLM", + "revision": "5ede1c97bbab6ce5cda5812749b4c0bdf79b18dd", + "submitted_time": "2024-09-17 04:17:55+00:00", + "num_params_billion": 32.763876352, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 33.44, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 28.63, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 33.44, + "stderr": 0.0 + } + ], + "average_accuracy": 31.836666666666662, + "best_prompt": 33.44, + "prompt_id": "p1", + "CPS": 32.90384533333333, + "is_dummy": false, + "std_accuracy": 2.777054794802099 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 39.79, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 41.86, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 39.79, + "stderr": 0.0 + } + ], + "average_accuracy": 40.48, + "best_prompt": 41.86, + "prompt_id": "p2", + "CPS": 41.282332, + "is_dummy": false, + "std_accuracy": 1.1951150572225255 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Qwen/Qwen2.5-32B-Instruct_10_EN.json b/e3c_llm_results/Qwen/Qwen2.5-32B-Instruct_10_EN.json new file mode 100644 index 0000000000000000000000000000000000000000..a82c4a806ac20ea198f1bf7053ebbb2ace711189 --- /dev/null +++ b/e3c_llm_results/Qwen/Qwen2.5-32B-Instruct_10_EN.json @@ -0,0 +1,163 @@ +{ + "average_CPS": 31.59830568333333, + "config": { + "model_name": "Qwen/Qwen2.5-32B-Instruct", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "EN", + "model": "Qwen/Qwen2.5-32B-Instruct", + "base_model": "Qwen2ForCausalLM", + "revision": "5ede1c97bbab6ce5cda5812749b4c0bdf79b18dd", + "submitted_time": "2024-09-17 04:17:55+00:00", + "num_params_billion": 32.763876352, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 59.699999999999996, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 56.02, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 61.129999999999995, + "stderr": 0.0 + } + ], + "average_accuracy": 58.949999999999996, + "best_prompt": 61.129999999999995, + "prompt_id": "p3", + "CPS": 59.79736599999999, + "is_dummy": false, + "std_accuracy": 2.63626629914354 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 64.82, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 64.69, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 63.7, + "stderr": 0.0 + } + ], + "average_accuracy": 64.40333333333332, + "best_prompt": 64.82, + "prompt_id": "p1", + "CPS": 64.54991666666665, + "is_dummy": false, + "std_accuracy": 0.6125629219380878 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 15.010000000000002, + "stderr": 0.0 + }, + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 13.83, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 8.39, + "stderr": 0.0 + } + ], + "average_accuracy": 9.307500000000001, + "best_prompt": 15.010000000000002, + "prompt_id": "p1", + "CPS": 14.154054750000002, + "is_dummy": false, + "std_accuracy": 6.842097022599626 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 3.11, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 5.46, + "stderr": 0.0 + } + ], + "average_accuracy": 2.856666666666667, + "best_prompt": 5.46, + "prompt_id": "p3", + "CPS": 5.317858, + "is_dummy": false, + "std_accuracy": 2.738801441020019 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 2.4699999999999998, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 15.57, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 1.7399999999999998, + "stderr": 0.0 + } + ], + "average_accuracy": 6.593333333333333, + "best_prompt": 15.57, + "prompt_id": "p2", + "CPS": 14.172333, + "is_dummy": false, + "std_accuracy": 7.782585260267525 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Qwen/Qwen2.5-32B-Instruct_10_GR.json b/e3c_llm_results/Qwen/Qwen2.5-32B-Instruct_10_GR.json new file mode 100644 index 0000000000000000000000000000000000000000..459d29262498f3a5b8e0c9623d609d58691f53fc --- /dev/null +++ b/e3c_llm_results/Qwen/Qwen2.5-32B-Instruct_10_GR.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 60.26106666666667, + "config": { + "model_name": "Qwen/Qwen2.5-32B-Instruct", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "GR", + "model": "Qwen/Qwen2.5-32B-Instruct", + "base_model": "Qwen2ForCausalLM", + "revision": "5ede1c97bbab6ce5cda5812749b4c0bdf79b18dd", + "submitted_time": "2024-09-17 04:17:55+00:00", + "num_params_billion": 32.763876352, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 61.96, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 61.309999999999995, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 61.96, + "stderr": 0.0 + } + ], + "average_accuracy": 61.74333333333333, + "best_prompt": 61.96, + "prompt_id": "p1", + "CPS": 61.82575333333334, + "is_dummy": false, + "std_accuracy": 0.37527767497326003 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 59.13, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 58.96, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 57.099999999999994, + "stderr": 0.0 + } + ], + "average_accuracy": 58.39666666666667, + "best_prompt": 59.13, + "prompt_id": "p1", + "CPS": 58.696380000000005, + "is_dummy": false, + "std_accuracy": 1.1261586625930393 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Qwen/Qwen2.5-32B-Instruct_10_IT.json b/e3c_llm_results/Qwen/Qwen2.5-32B-Instruct_10_IT.json new file mode 100644 index 0000000000000000000000000000000000000000..974d3edcf50e656124c16ace38699f0496c2f5f2 --- /dev/null +++ b/e3c_llm_results/Qwen/Qwen2.5-32B-Instruct_10_IT.json @@ -0,0 +1,163 @@ +{ + "average_CPS": 32.75043995, + "config": { + "model_name": "Qwen/Qwen2.5-32B-Instruct", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "IT", + "model": "Qwen/Qwen2.5-32B-Instruct", + "base_model": "Qwen2ForCausalLM", + "revision": "5ede1c97bbab6ce5cda5812749b4c0bdf79b18dd", + "submitted_time": "2024-09-17 04:17:55+00:00", + "num_params_billion": 32.763876352, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 69.34, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 71.52, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 69.3, + "stderr": 0.0 + } + ], + "average_accuracy": 70.05333333333334, + "best_prompt": 71.52, + "prompt_id": "p2", + "CPS": 70.47104, + "is_dummy": false, + "std_accuracy": 1.2703280416228429 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 58.01, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 55.95, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 55.26, + "stderr": 0.0 + } + ], + "average_accuracy": 56.406666666666666, + "best_prompt": 58.01, + "prompt_id": "p1", + "CPS": 57.079906333333334, + "is_dummy": false, + "std_accuracy": 1.4307457263026617 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 3.9800000000000004, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 5.99, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 10.25, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 10.25, + "stderr": 0.0 + } + ], + "average_accuracy": 7.6175, + "best_prompt": 10.25, + "prompt_id": "p3", + "CPS": 9.980168749999999, + "is_dummy": false, + "std_accuracy": 3.1485591942982425 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 23.22, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 1.09, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 8.28, + "stderr": 0.0 + } + ], + "average_accuracy": 10.863333333333332, + "best_prompt": 23.22, + "prompt_id": "p1", + "CPS": 20.350782, + "is_dummy": false, + "std_accuracy": 11.288907534980225 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 1.8599999999999999, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 6.02, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 2.7199999999999998, + "stderr": 0.0 + } + ], + "average_accuracy": 3.5333333333333328, + "best_prompt": 6.02, + "prompt_id": "p2", + "CPS": 5.8703026666666664, + "is_dummy": false, + "std_accuracy": 2.196026715077331 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Qwen/Qwen2.5-32B-Instruct_10_PL.json b/e3c_llm_results/Qwen/Qwen2.5-32B-Instruct_10_PL.json new file mode 100644 index 0000000000000000000000000000000000000000..cc4032de29f21671f35367bd4411de6a4d219ff9 --- /dev/null +++ b/e3c_llm_results/Qwen/Qwen2.5-32B-Instruct_10_PL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 59.57754466666667, + "config": { + "model_name": "Qwen/Qwen2.5-32B-Instruct", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "PL", + "model": "Qwen/Qwen2.5-32B-Instruct", + "base_model": "Qwen2ForCausalLM", + "revision": "5ede1c97bbab6ce5cda5812749b4c0bdf79b18dd", + "submitted_time": "2024-09-17 04:17:55+00:00", + "num_params_billion": 32.763876352, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 60.08, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 60.040000000000006, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 60.08, + "stderr": 0.0 + } + ], + "average_accuracy": 60.06666666666666, + "best_prompt": 60.08, + "prompt_id": "p1", + "CPS": 60.071989333333335, + "is_dummy": false, + "std_accuracy": 0.023094010767580435 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 58.58, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 58.68, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 59.38, + "stderr": 0.0 + } + ], + "average_accuracy": 58.879999999999995, + "best_prompt": 59.38, + "prompt_id": "p3", + "CPS": 59.083099999999995, + "is_dummy": false, + "std_accuracy": 0.43588989435406944 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Qwen/Qwen2.5-32B-Instruct_10_SK.json b/e3c_llm_results/Qwen/Qwen2.5-32B-Instruct_10_SK.json new file mode 100644 index 0000000000000000000000000000000000000000..765514bf924769352adb6adf057443658d28ecae --- /dev/null +++ b/e3c_llm_results/Qwen/Qwen2.5-32B-Instruct_10_SK.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 62.042391166666675, + "config": { + "model_name": "Qwen/Qwen2.5-32B-Instruct", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "SK", + "model": "Qwen/Qwen2.5-32B-Instruct", + "base_model": "Qwen2ForCausalLM", + "revision": "5ede1c97bbab6ce5cda5812749b4c0bdf79b18dd", + "submitted_time": "2024-09-17 04:17:55+00:00", + "num_params_billion": 32.763876352, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 67.43, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 66.73, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 67.43, + "stderr": 0.0 + } + ], + "average_accuracy": 67.19666666666667, + "best_prompt": 67.43, + "prompt_id": "p1", + "CPS": 67.27266333333334, + "is_dummy": false, + "std_accuracy": 0.40414518843273967 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 57.330000000000005, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 55.86, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 56.089999999999996, + "stderr": 0.0 + } + ], + "average_accuracy": 56.42666666666667, + "best_prompt": 57.330000000000005, + "prompt_id": "p1", + "CPS": 56.812119, + "is_dummy": false, + "std_accuracy": 0.7907169742286678 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Qwen/Qwen2.5-32B-Instruct_10_SL.json b/e3c_llm_results/Qwen/Qwen2.5-32B-Instruct_10_SL.json new file mode 100644 index 0000000000000000000000000000000000000000..c80c0447f76d126e739e337f179be7f2071ffcb6 --- /dev/null +++ b/e3c_llm_results/Qwen/Qwen2.5-32B-Instruct_10_SL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 61.44185133333333, + "config": { + "model_name": "Qwen/Qwen2.5-32B-Instruct", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "SL", + "model": "Qwen/Qwen2.5-32B-Instruct", + "base_model": "Qwen2ForCausalLM", + "revision": "5ede1c97bbab6ce5cda5812749b4c0bdf79b18dd", + "submitted_time": "2024-09-17 04:17:55+00:00", + "num_params_billion": 32.763876352, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 62.529999999999994, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 66.14999999999999, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 62.529999999999994, + "stderr": 0.0 + } + ], + "average_accuracy": 63.73666666666666, + "best_prompt": 66.14999999999999, + "prompt_id": "p2", + "CPS": 64.55358, + "is_dummy": false, + "std_accuracy": 2.090007974466444 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 59.919999999999995, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 58.489999999999995, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 53.39, + "stderr": 0.0 + } + ], + "average_accuracy": 57.26666666666667, + "best_prompt": 59.919999999999995, + "prompt_id": "p1", + "CPS": 58.33012266666667, + "is_dummy": false, + "std_accuracy": 3.4325840606361426 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Qwen/Qwen3-30B-A3B-Instruct-2507_0_EN.json b/e3c_llm_results/Qwen/Qwen3-30B-A3B-Instruct-2507_0_EN.json new file mode 100644 index 0000000000000000000000000000000000000000..b5caa94b2f5b818eba6053c28c2b5d9462460a2b --- /dev/null +++ b/e3c_llm_results/Qwen/Qwen3-30B-A3B-Instruct-2507_0_EN.json @@ -0,0 +1,151 @@ +{ + "average_CPS": 17.763235033333334, + "config": { + "model_name": "Qwen/Qwen3-30B-A3B-Instruct-2507", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "EN", + "model": "Qwen/Qwen3-30B-A3B-Instruct-2507", + "base_model": "Qwen3MoeForCausalLM", + "revision": "61082d4deaa4785f64943b443cbc2b5de7524fad", + "submitted_time": "2025-07-28 07:31:27+00:00", + "num_params_billion": 30.532122624, + "language": "" + }, + "tasks": { + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 43.94, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 40.31, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 39.97, + "stderr": 0.0 + } + ], + "average_accuracy": 41.406666666666666, + "best_prompt": 43.94, + "prompt_id": "p1", + "CPS": 42.82685333333333, + "is_dummy": false, + "std_accuracy": 2.200507517218091 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.03, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.01, + "best_prompt": 0.03, + "prompt_id": "p1", + "CPS": 0.029994, + "is_dummy": false, + "std_accuracy": 0.017320508075688773 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.01, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0033333333333333335, + "best_prompt": 0.01, + "prompt_id": "p3", + "CPS": 0.009999333333333334, + "is_dummy": false, + "std_accuracy": 0.005773502691896258 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "NER": { + "prompts": [ + { + "prompt": "p2", + "metric": "f1", + "value": 41.620000000000005, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 47.29, + "stderr": 0.0 + } + ], + "average_accuracy": 44.455, + "best_prompt": 47.29, + "prompt_id": "p3", + "CPS": 45.9493285, + "is_dummy": false, + "std_accuracy": 4.009295449327721 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Qwen/Qwen3-30B-A3B-Instruct-2507_0_GR.json b/e3c_llm_results/Qwen/Qwen3-30B-A3B-Instruct-2507_0_GR.json new file mode 100644 index 0000000000000000000000000000000000000000..bde9845bf6583ae27fa3d3224befac1f265ea118 --- /dev/null +++ b/e3c_llm_results/Qwen/Qwen3-30B-A3B-Instruct-2507_0_GR.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 41.21096783333334, + "config": { + "model_name": "Qwen/Qwen3-30B-A3B-Instruct-2507", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "GR", + "model": "Qwen/Qwen3-30B-A3B-Instruct-2507", + "base_model": "Qwen3MoeForCausalLM", + "revision": "61082d4deaa4785f64943b443cbc2b5de7524fad", + "submitted_time": "2025-07-28 07:31:27+00:00", + "num_params_billion": 30.532122624, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 42.91, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 45.21, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 42.91, + "stderr": 0.0 + } + ], + "average_accuracy": 43.67666666666667, + "best_prompt": 45.21, + "prompt_id": "p2", + "CPS": 44.516780000000004, + "is_dummy": false, + "std_accuracy": 1.3279056191361418 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 37.330000000000005, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 37.99, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 37.980000000000004, + "stderr": 0.0 + } + ], + "average_accuracy": 37.76666666666667, + "best_prompt": 37.99, + "prompt_id": "p2", + "CPS": 37.90515566666667, + "is_dummy": false, + "std_accuracy": 0.37819747927945296 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Qwen/Qwen3-30B-A3B-Instruct-2507_0_IT.json b/e3c_llm_results/Qwen/Qwen3-30B-A3B-Instruct-2507_0_IT.json new file mode 100644 index 0000000000000000000000000000000000000000..352e97dde73f14c661bba1eb59d2bfecb444ec27 --- /dev/null +++ b/e3c_llm_results/Qwen/Qwen3-30B-A3B-Instruct-2507_0_IT.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 16.892739600000002, + "config": { + "model_name": "Qwen/Qwen3-30B-A3B-Instruct-2507", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "IT", + "model": "Qwen/Qwen3-30B-A3B-Instruct-2507", + "base_model": "Qwen3MoeForCausalLM", + "revision": "61082d4deaa4785f64943b443cbc2b5de7524fad", + "submitted_time": "2025-07-28 07:31:27+00:00", + "num_params_billion": 30.532122624, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 8.85, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 53.16, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 45.14, + "stderr": 0.0 + } + ], + "average_accuracy": 35.71666666666667, + "best_prompt": 53.16, + "prompt_id": "p2", + "CPS": 43.887124, + "is_dummy": false, + "std_accuracy": 23.610240010074723 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 37.84, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 41.23, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 39.72, + "stderr": 0.0 + } + ], + "average_accuracy": 39.596666666666664, + "best_prompt": 41.23, + "prompt_id": "p2", + "CPS": 40.556576666666665, + "is_dummy": false, + "std_accuracy": 1.6983619559249796 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.02, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.006666666666666667, + "best_prompt": 0.02, + "prompt_id": "p1", + "CPS": 0.019997333333333336, + "is_dummy": false, + "std_accuracy": 0.011547005383792516 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Qwen/Qwen3-30B-A3B-Instruct-2507_0_PL.json b/e3c_llm_results/Qwen/Qwen3-30B-A3B-Instruct-2507_0_PL.json new file mode 100644 index 0000000000000000000000000000000000000000..7670404e0b2aaeb6b0bcca1dc84d540c1baef9ac --- /dev/null +++ b/e3c_llm_results/Qwen/Qwen3-30B-A3B-Instruct-2507_0_PL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 42.480305333333334, + "config": { + "model_name": "Qwen/Qwen3-30B-A3B-Instruct-2507", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "PL", + "model": "Qwen/Qwen3-30B-A3B-Instruct-2507", + "base_model": "Qwen3MoeForCausalLM", + "revision": "61082d4deaa4785f64943b443cbc2b5de7524fad", + "submitted_time": "2025-07-28 07:31:27+00:00", + "num_params_billion": 30.532122624, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 43.32, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 40.43, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 43.32, + "stderr": 0.0 + } + ], + "average_accuracy": 42.35666666666666, + "best_prompt": 43.32, + "prompt_id": "p1", + "CPS": 42.902684, + "is_dummy": false, + "std_accuracy": 1.6685422779580188 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 41.52, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 42.199999999999996, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 41.870000000000005, + "stderr": 0.0 + } + ], + "average_accuracy": 41.86333333333334, + "best_prompt": 42.199999999999996, + "prompt_id": "p2", + "CPS": 42.05792666666667, + "is_dummy": false, + "std_accuracy": 0.3400490160746401 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Qwen/Qwen3-30B-A3B-Instruct-2507_0_SK.json b/e3c_llm_results/Qwen/Qwen3-30B-A3B-Instruct-2507_0_SK.json new file mode 100644 index 0000000000000000000000000000000000000000..acf4da7fc3aca030be17f5ec8041c74b3c00b91c --- /dev/null +++ b/e3c_llm_results/Qwen/Qwen3-30B-A3B-Instruct-2507_0_SK.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 36.625888, + "config": { + "model_name": "Qwen/Qwen3-30B-A3B-Instruct-2507", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "SK", + "model": "Qwen/Qwen3-30B-A3B-Instruct-2507", + "base_model": "Qwen3MoeForCausalLM", + "revision": "61082d4deaa4785f64943b443cbc2b5de7524fad", + "submitted_time": "2025-07-28 07:31:27+00:00", + "num_params_billion": 30.532122624, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 32.31, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 33.98, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 32.31, + "stderr": 0.0 + } + ], + "average_accuracy": 32.86666666666667, + "best_prompt": 33.98, + "prompt_id": "p2", + "CPS": 33.60168933333333, + "is_dummy": false, + "std_accuracy": 0.9641749495466719 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 39.800000000000004, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 38.67, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 39.800000000000004, + "stderr": 0.0 + } + ], + "average_accuracy": 39.42333333333334, + "best_prompt": 39.800000000000004, + "prompt_id": "p1", + "CPS": 39.65008666666667, + "is_dummy": false, + "std_accuracy": 0.6524058041842786 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Qwen/Qwen3-30B-A3B-Instruct-2507_0_SL.json b/e3c_llm_results/Qwen/Qwen3-30B-A3B-Instruct-2507_0_SL.json new file mode 100644 index 0000000000000000000000000000000000000000..3707b703de4ba52063672af91385b56679d01035 --- /dev/null +++ b/e3c_llm_results/Qwen/Qwen3-30B-A3B-Instruct-2507_0_SL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 43.20190633333334, + "config": { + "model_name": "Qwen/Qwen3-30B-A3B-Instruct-2507", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "SL", + "model": "Qwen/Qwen3-30B-A3B-Instruct-2507", + "base_model": "Qwen3MoeForCausalLM", + "revision": "61082d4deaa4785f64943b443cbc2b5de7524fad", + "submitted_time": "2025-07-28 07:31:27+00:00", + "num_params_billion": 30.532122624, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 44.86, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 45.31, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 44.86, + "stderr": 0.0 + } + ], + "average_accuracy": 45.01, + "best_prompt": 45.31, + "prompt_id": "p2", + "CPS": 45.17407, + "is_dummy": false, + "std_accuracy": 0.25980762113533323 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 41.15, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 41.260000000000005, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 41.15, + "stderr": 0.0 + } + ], + "average_accuracy": 41.18666666666667, + "best_prompt": 41.260000000000005, + "prompt_id": "p2", + "CPS": 41.229742666666674, + "is_dummy": false, + "std_accuracy": 0.0635085296108626 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Qwen/Qwen3-30B-A3B-Instruct-2507_10_EN.json b/e3c_llm_results/Qwen/Qwen3-30B-A3B-Instruct-2507_10_EN.json new file mode 100644 index 0000000000000000000000000000000000000000..50158d879b0a2af51f1436b3ec0e09a244d303b0 --- /dev/null +++ b/e3c_llm_results/Qwen/Qwen3-30B-A3B-Instruct-2507_10_EN.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 46.50834546666666, + "config": { + "model_name": "Qwen/Qwen3-30B-A3B-Instruct-2507", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "EN", + "model": "Qwen/Qwen3-30B-A3B-Instruct-2507", + "base_model": "Qwen3MoeForCausalLM", + "revision": "61082d4deaa4785f64943b443cbc2b5de7524fad", + "submitted_time": "2025-07-28 07:31:27+00:00", + "num_params_billion": 30.532122624, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 59.86, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 55.93, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 61.42999999999999, + "stderr": 0.0 + } + ], + "average_accuracy": 59.07333333333332, + "best_prompt": 61.42999999999999, + "prompt_id": "p3", + "CPS": 59.982299666666655, + "is_dummy": false, + "std_accuracy": 2.833131365350593 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 51.5, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 52.61, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 53.64, + "stderr": 0.0 + } + ], + "average_accuracy": 52.583333333333336, + "best_prompt": 53.64, + "prompt_id": "p3", + "CPS": 53.073204000000004, + "is_dummy": false, + "std_accuracy": 1.0702491921666346 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 32.06, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 35.809999999999995, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 32.67, + "stderr": 0.0 + } + ], + "average_accuracy": 33.513333333333335, + "best_prompt": 35.809999999999995, + "prompt_id": "p2", + "CPS": 34.98756366666667, + "is_dummy": false, + "std_accuracy": 2.0122209951526986 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 38.1, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 36.51, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 21.25, + "stderr": 0.0 + } + ], + "average_accuracy": 31.953333333333333, + "best_prompt": 38.1, + "prompt_id": "p1", + "CPS": 35.75812, + "is_dummy": false, + "std_accuracy": 9.303388271663897 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 41.54, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 29.24, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 56.89999999999999, + "stderr": 0.0 + } + ], + "average_accuracy": 42.559999999999995, + "best_prompt": 56.89999999999999, + "prompt_id": "p3", + "CPS": 48.740539999999996, + "is_dummy": false, + "std_accuracy": 13.858181698909851 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Qwen/Qwen3-30B-A3B-Instruct-2507_10_GR.json b/e3c_llm_results/Qwen/Qwen3-30B-A3B-Instruct-2507_10_GR.json new file mode 100644 index 0000000000000000000000000000000000000000..571d9a3122c649dd3e13d19400ba065411e15ea4 --- /dev/null +++ b/e3c_llm_results/Qwen/Qwen3-30B-A3B-Instruct-2507_10_GR.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 56.23321899999999, + "config": { + "model_name": "Qwen/Qwen3-30B-A3B-Instruct-2507", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "GR", + "model": "Qwen/Qwen3-30B-A3B-Instruct-2507", + "base_model": "Qwen3MoeForCausalLM", + "revision": "61082d4deaa4785f64943b443cbc2b5de7524fad", + "submitted_time": "2025-07-28 07:31:27+00:00", + "num_params_billion": 30.532122624, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 61.63999999999999, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 56.69, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 61.63999999999999, + "stderr": 0.0 + } + ], + "average_accuracy": 59.98999999999999, + "best_prompt": 61.63999999999999, + "prompt_id": "p1", + "CPS": 60.62293999999999, + "is_dummy": false, + "std_accuracy": 2.8578838324886453 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 50.14999999999999, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 52.09, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 52.23, + "stderr": 0.0 + } + ], + "average_accuracy": 51.49, + "best_prompt": 52.23, + "prompt_id": "p3", + "CPS": 51.843498, + "is_dummy": false, + "std_accuracy": 1.1625833303466944 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Qwen/Qwen3-30B-A3B-Instruct-2507_10_IT.json b/e3c_llm_results/Qwen/Qwen3-30B-A3B-Instruct-2507_10_IT.json new file mode 100644 index 0000000000000000000000000000000000000000..547a8bb626bd9855aa58ad261a49bb5db89ed810 --- /dev/null +++ b/e3c_llm_results/Qwen/Qwen3-30B-A3B-Instruct-2507_10_IT.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 48.58629273333334, + "config": { + "model_name": "Qwen/Qwen3-30B-A3B-Instruct-2507", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "IT", + "model": "Qwen/Qwen3-30B-A3B-Instruct-2507", + "base_model": "Qwen3MoeForCausalLM", + "revision": "61082d4deaa4785f64943b443cbc2b5de7524fad", + "submitted_time": "2025-07-28 07:31:27+00:00", + "num_params_billion": 30.532122624, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 67.93, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 64.47, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 67.78, + "stderr": 0.0 + } + ], + "average_accuracy": 66.72666666666667, + "best_prompt": 67.93, + "prompt_id": "p1", + "CPS": 67.11257566666667, + "is_dummy": false, + "std_accuracy": 1.955769243375441 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 60.41, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 58.379999999999995, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 60.650000000000006, + "stderr": 0.0 + } + ], + "average_accuracy": 59.81333333333333, + "best_prompt": 60.650000000000006, + "prompt_id": "p3", + "CPS": 60.142561666666666, + "is_dummy": false, + "std_accuracy": 1.2470899459675484 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 16.2, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 25.66, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 17.34, + "stderr": 0.0 + } + ], + "average_accuracy": 19.733333333333334, + "best_prompt": 25.66, + "prompt_id": "p2", + "CPS": 24.139217333333335, + "is_dummy": false, + "std_accuracy": 5.164197259335989 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 45.12, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 54.64, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 44.07, + "stderr": 0.0 + } + ], + "average_accuracy": 47.94333333333333, + "best_prompt": 54.64, + "prompt_id": "p2", + "CPS": 50.980941333333334, + "is_dummy": false, + "std_accuracy": 5.82319786142746 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 21.47, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 50.71, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 19.88, + "stderr": 0.0 + } + ], + "average_accuracy": 30.686666666666667, + "best_prompt": 50.71, + "prompt_id": "p2", + "CPS": 40.55616766666667, + "is_dummy": false, + "std_accuracy": 17.358929498483867 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Qwen/Qwen3-30B-A3B-Instruct-2507_10_PL.json b/e3c_llm_results/Qwen/Qwen3-30B-A3B-Instruct-2507_10_PL.json new file mode 100644 index 0000000000000000000000000000000000000000..a24f42db114e235ce3ac2c5028fa17b0c3d44a5b --- /dev/null +++ b/e3c_llm_results/Qwen/Qwen3-30B-A3B-Instruct-2507_10_PL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 56.796842, + "config": { + "model_name": "Qwen/Qwen3-30B-A3B-Instruct-2507", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "PL", + "model": "Qwen/Qwen3-30B-A3B-Instruct-2507", + "base_model": "Qwen3MoeForCausalLM", + "revision": "61082d4deaa4785f64943b443cbc2b5de7524fad", + "submitted_time": "2025-07-28 07:31:27+00:00", + "num_params_billion": 30.532122624, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 62.760000000000005, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 58.03, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 62.760000000000005, + "stderr": 0.0 + } + ], + "average_accuracy": 61.18333333333334, + "best_prompt": 62.760000000000005, + "prompt_id": "p1", + "CPS": 61.770484, + "is_dummy": false, + "std_accuracy": 2.730866773266932 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 51.03, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 52.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 51.949999999999996, + "stderr": 0.0 + } + ], + "average_accuracy": 51.66, + "best_prompt": 52.0, + "prompt_id": "p2", + "CPS": 51.8232, + "is_dummy": false, + "std_accuracy": 0.5461684721768532 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Qwen/Qwen3-30B-A3B-Instruct-2507_10_SK.json b/e3c_llm_results/Qwen/Qwen3-30B-A3B-Instruct-2507_10_SK.json new file mode 100644 index 0000000000000000000000000000000000000000..941a107c7b210a7f5e342887ce0ed52078283535 --- /dev/null +++ b/e3c_llm_results/Qwen/Qwen3-30B-A3B-Instruct-2507_10_SK.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 56.40435283333333, + "config": { + "model_name": "Qwen/Qwen3-30B-A3B-Instruct-2507", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "SK", + "model": "Qwen/Qwen3-30B-A3B-Instruct-2507", + "base_model": "Qwen3MoeForCausalLM", + "revision": "61082d4deaa4785f64943b443cbc2b5de7524fad", + "submitted_time": "2025-07-28 07:31:27+00:00", + "num_params_billion": 30.532122624, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 60.85, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 59.19, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 60.85, + "stderr": 0.0 + } + ], + "average_accuracy": 60.29666666666666, + "best_prompt": 60.85, + "prompt_id": "p1", + "CPS": 60.51329666666666, + "is_dummy": false, + "std_accuracy": 0.9584014468547809 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 49.2, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 50.24999999999999, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 53.73, + "stderr": 0.0 + } + ], + "average_accuracy": 51.059999999999995, + "best_prompt": 53.73, + "prompt_id": "p3", + "CPS": 52.29540899999999, + "is_dummy": false, + "std_accuracy": 2.371138966825857 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/Qwen/Qwen3-30B-A3B-Instruct-2507_10_SL.json b/e3c_llm_results/Qwen/Qwen3-30B-A3B-Instruct-2507_10_SL.json new file mode 100644 index 0000000000000000000000000000000000000000..2ed21a928d51a4f5feeaca60ee48601fcee48595 --- /dev/null +++ b/e3c_llm_results/Qwen/Qwen3-30B-A3B-Instruct-2507_10_SL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 59.600933166666664, + "config": { + "model_name": "Qwen/Qwen3-30B-A3B-Instruct-2507", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "SL", + "model": "Qwen/Qwen3-30B-A3B-Instruct-2507", + "base_model": "Qwen3MoeForCausalLM", + "revision": "61082d4deaa4785f64943b443cbc2b5de7524fad", + "submitted_time": "2025-07-28 07:31:27+00:00", + "num_params_billion": 30.532122624, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 66.14999999999999, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 59.440000000000005, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 66.14999999999999, + "stderr": 0.0 + } + ], + "average_accuracy": 63.913333333333334, + "best_prompt": 66.14999999999999, + "prompt_id": "p1", + "CPS": 64.670445, + "is_dummy": false, + "std_accuracy": 3.874020306262381 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 50.62, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 55.76, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 54.290000000000006, + "stderr": 0.0 + } + ], + "average_accuracy": 53.55666666666667, + "best_prompt": 55.76, + "prompt_id": "p2", + "CPS": 54.531421333333334, + "is_dummy": false, + "std_accuracy": 2.647306807556189 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/deepseek-ai/.ipynb_checkpoints/DeepSeek-R1-Distill-Qwen-32B_0_EN-checkpoint.json b/e3c_llm_results/deepseek-ai/.ipynb_checkpoints/DeepSeek-R1-Distill-Qwen-32B_0_EN-checkpoint.json new file mode 100644 index 0000000000000000000000000000000000000000..dd04460110df910bce59a8f73b3c97d40801845c --- /dev/null +++ b/e3c_llm_results/deepseek-ai/.ipynb_checkpoints/DeepSeek-R1-Distill-Qwen-32B_0_EN-checkpoint.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 15.443440999999998, + "config": { + "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "EN", + "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "base_model": "Qwen2ForCausalLM", + "revision": "711ad2ea6aa40cfca18895e8aca02ab92df1a746", + "submitted_time": "2025-01-20 09:19:00+00:00", + "num_params_billion": 32.763876352, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 19.63, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 34.589999999999996, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 32.08, + "stderr": 0.0 + } + ], + "average_accuracy": 28.766666666666666, + "best_prompt": 34.589999999999996, + "prompt_id": "p2", + "CPS": 32.575708999999996, + "is_dummy": false, + "std_accuracy": 8.01149382658024 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 44.87, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 44.92, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 43.11, + "stderr": 0.0 + } + ], + "average_accuracy": 44.29999999999999, + "best_prompt": 44.92, + "prompt_id": "p2", + "CPS": 44.641496, + "is_dummy": false, + "std_accuracy": 1.030873416089483 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B_0_EN.json b/e3c_llm_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B_0_EN.json new file mode 100644 index 0000000000000000000000000000000000000000..dd04460110df910bce59a8f73b3c97d40801845c --- /dev/null +++ b/e3c_llm_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B_0_EN.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 15.443440999999998, + "config": { + "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "EN", + "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "base_model": "Qwen2ForCausalLM", + "revision": "711ad2ea6aa40cfca18895e8aca02ab92df1a746", + "submitted_time": "2025-01-20 09:19:00+00:00", + "num_params_billion": 32.763876352, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 19.63, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 34.589999999999996, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 32.08, + "stderr": 0.0 + } + ], + "average_accuracy": 28.766666666666666, + "best_prompt": 34.589999999999996, + "prompt_id": "p2", + "CPS": 32.575708999999996, + "is_dummy": false, + "std_accuracy": 8.01149382658024 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 44.87, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 44.92, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 43.11, + "stderr": 0.0 + } + ], + "average_accuracy": 44.29999999999999, + "best_prompt": 44.92, + "prompt_id": "p2", + "CPS": 44.641496, + "is_dummy": false, + "std_accuracy": 1.030873416089483 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B_0_GR.json b/e3c_llm_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B_0_GR.json new file mode 100644 index 0000000000000000000000000000000000000000..25cf2fb05e184ba378fb7302f6d7a140af01235c --- /dev/null +++ b/e3c_llm_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B_0_GR.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 36.46137383333333, + "config": { + "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "GR", + "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "base_model": "Qwen2ForCausalLM", + "revision": "711ad2ea6aa40cfca18895e8aca02ab92df1a746", + "submitted_time": "2025-01-20 09:19:00+00:00", + "num_params_billion": 32.763876352, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 34.55, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 33.54, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 34.55, + "stderr": 0.0 + } + ], + "average_accuracy": 34.21333333333333, + "best_prompt": 34.55, + "prompt_id": "p1", + "CPS": 34.433681666666665, + "is_dummy": false, + "std_accuracy": 0.5831237718815209 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 24.060000000000002, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 39.47, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 41.02, + "stderr": 0.0 + } + ], + "average_accuracy": 34.85, + "best_prompt": 41.02, + "prompt_id": "p3", + "CPS": 38.489066, + "is_dummy": false, + "std_accuracy": 9.376497213778714 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B_0_IT.json b/e3c_llm_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B_0_IT.json new file mode 100644 index 0000000000000000000000000000000000000000..de4b5cc7c3dcd289f74d0459f3eddfc020e1000c --- /dev/null +++ b/e3c_llm_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B_0_IT.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 15.963344799999998, + "config": { + "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "IT", + "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "base_model": "Qwen2ForCausalLM", + "revision": "711ad2ea6aa40cfca18895e8aca02ab92df1a746", + "submitted_time": "2025-01-20 09:19:00+00:00", + "num_params_billion": 32.763876352, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 26.779999999999998, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 35.68, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 34.14, + "stderr": 0.0 + } + ], + "average_accuracy": 32.199999999999996, + "best_prompt": 35.68, + "prompt_id": "p2", + "CPS": 34.438336, + "is_dummy": false, + "std_accuracy": 4.756595421096902 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 45.190000000000005, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 46.11, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 42.27, + "stderr": 0.0 + } + ], + "average_accuracy": 44.52333333333334, + "best_prompt": 46.11, + "prompt_id": "p2", + "CPS": 45.378388, + "is_dummy": false, + "std_accuracy": 2.0049272638510676 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B_0_PL.json b/e3c_llm_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B_0_PL.json new file mode 100644 index 0000000000000000000000000000000000000000..c3380d143512bf188b6b051e6eb3143b9b708e99 --- /dev/null +++ b/e3c_llm_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B_0_PL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 39.199796666666664, + "config": { + "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "PL", + "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "base_model": "Qwen2ForCausalLM", + "revision": "711ad2ea6aa40cfca18895e8aca02ab92df1a746", + "submitted_time": "2025-01-20 09:19:00+00:00", + "num_params_billion": 32.763876352, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 32.04, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 37.28, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 32.04, + "stderr": 0.0 + } + ], + "average_accuracy": 33.78666666666666, + "best_prompt": 37.28, + "prompt_id": "p2", + "CPS": 35.97768533333333, + "is_dummy": false, + "std_accuracy": 3.02531541055364 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 39.83, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 43.269999999999996, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 40.83, + "stderr": 0.0 + } + ], + "average_accuracy": 41.309999999999995, + "best_prompt": 43.269999999999996, + "prompt_id": "p2", + "CPS": 42.421907999999995, + "is_dummy": false, + "std_accuracy": 1.769519708847572 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B_0_SK.json b/e3c_llm_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B_0_SK.json new file mode 100644 index 0000000000000000000000000000000000000000..91392100d164b54aff38fbd3b4932bdf3ba9bc29 --- /dev/null +++ b/e3c_llm_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B_0_SK.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 33.894328, + "config": { + "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "SK", + "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "base_model": "Qwen2ForCausalLM", + "revision": "711ad2ea6aa40cfca18895e8aca02ab92df1a746", + "submitted_time": "2025-01-20 09:19:00+00:00", + "num_params_billion": 32.763876352, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 28.29, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 19.05, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 28.29, + "stderr": 0.0 + } + ], + "average_accuracy": 25.209999999999997, + "best_prompt": 28.29, + "prompt_id": "p1", + "CPS": 27.418667999999997, + "is_dummy": false, + "std_accuracy": 5.334716487312141 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 38.93, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 40.910000000000004, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 38.93, + "stderr": 0.0 + } + ], + "average_accuracy": 39.59, + "best_prompt": 40.910000000000004, + "prompt_id": "p2", + "CPS": 40.369988000000006, + "is_dummy": false, + "std_accuracy": 1.1431535329954614 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B_0_SL.json b/e3c_llm_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B_0_SL.json new file mode 100644 index 0000000000000000000000000000000000000000..9e6bb1d1f11b95fdf5901fc0810dd74e60c7d0e5 --- /dev/null +++ b/e3c_llm_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B_0_SL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 34.339884000000005, + "config": { + "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "SL", + "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "base_model": "Qwen2ForCausalLM", + "revision": "711ad2ea6aa40cfca18895e8aca02ab92df1a746", + "submitted_time": "2025-01-20 09:19:00+00:00", + "num_params_billion": 32.763876352, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 28.1, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 21.92, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 28.1, + "stderr": 0.0 + } + ], + "average_accuracy": 26.040000000000003, + "best_prompt": 28.1, + "prompt_id": "p1", + "CPS": 27.521140000000003, + "is_dummy": false, + "std_accuracy": 3.568024663591887 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 41.160000000000004, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 41.15, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 41.160000000000004, + "stderr": 0.0 + } + ], + "average_accuracy": 41.156666666666666, + "best_prompt": 41.160000000000004, + "prompt_id": "p1", + "CPS": 41.15862800000001, + "is_dummy": false, + "std_accuracy": 0.005773502691899211 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B_10_EN.json b/e3c_llm_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B_10_EN.json new file mode 100644 index 0000000000000000000000000000000000000000..936b95d0c5b71f86f5615bc1dd8ed1828dbc0594 --- /dev/null +++ b/e3c_llm_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B_10_EN.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 43.183429333333336, + "config": { + "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "EN", + "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "base_model": "Qwen2ForCausalLM", + "revision": "711ad2ea6aa40cfca18895e8aca02ab92df1a746", + "submitted_time": "2025-01-20 09:19:00+00:00", + "num_params_billion": 32.763876352, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 60.24, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 59.29, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 59.35, + "stderr": 0.0 + } + ], + "average_accuracy": 59.626666666666665, + "best_prompt": 60.24, + "prompt_id": "p1", + "CPS": 59.870528, + "is_dummy": false, + "std_accuracy": 0.5320087718575085 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 51.910000000000004, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 51.99, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 52.73, + "stderr": 0.0 + } + ], + "average_accuracy": 52.21, + "best_prompt": 52.73, + "prompt_id": "p3", + "CPS": 52.455804, + "is_dummy": false, + "std_accuracy": 0.45210618221828913 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 11.690000000000001, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 15.03, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 26.33, + "stderr": 0.0 + } + ], + "average_accuracy": 17.683333333333334, + "best_prompt": 26.33, + "prompt_id": "p3", + "CPS": 24.053332666666666, + "is_dummy": false, + "std_accuracy": 7.672192211704117 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 31.169999999999998, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 24.16, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 14.829999999999998, + "stderr": 0.0 + } + ], + "average_accuracy": 23.386666666666667, + "best_prompt": 31.169999999999998, + "prompt_id": "p1", + "CPS": 28.743935, + "is_dummy": false, + "std_accuracy": 8.197404060636106 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 39.22, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 51.910000000000004, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 53.71, + "stderr": 0.0 + } + ], + "average_accuracy": 48.28, + "best_prompt": 53.71, + "prompt_id": "p3", + "CPS": 50.793547, + "is_dummy": false, + "std_accuracy": 7.897638887667632 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B_10_GR.json b/e3c_llm_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B_10_GR.json new file mode 100644 index 0000000000000000000000000000000000000000..e9a1b10c2d587985c1d8fe4e6d8bd1de98a23aa8 --- /dev/null +++ b/e3c_llm_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B_10_GR.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 52.0035325, + "config": { + "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "GR", + "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "base_model": "Qwen2ForCausalLM", + "revision": "711ad2ea6aa40cfca18895e8aca02ab92df1a746", + "submitted_time": "2025-01-20 09:19:00+00:00", + "num_params_billion": 32.763876352, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 59.28, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 57.96, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 59.28, + "stderr": 0.0 + } + ], + "average_accuracy": 58.84, + "best_prompt": 59.28, + "prompt_id": "p1", + "CPS": 59.019168, + "is_dummy": false, + "std_accuracy": 0.7621023553303061 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 44.67, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 42.1, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 45.69, + "stderr": 0.0 + } + ], + "average_accuracy": 44.153333333333336, + "best_prompt": 45.69, + "prompt_id": "p3", + "CPS": 44.987897, + "is_dummy": false, + "std_accuracy": 1.8499279265239843 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B_10_IT.json b/e3c_llm_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B_10_IT.json new file mode 100644 index 0000000000000000000000000000000000000000..4a1146fa77865ec03196283d24cc77823d10eae3 --- /dev/null +++ b/e3c_llm_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B_10_IT.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 39.245102200000005, + "config": { + "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "IT", + "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "base_model": "Qwen2ForCausalLM", + "revision": "711ad2ea6aa40cfca18895e8aca02ab92df1a746", + "submitted_time": "2025-01-20 09:19:00+00:00", + "num_params_billion": 32.763876352, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 69.82000000000001, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 66.79, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 69.3, + "stderr": 0.0 + } + ], + "average_accuracy": 68.63666666666667, + "best_prompt": 69.82000000000001, + "prompt_id": "p1", + "CPS": 68.99379666666667, + "is_dummy": false, + "std_accuracy": 1.620257181231834 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 55.46, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 55.26, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 55.17999999999999, + "stderr": 0.0 + } + ], + "average_accuracy": 55.29999999999999, + "best_prompt": 55.46, + "prompt_id": "p1", + "CPS": 55.37126399999999, + "is_dummy": false, + "std_accuracy": 0.1442220510185634 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 3.08, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 1.7399999999999998, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 12.280000000000001, + "stderr": 0.0 + } + ], + "average_accuracy": 5.7, + "best_prompt": 12.280000000000001, + "prompt_id": "p3", + "CPS": 11.471976000000002, + "is_dummy": false, + "std_accuracy": 5.7376998875856176 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 27.950000000000003, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 29.2, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 31.81, + "stderr": 0.0 + } + ], + "average_accuracy": 29.653333333333336, + "best_prompt": 31.81, + "prompt_id": "p3", + "CPS": 31.123964333333333, + "is_dummy": false, + "std_accuracy": 1.9695261697508175 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 26.3, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 29.67, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 28.939999999999998, + "stderr": 0.0 + } + ], + "average_accuracy": 28.30333333333333, + "best_prompt": 29.67, + "prompt_id": "p2", + "CPS": 29.26451, + "is_dummy": false, + "std_accuracy": 1.7729166177046605 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B_10_PL.json b/e3c_llm_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B_10_PL.json new file mode 100644 index 0000000000000000000000000000000000000000..6cfece54a11e960b9587ebb01ba6abd137f95bff --- /dev/null +++ b/e3c_llm_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B_10_PL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 56.36566883333333, + "config": { + "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "PL", + "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "base_model": "Qwen2ForCausalLM", + "revision": "711ad2ea6aa40cfca18895e8aca02ab92df1a746", + "submitted_time": "2025-01-20 09:19:00+00:00", + "num_params_billion": 32.763876352, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 62.13999999999999, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 61.4, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 62.13999999999999, + "stderr": 0.0 + } + ], + "average_accuracy": 61.893333333333324, + "best_prompt": 62.13999999999999, + "prompt_id": "p1", + "CPS": 61.98672133333332, + "is_dummy": false, + "std_accuracy": 0.4272391992003201 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 48.63, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 51.29, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 50.760000000000005, + "stderr": 0.0 + } + ], + "average_accuracy": 50.22666666666667, + "best_prompt": 51.29, + "prompt_id": "p2", + "CPS": 50.74461633333333, + "is_dummy": false, + "std_accuracy": 1.4079180847383597 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B_10_SK.json b/e3c_llm_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B_10_SK.json new file mode 100644 index 0000000000000000000000000000000000000000..d870eae3dfe93c6e3a37518ac5516b4b8939f743 --- /dev/null +++ b/e3c_llm_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B_10_SK.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 55.21981100000001, + "config": { + "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "SK", + "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "base_model": "Qwen2ForCausalLM", + "revision": "711ad2ea6aa40cfca18895e8aca02ab92df1a746", + "submitted_time": "2025-01-20 09:19:00+00:00", + "num_params_billion": 32.763876352, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 63.470000000000006, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 62.11, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 63.470000000000006, + "stderr": 0.0 + } + ], + "average_accuracy": 63.01666666666667, + "best_prompt": 63.470000000000006, + "prompt_id": "p1", + "CPS": 63.182269333333345, + "is_dummy": false, + "std_accuracy": 0.7851963660978948 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 47.99, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 44.51, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 46.89, + "stderr": 0.0 + } + ], + "average_accuracy": 46.46333333333333, + "best_prompt": 47.99, + "prompt_id": "p1", + "CPS": 47.25735266666666, + "is_dummy": false, + "std_accuracy": 1.7788010943704022 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B_10_SL.json b/e3c_llm_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B_10_SL.json new file mode 100644 index 0000000000000000000000000000000000000000..c5ae7fd6a6dea4eff0ea12a15fd6ef4b7731915e --- /dev/null +++ b/e3c_llm_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B_10_SL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 55.28181983333334, + "config": { + "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "SL", + "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "base_model": "Qwen2ForCausalLM", + "revision": "711ad2ea6aa40cfca18895e8aca02ab92df1a746", + "submitted_time": "2025-01-20 09:19:00+00:00", + "num_params_billion": 32.763876352, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 60.150000000000006, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 60.49, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 60.150000000000006, + "stderr": 0.0 + } + ], + "average_accuracy": 60.26333333333334, + "best_prompt": 60.49, + "prompt_id": "p2", + "CPS": 60.35288933333334, + "is_dummy": false, + "std_accuracy": 0.19629909152447061 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 51.370000000000005, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 46.739999999999995, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 49.230000000000004, + "stderr": 0.0 + } + ], + "average_accuracy": 49.11333333333334, + "best_prompt": 51.370000000000005, + "prompt_id": "p1", + "CPS": 50.21075033333334, + "is_dummy": false, + "std_accuracy": 2.31720377466751 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/epfl-llm/meditron-7b_0_EN.json b/e3c_llm_results/epfl-llm/meditron-7b_0_EN.json new file mode 100644 index 0000000000000000000000000000000000000000..018a81f3289f9a28a0c0a980f65758d256cba978 --- /dev/null +++ b/e3c_llm_results/epfl-llm/meditron-7b_0_EN.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 2.6316848, + "config": { + "model_name": "epfl-llm/meditron-7b", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "EN", + "model": "epfl-llm/meditron-7b", + "base_model": "LlamaForCausalLM", + "revision": "d7d0a5ed929384a6b059ac74198cf1d71f44ba76", + "submitted_time": "2023-11-08 16:03:23+00:00", + "num_params_billion": 6.73855488, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 5.779999999999999, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 4.1000000000000005, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 8.48, + "stderr": 0.0 + } + ], + "average_accuracy": 6.12, + "best_prompt": 8.48, + "prompt_id": "p3", + "CPS": 8.279872000000001, + "is_dummy": false, + "std_accuracy": 2.209705862779026 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 4.42, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 4.97, + "stderr": 0.0 + } + ], + "average_accuracy": 3.1300000000000003, + "best_prompt": 4.97, + "prompt_id": "p3", + "CPS": 4.878552, + "is_dummy": false, + "std_accuracy": 2.7245733610971095 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/epfl-llm/meditron-7b_0_GR.json b/e3c_llm_results/epfl-llm/meditron-7b_0_GR.json new file mode 100644 index 0000000000000000000000000000000000000000..194e574bef7a459af27ebd8d155bcbe19e26b253 --- /dev/null +++ b/e3c_llm_results/epfl-llm/meditron-7b_0_GR.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 19.223575999999998, + "config": { + "model_name": "epfl-llm/meditron-7b", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "GR", + "model": "epfl-llm/meditron-7b", + "base_model": "LlamaForCausalLM", + "revision": "d7d0a5ed929384a6b059ac74198cf1d71f44ba76", + "submitted_time": "2023-11-08 16:03:23+00:00", + "num_params_billion": 6.73855488, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 24.169999999999998, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 24.43, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 24.169999999999998, + "stderr": 0.0 + } + ], + "average_accuracy": 24.256666666666664, + "best_prompt": 24.43, + "prompt_id": "p2", + "CPS": 24.387654666666666, + "is_dummy": false, + "std_accuracy": 0.1501110699893036 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 15.559999999999999, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 1.6099999999999999, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.58, + "stderr": 0.0 + } + ], + "average_accuracy": 5.916666666666665, + "best_prompt": 15.559999999999999, + "prompt_id": "p1", + "CPS": 14.059497333333331, + "is_dummy": false, + "std_accuracy": 8.367235704420745 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/epfl-llm/meditron-7b_0_IT.json b/e3c_llm_results/epfl-llm/meditron-7b_0_IT.json new file mode 100644 index 0000000000000000000000000000000000000000..38c4e448e165f577c82fc85cbdebae2679fd0970 --- /dev/null +++ b/e3c_llm_results/epfl-llm/meditron-7b_0_IT.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 5.052730066666666, + "config": { + "model_name": "epfl-llm/meditron-7b", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "IT", + "model": "epfl-llm/meditron-7b", + "base_model": "LlamaForCausalLM", + "revision": "d7d0a5ed929384a6b059ac74198cf1d71f44ba76", + "submitted_time": "2023-11-08 16:03:23+00:00", + "num_params_billion": 6.73855488, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 7.7299999999999995, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 6.12, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 5.3100000000000005, + "stderr": 0.0 + } + ], + "average_accuracy": 6.386666666666667, + "best_prompt": 7.7299999999999995, + "prompt_id": "p1", + "CPS": 7.626160333333333, + "is_dummy": false, + "std_accuracy": 1.2318414400130124 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.2, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 19.29, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 12.68, + "stderr": 0.0 + } + ], + "average_accuracy": 10.723333333333334, + "best_prompt": 19.29, + "prompt_id": "p2", + "CPS": 17.63749, + "is_dummy": false, + "std_accuracy": 9.694247435120136 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/epfl-llm/meditron-7b_0_PL.json b/e3c_llm_results/epfl-llm/meditron-7b_0_PL.json new file mode 100644 index 0000000000000000000000000000000000000000..d8fa46e41e6c78724b244eba4ac7fe27fd493609 --- /dev/null +++ b/e3c_llm_results/epfl-llm/meditron-7b_0_PL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 6.367811666666667, + "config": { + "model_name": "epfl-llm/meditron-7b", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "PL", + "model": "epfl-llm/meditron-7b", + "base_model": "LlamaForCausalLM", + "revision": "d7d0a5ed929384a6b059ac74198cf1d71f44ba76", + "submitted_time": "2023-11-08 16:03:23+00:00", + "num_params_billion": 6.73855488, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 11.4, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 12.030000000000001, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 11.4, + "stderr": 0.0 + } + ], + "average_accuracy": 11.61, + "best_prompt": 12.030000000000001, + "prompt_id": "p2", + "CPS": 11.979474000000002, + "is_dummy": false, + "std_accuracy": 0.3637306695894647 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.76, + "stderr": 0.0 + } + ], + "average_accuracy": 0.25333333333333335, + "best_prompt": 0.76, + "prompt_id": "p3", + "CPS": 0.7561493333333333, + "is_dummy": false, + "std_accuracy": 0.4387862045841156 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/epfl-llm/meditron-7b_0_SK.json b/e3c_llm_results/epfl-llm/meditron-7b_0_SK.json new file mode 100644 index 0000000000000000000000000000000000000000..2be265b8f56d0cf4853f7945c234a0b3edf38821 --- /dev/null +++ b/e3c_llm_results/epfl-llm/meditron-7b_0_SK.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 4.508018000000001, + "config": { + "model_name": "epfl-llm/meditron-7b", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "SK", + "model": "epfl-llm/meditron-7b", + "base_model": "LlamaForCausalLM", + "revision": "d7d0a5ed929384a6b059ac74198cf1d71f44ba76", + "submitted_time": "2023-11-08 16:03:23+00:00", + "num_params_billion": 6.73855488, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 8.74, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 5.86, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 8.74, + "stderr": 0.0 + } + ], + "average_accuracy": 7.780000000000001, + "best_prompt": 8.74, + "prompt_id": "p1", + "CPS": 8.656096000000002, + "is_dummy": false, + "std_accuracy": 1.6627687752661222 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.36, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.31, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.36, + "stderr": 0.0 + } + ], + "average_accuracy": 0.34333333333333327, + "best_prompt": 0.36, + "prompt_id": "p1", + "CPS": 0.35994, + "is_dummy": false, + "std_accuracy": 0.02886751345948128 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/epfl-llm/meditron-7b_0_SL.json b/e3c_llm_results/epfl-llm/meditron-7b_0_SL.json new file mode 100644 index 0000000000000000000000000000000000000000..fc240fc571811ec62943a25b58191a669fa86a0b --- /dev/null +++ b/e3c_llm_results/epfl-llm/meditron-7b_0_SL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 8.782022166666668, + "config": { + "model_name": "epfl-llm/meditron-7b", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "SL", + "model": "epfl-llm/meditron-7b", + "base_model": "LlamaForCausalLM", + "revision": "d7d0a5ed929384a6b059ac74198cf1d71f44ba76", + "submitted_time": "2023-11-08 16:03:23+00:00", + "num_params_billion": 6.73855488, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 11.97, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 4.6, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 11.97, + "stderr": 0.0 + } + ], + "average_accuracy": 9.513333333333334, + "best_prompt": 11.97, + "prompt_id": "p1", + "CPS": 11.675937000000001, + "is_dummy": false, + "std_accuracy": 4.2550714839275425 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 5.9799999999999995, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 1.37, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 5.9799999999999995, + "stderr": 0.0 + } + ], + "average_accuracy": 4.4433333333333325, + "best_prompt": 5.9799999999999995, + "prompt_id": "p1", + "CPS": 5.888107333333333, + "is_dummy": false, + "std_accuracy": 2.6615847409641744 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/epfl-llm/meditron-7b_10_EN.json b/e3c_llm_results/epfl-llm/meditron-7b_10_EN.json new file mode 100644 index 0000000000000000000000000000000000000000..ae1d76af70eafafedd20c46cd853652733afa6ba --- /dev/null +++ b/e3c_llm_results/epfl-llm/meditron-7b_10_EN.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 4.328597533333334, + "config": { + "model_name": "epfl-llm/meditron-7b", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "EN", + "model": "epfl-llm/meditron-7b", + "base_model": "LlamaForCausalLM", + "revision": "d7d0a5ed929384a6b059ac74198cf1d71f44ba76", + "submitted_time": "2023-11-08 16:03:23+00:00", + "num_params_billion": 6.73855488, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 8.03, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 14.790000000000001, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 14.540000000000001, + "stderr": 0.0 + } + ], + "average_accuracy": 12.453333333333333, + "best_prompt": 14.790000000000001, + "prompt_id": "p2", + "CPS": 14.444407000000002, + "is_dummy": false, + "std_accuracy": 3.8327579278286468 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 7.22, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 6.92, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 6.63, + "stderr": 0.0 + } + ], + "average_accuracy": 6.923333333333333, + "best_prompt": 7.22, + "prompt_id": "p1", + "CPS": 7.1985806666666665, + "is_dummy": false, + "std_accuracy": 0.29501412395567317 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/epfl-llm/meditron-7b_10_GR.json b/e3c_llm_results/epfl-llm/meditron-7b_10_GR.json new file mode 100644 index 0000000000000000000000000000000000000000..f6eb5f9a3ff99a0bfa6d9e32342db44792911c81 --- /dev/null +++ b/e3c_llm_results/epfl-llm/meditron-7b_10_GR.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 0.0, + "config": { + "model_name": "epfl-llm/meditron-7b", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "GR", + "model": "epfl-llm/meditron-7b", + "base_model": "LlamaForCausalLM", + "revision": "d7d0a5ed929384a6b059ac74198cf1d71f44ba76", + "submitted_time": "2023-11-08 16:03:23+00:00", + "num_params_billion": 6.73855488, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/epfl-llm/meditron-7b_10_IT.json b/e3c_llm_results/epfl-llm/meditron-7b_10_IT.json new file mode 100644 index 0000000000000000000000000000000000000000..6d58714176f8b36fa7af44fb3e9a44ef54edc8e4 --- /dev/null +++ b/e3c_llm_results/epfl-llm/meditron-7b_10_IT.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 8.8522688, + "config": { + "model_name": "epfl-llm/meditron-7b", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "IT", + "model": "epfl-llm/meditron-7b", + "base_model": "LlamaForCausalLM", + "revision": "d7d0a5ed929384a6b059ac74198cf1d71f44ba76", + "submitted_time": "2023-11-08 16:03:23+00:00", + "num_params_billion": 6.73855488, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 29.909999999999997, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 35.63, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 33.11, + "stderr": 0.0 + } + ], + "average_accuracy": 32.88333333333333, + "best_prompt": 35.63, + "prompt_id": "p2", + "CPS": 34.651362666666664, + "is_dummy": false, + "std_accuracy": 2.8667286814997595 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 8.32, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 8.870000000000001, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 9.68, + "stderr": 0.0 + } + ], + "average_accuracy": 8.956666666666667, + "best_prompt": 9.68, + "prompt_id": "p3", + "CPS": 9.609981333333334, + "is_dummy": false, + "std_accuracy": 0.6841296173484472 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/epfl-llm/meditron-7b_10_PL.json b/e3c_llm_results/epfl-llm/meditron-7b_10_PL.json new file mode 100644 index 0000000000000000000000000000000000000000..8336b4911a828762b7c245545f66e08f250a2dd8 --- /dev/null +++ b/e3c_llm_results/epfl-llm/meditron-7b_10_PL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 19.029036333333334, + "config": { + "model_name": "epfl-llm/meditron-7b", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "PL", + "model": "epfl-llm/meditron-7b", + "base_model": "LlamaForCausalLM", + "revision": "d7d0a5ed929384a6b059ac74198cf1d71f44ba76", + "submitted_time": "2023-11-08 16:03:23+00:00", + "num_params_billion": 6.73855488, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 31.840000000000003, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 32.97, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 31.840000000000003, + "stderr": 0.0 + } + ], + "average_accuracy": 32.21666666666667, + "best_prompt": 32.97, + "prompt_id": "p2", + "CPS": 32.721626, + "is_dummy": false, + "std_accuracy": 0.6524058041842745 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 5.33, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 4.61, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 5.35, + "stderr": 0.0 + } + ], + "average_accuracy": 5.096666666666667, + "best_prompt": 5.35, + "prompt_id": "p3", + "CPS": 5.336446666666666, + "is_dummy": false, + "std_accuracy": 0.4215843134336631 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/epfl-llm/meditron-7b_10_SK.json b/e3c_llm_results/epfl-llm/meditron-7b_10_SK.json new file mode 100644 index 0000000000000000000000000000000000000000..1886c27e669de562ad32dc3485a3f09609b48b52 --- /dev/null +++ b/e3c_llm_results/epfl-llm/meditron-7b_10_SK.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 17.218929, + "config": { + "model_name": "epfl-llm/meditron-7b", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "SK", + "model": "epfl-llm/meditron-7b", + "base_model": "LlamaForCausalLM", + "revision": "d7d0a5ed929384a6b059ac74198cf1d71f44ba76", + "submitted_time": "2023-11-08 16:03:23+00:00", + "num_params_billion": 6.73855488, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 30.04, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 29.7, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 30.04, + "stderr": 0.0 + } + ], + "average_accuracy": 29.926666666666666, + "best_prompt": 30.04, + "prompt_id": "p1", + "CPS": 30.005954666666668, + "is_dummy": false, + "std_accuracy": 0.19629909152447267 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 4.45, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 3.93, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 3.75, + "stderr": 0.0 + } + ], + "average_accuracy": 4.043333333333334, + "best_prompt": 4.45, + "prompt_id": "p1", + "CPS": 4.4319033333333335, + "is_dummy": false, + "std_accuracy": 0.36350149013908234 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/epfl-llm/meditron-7b_10_SL.json b/e3c_llm_results/epfl-llm/meditron-7b_10_SL.json new file mode 100644 index 0000000000000000000000000000000000000000..eccf555e2e2f848343b45cbd8f7740a5095c91ec --- /dev/null +++ b/e3c_llm_results/epfl-llm/meditron-7b_10_SL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 18.122609833333332, + "config": { + "model_name": "epfl-llm/meditron-7b", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "SL", + "model": "epfl-llm/meditron-7b", + "base_model": "LlamaForCausalLM", + "revision": "d7d0a5ed929384a6b059ac74198cf1d71f44ba76", + "submitted_time": "2023-11-08 16:03:23+00:00", + "num_params_billion": 6.73855488, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 31.19, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 29.160000000000004, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 31.19, + "stderr": 0.0 + } + ], + "average_accuracy": 30.513333333333335, + "best_prompt": 31.19, + "prompt_id": "p1", + "CPS": 30.978947666666667, + "is_dummy": false, + "std_accuracy": 1.172021046454939 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 4.77, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 5.01, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 5.28, + "stderr": 0.0 + } + ], + "average_accuracy": 5.02, + "best_prompt": 5.28, + "prompt_id": "p3", + "CPS": 5.266272, + "is_dummy": false, + "std_accuracy": 0.2551470164434618 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/gemma-2-9b-it_0_EN.json b/e3c_llm_results/google/gemma-2-9b-it_0_EN.json new file mode 100644 index 0000000000000000000000000000000000000000..ca744fc0dbc71cbb66a0e945c34fcdb4a13f2c60 --- /dev/null +++ b/e3c_llm_results/google/gemma-2-9b-it_0_EN.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 20.2609912, + "config": { + "model_name": "google/gemma-2-9b-it", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "EN", + "model": "google/gemma-2-9b-it", + "base_model": "Gemma2ForCausalLM", + "revision": "11c9b309abf73637e4b6f9a3fa1e92e615547819", + "submitted_time": "2024-06-24 08:05:41+00:00", + "num_params_billion": 9.241705984, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 32.67, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 51.739999999999995, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 53.7, + "stderr": 0.0 + } + ], + "average_accuracy": 46.03666666666667, + "best_prompt": 53.7, + "prompt_id": "p3", + "CPS": 49.584790000000005, + "is_dummy": false, + "std_accuracy": 11.617281667125631 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 43.6, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 42.05, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 40.67, + "stderr": 0.0 + } + ], + "average_accuracy": 42.10666666666667, + "best_prompt": 43.6, + "prompt_id": "p1", + "CPS": 42.948906666666666, + "is_dummy": false, + "std_accuracy": 1.4658217263137197 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.06, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 7.9399999999999995, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 2.6666666666666665, + "best_prompt": 7.9399999999999995, + "prompt_id": "p2", + "CPS": 7.521297333333333, + "is_dummy": false, + "std_accuracy": 4.566939164619267 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.13, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 1.26, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.4633333333333334, + "best_prompt": 1.26, + "prompt_id": "p2", + "CPS": 1.249962, + "is_dummy": false, + "std_accuracy": 0.6929886963965093 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/gemma-2-9b-it_0_GR.json b/e3c_llm_results/google/gemma-2-9b-it_0_GR.json new file mode 100644 index 0000000000000000000000000000000000000000..3c3c8c09ca78762e8e2930ec0efeca6d88ea98e4 --- /dev/null +++ b/e3c_llm_results/google/gemma-2-9b-it_0_GR.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 47.411836666666666, + "config": { + "model_name": "google/gemma-2-9b-it", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "GR", + "model": "google/gemma-2-9b-it", + "base_model": "Gemma2ForCausalLM", + "revision": "11c9b309abf73637e4b6f9a3fa1e92e615547819", + "submitted_time": "2024-06-24 08:05:41+00:00", + "num_params_billion": 9.241705984, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 55.489999999999995, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 47.77, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 55.489999999999995, + "stderr": 0.0 + } + ], + "average_accuracy": 52.916666666666664, + "best_prompt": 55.489999999999995, + "prompt_id": "p1", + "CPS": 54.06205733333333, + "is_dummy": false, + "std_accuracy": 4.457144078143906 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 41.24, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 39.57, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 39.43, + "stderr": 0.0 + } + ], + "average_accuracy": 40.080000000000005, + "best_prompt": 41.24, + "prompt_id": "p1", + "CPS": 40.761616000000004, + "is_dummy": false, + "std_accuracy": 1.0070253224224317 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/gemma-2-9b-it_0_IT.json b/e3c_llm_results/google/gemma-2-9b-it_0_IT.json new file mode 100644 index 0000000000000000000000000000000000000000..1582693b99165fb210f40f52cf2ca3056ab2a3c1 --- /dev/null +++ b/e3c_llm_results/google/gemma-2-9b-it_0_IT.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 21.524752466666662, + "config": { + "model_name": "google/gemma-2-9b-it", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "IT", + "model": "google/gemma-2-9b-it", + "base_model": "Gemma2ForCausalLM", + "revision": "11c9b309abf73637e4b6f9a3fa1e92e615547819", + "submitted_time": "2024-06-24 08:05:41+00:00", + "num_params_billion": 9.241705984, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 57.38999999999999, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 65.24, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 62.1, + "stderr": 0.0 + } + ], + "average_accuracy": 61.57666666666666, + "best_prompt": 65.24, + "prompt_id": "p2", + "CPS": 62.85004133333333, + "is_dummy": false, + "std_accuracy": 3.9510800211250268 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 45.85, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 41.13, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 41.959999999999994, + "stderr": 0.0 + } + ], + "average_accuracy": 42.98, + "best_prompt": 45.85, + "prompt_id": "p1", + "CPS": 44.534105, + "is_dummy": false, + "std_accuracy": 2.51990079169796 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.24, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.08, + "best_prompt": 0.24, + "prompt_id": "p1", + "CPS": 0.23961599999999997, + "is_dummy": false, + "std_accuracy": 0.13856406460551018 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/gemma-2-9b-it_0_PL.json b/e3c_llm_results/google/gemma-2-9b-it_0_PL.json new file mode 100644 index 0000000000000000000000000000000000000000..5e6d2589bc155ee5a52d512fe4a78b3900afef8d --- /dev/null +++ b/e3c_llm_results/google/gemma-2-9b-it_0_PL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 41.18764683333333, + "config": { + "model_name": "google/gemma-2-9b-it", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "PL", + "model": "google/gemma-2-9b-it", + "base_model": "Gemma2ForCausalLM", + "revision": "11c9b309abf73637e4b6f9a3fa1e92e615547819", + "submitted_time": "2024-06-24 08:05:41+00:00", + "num_params_billion": 9.241705984, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 40.6, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 41.55, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 40.6, + "stderr": 0.0 + } + ], + "average_accuracy": 40.916666666666664, + "best_prompt": 41.55, + "prompt_id": "p2", + "CPS": 41.28685, + "is_dummy": false, + "std_accuracy": 0.548482755730142 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 36.74, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 42.71, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 37.29, + "stderr": 0.0 + } + ], + "average_accuracy": 38.913333333333334, + "best_prompt": 42.71, + "prompt_id": "p2", + "CPS": 41.08844366666666, + "is_dummy": false, + "std_accuracy": 3.2994898595591007 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/gemma-2-9b-it_0_SK.json b/e3c_llm_results/google/gemma-2-9b-it_0_SK.json new file mode 100644 index 0000000000000000000000000000000000000000..27b21560efcc055b5b36ef06708ed28d4cb7365a --- /dev/null +++ b/e3c_llm_results/google/gemma-2-9b-it_0_SK.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 45.32347, + "config": { + "model_name": "google/gemma-2-9b-it", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "SK", + "model": "google/gemma-2-9b-it", + "base_model": "Gemma2ForCausalLM", + "revision": "11c9b309abf73637e4b6f9a3fa1e92e615547819", + "submitted_time": "2024-06-24 08:05:41+00:00", + "num_params_billion": 9.241705984, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 48.75, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 45.75, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 48.75, + "stderr": 0.0 + } + ], + "average_accuracy": 47.75, + "best_prompt": 48.75, + "prompt_id": "p1", + "CPS": 48.2625, + "is_dummy": false, + "std_accuracy": 1.7320508075688772 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 39.89, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 43.4, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 39.89, + "stderr": 0.0 + } + ], + "average_accuracy": 41.059999999999995, + "best_prompt": 43.4, + "prompt_id": "p2", + "CPS": 42.38444, + "is_dummy": false, + "std_accuracy": 2.0264994448555855 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/gemma-2-9b-it_0_SL.json b/e3c_llm_results/google/gemma-2-9b-it_0_SL.json new file mode 100644 index 0000000000000000000000000000000000000000..6d9bb91731ce43227d4af908c2fe375610387da6 --- /dev/null +++ b/e3c_llm_results/google/gemma-2-9b-it_0_SL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 43.368616, + "config": { + "model_name": "google/gemma-2-9b-it", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "SL", + "model": "google/gemma-2-9b-it", + "base_model": "Gemma2ForCausalLM", + "revision": "11c9b309abf73637e4b6f9a3fa1e92e615547819", + "submitted_time": "2024-06-24 08:05:41+00:00", + "num_params_billion": 9.241705984, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 47.07, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 40.46, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 47.07, + "stderr": 0.0 + } + ], + "average_accuracy": 44.86666666666667, + "best_prompt": 47.07, + "prompt_id": "p1", + "CPS": 46.032891, + "is_dummy": false, + "std_accuracy": 3.816285279343426 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 40.79, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 40.160000000000004, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 40.79, + "stderr": 0.0 + } + ], + "average_accuracy": 40.580000000000005, + "best_prompt": 40.79, + "prompt_id": "p1", + "CPS": 40.704341, + "is_dummy": false, + "std_accuracy": 0.3637306695894616 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/gemma-2-9b-it_10_EN.json b/e3c_llm_results/google/gemma-2-9b-it_10_EN.json new file mode 100644 index 0000000000000000000000000000000000000000..f38a35dab7386200cb87d83d2674aaea7b3c5916 --- /dev/null +++ b/e3c_llm_results/google/gemma-2-9b-it_10_EN.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 39.772889533333334, + "config": { + "model_name": "google/gemma-2-9b-it", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "EN", + "model": "google/gemma-2-9b-it", + "base_model": "Gemma2ForCausalLM", + "revision": "11c9b309abf73637e4b6f9a3fa1e92e615547819", + "submitted_time": "2024-06-24 08:05:41+00:00", + "num_params_billion": 9.241705984, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 62.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 56.38999999999999, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 59.18, + "stderr": 0.0 + } + ], + "average_accuracy": 59.19, + "best_prompt": 62.0, + "prompt_id": "p1", + "CPS": 60.257799999999996, + "is_dummy": false, + "std_accuracy": 2.8050133689521015 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 51.629999999999995, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 53.37, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 54.09, + "stderr": 0.0 + } + ], + "average_accuracy": 53.03, + "best_prompt": 54.09, + "prompt_id": "p3", + "CPS": 53.516646, + "is_dummy": false, + "std_accuracy": 1.2647529403009938 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 29.509999999999998, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 33.879999999999995, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 32.62, + "stderr": 0.0 + } + ], + "average_accuracy": 32.00333333333333, + "best_prompt": 33.879999999999995, + "prompt_id": "p2", + "CPS": 33.244185333333334, + "is_dummy": false, + "std_accuracy": 2.2493184152834673 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 31.180000000000003, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 27.37, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 5.06, + "stderr": 0.0 + } + ], + "average_accuracy": 21.203333333333337, + "best_prompt": 31.180000000000003, + "prompt_id": "p1", + "CPS": 28.069275333333337, + "is_dummy": false, + "std_accuracy": 14.109728322449493 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 10.67, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 26.490000000000002, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 11.58, + "stderr": 0.0 + } + ], + "average_accuracy": 16.246666666666666, + "best_prompt": 26.490000000000002, + "prompt_id": "p2", + "CPS": 23.776541, + "is_dummy": false, + "std_accuracy": 8.882647878495092 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/gemma-2-9b-it_10_GR.json b/e3c_llm_results/google/gemma-2-9b-it_10_GR.json new file mode 100644 index 0000000000000000000000000000000000000000..c26a8a55cdd19623c0ff0f5e712de60c7e6ad80b --- /dev/null +++ b/e3c_llm_results/google/gemma-2-9b-it_10_GR.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 56.44067866666667, + "config": { + "model_name": "google/gemma-2-9b-it", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "GR", + "model": "google/gemma-2-9b-it", + "base_model": "Gemma2ForCausalLM", + "revision": "11c9b309abf73637e4b6f9a3fa1e92e615547819", + "submitted_time": "2024-06-24 08:05:41+00:00", + "num_params_billion": 9.241705984, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 60.83, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 56.63, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 60.83, + "stderr": 0.0 + } + ], + "average_accuracy": 59.43000000000001, + "best_prompt": 60.83, + "prompt_id": "p1", + "CPS": 59.97838, + "is_dummy": false, + "std_accuracy": 2.4248711305964257 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 50.7, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 49.71, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 54.44, + "stderr": 0.0 + } + ], + "average_accuracy": 51.61666666666667, + "best_prompt": 54.44, + "prompt_id": "p3", + "CPS": 52.90297733333333, + "is_dummy": false, + "std_accuracy": 2.494681008332192 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/gemma-2-9b-it_10_IT.json b/e3c_llm_results/google/gemma-2-9b-it_10_IT.json new file mode 100644 index 0000000000000000000000000000000000000000..44ec606d3bfdffeaff629bc4b3693a101b991da2 --- /dev/null +++ b/e3c_llm_results/google/gemma-2-9b-it_10_IT.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 39.260664733333336, + "config": { + "model_name": "google/gemma-2-9b-it", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "IT", + "model": "google/gemma-2-9b-it", + "base_model": "Gemma2ForCausalLM", + "revision": "11c9b309abf73637e4b6f9a3fa1e92e615547819", + "submitted_time": "2024-06-24 08:05:41+00:00", + "num_params_billion": 9.241705984, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 69.1, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 66.43, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 65.69, + "stderr": 0.0 + } + ], + "average_accuracy": 67.07333333333334, + "best_prompt": 69.1, + "prompt_id": "p1", + "CPS": 67.69957333333333, + "is_dummy": false, + "std_accuracy": 1.7937205282131663 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 49.58, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 53.65, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 53.05, + "stderr": 0.0 + } + ], + "average_accuracy": 52.09333333333333, + "best_prompt": 53.65, + "prompt_id": "p2", + "CPS": 52.81484833333333, + "is_dummy": false, + "std_accuracy": 2.197187596299718 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 17.9, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 16.53, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 10.84, + "stderr": 0.0 + } + ], + "average_accuracy": 15.089999999999998, + "best_prompt": 17.9, + "prompt_id": "p1", + "CPS": 17.397009999999998, + "is_dummy": false, + "std_accuracy": 3.743808221584006 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 32.879999999999995, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 40.35, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 9.19, + "stderr": 0.0 + } + ], + "average_accuracy": 27.47333333333333, + "best_prompt": 40.35, + "prompt_id": "p2", + "CPS": 35.154265, + "is_dummy": false, + "std_accuracy": 16.268387545584638 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 8.51, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 26.529999999999998, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 7.32, + "stderr": 0.0 + } + ], + "average_accuracy": 14.12, + "best_prompt": 26.529999999999998, + "prompt_id": "p2", + "CPS": 23.237627, + "is_dummy": false, + "std_accuracy": 10.763832960428175 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/gemma-2-9b-it_10_PL.json b/e3c_llm_results/google/gemma-2-9b-it_10_PL.json new file mode 100644 index 0000000000000000000000000000000000000000..19e3421c8fad4a9a3a529ae9e17b4edd75aa67cc --- /dev/null +++ b/e3c_llm_results/google/gemma-2-9b-it_10_PL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 54.98672666666667, + "config": { + "model_name": "google/gemma-2-9b-it", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "PL", + "model": "google/gemma-2-9b-it", + "base_model": "Gemma2ForCausalLM", + "revision": "11c9b309abf73637e4b6f9a3fa1e92e615547819", + "submitted_time": "2024-06-24 08:05:41+00:00", + "num_params_billion": 9.241705984, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 59.08, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 58.620000000000005, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 59.08, + "stderr": 0.0 + } + ], + "average_accuracy": 58.92666666666667, + "best_prompt": 59.08, + "prompt_id": "p1", + "CPS": 58.98941066666667, + "is_dummy": false, + "std_accuracy": 0.26558112382722426 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 51.68000000000001, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 48.08, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 51.239999999999995, + "stderr": 0.0 + } + ], + "average_accuracy": 50.333333333333336, + "best_prompt": 51.68000000000001, + "prompt_id": "p1", + "CPS": 50.98404266666667, + "is_dummy": false, + "std_accuracy": 1.9638058288266032 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/gemma-2-9b-it_10_SK.json b/e3c_llm_results/google/gemma-2-9b-it_10_SK.json new file mode 100644 index 0000000000000000000000000000000000000000..246c36ffcc5a012bf8167c5bc372e6ba6b8a1a06 --- /dev/null +++ b/e3c_llm_results/google/gemma-2-9b-it_10_SK.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 56.074384499999994, + "config": { + "model_name": "google/gemma-2-9b-it", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "SK", + "model": "google/gemma-2-9b-it", + "base_model": "Gemma2ForCausalLM", + "revision": "11c9b309abf73637e4b6f9a3fa1e92e615547819", + "submitted_time": "2024-06-24 08:05:41+00:00", + "num_params_billion": 9.241705984, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 61.41, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 61.22, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 61.41, + "stderr": 0.0 + } + ], + "average_accuracy": 61.346666666666664, + "best_prompt": 61.41, + "prompt_id": "p1", + "CPS": 61.371106999999995, + "is_dummy": false, + "std_accuracy": 0.10969655114602758 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 51.53, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 47.54, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 51.13999999999999, + "stderr": 0.0 + } + ], + "average_accuracy": 50.06999999999999, + "best_prompt": 51.53, + "prompt_id": "p1", + "CPS": 50.777662, + "is_dummy": false, + "std_accuracy": 2.199704525612473 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/gemma-2-9b-it_10_SL.json b/e3c_llm_results/google/gemma-2-9b-it_10_SL.json new file mode 100644 index 0000000000000000000000000000000000000000..2c09f9ec64339ddf7b3b9651d9148fa1453ebda0 --- /dev/null +++ b/e3c_llm_results/google/gemma-2-9b-it_10_SL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 55.7992, + "config": { + "model_name": "google/gemma-2-9b-it", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "SL", + "model": "google/gemma-2-9b-it", + "base_model": "Gemma2ForCausalLM", + "revision": "11c9b309abf73637e4b6f9a3fa1e92e615547819", + "submitted_time": "2024-06-24 08:05:41+00:00", + "num_params_billion": 9.241705984, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 63.65, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 57.37, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 63.65, + "stderr": 0.0 + } + ], + "average_accuracy": 61.556666666666665, + "best_prompt": 63.65, + "prompt_id": "p1", + "CPS": 62.31759333333333, + "is_dummy": false, + "std_accuracy": 3.6257596905108507 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 48.010000000000005, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 48.78, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 49.72, + "stderr": 0.0 + } + ], + "average_accuracy": 48.836666666666666, + "best_prompt": 49.72, + "prompt_id": "p3", + "CPS": 49.28080666666666, + "is_dummy": false, + "std_accuracy": 0.8564072240081397 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/gemma-3-27b-it_0_EN.json b/e3c_llm_results/google/gemma-3-27b-it_0_EN.json new file mode 100644 index 0000000000000000000000000000000000000000..b61e5f4cf20e438a1a1220080d2dbdbdc8fac0f9 --- /dev/null +++ b/e3c_llm_results/google/gemma-3-27b-it_0_EN.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 23.81513066666667, + "config": { + "model_name": "google/gemma-3-27b-it", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "EN", + "model": "google/gemma-3-27b-it", + "base_model": "Gemma3ForConditionalGeneration", + "revision": "005ad3404e59d6023443cb575daa05336842228a", + "submitted_time": "2025-03-01 19:10:19+00:00", + "num_params_billion": 27.43240664, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 54.459999999999994, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 58.3, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 51.94, + "stderr": 0.0 + } + ], + "average_accuracy": 54.9, + "best_prompt": 58.3, + "prompt_id": "p2", + "CPS": 56.3178, + "is_dummy": false, + "std_accuracy": 3.202748819373758 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 45.43, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 45.82, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 47.43, + "stderr": 0.0 + } + ], + "average_accuracy": 46.22666666666667, + "best_prompt": 47.43, + "prompt_id": "p3", + "CPS": 46.859259, + "is_dummy": false, + "std_accuracy": 1.0602043828117922 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 15.590000000000002, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 12.13, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 9.24, + "best_prompt": 15.590000000000002, + "prompt_id": "p1", + "CPS": 14.600035000000002, + "is_dummy": false, + "std_accuracy": 8.186946927884657 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 1.31, + "stderr": 0.0 + } + ], + "average_accuracy": 0.4366666666666667, + "best_prompt": 1.31, + "prompt_id": "p3", + "CPS": 1.2985593333333334, + "is_dummy": false, + "std_accuracy": 0.7563288526384098 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/gemma-3-27b-it_0_GR.json b/e3c_llm_results/google/gemma-3-27b-it_0_GR.json new file mode 100644 index 0000000000000000000000000000000000000000..108468c36515deb23bfc1b1c6e5e2eb67bd70a35 --- /dev/null +++ b/e3c_llm_results/google/gemma-3-27b-it_0_GR.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 50.113703, + "config": { + "model_name": "google/gemma-3-27b-it", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "GR", + "model": "google/gemma-3-27b-it", + "base_model": "Gemma3ForConditionalGeneration", + "revision": "005ad3404e59d6023443cb575daa05336842228a", + "submitted_time": "2025-03-01 19:10:19+00:00", + "num_params_billion": 27.43240664, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 48.66, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 57.21000000000001, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 48.66, + "stderr": 0.0 + } + ], + "average_accuracy": 51.51, + "best_prompt": 57.21000000000001, + "prompt_id": "p2", + "CPS": 53.94903000000001, + "is_dummy": false, + "std_accuracy": 4.936344801571307 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 39.550000000000004, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 46.949999999999996, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 47.69, + "stderr": 0.0 + } + ], + "average_accuracy": 44.73, + "best_prompt": 47.69, + "prompt_id": "p3", + "CPS": 46.278376, + "is_dummy": false, + "std_accuracy": 4.501244272420679 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/gemma-3-27b-it_0_IT.json b/e3c_llm_results/google/gemma-3-27b-it_0_IT.json new file mode 100644 index 0000000000000000000000000000000000000000..1addb51066625c0689aa893038bd259be21472a6 --- /dev/null +++ b/e3c_llm_results/google/gemma-3-27b-it_0_IT.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 24.543379466666668, + "config": { + "model_name": "google/gemma-3-27b-it", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "IT", + "model": "google/gemma-3-27b-it", + "base_model": "Gemma3ForConditionalGeneration", + "revision": "005ad3404e59d6023443cb575daa05336842228a", + "submitted_time": "2025-03-01 19:10:19+00:00", + "num_params_billion": 27.43240664, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 55.43, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 66.97, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 59.540000000000006, + "stderr": 0.0 + } + ], + "average_accuracy": 60.64666666666667, + "best_prompt": 66.97, + "prompt_id": "p2", + "CPS": 62.73526366666667, + "is_dummy": false, + "std_accuracy": 5.849054054574408 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 43.9, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 48.949999999999996, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 49.27, + "stderr": 0.0 + } + ], + "average_accuracy": 47.373333333333335, + "best_prompt": 49.27, + "prompt_id": "p3", + "CPS": 48.335512333333334, + "is_dummy": false, + "std_accuracy": 3.0122472231431034 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 12.34, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 6.11, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 6.1499999999999995, + "best_prompt": 12.34, + "prompt_id": "p1", + "CPS": 11.576154, + "is_dummy": false, + "std_accuracy": 6.170097243966256 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.06999999999999999, + "stderr": 0.0 + } + ], + "average_accuracy": 0.02333333333333333, + "best_prompt": 0.06999999999999999, + "prompt_id": "p3", + "CPS": 0.06996733333333333, + "is_dummy": false, + "std_accuracy": 0.0404145188432738 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/gemma-3-27b-it_0_PL.json b/e3c_llm_results/google/gemma-3-27b-it_0_PL.json new file mode 100644 index 0000000000000000000000000000000000000000..394c056ed0e89819ffe5b326d2a6ea06595b9a44 --- /dev/null +++ b/e3c_llm_results/google/gemma-3-27b-it_0_PL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 44.29942833333334, + "config": { + "model_name": "google/gemma-3-27b-it", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "PL", + "model": "google/gemma-3-27b-it", + "base_model": "Gemma3ForConditionalGeneration", + "revision": "005ad3404e59d6023443cb575daa05336842228a", + "submitted_time": "2025-03-01 19:10:19+00:00", + "num_params_billion": 27.43240664, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 45.06, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 45.11, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 45.06, + "stderr": 0.0 + } + ], + "average_accuracy": 45.076666666666675, + "best_prompt": 45.11, + "prompt_id": "p2", + "CPS": 45.09496333333333, + "is_dummy": false, + "std_accuracy": 0.028867513459479646 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 43.84, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 42.67, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 42.71, + "stderr": 0.0 + } + ], + "average_accuracy": 43.07333333333333, + "best_prompt": 43.84, + "prompt_id": "p1", + "CPS": 43.50389333333334, + "is_dummy": false, + "std_accuracy": 0.6642539674953661 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/gemma-3-27b-it_0_SK.json b/e3c_llm_results/google/gemma-3-27b-it_0_SK.json new file mode 100644 index 0000000000000000000000000000000000000000..47d69384014d165501a8895f4c2647723f737797 --- /dev/null +++ b/e3c_llm_results/google/gemma-3-27b-it_0_SK.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 37.22623216666667, + "config": { + "model_name": "google/gemma-3-27b-it", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "SK", + "model": "google/gemma-3-27b-it", + "base_model": "Gemma3ForConditionalGeneration", + "revision": "005ad3404e59d6023443cb575daa05336842228a", + "submitted_time": "2025-03-01 19:10:19+00:00", + "num_params_billion": 27.43240664, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 31.830000000000002, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 21.57, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 31.830000000000002, + "stderr": 0.0 + } + ], + "average_accuracy": 28.41, + "best_prompt": 31.830000000000002, + "prompt_id": "p1", + "CPS": 30.741414000000002, + "is_dummy": false, + "std_accuracy": 5.923613761885561 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 43.730000000000004, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 43.6, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 43.730000000000004, + "stderr": 0.0 + } + ], + "average_accuracy": 43.68666666666667, + "best_prompt": 43.730000000000004, + "prompt_id": "p1", + "CPS": 43.71105033333333, + "is_dummy": false, + "std_accuracy": 0.07505553499465283 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/gemma-3-27b-it_0_SL.json b/e3c_llm_results/google/gemma-3-27b-it_0_SL.json new file mode 100644 index 0000000000000000000000000000000000000000..37b56e19ea639bdcaed35eb2d06922b7629a9d0a --- /dev/null +++ b/e3c_llm_results/google/gemma-3-27b-it_0_SL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 45.01248166666667, + "config": { + "model_name": "google/gemma-3-27b-it", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "SL", + "model": "google/gemma-3-27b-it", + "base_model": "Gemma3ForConditionalGeneration", + "revision": "005ad3404e59d6023443cb575daa05336842228a", + "submitted_time": "2025-03-01 19:10:19+00:00", + "num_params_billion": 27.43240664, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 43.7, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 47.83, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 43.7, + "stderr": 0.0 + } + ], + "average_accuracy": 45.076666666666675, + "best_prompt": 47.83, + "prompt_id": "p2", + "CPS": 46.513080666666674, + "is_dummy": false, + "std_accuracy": 2.384456611753152 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 42.55, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 43.91, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 42.55, + "stderr": 0.0 + } + ], + "average_accuracy": 43.00333333333333, + "best_prompt": 43.91, + "prompt_id": "p2", + "CPS": 43.511882666666665, + "is_dummy": false, + "std_accuracy": 0.7851963660978907 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/gemma-3-27b-it_10_EN.json b/e3c_llm_results/google/gemma-3-27b-it_10_EN.json new file mode 100644 index 0000000000000000000000000000000000000000..9171719ae6e1e33c6212b6f7e158092c14b22229 --- /dev/null +++ b/e3c_llm_results/google/gemma-3-27b-it_10_EN.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 45.43019166666667, + "config": { + "model_name": "google/gemma-3-27b-it", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "EN", + "model": "google/gemma-3-27b-it", + "base_model": "Gemma3ForConditionalGeneration", + "revision": "005ad3404e59d6023443cb575daa05336842228a", + "submitted_time": "2025-03-01 19:10:19+00:00", + "num_params_billion": 27.43240664, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 61.6, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 63.080000000000005, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 60.940000000000005, + "stderr": 0.0 + } + ], + "average_accuracy": 61.873333333333335, + "best_prompt": 63.080000000000005, + "prompt_id": "p2", + "CPS": 62.318834666666675, + "is_dummy": false, + "std_accuracy": 1.0958710386415615 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 51.910000000000004, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 56.00000000000001, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 57.64, + "stderr": 0.0 + } + ], + "average_accuracy": 55.18333333333334, + "best_prompt": 57.64, + "prompt_id": "p3", + "CPS": 56.22397733333334, + "is_dummy": false, + "std_accuracy": 2.9510054783638284 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 32.71, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 33.01, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 33.42, + "stderr": 0.0 + } + ], + "average_accuracy": 33.04666666666667, + "best_prompt": 33.42, + "prompt_id": "p3", + "CPS": 33.295232, + "is_dummy": false, + "std_accuracy": 0.3564173583501984 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 40.22, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 38.58, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 8.28, + "stderr": 0.0 + } + ], + "average_accuracy": 29.026666666666667, + "best_prompt": 40.22, + "prompt_id": "p1", + "CPS": 35.71804133333333, + "is_dummy": false, + "std_accuracy": 17.985842580578016 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 24.490000000000002, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 48.209999999999994, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 18.32, + "stderr": 0.0 + } + ], + "average_accuracy": 30.339999999999993, + "best_prompt": 48.209999999999994, + "prompt_id": "p2", + "CPS": 39.594873, + "is_dummy": false, + "std_accuracy": 15.78036438109082 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/gemma-3-27b-it_10_GR.json b/e3c_llm_results/google/gemma-3-27b-it_10_GR.json new file mode 100644 index 0000000000000000000000000000000000000000..41270bebf28ef0e6e8f33f0d0e737baca795023d --- /dev/null +++ b/e3c_llm_results/google/gemma-3-27b-it_10_GR.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 60.327389833333335, + "config": { + "model_name": "google/gemma-3-27b-it", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "GR", + "model": "google/gemma-3-27b-it", + "base_model": "Gemma3ForConditionalGeneration", + "revision": "005ad3404e59d6023443cb575daa05336842228a", + "submitted_time": "2025-03-01 19:10:19+00:00", + "num_params_billion": 27.43240664, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 65.51, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 66.08000000000001, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 65.51, + "stderr": 0.0 + } + ], + "average_accuracy": 65.7, + "best_prompt": 66.08000000000001, + "prompt_id": "p2", + "CPS": 65.828896, + "is_dummy": false, + "std_accuracy": 0.32908965343809093 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 50.83, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 55.50000000000001, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 55.81, + "stderr": 0.0 + } + ], + "average_accuracy": 54.046666666666674, + "best_prompt": 55.81, + "prompt_id": "p3", + "CPS": 54.82588366666667, + "is_dummy": false, + "std_accuracy": 2.7900238947602856 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/gemma-3-27b-it_10_IT.json b/e3c_llm_results/google/gemma-3-27b-it_10_IT.json new file mode 100644 index 0000000000000000000000000000000000000000..70a2804c92f0a83e3005afb3f9db21e907f4043d --- /dev/null +++ b/e3c_llm_results/google/gemma-3-27b-it_10_IT.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 43.0060486, + "config": { + "model_name": "google/gemma-3-27b-it", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "IT", + "model": "google/gemma-3-27b-it", + "base_model": "Gemma3ForConditionalGeneration", + "revision": "005ad3404e59d6023443cb575daa05336842228a", + "submitted_time": "2025-03-01 19:10:19+00:00", + "num_params_billion": 27.43240664, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 71.41999999999999, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 69.92, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 72.11999999999999, + "stderr": 0.0 + } + ], + "average_accuracy": 71.15333333333332, + "best_prompt": 72.11999999999999, + "prompt_id": "p3", + "CPS": 71.42284, + "is_dummy": false, + "std_accuracy": 1.1239810200058178 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 52.23, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 58.37, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 57.86, + "stderr": 0.0 + } + ], + "average_accuracy": 56.15333333333333, + "best_prompt": 58.37, + "prompt_id": "p2", + "CPS": 57.07613166666666, + "is_dummy": false, + "std_accuracy": 3.40726185276878 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 19.650000000000002, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 24.87, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 18.740000000000002, + "stderr": 0.0 + } + ], + "average_accuracy": 21.08666666666667, + "best_prompt": 24.87, + "prompt_id": "p2", + "CPS": 23.929085000000004, + "is_dummy": false, + "std_accuracy": 3.307904674160567 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 57.32000000000001, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 34.43, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 21.44, + "stderr": 0.0 + } + ], + "average_accuracy": 37.73, + "best_prompt": 57.32000000000001, + "prompt_id": "p1", + "CPS": 46.091012, + "is_dummy": false, + "std_accuracy": 18.166207639460694 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 13.469999999999999, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 17.83, + "stderr": 0.0 + } + ], + "average_accuracy": 10.433333333333332, + "best_prompt": 17.83, + "prompt_id": "p3", + "CPS": 16.511174333333333, + "is_dummy": false, + "std_accuracy": 9.294796035058182 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/gemma-3-27b-it_10_PL.json b/e3c_llm_results/google/gemma-3-27b-it_10_PL.json new file mode 100644 index 0000000000000000000000000000000000000000..bd7b8c17d93e86263091569c6493ab58ca0ed609 --- /dev/null +++ b/e3c_llm_results/google/gemma-3-27b-it_10_PL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 61.5666635, + "config": { + "model_name": "google/gemma-3-27b-it", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "PL", + "model": "google/gemma-3-27b-it", + "base_model": "Gemma3ForConditionalGeneration", + "revision": "005ad3404e59d6023443cb575daa05336842228a", + "submitted_time": "2025-03-01 19:10:19+00:00", + "num_params_billion": 27.43240664, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 65.91, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 66.72, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 65.91, + "stderr": 0.0 + } + ], + "average_accuracy": 66.17999999999999, + "best_prompt": 66.72, + "prompt_id": "p2", + "CPS": 66.35971199999999, + "is_dummy": false, + "std_accuracy": 0.4676537180435982 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 57.95, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 56.010000000000005, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 53.800000000000004, + "stderr": 0.0 + } + ], + "average_accuracy": 55.92000000000001, + "best_prompt": 57.95, + "prompt_id": "p1", + "CPS": 56.77361500000001, + "is_dummy": false, + "std_accuracy": 2.0764633394307728 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/gemma-3-27b-it_10_SK.json b/e3c_llm_results/google/gemma-3-27b-it_10_SK.json new file mode 100644 index 0000000000000000000000000000000000000000..f3f69756b16bd7a976c5a541a45af486df4bc2b0 --- /dev/null +++ b/e3c_llm_results/google/gemma-3-27b-it_10_SK.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 59.623766999999994, + "config": { + "model_name": "google/gemma-3-27b-it", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "SK", + "model": "google/gemma-3-27b-it", + "base_model": "Gemma3ForConditionalGeneration", + "revision": "005ad3404e59d6023443cb575daa05336842228a", + "submitted_time": "2025-03-01 19:10:19+00:00", + "num_params_billion": 27.43240664, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 67.36999999999999, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 68.85, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 67.36999999999999, + "stderr": 0.0 + } + ], + "average_accuracy": 67.86333333333333, + "best_prompt": 68.85, + "prompt_id": "p2", + "CPS": 68.17067999999999, + "is_dummy": false, + "std_accuracy": 0.8544783984006484 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 51.21, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 50.61, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 51.03, + "stderr": 0.0 + } + ], + "average_accuracy": 50.949999999999996, + "best_prompt": 51.21, + "prompt_id": "p1", + "CPS": 51.076854, + "is_dummy": false, + "std_accuracy": 0.30789608636681387 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/gemma-3-27b-it_10_SL.json b/e3c_llm_results/google/gemma-3-27b-it_10_SL.json new file mode 100644 index 0000000000000000000000000000000000000000..bd7c4e6c8176c167ca6687ba039808406d62be64 --- /dev/null +++ b/e3c_llm_results/google/gemma-3-27b-it_10_SL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 59.561417000000006, + "config": { + "model_name": "google/gemma-3-27b-it", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "SL", + "model": "google/gemma-3-27b-it", + "base_model": "Gemma3ForConditionalGeneration", + "revision": "005ad3404e59d6023443cb575daa05336842228a", + "submitted_time": "2025-03-01 19:10:19+00:00", + "num_params_billion": 27.43240664, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 67.5, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 69.17999999999999, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 67.5, + "stderr": 0.0 + } + ], + "average_accuracy": 68.06, + "best_prompt": 69.17999999999999, + "prompt_id": "p2", + "CPS": 68.405184, + "is_dummy": false, + "std_accuracy": 0.9699484522385671 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 51.49, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 47.03, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 51.449999999999996, + "stderr": 0.0 + } + ], + "average_accuracy": 49.99, + "best_prompt": 51.49, + "prompt_id": "p1", + "CPS": 50.71765, + "is_dummy": false, + "std_accuracy": 2.5635132143213135 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/medgemma-27b-text-it_0_EN.json b/e3c_llm_results/google/medgemma-27b-text-it_0_EN.json new file mode 100644 index 0000000000000000000000000000000000000000..4f7b638b4f5148f10d1ac8a6fcc8ac73b0120493 --- /dev/null +++ b/e3c_llm_results/google/medgemma-27b-text-it_0_EN.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 21.581613666666666, + "config": { + "model_name": "google/medgemma-27b-text-it", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "EN", + "model": "google/medgemma-27b-text-it", + "base_model": "Gemma3ForCausalLM", + "revision": "6b08c481126ff65a9b8fa5ab4d691b152b8edb5d", + "submitted_time": "2025-05-19 20:53:04+00:00", + "num_params_billion": 27.00900224, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 38.42, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 60.35, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 51.559999999999995, + "stderr": 0.0 + } + ], + "average_accuracy": 50.11000000000001, + "best_prompt": 60.35, + "prompt_id": "p2", + "CPS": 54.17016, + "is_dummy": false, + "std_accuracy": 11.036670693646702 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 48.36, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 47.63, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 44.43, + "stderr": 0.0 + } + ], + "average_accuracy": 46.80666666666667, + "best_prompt": 48.36, + "prompt_id": "p1", + "CPS": 47.608808, + "is_dummy": false, + "std_accuracy": 2.0903667939702197 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 6.23, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 3.27, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 3.1666666666666665, + "best_prompt": 6.23, + "prompt_id": "p1", + "CPS": 6.039154333333334, + "is_dummy": false, + "std_accuracy": 3.116285181643897 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.09, + "stderr": 0.0 + } + ], + "average_accuracy": 0.03, + "best_prompt": 0.09, + "prompt_id": "p3", + "CPS": 0.089946, + "is_dummy": false, + "std_accuracy": 0.05196152422706632 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/medgemma-27b-text-it_0_GR.json b/e3c_llm_results/google/medgemma-27b-text-it_0_GR.json new file mode 100644 index 0000000000000000000000000000000000000000..1ae8f09585013a20de7ea4216e6178d6ba673b9e --- /dev/null +++ b/e3c_llm_results/google/medgemma-27b-text-it_0_GR.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 50.34454, + "config": { + "model_name": "google/medgemma-27b-text-it", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "GR", + "model": "google/medgemma-27b-text-it", + "base_model": "Gemma3ForCausalLM", + "revision": "6b08c481126ff65a9b8fa5ab4d691b152b8edb5d", + "submitted_time": "2025-05-19 20:53:04+00:00", + "num_params_billion": 27.00900224, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 53.14, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 61.260000000000005, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 53.14, + "stderr": 0.0 + } + ], + "average_accuracy": 55.84666666666667, + "best_prompt": 61.260000000000005, + "prompt_id": "p2", + "CPS": 57.943792, + "is_dummy": false, + "std_accuracy": 4.688084185819764 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 40.69, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 43.32, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 41.97, + "stderr": 0.0 + } + ], + "average_accuracy": 41.99333333333333, + "best_prompt": 43.32, + "prompt_id": "p2", + "CPS": 42.745288, + "is_dummy": false, + "std_accuracy": 1.3151552506580113 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/medgemma-27b-text-it_0_IT.json b/e3c_llm_results/google/medgemma-27b-text-it_0_IT.json new file mode 100644 index 0000000000000000000000000000000000000000..a94b99de846b0b13135cb48d3a85113d66d96f44 --- /dev/null +++ b/e3c_llm_results/google/medgemma-27b-text-it_0_IT.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 21.714833866666666, + "config": { + "model_name": "google/medgemma-27b-text-it", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "IT", + "model": "google/medgemma-27b-text-it", + "base_model": "Gemma3ForCausalLM", + "revision": "6b08c481126ff65a9b8fa5ab4d691b152b8edb5d", + "submitted_time": "2025-05-19 20:53:04+00:00", + "num_params_billion": 27.00900224, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 42.61, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 62.12, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 55.82, + "stderr": 0.0 + } + ], + "average_accuracy": 53.51666666666666, + "best_prompt": 62.12, + "prompt_id": "p2", + "CPS": 56.77560933333333, + "is_dummy": false, + "std_accuracy": 9.9568586076801 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 40.42, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 49.16, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 46.04, + "stderr": 0.0 + } + ], + "average_accuracy": 45.20666666666667, + "best_prompt": 49.16, + "prompt_id": "p2", + "CPS": 47.21654133333333, + "is_dummy": false, + "std_accuracy": 4.429191047283162 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 4.72, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.64, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.03, + "stderr": 0.0 + } + ], + "average_accuracy": 1.7966666666666666, + "best_prompt": 4.72, + "prompt_id": "p1", + "CPS": 4.5820186666666665, + "is_dummy": false, + "std_accuracy": 2.54998692807107 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/medgemma-27b-text-it_0_PL.json b/e3c_llm_results/google/medgemma-27b-text-it_0_PL.json new file mode 100644 index 0000000000000000000000000000000000000000..85973c072b147631fc1c74a559b0559f9b693e4a --- /dev/null +++ b/e3c_llm_results/google/medgemma-27b-text-it_0_PL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 43.305971666666665, + "config": { + "model_name": "google/medgemma-27b-text-it", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "PL", + "model": "google/medgemma-27b-text-it", + "base_model": "Gemma3ForCausalLM", + "revision": "6b08c481126ff65a9b8fa5ab4d691b152b8edb5d", + "submitted_time": "2025-05-19 20:53:04+00:00", + "num_params_billion": 27.00900224, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 42.16, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 43.03, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 42.16, + "stderr": 0.0 + } + ], + "average_accuracy": 42.449999999999996, + "best_prompt": 43.03, + "prompt_id": "p2", + "CPS": 42.780426, + "is_dummy": false, + "std_accuracy": 0.502294734194977 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 43.25, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 44.24, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 42.46, + "stderr": 0.0 + } + ], + "average_accuracy": 43.31666666666667, + "best_prompt": 44.24, + "prompt_id": "p2", + "CPS": 43.83151733333334, + "is_dummy": false, + "std_accuracy": 0.8918706931687655 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/medgemma-27b-text-it_0_SK.json b/e3c_llm_results/google/medgemma-27b-text-it_0_SK.json new file mode 100644 index 0000000000000000000000000000000000000000..2fd3f810e8e542c5cac10a148bdc23ba143ceec5 --- /dev/null +++ b/e3c_llm_results/google/medgemma-27b-text-it_0_SK.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 36.36130216666666, + "config": { + "model_name": "google/medgemma-27b-text-it", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "SK", + "model": "google/medgemma-27b-text-it", + "base_model": "Gemma3ForCausalLM", + "revision": "6b08c481126ff65a9b8fa5ab4d691b152b8edb5d", + "submitted_time": "2025-05-19 20:53:04+00:00", + "num_params_billion": 27.00900224, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 29.709999999999997, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 10.66, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 29.709999999999997, + "stderr": 0.0 + } + ], + "average_accuracy": 23.36, + "best_prompt": 29.709999999999997, + "prompt_id": "p1", + "CPS": 27.823414999999997, + "is_dummy": false, + "std_accuracy": 10.99852262806237 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 43.95, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 45.31, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 43.95, + "stderr": 0.0 + } + ], + "average_accuracy": 44.403333333333336, + "best_prompt": 45.31, + "prompt_id": "p2", + "CPS": 44.89918933333333, + "is_dummy": false, + "std_accuracy": 0.7851963660978907 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/medgemma-27b-text-it_0_SL.json b/e3c_llm_results/google/medgemma-27b-text-it_0_SL.json new file mode 100644 index 0000000000000000000000000000000000000000..2c55dad3d6016df0e351a987d060e5ae23fbba6b --- /dev/null +++ b/e3c_llm_results/google/medgemma-27b-text-it_0_SL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 46.321461, + "config": { + "model_name": "google/medgemma-27b-text-it", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "SL", + "model": "google/medgemma-27b-text-it", + "base_model": "Gemma3ForCausalLM", + "revision": "6b08c481126ff65a9b8fa5ab4d691b152b8edb5d", + "submitted_time": "2025-05-19 20:53:04+00:00", + "num_params_billion": 27.00900224, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 46.75, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 52.38, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 46.75, + "stderr": 0.0 + } + ], + "average_accuracy": 48.626666666666665, + "best_prompt": 52.38, + "prompt_id": "p2", + "CPS": 50.414004, + "is_dummy": false, + "std_accuracy": 3.2504820155375946 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 41.82, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 42.39, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 41.82, + "stderr": 0.0 + } + ], + "average_accuracy": 42.01, + "best_prompt": 42.39, + "prompt_id": "p2", + "CPS": 42.228918, + "is_dummy": false, + "std_accuracy": 0.3290896534380868 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/medgemma-27b-text-it_10_EN.json b/e3c_llm_results/google/medgemma-27b-text-it_10_EN.json new file mode 100644 index 0000000000000000000000000000000000000000..a7b9e6bfcab853a76d30a8636e03c8eba85e45ab --- /dev/null +++ b/e3c_llm_results/google/medgemma-27b-text-it_10_EN.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 50.544787533333334, + "config": { + "model_name": "google/medgemma-27b-text-it", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "EN", + "model": "google/medgemma-27b-text-it", + "base_model": "Gemma3ForCausalLM", + "revision": "6b08c481126ff65a9b8fa5ab4d691b152b8edb5d", + "submitted_time": "2025-05-19 20:53:04+00:00", + "num_params_billion": 27.00900224, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 63.55, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 61.61, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 64.55, + "stderr": 0.0 + } + ], + "average_accuracy": 63.23666666666666, + "best_prompt": 64.55, + "prompt_id": "p3", + "CPS": 63.70224333333332, + "is_dummy": false, + "std_accuracy": 1.4948355539434195 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 55.620000000000005, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 54.94, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 55.65, + "stderr": 0.0 + } + ], + "average_accuracy": 55.403333333333336, + "best_prompt": 55.65, + "prompt_id": "p3", + "CPS": 55.512730000000005, + "is_dummy": false, + "std_accuracy": 0.4015387071420824 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 37.11, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 35.82, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 33.550000000000004, + "stderr": 0.0 + } + ], + "average_accuracy": 35.49333333333334, + "best_prompt": 37.11, + "prompt_id": "p1", + "CPS": 36.510055, + "is_dummy": false, + "std_accuracy": 1.8023410702010108 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 54.800000000000004, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 40.1, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 4.91, + "stderr": 0.0 + } + ], + "average_accuracy": 33.27, + "best_prompt": 54.800000000000004, + "prompt_id": "p1", + "CPS": 43.00156, + "is_dummy": false, + "std_accuracy": 25.63668660338149 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 48.99, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 38.01, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 61.24000000000001, + "stderr": 0.0 + } + ], + "average_accuracy": 49.413333333333334, + "best_prompt": 61.24000000000001, + "prompt_id": "p3", + "CPS": 53.99734933333334, + "is_dummy": false, + "std_accuracy": 11.620784540354126 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/medgemma-27b-text-it_10_GR.json b/e3c_llm_results/google/medgemma-27b-text-it_10_GR.json new file mode 100644 index 0000000000000000000000000000000000000000..a45217ffcb35082fa5eb968b82d5a3430383c72a --- /dev/null +++ b/e3c_llm_results/google/medgemma-27b-text-it_10_GR.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 62.99263766666667, + "config": { + "model_name": "google/medgemma-27b-text-it", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "GR", + "model": "google/medgemma-27b-text-it", + "base_model": "Gemma3ForCausalLM", + "revision": "6b08c481126ff65a9b8fa5ab4d691b152b8edb5d", + "submitted_time": "2025-05-19 20:53:04+00:00", + "num_params_billion": 27.00900224, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 68.36, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 68.46, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 68.36, + "stderr": 0.0 + } + ], + "average_accuracy": 68.39333333333333, + "best_prompt": 68.46, + "prompt_id": "p2", + "CPS": 68.41436, + "is_dummy": false, + "std_accuracy": 0.05773502691895929 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 53.92, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 58.67, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 57.8, + "stderr": 0.0 + } + ], + "average_accuracy": 56.79666666666666, + "best_prompt": 58.67, + "prompt_id": "p2", + "CPS": 57.57091533333333, + "is_dummy": false, + "std_accuracy": 2.528958942595417 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/medgemma-27b-text-it_10_IT.json b/e3c_llm_results/google/medgemma-27b-text-it_10_IT.json new file mode 100644 index 0000000000000000000000000000000000000000..fa0c4c7ef1eb0f4da3d0d96d993ffc4c2883e03f --- /dev/null +++ b/e3c_llm_results/google/medgemma-27b-text-it_10_IT.json @@ -0,0 +1,151 @@ +{ + "average_CPS": 50.36945153333333, + "config": { + "model_name": "google/medgemma-27b-text-it", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "IT", + "model": "google/medgemma-27b-text-it", + "base_model": "Gemma3ForCausalLM", + "revision": "6b08c481126ff65a9b8fa5ab4d691b152b8edb5d", + "submitted_time": "2025-05-19 20:53:04+00:00", + "num_params_billion": 27.00900224, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 72.61999999999999, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 70.05, + "stderr": 0.0 + } + ], + "average_accuracy": 71.335, + "best_prompt": 72.61999999999999, + "prompt_id": "p1", + "CPS": 71.686833, + "is_dummy": false, + "std_accuracy": 1.8172644276494223 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 59.19, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 62.35000000000001, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 57.26, + "stderr": 0.0 + } + ], + "average_accuracy": 59.6, + "best_prompt": 62.35000000000001, + "prompt_id": "p2", + "CPS": 60.635375, + "is_dummy": false, + "std_accuracy": 2.569649781585037 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 23.14, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 29.92, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 15.409999999999998, + "stderr": 0.0 + } + ], + "average_accuracy": 22.823333333333334, + "best_prompt": 29.92, + "prompt_id": "p2", + "CPS": 27.796677333333335, + "is_dummy": false, + "std_accuracy": 7.260181356779826 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 58.98, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 57.97, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 5.28, + "stderr": 0.0 + } + ], + "average_accuracy": 40.74333333333333, + "best_prompt": 58.98, + "prompt_id": "p1", + "CPS": 48.224014, + "is_dummy": false, + "std_accuracy": 30.71629914773805 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 52.65, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 52.849999999999994, + "stderr": 0.0 + } + ], + "average_accuracy": 35.166666666666664, + "best_prompt": 52.849999999999994, + "prompt_id": "p3", + "CPS": 43.50435833333333, + "is_dummy": false, + "std_accuracy": 30.455390874742246 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/medgemma-27b-text-it_10_PL.json b/e3c_llm_results/google/medgemma-27b-text-it_10_PL.json new file mode 100644 index 0000000000000000000000000000000000000000..58e3a004980b086e1f5fe175d7884784ff6b0a94 --- /dev/null +++ b/e3c_llm_results/google/medgemma-27b-text-it_10_PL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 64.263205, + "config": { + "model_name": "google/medgemma-27b-text-it", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "PL", + "model": "google/medgemma-27b-text-it", + "base_model": "Gemma3ForCausalLM", + "revision": "6b08c481126ff65a9b8fa5ab4d691b152b8edb5d", + "submitted_time": "2025-05-19 20:53:04+00:00", + "num_params_billion": 27.00900224, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 68.28999999999999, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 67.15, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 68.28999999999999, + "stderr": 0.0 + } + ], + "average_accuracy": 67.91, + "best_prompt": 68.28999999999999, + "prompt_id": "p1", + "CPS": 68.030498, + "is_dummy": false, + "std_accuracy": 0.6581793068761655 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 59.4, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 61.33, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 59.18, + "stderr": 0.0 + } + ], + "average_accuracy": 59.97, + "best_prompt": 61.33, + "prompt_id": "p2", + "CPS": 60.495912000000004, + "is_dummy": false, + "std_accuracy": 1.1829201156460223 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/medgemma-27b-text-it_10_SK.json b/e3c_llm_results/google/medgemma-27b-text-it_10_SK.json new file mode 100644 index 0000000000000000000000000000000000000000..addddd61e05b8976e4a1d3fcae6adf1772776b69 --- /dev/null +++ b/e3c_llm_results/google/medgemma-27b-text-it_10_SK.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 61.554673333333334, + "config": { + "model_name": "google/medgemma-27b-text-it", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "SK", + "model": "google/medgemma-27b-text-it", + "base_model": "Gemma3ForCausalLM", + "revision": "6b08c481126ff65a9b8fa5ab4d691b152b8edb5d", + "submitted_time": "2025-05-19 20:53:04+00:00", + "num_params_billion": 27.00900224, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 71.43, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 71.27, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 71.43, + "stderr": 0.0 + } + ], + "average_accuracy": 71.37666666666667, + "best_prompt": 71.43, + "prompt_id": "p1", + "CPS": 71.391904, + "is_dummy": false, + "std_accuracy": 0.09237604307034636 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 51.11, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 51.88, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 51.71, + "stderr": 0.0 + } + ], + "average_accuracy": 51.56666666666667, + "best_prompt": 51.88, + "prompt_id": "p2", + "CPS": 51.71744266666667, + "is_dummy": false, + "std_accuracy": 0.40451617190581457 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/medgemma-27b-text-it_10_SL.json b/e3c_llm_results/google/medgemma-27b-text-it_10_SL.json new file mode 100644 index 0000000000000000000000000000000000000000..d15804eb624203d39811e58e97dd13af0a908992 --- /dev/null +++ b/e3c_llm_results/google/medgemma-27b-text-it_10_SL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 62.13607933333333, + "config": { + "model_name": "google/medgemma-27b-text-it", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "SL", + "model": "google/medgemma-27b-text-it", + "base_model": "Gemma3ForCausalLM", + "revision": "6b08c481126ff65a9b8fa5ab4d691b152b8edb5d", + "submitted_time": "2025-05-19 20:53:04+00:00", + "num_params_billion": 27.00900224, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 69.47, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 67.65, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 69.47, + "stderr": 0.0 + } + ], + "average_accuracy": 68.86333333333333, + "best_prompt": 69.47, + "prompt_id": "p1", + "CPS": 69.04854866666666, + "is_dummy": false, + "std_accuracy": 1.050777489925115 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 53.23, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 55.900000000000006, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 54.94, + "stderr": 0.0 + } + ], + "average_accuracy": 54.69, + "best_prompt": 55.900000000000006, + "prompt_id": "p2", + "CPS": 55.22361, + "is_dummy": false, + "std_accuracy": 1.352442235365345 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/medgemma-4b-it_0_EN.json b/e3c_llm_results/google/medgemma-4b-it_0_EN.json new file mode 100644 index 0000000000000000000000000000000000000000..6edf00bce2f17fe607f9aba70e967c27036c0fab --- /dev/null +++ b/e3c_llm_results/google/medgemma-4b-it_0_EN.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 11.784388666666667, + "config": { + "model_name": "google/medgemma-4b-it", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "EN", + "model": "google/medgemma-4b-it", + "base_model": "Gemma3ForConditionalGeneration", + "revision": "efe6cc02361759b6bd501c654ddb7c9d25ec509d", + "submitted_time": "2025-05-19 20:52:44+00:00", + "num_params_billion": 4.300079472, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 26.35, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 25.03, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 27.37, + "stderr": 0.0 + } + ], + "average_accuracy": 26.25, + "best_prompt": 27.37, + "prompt_id": "p3", + "CPS": 27.063456000000002, + "is_dummy": false, + "std_accuracy": 1.173200750084997 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 20.95, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 32.57, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 32.029999999999994, + "stderr": 0.0 + } + ], + "average_accuracy": 28.516666666666662, + "best_prompt": 32.57, + "prompt_id": "p2", + "CPS": 31.24982933333333, + "is_dummy": false, + "std_accuracy": 6.558485597554768 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.61, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.5599999999999999, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.38999999999999996, + "best_prompt": 0.61, + "prompt_id": "p1", + "CPS": 0.608658, + "is_dummy": false, + "std_accuracy": 0.33867388443752194 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/medgemma-4b-it_0_GR.json b/e3c_llm_results/google/medgemma-4b-it_0_GR.json new file mode 100644 index 0000000000000000000000000000000000000000..f9a4742960aa0eafa76003daeca3bd32305b543d --- /dev/null +++ b/e3c_llm_results/google/medgemma-4b-it_0_GR.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 27.1538555, + "config": { + "model_name": "google/medgemma-4b-it", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "GR", + "model": "google/medgemma-4b-it", + "base_model": "Gemma3ForConditionalGeneration", + "revision": "efe6cc02361759b6bd501c654ddb7c9d25ec509d", + "submitted_time": "2025-05-19 20:52:44+00:00", + "num_params_billion": 4.300079472, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 27.05, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 26.540000000000003, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 27.05, + "stderr": 0.0 + } + ], + "average_accuracy": 26.88, + "best_prompt": 27.05, + "prompt_id": "p1", + "CPS": 27.004015, + "is_dummy": false, + "std_accuracy": 0.294448637286708 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 23.810000000000002, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 30.240000000000002, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 7.539999999999999, + "stderr": 0.0 + } + ], + "average_accuracy": 20.53, + "best_prompt": 30.240000000000002, + "prompt_id": "p2", + "CPS": 27.303696000000002, + "is_dummy": false, + "std_accuracy": 11.70005555542366 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/medgemma-4b-it_0_IT.json b/e3c_llm_results/google/medgemma-4b-it_0_IT.json new file mode 100644 index 0000000000000000000000000000000000000000..38072c731ca8a9c900bca0a8685958a7ed43acf5 --- /dev/null +++ b/e3c_llm_results/google/medgemma-4b-it_0_IT.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 10.7509926, + "config": { + "model_name": "google/medgemma-4b-it", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "IT", + "model": "google/medgemma-4b-it", + "base_model": "Gemma3ForConditionalGeneration", + "revision": "efe6cc02361759b6bd501c654ddb7c9d25ec509d", + "submitted_time": "2025-05-19 20:52:44+00:00", + "num_params_billion": 4.300079472, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 31.569999999999997, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 26.27, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 30.04, + "stderr": 0.0 + } + ], + "average_accuracy": 29.293333333333333, + "best_prompt": 31.569999999999997, + "prompt_id": "p1", + "CPS": 30.851256333333332, + "is_dummy": false, + "std_accuracy": 2.727752432559327 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 21.54, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 24.610000000000003, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 6.88, + "stderr": 0.0 + } + ], + "average_accuracy": 17.67666666666667, + "best_prompt": 24.610000000000003, + "prompt_id": "p2", + "CPS": 22.903706666666668, + "is_dummy": false, + "std_accuracy": 9.47534871829704 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/medgemma-4b-it_0_PL.json b/e3c_llm_results/google/medgemma-4b-it_0_PL.json new file mode 100644 index 0000000000000000000000000000000000000000..56ba3c111cec9f9a021e0fab94a16735dfc6be30 --- /dev/null +++ b/e3c_llm_results/google/medgemma-4b-it_0_PL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 17.725084, + "config": { + "model_name": "google/medgemma-4b-it", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "PL", + "model": "google/medgemma-4b-it", + "base_model": "Gemma3ForConditionalGeneration", + "revision": "efe6cc02361759b6bd501c654ddb7c9d25ec509d", + "submitted_time": "2025-05-19 20:52:44+00:00", + "num_params_billion": 4.300079472, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 22.55, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 21.83, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 22.55, + "stderr": 0.0 + } + ], + "average_accuracy": 22.31, + "best_prompt": 22.55, + "prompt_id": "p1", + "CPS": 22.49588, + "is_dummy": false, + "std_accuracy": 0.41569219381653194 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 11.5, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 13.139999999999999, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 10.54, + "stderr": 0.0 + } + ], + "average_accuracy": 11.726666666666667, + "best_prompt": 13.139999999999999, + "prompt_id": "p2", + "CPS": 12.954287999999998, + "is_dummy": false, + "std_accuracy": 1.3147369825685031 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/medgemma-4b-it_0_SK.json b/e3c_llm_results/google/medgemma-4b-it_0_SK.json new file mode 100644 index 0000000000000000000000000000000000000000..e7f23d3cdad5a7506bb4d3d0d7d4e57cc1afbb0a --- /dev/null +++ b/e3c_llm_results/google/medgemma-4b-it_0_SK.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 19.074956666666665, + "config": { + "model_name": "google/medgemma-4b-it", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "SK", + "model": "google/medgemma-4b-it", + "base_model": "Gemma3ForConditionalGeneration", + "revision": "efe6cc02361759b6bd501c654ddb7c9d25ec509d", + "submitted_time": "2025-05-19 20:52:44+00:00", + "num_params_billion": 4.300079472, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 24.47, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 23.87, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 24.47, + "stderr": 0.0 + } + ], + "average_accuracy": 24.27, + "best_prompt": 24.47, + "prompt_id": "p1", + "CPS": 24.421059999999997, + "is_dummy": false, + "std_accuracy": 0.34641016151377424 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 11.19, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 13.99, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 11.19, + "stderr": 0.0 + } + ], + "average_accuracy": 12.123333333333333, + "best_prompt": 13.99, + "prompt_id": "p2", + "CPS": 13.728853333333333, + "is_dummy": false, + "std_accuracy": 1.6165807537309524 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/medgemma-4b-it_0_SL.json b/e3c_llm_results/google/medgemma-4b-it_0_SL.json new file mode 100644 index 0000000000000000000000000000000000000000..4e2b67f04b2f31f440338c2cec8b0ab5c9103308 --- /dev/null +++ b/e3c_llm_results/google/medgemma-4b-it_0_SL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 18.266028000000002, + "config": { + "model_name": "google/medgemma-4b-it", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "SL", + "model": "google/medgemma-4b-it", + "base_model": "Gemma3ForConditionalGeneration", + "revision": "efe6cc02361759b6bd501c654ddb7c9d25ec509d", + "submitted_time": "2025-05-19 20:52:44+00:00", + "num_params_billion": 4.300079472, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 25.740000000000002, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 25.580000000000002, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 25.740000000000002, + "stderr": 0.0 + } + ], + "average_accuracy": 25.686666666666667, + "best_prompt": 25.740000000000002, + "prompt_id": "p1", + "CPS": 25.726272, + "is_dummy": false, + "std_accuracy": 0.09237604307034021 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 9.73, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 10.89, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 9.73, + "stderr": 0.0 + } + ], + "average_accuracy": 10.116666666666667, + "best_prompt": 10.89, + "prompt_id": "p2", + "CPS": 10.805784000000001, + "is_dummy": false, + "std_accuracy": 0.6697263122599659 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/medgemma-4b-it_10_EN.json b/e3c_llm_results/google/medgemma-4b-it_10_EN.json new file mode 100644 index 0000000000000000000000000000000000000000..70626aa417b34b7d2b2ccdddb35a20015f968ed9 --- /dev/null +++ b/e3c_llm_results/google/medgemma-4b-it_10_EN.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 33.354895866666666, + "config": { + "model_name": "google/medgemma-4b-it", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "EN", + "model": "google/medgemma-4b-it", + "base_model": "Gemma3ForConditionalGeneration", + "revision": "efe6cc02361759b6bd501c654ddb7c9d25ec509d", + "submitted_time": "2025-05-19 20:52:44+00:00", + "num_params_billion": 4.300079472, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 48.33, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 50.05, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 49.51, + "stderr": 0.0 + } + ], + "average_accuracy": 49.29666666666666, + "best_prompt": 50.05, + "prompt_id": "p2", + "CPS": 49.672956666666664, + "is_dummy": false, + "std_accuracy": 0.8796211305632285 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 9.64, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 12.370000000000001, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 13.91, + "stderr": 0.0 + } + ], + "average_accuracy": 11.973333333333334, + "best_prompt": 13.91, + "prompt_id": "p3", + "CPS": 13.640609666666666, + "is_dummy": false, + "std_accuracy": 2.16246001889823 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 26.590000000000003, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 26.71, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 26.07, + "stderr": 0.0 + } + ], + "average_accuracy": 26.456666666666667, + "best_prompt": 26.71, + "prompt_id": "p2", + "CPS": 26.642334666666667, + "is_dummy": false, + "std_accuracy": 0.3401960219246161 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 36.620000000000005, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 38.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.06, + "stderr": 0.0 + } + ], + "average_accuracy": 24.893333333333334, + "best_prompt": 38.0, + "prompt_id": "p2", + "CPS": 33.019466666666666, + "is_dummy": false, + "std_accuracy": 21.517363531188792 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 45.050000000000004, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 37.99, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 43.78, + "stderr": 0.0 + } + ], + "average_accuracy": 42.27333333333333, + "best_prompt": 45.050000000000004, + "prompt_id": "p1", + "CPS": 43.79911166666667, + "is_dummy": false, + "std_accuracy": 3.763433715815032 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/medgemma-4b-it_10_GR.json b/e3c_llm_results/google/medgemma-4b-it_10_GR.json new file mode 100644 index 0000000000000000000000000000000000000000..753d835f990f9f0fa75fe03181b4e867bda9d7db --- /dev/null +++ b/e3c_llm_results/google/medgemma-4b-it_10_GR.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 32.8816105, + "config": { + "model_name": "google/medgemma-4b-it", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "GR", + "model": "google/medgemma-4b-it", + "base_model": "Gemma3ForConditionalGeneration", + "revision": "efe6cc02361759b6bd501c654ddb7c9d25ec509d", + "submitted_time": "2025-05-19 20:52:44+00:00", + "num_params_billion": 4.300079472, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 49.1, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 50.39, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 49.1, + "stderr": 0.0 + } + ], + "average_accuracy": 49.53, + "best_prompt": 50.39, + "prompt_id": "p2", + "CPS": 49.956646000000006, + "is_dummy": false, + "std_accuracy": 0.7447818472546168 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 12.04, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 16.05, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 15.509999999999998, + "stderr": 0.0 + } + ], + "average_accuracy": 14.533333333333331, + "best_prompt": 16.05, + "prompt_id": "p2", + "CPS": 15.806575, + "is_dummy": false, + "std_accuracy": 2.1761050832469775 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/medgemma-4b-it_10_IT.json b/e3c_llm_results/google/medgemma-4b-it_10_IT.json new file mode 100644 index 0000000000000000000000000000000000000000..8f8f5f9fe3968307cf6519e71d901d58727e7b1e --- /dev/null +++ b/e3c_llm_results/google/medgemma-4b-it_10_IT.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 33.5863308, + "config": { + "model_name": "google/medgemma-4b-it", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "IT", + "model": "google/medgemma-4b-it", + "base_model": "Gemma3ForConditionalGeneration", + "revision": "efe6cc02361759b6bd501c654ddb7c9d25ec509d", + "submitted_time": "2025-05-19 20:52:44+00:00", + "num_params_billion": 4.300079472, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 56.330000000000005, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 53.769999999999996, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 53.52, + "stderr": 0.0 + } + ], + "average_accuracy": 54.54, + "best_prompt": 56.330000000000005, + "prompt_id": "p1", + "CPS": 55.321693, + "is_dummy": false, + "std_accuracy": 1.5552170266557686 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 15.920000000000002, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 19.17, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 17.51, + "stderr": 0.0 + } + ], + "average_accuracy": 17.533333333333335, + "best_prompt": 19.17, + "prompt_id": "p2", + "CPS": 18.856251000000004, + "is_dummy": false, + "std_accuracy": 1.6251256361688882 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 10.72, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 13.55, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 8.61, + "stderr": 0.0 + } + ], + "average_accuracy": 10.96, + "best_prompt": 13.55, + "prompt_id": "p2", + "CPS": 13.199055, + "is_dummy": false, + "std_accuracy": 2.4787295132789304 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 52.290000000000006, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 52.89, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.54, + "stderr": 0.0 + } + ], + "average_accuracy": 35.24, + "best_prompt": 52.89, + "prompt_id": "p2", + "CPS": 43.554915, + "is_dummy": false, + "std_accuracy": 30.0525789242787 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 43.14, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.52, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 43.059999999999995, + "stderr": 0.0 + } + ], + "average_accuracy": 28.906666666666666, + "best_prompt": 43.14, + "prompt_id": "p1", + "CPS": 36.99974, + "is_dummy": false, + "std_accuracy": 24.583607004126414 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/medgemma-4b-it_10_PL.json b/e3c_llm_results/google/medgemma-4b-it_10_PL.json new file mode 100644 index 0000000000000000000000000000000000000000..d94e371ca25ef75fddd5c86c38c8c5dc2cbc56ba --- /dev/null +++ b/e3c_llm_results/google/medgemma-4b-it_10_PL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 31.782375333333327, + "config": { + "model_name": "google/medgemma-4b-it", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "PL", + "model": "google/medgemma-4b-it", + "base_model": "Gemma3ForConditionalGeneration", + "revision": "efe6cc02361759b6bd501c654ddb7c9d25ec509d", + "submitted_time": "2025-05-19 20:52:44+00:00", + "num_params_billion": 4.300079472, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 51.85999999999999, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 52.059999999999995, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 51.85999999999999, + "stderr": 0.0 + } + ], + "average_accuracy": 51.926666666666655, + "best_prompt": 52.059999999999995, + "prompt_id": "p2", + "CPS": 51.99058666666666, + "is_dummy": false, + "std_accuracy": 0.1154700538379268 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 11.709999999999999, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 9.969999999999999, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 9.969999999999999, + "stderr": 0.0 + } + ], + "average_accuracy": 10.549999999999999, + "best_prompt": 11.709999999999999, + "prompt_id": "p1", + "CPS": 11.574163999999998, + "is_dummy": false, + "std_accuracy": 1.004589468389949 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/medgemma-4b-it_10_SK.json b/e3c_llm_results/google/medgemma-4b-it_10_SK.json new file mode 100644 index 0000000000000000000000000000000000000000..dbaba2b960b60e7987164956beffb9d1bcb5a809 --- /dev/null +++ b/e3c_llm_results/google/medgemma-4b-it_10_SK.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 28.978618833333336, + "config": { + "model_name": "google/medgemma-4b-it", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "SK", + "model": "google/medgemma-4b-it", + "base_model": "Gemma3ForConditionalGeneration", + "revision": "efe6cc02361759b6bd501c654ddb7c9d25ec509d", + "submitted_time": "2025-05-19 20:52:44+00:00", + "num_params_billion": 4.300079472, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 47.56, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 44.49, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 47.56, + "stderr": 0.0 + } + ], + "average_accuracy": 46.53666666666667, + "best_prompt": 47.56, + "prompt_id": "p1", + "CPS": 47.07330266666667, + "is_dummy": false, + "std_accuracy": 1.7724653264121513 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 10.95, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 10.09, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 10.0, + "stderr": 0.0 + } + ], + "average_accuracy": 10.346666666666666, + "best_prompt": 10.95, + "prompt_id": "p1", + "CPS": 10.883935, + "is_dummy": false, + "std_accuracy": 0.5244362052083484 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/google/medgemma-4b-it_10_SL.json b/e3c_llm_results/google/medgemma-4b-it_10_SL.json new file mode 100644 index 0000000000000000000000000000000000000000..fa1a8dbc033fe881961a9e854dabae4c4a91ee7f --- /dev/null +++ b/e3c_llm_results/google/medgemma-4b-it_10_SL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 32.7709705, + "config": { + "model_name": "google/medgemma-4b-it", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "SL", + "model": "google/medgemma-4b-it", + "base_model": "Gemma3ForConditionalGeneration", + "revision": "efe6cc02361759b6bd501c654ddb7c9d25ec509d", + "submitted_time": "2025-05-19 20:52:44+00:00", + "num_params_billion": 4.300079472, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 51.17, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 49.55, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 51.17, + "stderr": 0.0 + } + ], + "average_accuracy": 50.629999999999995, + "best_prompt": 51.17, + "prompt_id": "p1", + "CPS": 50.893682, + "is_dummy": false, + "std_accuracy": 0.9353074360871964 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 11.78, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 11.01, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 15.010000000000002, + "stderr": 0.0 + } + ], + "average_accuracy": 12.6, + "best_prompt": 15.010000000000002, + "prompt_id": "p3", + "CPS": 14.648259000000001, + "is_dummy": false, + "std_accuracy": 2.1223336212763546 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/meta-llama/.DS_Store b/e3c_llm_results/meta-llama/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..446a5c6c78528e5ea6ca14aabc826a97cfbc6bcc Binary files /dev/null and b/e3c_llm_results/meta-llama/.DS_Store differ diff --git a/e3c_llm_results/meta-llama/Llama-3.2-1B-Instruct_5.json b/e3c_llm_results/meta-llama/Llama-3.2-1B-Instruct_5.json new file mode 100644 index 0000000000000000000000000000000000000000..57f929e967ef4193e87800b4c8543700b611a406 --- /dev/null +++ b/e3c_llm_results/meta-llama/Llama-3.2-1B-Instruct_5.json @@ -0,0 +1,24 @@ +{ + "average_CPS": 12.479999999999999, + "config": { + "model_name": "meta-llama/Llama-3.2-1B-Instruct", + "num_fewshot": "5", + "batch_size": 8 + }, + "tasks": { + "RE": { + "prompts": [ + { + "prompt": "prompt-1", + "metric": "f1", + "value": 12.479999999999999, + "stderr": null + } + ], + "average_accuracy": 12.479999999999999, + "best_prompt": 12.479999999999999, + "prompt_id": "prompt-1", + "CPS": 12.479999999999999 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/microsoft/MediPhi-Clinical_0_EN.json b/e3c_llm_results/microsoft/MediPhi-Clinical_0_EN.json new file mode 100644 index 0000000000000000000000000000000000000000..4db4a012e0e3da7b6c380f98c64ad248850bde75 --- /dev/null +++ b/e3c_llm_results/microsoft/MediPhi-Clinical_0_EN.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 13.326142266666668, + "config": { + "model_name": "microsoft/MediPhi-Clinical", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "EN", + "model": "microsoft/MediPhi-Clinical", + "base_model": "Phi3ForCausalLM", + "revision": "0906e64d321a9c4b058137b34fb3ed6e257e05a0", + "submitted_time": "2025-05-29 20:40:05+00:00", + "num_params_billion": 3.821079552, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 25.019999999999996, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 30.89, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 27.68, + "stderr": 0.0 + } + ], + "average_accuracy": 27.863333333333333, + "best_prompt": 30.89, + "prompt_id": "p2", + "CPS": 29.955062666666667, + "is_dummy": false, + "std_accuracy": 2.939291297801793 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 22.74, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 39.290000000000006, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 35.42, + "stderr": 0.0 + } + ], + "average_accuracy": 32.483333333333334, + "best_prompt": 39.290000000000006, + "prompt_id": "p2", + "CPS": 36.61566066666667, + "is_dummy": false, + "std_accuracy": 8.65699909514454 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.03, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.01, + "best_prompt": 0.03, + "prompt_id": "p2", + "CPS": 0.029994, + "is_dummy": false, + "std_accuracy": 0.017320508075688773 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.03, + "stderr": 0.0 + } + ], + "average_accuracy": 0.01, + "best_prompt": 0.03, + "prompt_id": "p3", + "CPS": 0.029994, + "is_dummy": false, + "std_accuracy": 0.017320508075688773 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/microsoft/MediPhi-Clinical_0_GR.json b/e3c_llm_results/microsoft/MediPhi-Clinical_0_GR.json new file mode 100644 index 0000000000000000000000000000000000000000..5439d8bb6753816be58ecc8557b98adbf7f56db0 --- /dev/null +++ b/e3c_llm_results/microsoft/MediPhi-Clinical_0_GR.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 15.980523333333334, + "config": { + "model_name": "microsoft/MediPhi-Clinical", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "GR", + "model": "microsoft/MediPhi-Clinical", + "base_model": "Phi3ForCausalLM", + "revision": "0906e64d321a9c4b058137b34fb3ed6e257e05a0", + "submitted_time": "2025-05-29 20:40:05+00:00", + "num_params_billion": 3.821079552, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 16.41, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 18.69, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 16.41, + "stderr": 0.0 + } + ], + "average_accuracy": 17.17, + "best_prompt": 18.69, + "prompt_id": "p2", + "CPS": 18.405912, + "is_dummy": false, + "std_accuracy": 1.3163586137523473 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 7.359999999999999, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 7.779999999999999, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 14.180000000000001, + "stderr": 0.0 + } + ], + "average_accuracy": 9.773333333333333, + "best_prompt": 14.180000000000001, + "prompt_id": "p3", + "CPS": 13.555134666666667, + "is_dummy": false, + "std_accuracy": 3.822058782035324 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/microsoft/MediPhi-Clinical_0_IT.json b/e3c_llm_results/microsoft/MediPhi-Clinical_0_IT.json new file mode 100644 index 0000000000000000000000000000000000000000..722d002e6e2ebd62d2b02df97491535731029afe --- /dev/null +++ b/e3c_llm_results/microsoft/MediPhi-Clinical_0_IT.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 9.630995666666667, + "config": { + "model_name": "microsoft/MediPhi-Clinical", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "IT", + "model": "microsoft/MediPhi-Clinical", + "base_model": "Phi3ForCausalLM", + "revision": "0906e64d321a9c4b058137b34fb3ed6e257e05a0", + "submitted_time": "2025-05-29 20:40:05+00:00", + "num_params_billion": 3.821079552, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 33.97, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 33.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 32.26, + "stderr": 0.0 + } + ], + "average_accuracy": 33.07666666666666, + "best_prompt": 33.97, + "prompt_id": "p1", + "CPS": 33.666534666666664, + "is_dummy": false, + "std_accuracy": 0.8575740978675451 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 14.89, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 7.359999999999999, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 1.49, + "stderr": 0.0 + } + ], + "average_accuracy": 7.913333333333333, + "best_prompt": 14.89, + "prompt_id": "p1", + "CPS": 13.851174333333335, + "is_dummy": false, + "std_accuracy": 6.7171149560904 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.64, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.21333333333333335, + "best_prompt": 0.64, + "prompt_id": "p2", + "CPS": 0.6372693333333334, + "is_dummy": false, + "std_accuracy": 0.3695041722813605 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/microsoft/MediPhi-Clinical_0_PL.json b/e3c_llm_results/microsoft/MediPhi-Clinical_0_PL.json new file mode 100644 index 0000000000000000000000000000000000000000..07493d6ba926b87d132a4592225d9e5242419c11 --- /dev/null +++ b/e3c_llm_results/microsoft/MediPhi-Clinical_0_PL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 29.002397166666665, + "config": { + "model_name": "microsoft/MediPhi-Clinical", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "PL", + "model": "microsoft/MediPhi-Clinical", + "base_model": "Phi3ForCausalLM", + "revision": "0906e64d321a9c4b058137b34fb3ed6e257e05a0", + "submitted_time": "2025-05-29 20:40:05+00:00", + "num_params_billion": 3.821079552, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 28.15, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 28.610000000000003, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 28.15, + "stderr": 0.0 + } + ], + "average_accuracy": 28.30333333333333, + "best_prompt": 28.610000000000003, + "prompt_id": "p2", + "CPS": 28.522262666666666, + "is_dummy": false, + "std_accuracy": 0.26558112382723037 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 21.09, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 29.080000000000002, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 30.61, + "stderr": 0.0 + } + ], + "average_accuracy": 26.926666666666666, + "best_prompt": 30.61, + "prompt_id": "p3", + "CPS": 29.482531666666663, + "is_dummy": false, + "std_accuracy": 5.112263034443096 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/microsoft/MediPhi-Clinical_0_SK.json b/e3c_llm_results/microsoft/MediPhi-Clinical_0_SK.json new file mode 100644 index 0000000000000000000000000000000000000000..06957df0b599a226b72424d6d11c3537fa551e27 --- /dev/null +++ b/e3c_llm_results/microsoft/MediPhi-Clinical_0_SK.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 21.908259666666666, + "config": { + "model_name": "microsoft/MediPhi-Clinical", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "SK", + "model": "microsoft/MediPhi-Clinical", + "base_model": "Phi3ForCausalLM", + "revision": "0906e64d321a9c4b058137b34fb3ed6e257e05a0", + "submitted_time": "2025-05-29 20:40:05+00:00", + "num_params_billion": 3.821079552, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 25.71, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 29.87, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 25.71, + "stderr": 0.0 + } + ], + "average_accuracy": 27.096666666666664, + "best_prompt": 29.87, + "prompt_id": "p2", + "CPS": 29.041605333333333, + "is_dummy": false, + "std_accuracy": 2.4017771198288433 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 15.540000000000001, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.77, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 15.540000000000001, + "stderr": 0.0 + } + ], + "average_accuracy": 10.616666666666667, + "best_prompt": 15.540000000000001, + "prompt_id": "p1", + "CPS": 14.774914, + "is_dummy": false, + "std_accuracy": 8.527463475930773 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/microsoft/MediPhi-Clinical_0_SL.json b/e3c_llm_results/microsoft/MediPhi-Clinical_0_SL.json new file mode 100644 index 0000000000000000000000000000000000000000..ddd541d06daf7f909e3feb39637342751a49b26b --- /dev/null +++ b/e3c_llm_results/microsoft/MediPhi-Clinical_0_SL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 16.78806766666667, + "config": { + "model_name": "microsoft/MediPhi-Clinical", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "SL", + "model": "microsoft/MediPhi-Clinical", + "base_model": "Phi3ForCausalLM", + "revision": "0906e64d321a9c4b058137b34fb3ed6e257e05a0", + "submitted_time": "2025-05-29 20:40:05+00:00", + "num_params_billion": 3.821079552, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 29.98, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 26.8, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 29.98, + "stderr": 0.0 + } + ], + "average_accuracy": 28.92, + "best_prompt": 29.98, + "prompt_id": "p1", + "CPS": 29.662212000000004, + "is_dummy": false, + "std_accuracy": 1.8359738560230099 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 3.95, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 1.21, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 3.95, + "stderr": 0.0 + } + ], + "average_accuracy": 3.0366666666666666, + "best_prompt": 3.95, + "prompt_id": "p1", + "CPS": 3.9139233333333334, + "is_dummy": false, + "std_accuracy": 1.5819397375795747 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/microsoft/MediPhi-Clinical_10_EN.json b/e3c_llm_results/microsoft/MediPhi-Clinical_10_EN.json new file mode 100644 index 0000000000000000000000000000000000000000..1696c5bc0fb57f5dd15f5e26d54b2f6936a552b9 --- /dev/null +++ b/e3c_llm_results/microsoft/MediPhi-Clinical_10_EN.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 34.61790526666666, + "config": { + "model_name": "microsoft/MediPhi-Clinical", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "EN", + "model": "microsoft/MediPhi-Clinical", + "base_model": "Phi3ForCausalLM", + "revision": "0906e64d321a9c4b058137b34fb3ed6e257e05a0", + "submitted_time": "2025-05-29 20:40:05+00:00", + "num_params_billion": 3.821079552, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 50.09, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 49.66, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 50.49, + "stderr": 0.0 + } + ], + "average_accuracy": 50.080000000000005, + "best_prompt": 50.49, + "prompt_id": "p3", + "CPS": 50.282991, + "is_dummy": false, + "std_accuracy": 0.41509035161034796 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 11.75, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 10.95, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 11.07, + "stderr": 0.0 + } + ], + "average_accuracy": 11.256666666666666, + "best_prompt": 11.75, + "prompt_id": "p1", + "CPS": 11.692033333333333, + "is_dummy": false, + "std_accuracy": 0.4314317249963585 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 30.520000000000003, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 33.07, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 32.08, + "stderr": 0.0 + } + ], + "average_accuracy": 31.89, + "best_prompt": 33.07, + "prompt_id": "p2", + "CPS": 32.679774, + "is_dummy": false, + "std_accuracy": 1.285573801848807 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 18.33, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 28.03, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 40.02, + "stderr": 0.0 + } + ], + "average_accuracy": 28.793333333333333, + "best_prompt": 40.02, + "prompt_id": "p3", + "CPS": 35.527088, + "is_dummy": false, + "std_accuracy": 10.86512923684451 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 35.28, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 28.18, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 48.199999999999996, + "stderr": 0.0 + } + ], + "average_accuracy": 37.22, + "best_prompt": 48.199999999999996, + "prompt_id": "p3", + "CPS": 42.907639999999994, + "is_dummy": false, + "std_accuracy": 10.150014778314363 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/microsoft/MediPhi-Clinical_10_GR.json b/e3c_llm_results/microsoft/MediPhi-Clinical_10_GR.json new file mode 100644 index 0000000000000000000000000000000000000000..f5d4cbb3022141fb6c72b1cd6aede980b698a613 --- /dev/null +++ b/e3c_llm_results/microsoft/MediPhi-Clinical_10_GR.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 20.501029666666668, + "config": { + "model_name": "microsoft/MediPhi-Clinical", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "GR", + "model": "microsoft/MediPhi-Clinical", + "base_model": "Phi3ForCausalLM", + "revision": "0906e64d321a9c4b058137b34fb3ed6e257e05a0", + "submitted_time": "2025-05-29 20:40:05+00:00", + "num_params_billion": 3.821079552, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 33.75, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 34.03, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 33.75, + "stderr": 0.0 + } + ], + "average_accuracy": 33.843333333333334, + "best_prompt": 34.03, + "prompt_id": "p2", + "CPS": 33.96647733333334, + "is_dummy": false, + "std_accuracy": 0.16165807537309587 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 4.2700000000000005, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 6.81, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 7.109999999999999, + "stderr": 0.0 + } + ], + "average_accuracy": 6.063333333333333, + "best_prompt": 7.109999999999999, + "prompt_id": "p3", + "CPS": 7.035582, + "is_dummy": false, + "std_accuracy": 1.560299116622621 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/microsoft/MediPhi-Clinical_10_IT.json b/e3c_llm_results/microsoft/MediPhi-Clinical_10_IT.json new file mode 100644 index 0000000000000000000000000000000000000000..2fbe45b0f55683348d2da5b042e05d3ea0817b35 --- /dev/null +++ b/e3c_llm_results/microsoft/MediPhi-Clinical_10_IT.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 32.65775206666667, + "config": { + "model_name": "microsoft/MediPhi-Clinical", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "IT", + "model": "microsoft/MediPhi-Clinical", + "base_model": "Phi3ForCausalLM", + "revision": "0906e64d321a9c4b058137b34fb3ed6e257e05a0", + "submitted_time": "2025-05-29 20:40:05+00:00", + "num_params_billion": 3.821079552, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 51.949999999999996, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 53.010000000000005, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 52.75, + "stderr": 0.0 + } + ], + "average_accuracy": 52.57, + "best_prompt": 53.010000000000005, + "prompt_id": "p2", + "CPS": 52.776756, + "is_dummy": false, + "std_accuracy": 0.5524490926773298 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 21.14, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 9.610000000000001, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 14.219999999999999, + "stderr": 0.0 + } + ], + "average_accuracy": 14.99, + "best_prompt": 21.14, + "prompt_id": "p1", + "CPS": 19.83989, + "is_dummy": false, + "std_accuracy": 5.803438635843408 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 14.219999999999999, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 16.46, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 8.290000000000001, + "stderr": 0.0 + } + ], + "average_accuracy": 12.99, + "best_prompt": 16.46, + "prompt_id": "p2", + "CPS": 15.888838000000002, + "is_dummy": false, + "std_accuracy": 4.221599223043324 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 32.21, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 37.980000000000004, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 53.65, + "stderr": 0.0 + } + ], + "average_accuracy": 41.28, + "best_prompt": 53.65, + "prompt_id": "p3", + "CPS": 47.013495000000006, + "is_dummy": false, + "std_accuracy": 11.094408501583127 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 11.690000000000001, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 31.03, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 18.85, + "stderr": 0.0 + } + ], + "average_accuracy": 20.523333333333333, + "best_prompt": 31.03, + "prompt_id": "p2", + "CPS": 27.769781333333334, + "is_dummy": false, + "std_accuracy": 9.777982068572909 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/microsoft/MediPhi-Clinical_10_PL.json b/e3c_llm_results/microsoft/MediPhi-Clinical_10_PL.json new file mode 100644 index 0000000000000000000000000000000000000000..c0a4491e5fc31cf8df808a8d15d2d0212e34d7bd --- /dev/null +++ b/e3c_llm_results/microsoft/MediPhi-Clinical_10_PL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 28.317504, + "config": { + "model_name": "microsoft/MediPhi-Clinical", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "PL", + "model": "microsoft/MediPhi-Clinical", + "base_model": "Phi3ForCausalLM", + "revision": "0906e64d321a9c4b058137b34fb3ed6e257e05a0", + "submitted_time": "2025-05-29 20:40:05+00:00", + "num_params_billion": 3.821079552, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 39.129999999999995, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 41.32, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 39.129999999999995, + "stderr": 0.0 + } + ], + "average_accuracy": 39.85999999999999, + "best_prompt": 41.32, + "prompt_id": "p2", + "CPS": 40.716727999999996, + "is_dummy": false, + "std_accuracy": 1.2643970895252832 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 12.55, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 12.07, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 16.36, + "stderr": 0.0 + } + ], + "average_accuracy": 13.660000000000002, + "best_prompt": 16.36, + "prompt_id": "p3", + "CPS": 15.91828, + "is_dummy": false, + "std_accuracy": 2.35055312639387 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/microsoft/MediPhi-Clinical_10_SK.json b/e3c_llm_results/microsoft/MediPhi-Clinical_10_SK.json new file mode 100644 index 0000000000000000000000000000000000000000..b7776b39560d44bb293b6bde77b03a5af6909e73 --- /dev/null +++ b/e3c_llm_results/microsoft/MediPhi-Clinical_10_SK.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 23.942156333333333, + "config": { + "model_name": "microsoft/MediPhi-Clinical", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "SK", + "model": "microsoft/MediPhi-Clinical", + "base_model": "Phi3ForCausalLM", + "revision": "0906e64d321a9c4b058137b34fb3ed6e257e05a0", + "submitted_time": "2025-05-29 20:40:05+00:00", + "num_params_billion": 3.821079552, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 41.06, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 38.61, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 41.06, + "stderr": 0.0 + } + ], + "average_accuracy": 40.24333333333333, + "best_prompt": 41.06, + "prompt_id": "p1", + "CPS": 40.72467666666667, + "is_dummy": false, + "std_accuracy": 1.4145081595145848 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 5.09, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 6.0600000000000005, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 7.24, + "stderr": 0.0 + } + ], + "average_accuracy": 6.13, + "best_prompt": 7.24, + "prompt_id": "p3", + "CPS": 7.159636, + "is_dummy": false, + "std_accuracy": 1.0767079455451234 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/microsoft/MediPhi-Clinical_10_SL.json b/e3c_llm_results/microsoft/MediPhi-Clinical_10_SL.json new file mode 100644 index 0000000000000000000000000000000000000000..bb21bc4adcf5b49db6ab02eda0be2e1968689f12 --- /dev/null +++ b/e3c_llm_results/microsoft/MediPhi-Clinical_10_SL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 24.260621, + "config": { + "model_name": "microsoft/MediPhi-Clinical", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "SL", + "model": "microsoft/MediPhi-Clinical", + "base_model": "Phi3ForCausalLM", + "revision": "0906e64d321a9c4b058137b34fb3ed6e257e05a0", + "submitted_time": "2025-05-29 20:40:05+00:00", + "num_params_billion": 3.821079552, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 40.36, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 39.900000000000006, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 40.36, + "stderr": 0.0 + } + ], + "average_accuracy": 40.20666666666667, + "best_prompt": 40.36, + "prompt_id": "p1", + "CPS": 40.29811466666667, + "is_dummy": false, + "std_accuracy": 0.26558112382722426 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 8.290000000000001, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 6.74, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 7.42, + "stderr": 0.0 + } + ], + "average_accuracy": 7.483333333333334, + "best_prompt": 8.290000000000001, + "prompt_id": "p1", + "CPS": 8.223127333333334, + "is_dummy": false, + "std_accuracy": 0.7769384359994902 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/microsoft/MediPhi-Instruct_0_EN.json b/e3c_llm_results/microsoft/MediPhi-Instruct_0_EN.json new file mode 100644 index 0000000000000000000000000000000000000000..af7076f74a0b9ae21fef440fd356fd556f331e38 --- /dev/null +++ b/e3c_llm_results/microsoft/MediPhi-Instruct_0_EN.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 11.733952266666666, + "config": { + "model_name": "microsoft/MediPhi-Instruct", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "EN", + "model": "microsoft/MediPhi-Instruct", + "base_model": "Phi3ForCausalLM", + "revision": "a94ac478e7c246103d55665a0804684042f3b973", + "submitted_time": "2025-07-11 19:28:15+00:00", + "num_params_billion": 3.821079552, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 7.61, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 24.099999999999998, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 16.25, + "stderr": 0.0 + } + ], + "average_accuracy": 15.986666666666665, + "best_prompt": 24.099999999999998, + "prompt_id": "p2", + "CPS": 22.144686666666665, + "is_dummy": false, + "std_accuracy": 8.24815332867505 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 11.35, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 40.06, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 38.04, + "stderr": 0.0 + } + ], + "average_accuracy": 29.816666666666666, + "best_prompt": 40.06, + "prompt_id": "p2", + "CPS": 35.95652066666667, + "is_dummy": false, + "std_accuracy": 16.024463589566214 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.44999999999999996, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.15, + "best_prompt": 0.44999999999999996, + "prompt_id": "p2", + "CPS": 0.44864999999999994, + "is_dummy": false, + "std_accuracy": 0.25980762113533157 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.12, + "stderr": 0.0 + } + ], + "average_accuracy": 0.04, + "best_prompt": 0.12, + "prompt_id": "p3", + "CPS": 0.119904, + "is_dummy": false, + "std_accuracy": 0.06928203230275509 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/microsoft/MediPhi-Instruct_0_GR.json b/e3c_llm_results/microsoft/MediPhi-Instruct_0_GR.json new file mode 100644 index 0000000000000000000000000000000000000000..0deadb52317a0d4b4df567d7a0aa11aee92ad91f --- /dev/null +++ b/e3c_llm_results/microsoft/MediPhi-Instruct_0_GR.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 15.261295333333333, + "config": { + "model_name": "microsoft/MediPhi-Instruct", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "GR", + "model": "microsoft/MediPhi-Instruct", + "base_model": "Phi3ForCausalLM", + "revision": "a94ac478e7c246103d55665a0804684042f3b973", + "submitted_time": "2025-07-11 19:28:15+00:00", + "num_params_billion": 3.821079552, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 12.94, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 8.9, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 12.94, + "stderr": 0.0 + } + ], + "average_accuracy": 11.593333333333334, + "best_prompt": 12.94, + "prompt_id": "p1", + "CPS": 12.765741333333333, + "is_dummy": false, + "std_accuracy": 2.3324950875260875 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 9.62, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 6.7299999999999995, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 19.16, + "stderr": 0.0 + } + ], + "average_accuracy": 11.836666666666666, + "best_prompt": 19.16, + "prompt_id": "p3", + "CPS": 17.75684933333333, + "is_dummy": false, + "std_accuracy": 6.504723924451624 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/microsoft/MediPhi-Instruct_0_IT.json b/e3c_llm_results/microsoft/MediPhi-Instruct_0_IT.json new file mode 100644 index 0000000000000000000000000000000000000000..110c043ee56415495c01f264c6fafcc47b72a79d --- /dev/null +++ b/e3c_llm_results/microsoft/MediPhi-Instruct_0_IT.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 11.238403133333334, + "config": { + "model_name": "microsoft/MediPhi-Instruct", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "IT", + "model": "microsoft/MediPhi-Instruct", + "base_model": "Phi3ForCausalLM", + "revision": "a94ac478e7c246103d55665a0804684042f3b973", + "submitted_time": "2025-07-11 19:28:15+00:00", + "num_params_billion": 3.821079552, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 8.67, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 24.84, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 27.169999999999998, + "stderr": 0.0 + } + ], + "average_accuracy": 20.226666666666663, + "best_prompt": 27.169999999999998, + "prompt_id": "p3", + "CPS": 25.283496333333332, + "is_dummy": false, + "std_accuracy": 10.075943297445322 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 17.119999999999997, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 28.96, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 32.61, + "stderr": 0.0 + } + ], + "average_accuracy": 26.23, + "best_prompt": 32.61, + "prompt_id": "p3", + "CPS": 30.529482, + "is_dummy": false, + "std_accuracy": 8.097820694483179 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.38, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.12666666666666668, + "best_prompt": 0.38, + "prompt_id": "p2", + "CPS": 0.37903733333333334, + "is_dummy": false, + "std_accuracy": 0.2193931022920578 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/microsoft/MediPhi-Instruct_0_PL.json b/e3c_llm_results/microsoft/MediPhi-Instruct_0_PL.json new file mode 100644 index 0000000000000000000000000000000000000000..7951d19547ec0c62157290970c79c1bf1ac4dd10 --- /dev/null +++ b/e3c_llm_results/microsoft/MediPhi-Instruct_0_PL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 23.550823, + "config": { + "model_name": "microsoft/MediPhi-Instruct", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "PL", + "model": "microsoft/MediPhi-Instruct", + "base_model": "Phi3ForCausalLM", + "revision": "a94ac478e7c246103d55665a0804684042f3b973", + "submitted_time": "2025-07-11 19:28:15+00:00", + "num_params_billion": 3.821079552, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 15.1, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 16.8, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 15.1, + "stderr": 0.0 + } + ], + "average_accuracy": 15.666666666666666, + "best_prompt": 16.8, + "prompt_id": "p2", + "CPS": 16.6096, + "is_dummy": false, + "std_accuracy": 0.9814954576223645 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 26.83, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 31.259999999999998, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 28.32, + "stderr": 0.0 + } + ], + "average_accuracy": 28.80333333333333, + "best_prompt": 31.259999999999998, + "prompt_id": "p2", + "CPS": 30.492046, + "is_dummy": false, + "std_accuracy": 2.2542034809070213 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/microsoft/MediPhi-Instruct_0_SK.json b/e3c_llm_results/microsoft/MediPhi-Instruct_0_SK.json new file mode 100644 index 0000000000000000000000000000000000000000..ca96039fbd7a4d7f1c3c742a43ec6649c3813335 --- /dev/null +++ b/e3c_llm_results/microsoft/MediPhi-Instruct_0_SK.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 18.487242666666667, + "config": { + "model_name": "microsoft/MediPhi-Instruct", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "SK", + "model": "microsoft/MediPhi-Instruct", + "base_model": "Phi3ForCausalLM", + "revision": "a94ac478e7c246103d55665a0804684042f3b973", + "submitted_time": "2025-07-11 19:28:15+00:00", + "num_params_billion": 3.821079552, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 16.41, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 20.810000000000002, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 16.41, + "stderr": 0.0 + } + ], + "average_accuracy": 17.876666666666665, + "best_prompt": 20.810000000000002, + "prompt_id": "p2", + "CPS": 20.199573333333337, + "is_dummy": false, + "std_accuracy": 2.5403411844343546 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 17.76, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 1.1199999999999999, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 17.76, + "stderr": 0.0 + } + ], + "average_accuracy": 12.213333333333333, + "best_prompt": 17.76, + "prompt_id": "p1", + "CPS": 16.774912, + "is_dummy": false, + "std_accuracy": 9.607108479315373 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/microsoft/MediPhi-Instruct_0_SL.json b/e3c_llm_results/microsoft/MediPhi-Instruct_0_SL.json new file mode 100644 index 0000000000000000000000000000000000000000..6faae21aaca5273dff9afdea9cc3e3059ac8ed49 --- /dev/null +++ b/e3c_llm_results/microsoft/MediPhi-Instruct_0_SL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 16.379518, + "config": { + "model_name": "microsoft/MediPhi-Instruct", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "SL", + "model": "microsoft/MediPhi-Instruct", + "base_model": "Phi3ForCausalLM", + "revision": "a94ac478e7c246103d55665a0804684042f3b973", + "submitted_time": "2025-07-11 19:28:15+00:00", + "num_params_billion": 3.821079552, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 17.580000000000002, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 18.6, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 17.580000000000002, + "stderr": 0.0 + } + ], + "average_accuracy": 17.92, + "best_prompt": 18.6, + "prompt_id": "p2", + "CPS": 18.47352, + "is_dummy": false, + "std_accuracy": 0.588897274573418 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 14.46, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 10.84, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 14.46, + "stderr": 0.0 + } + ], + "average_accuracy": 13.253333333333336, + "best_prompt": 14.46, + "prompt_id": "p1", + "CPS": 14.285516000000001, + "is_dummy": false, + "std_accuracy": 2.0900079744664457 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/microsoft/MediPhi-Instruct_10_EN.json b/e3c_llm_results/microsoft/MediPhi-Instruct_10_EN.json new file mode 100644 index 0000000000000000000000000000000000000000..e4a7e0da538646f44356efa40899230fad028abf --- /dev/null +++ b/e3c_llm_results/microsoft/MediPhi-Instruct_10_EN.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 38.280346866666676, + "config": { + "model_name": "microsoft/MediPhi-Instruct", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "EN", + "model": "microsoft/MediPhi-Instruct", + "base_model": "Phi3ForCausalLM", + "revision": "a94ac478e7c246103d55665a0804684042f3b973", + "submitted_time": "2025-07-11 19:28:15+00:00", + "num_params_billion": 3.821079552, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 53.56999999999999, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 52.27, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 50.629999999999995, + "stderr": 0.0 + } + ], + "average_accuracy": 52.156666666666666, + "best_prompt": 53.56999999999999, + "prompt_id": "p1", + "CPS": 52.81287733333333, + "is_dummy": false, + "std_accuracy": 1.4732730002729741 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 14.32, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 18.88, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 18.360000000000003, + "stderr": 0.0 + } + ], + "average_accuracy": 17.186666666666667, + "best_prompt": 18.88, + "prompt_id": "p2", + "CPS": 18.560298666666665, + "is_dummy": false, + "std_accuracy": 2.496183753919838 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 27.42, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 34.38, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 23.87, + "stderr": 0.0 + } + ], + "average_accuracy": 28.55666666666667, + "best_prompt": 34.38, + "prompt_id": "p2", + "CPS": 32.377938, + "is_dummy": false, + "std_accuracy": 5.346403775748081 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 21.62, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 30.020000000000003, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 51.44, + "stderr": 0.0 + } + ], + "average_accuracy": 34.36, + "best_prompt": 51.44, + "prompt_id": "p3", + "CPS": 42.654048, + "is_dummy": false, + "std_accuracy": 15.376436518257407 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 45.43, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 31.759999999999998, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 48.010000000000005, + "stderr": 0.0 + } + ], + "average_accuracy": 41.733333333333334, + "best_prompt": 48.010000000000005, + "prompt_id": "p3", + "CPS": 44.99657233333333, + "is_dummy": false, + "std_accuracy": 8.73296246031857 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/microsoft/MediPhi-Instruct_10_GR.json b/e3c_llm_results/microsoft/MediPhi-Instruct_10_GR.json new file mode 100644 index 0000000000000000000000000000000000000000..650ea7348936978f27e05aaa8a3c638806fda2f5 --- /dev/null +++ b/e3c_llm_results/microsoft/MediPhi-Instruct_10_GR.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 18.663691, + "config": { + "model_name": "microsoft/MediPhi-Instruct", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "GR", + "model": "microsoft/MediPhi-Instruct", + "base_model": "Phi3ForCausalLM", + "revision": "a94ac478e7c246103d55665a0804684042f3b973", + "submitted_time": "2025-07-11 19:28:15+00:00", + "num_params_billion": 3.821079552, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 28.22, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 29.99, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 28.22, + "stderr": 0.0 + } + ], + "average_accuracy": 28.81, + "best_prompt": 29.99, + "prompt_id": "p2", + "CPS": 29.636117999999996, + "is_dummy": false, + "std_accuracy": 1.0219099764656374 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 5.76, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 6.74, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 7.7700000000000005, + "stderr": 0.0 + } + ], + "average_accuracy": 6.756666666666667, + "best_prompt": 7.7700000000000005, + "prompt_id": "p3", + "CPS": 7.691264, + "is_dummy": false, + "std_accuracy": 1.0051036430803213 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/microsoft/MediPhi-Instruct_10_IT.json b/e3c_llm_results/microsoft/MediPhi-Instruct_10_IT.json new file mode 100644 index 0000000000000000000000000000000000000000..b93cb657d586266ae5ae595a9d695451b5f99c86 --- /dev/null +++ b/e3c_llm_results/microsoft/MediPhi-Instruct_10_IT.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 38.20042566666667, + "config": { + "model_name": "microsoft/MediPhi-Instruct", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "IT", + "model": "microsoft/MediPhi-Instruct", + "base_model": "Phi3ForCausalLM", + "revision": "a94ac478e7c246103d55665a0804684042f3b973", + "submitted_time": "2025-07-11 19:28:15+00:00", + "num_params_billion": 3.821079552, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 57.29, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 56.269999999999996, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 57.9, + "stderr": 0.0 + } + ], + "average_accuracy": 57.153333333333336, + "best_prompt": 57.9, + "prompt_id": "p3", + "CPS": 57.46768, + "is_dummy": false, + "std_accuracy": 0.8235492294534285 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 28.73, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 23.07, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 28.58, + "stderr": 0.0 + } + ], + "average_accuracy": 26.793333333333333, + "best_prompt": 28.73, + "prompt_id": "p1", + "CPS": 28.173595666666667, + "is_dummy": false, + "std_accuracy": 3.2253733634004806 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 15.45, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 15.079999999999998, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 4.75, + "stderr": 0.0 + } + ], + "average_accuracy": 11.76, + "best_prompt": 15.45, + "prompt_id": "p1", + "CPS": 14.879895, + "is_dummy": false, + "std_accuracy": 6.073656229982069 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 56.169999999999995, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 32.7, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 54.04, + "stderr": 0.0 + } + ], + "average_accuracy": 47.63666666666666, + "best_prompt": 56.169999999999995, + "prompt_id": "p1", + "CPS": 51.376826666666666, + "is_dummy": false, + "std_accuracy": 12.979300186579138 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 32.910000000000004, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 40.29, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 38.84, + "stderr": 0.0 + } + ], + "average_accuracy": 37.34666666666667, + "best_prompt": 40.29, + "prompt_id": "p2", + "CPS": 39.104131, + "is_dummy": false, + "std_accuracy": 3.910068200598721 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/microsoft/MediPhi-Instruct_10_PL.json b/e3c_llm_results/microsoft/MediPhi-Instruct_10_PL.json new file mode 100644 index 0000000000000000000000000000000000000000..b66eabed175d08abdde52f8af38292bb62a04113 --- /dev/null +++ b/e3c_llm_results/microsoft/MediPhi-Instruct_10_PL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 35.295837, + "config": { + "model_name": "microsoft/MediPhi-Instruct", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "PL", + "model": "microsoft/MediPhi-Instruct", + "base_model": "Phi3ForCausalLM", + "revision": "a94ac478e7c246103d55665a0804684042f3b973", + "submitted_time": "2025-07-11 19:28:15+00:00", + "num_params_billion": 3.821079552, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 44.17, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 45.06, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 44.17, + "stderr": 0.0 + } + ], + "average_accuracy": 44.46666666666667, + "best_prompt": 45.06, + "prompt_id": "p2", + "CPS": 44.792644, + "is_dummy": false, + "std_accuracy": 0.5138417395787672 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 15.25, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 26.86, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 26.619999999999997, + "stderr": 0.0 + } + ], + "average_accuracy": 22.909999999999997, + "best_prompt": 26.86, + "prompt_id": "p2", + "CPS": 25.79903, + "is_dummy": false, + "std_accuracy": 6.634839862423206 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/microsoft/MediPhi-Instruct_10_SK.json b/e3c_llm_results/microsoft/MediPhi-Instruct_10_SK.json new file mode 100644 index 0000000000000000000000000000000000000000..061b50d9a6a36a837980da53f31b65f6e4a4a4db --- /dev/null +++ b/e3c_llm_results/microsoft/MediPhi-Instruct_10_SK.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 28.662679833333332, + "config": { + "model_name": "microsoft/MediPhi-Instruct", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "SK", + "model": "microsoft/MediPhi-Instruct", + "base_model": "Phi3ForCausalLM", + "revision": "a94ac478e7c246103d55665a0804684042f3b973", + "submitted_time": "2025-07-11 19:28:15+00:00", + "num_params_billion": 3.821079552, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 43.269999999999996, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 40.23, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 43.269999999999996, + "stderr": 0.0 + } + ], + "average_accuracy": 42.25666666666667, + "best_prompt": 43.269999999999996, + "prompt_id": "p1", + "CPS": 42.831530666666666, + "is_dummy": false, + "std_accuracy": 1.7551448183364617 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 10.7, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 13.950000000000001, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 14.729999999999999, + "stderr": 0.0 + } + ], + "average_accuracy": 13.126666666666665, + "best_prompt": 14.729999999999999, + "prompt_id": "p3", + "CPS": 14.493828999999998, + "is_dummy": false, + "std_accuracy": 2.1374361588906776 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/microsoft/MediPhi-Instruct_10_SL.json b/e3c_llm_results/microsoft/MediPhi-Instruct_10_SL.json new file mode 100644 index 0000000000000000000000000000000000000000..aef87aa5170b16e3a61df26401fb96d046eca734 --- /dev/null +++ b/e3c_llm_results/microsoft/MediPhi-Instruct_10_SL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 29.24573433333333, + "config": { + "model_name": "microsoft/MediPhi-Instruct", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "SL", + "model": "microsoft/MediPhi-Instruct", + "base_model": "Phi3ForCausalLM", + "revision": "a94ac478e7c246103d55665a0804684042f3b973", + "submitted_time": "2025-07-11 19:28:15+00:00", + "num_params_billion": 3.821079552, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 39.73, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 35.64, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 39.73, + "stderr": 0.0 + } + ], + "average_accuracy": 38.36666666666667, + "best_prompt": 39.73, + "prompt_id": "p1", + "CPS": 39.188347666666665, + "is_dummy": false, + "std_accuracy": 2.3613626009855673 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 11.55, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 14.680000000000001, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 20.27, + "stderr": 0.0 + } + ], + "average_accuracy": 15.5, + "best_prompt": 20.27, + "prompt_id": "p3", + "CPS": 19.303121, + "is_dummy": false, + "std_accuracy": 4.417454017870474 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/mistralai/Mistral-7B-Instruct-v0.2_0_EN.json b/e3c_llm_results/mistralai/Mistral-7B-Instruct-v0.2_0_EN.json new file mode 100644 index 0000000000000000000000000000000000000000..aadc2cb582024aa75c6c9d0b9a19fdb6cec678fb --- /dev/null +++ b/e3c_llm_results/mistralai/Mistral-7B-Instruct-v0.2_0_EN.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 11.809398400000003, + "config": { + "model_name": "mistralai/Mistral-7B-Instruct-v0.2", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "EN", + "model": "mistralai/Mistral-7B-Instruct-v0.2", + "base_model": "MistralForCausalLM", + "revision": "63a8b081895390a26e140280378bc85ec8bce07a", + "submitted_time": "2023-12-11 13:18:44+00:00", + "num_params_billion": 7.241732096, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 25.290000000000003, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 21.44, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 21.62, + "stderr": 0.0 + } + ], + "average_accuracy": 22.783333333333335, + "best_prompt": 25.290000000000003, + "prompt_id": "p1", + "CPS": 24.656064, + "is_dummy": false, + "std_accuracy": 2.172701850998737 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 36.88, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 36.42, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 16.93, + "stderr": 0.0 + } + ], + "average_accuracy": 30.07666666666667, + "best_prompt": 36.88, + "prompt_id": "p1", + "CPS": 34.37093066666667, + "is_dummy": false, + "std_accuracy": 11.387670232902487 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.02, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.006666666666666667, + "best_prompt": 0.02, + "prompt_id": "p2", + "CPS": 0.019997333333333336, + "is_dummy": false, + "std_accuracy": 0.011547005383792516 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/mistralai/Mistral-7B-Instruct-v0.2_0_GR.json b/e3c_llm_results/mistralai/Mistral-7B-Instruct-v0.2_0_GR.json new file mode 100644 index 0000000000000000000000000000000000000000..cb74cbd399ee3933ce898ab6828e2693b10521e4 --- /dev/null +++ b/e3c_llm_results/mistralai/Mistral-7B-Instruct-v0.2_0_GR.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 14.120156666666666, + "config": { + "model_name": "mistralai/Mistral-7B-Instruct-v0.2", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "GR", + "model": "mistralai/Mistral-7B-Instruct-v0.2", + "base_model": "MistralForCausalLM", + "revision": "63a8b081895390a26e140280378bc85ec8bce07a", + "submitted_time": "2023-12-11 13:18:44+00:00", + "num_params_billion": 7.241732096, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 16.03, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 19.09, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 16.03, + "stderr": 0.0 + } + ], + "average_accuracy": 17.05, + "best_prompt": 19.09, + "prompt_id": "p2", + "CPS": 18.700564, + "is_dummy": false, + "std_accuracy": 1.766691823720254 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 4.32, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 3.4799999999999995, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 9.94, + "stderr": 0.0 + } + ], + "average_accuracy": 5.913333333333333, + "best_prompt": 9.94, + "prompt_id": "p3", + "CPS": 9.539749333333333, + "is_dummy": false, + "std_accuracy": 3.5123970922054544 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/mistralai/Mistral-7B-Instruct-v0.2_0_IT.json b/e3c_llm_results/mistralai/Mistral-7B-Instruct-v0.2_0_IT.json new file mode 100644 index 0000000000000000000000000000000000000000..9a9beab9a845c0ffefb5e20bcd50f075cd21968e --- /dev/null +++ b/e3c_llm_results/mistralai/Mistral-7B-Instruct-v0.2_0_IT.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 7.919311333333335, + "config": { + "model_name": "mistralai/Mistral-7B-Instruct-v0.2", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "IT", + "model": "mistralai/Mistral-7B-Instruct-v0.2", + "base_model": "MistralForCausalLM", + "revision": "63a8b081895390a26e140280378bc85ec8bce07a", + "submitted_time": "2023-12-11 13:18:44+00:00", + "num_params_billion": 7.241732096, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 27.88, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 20.3, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 24.81, + "stderr": 0.0 + } + ], + "average_accuracy": 24.33, + "best_prompt": 27.88, + "prompt_id": "p1", + "CPS": 26.89026, + "is_dummy": false, + "std_accuracy": 3.8127286816661887 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 13.819999999999999, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 1.63, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 1.4000000000000001, + "stderr": 0.0 + } + ], + "average_accuracy": 5.616666666666666, + "best_prompt": 13.819999999999999, + "prompt_id": "p1", + "CPS": 12.686299333333332, + "is_dummy": false, + "std_accuracy": 7.105225776379898 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.02, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.006666666666666667, + "best_prompt": 0.02, + "prompt_id": "p1", + "CPS": 0.019997333333333336, + "is_dummy": false, + "std_accuracy": 0.011547005383792516 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/mistralai/Mistral-7B-Instruct-v0.2_0_PL.json b/e3c_llm_results/mistralai/Mistral-7B-Instruct-v0.2_0_PL.json new file mode 100644 index 0000000000000000000000000000000000000000..731f4e2146d96568835ab97ca613fd768326b73f --- /dev/null +++ b/e3c_llm_results/mistralai/Mistral-7B-Instruct-v0.2_0_PL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 21.287892, + "config": { + "model_name": "mistralai/Mistral-7B-Instruct-v0.2", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "PL", + "model": "mistralai/Mistral-7B-Instruct-v0.2", + "base_model": "MistralForCausalLM", + "revision": "63a8b081895390a26e140280378bc85ec8bce07a", + "submitted_time": "2023-12-11 13:18:44+00:00", + "num_params_billion": 7.241732096, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 30.240000000000002, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 28.110000000000003, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 30.240000000000002, + "stderr": 0.0 + } + ], + "average_accuracy": 29.53, + "best_prompt": 30.240000000000002, + "prompt_id": "p1", + "CPS": 30.025296, + "is_dummy": false, + "std_accuracy": 1.2297560733739024 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 8.63, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 12.920000000000002, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 8.63, + "stderr": 0.0 + } + ], + "average_accuracy": 10.060000000000002, + "best_prompt": 12.920000000000002, + "prompt_id": "p2", + "CPS": 12.550488000000001, + "is_dummy": false, + "std_accuracy": 2.476832654823495 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/mistralai/Mistral-7B-Instruct-v0.2_0_SK.json b/e3c_llm_results/mistralai/Mistral-7B-Instruct-v0.2_0_SK.json new file mode 100644 index 0000000000000000000000000000000000000000..63b3049082e6bb3bd8346a2643161658df5aa232 --- /dev/null +++ b/e3c_llm_results/mistralai/Mistral-7B-Instruct-v0.2_0_SK.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 14.880865666666669, + "config": { + "model_name": "mistralai/Mistral-7B-Instruct-v0.2", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "SK", + "model": "mistralai/Mistral-7B-Instruct-v0.2", + "base_model": "MistralForCausalLM", + "revision": "63a8b081895390a26e140280378bc85ec8bce07a", + "submitted_time": "2023-12-11 13:18:44+00:00", + "num_params_billion": 7.241732096, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 21.43, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 21.46, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 21.43, + "stderr": 0.0 + } + ], + "average_accuracy": 21.439999999999998, + "best_prompt": 21.46, + "prompt_id": "p2", + "CPS": 21.455708, + "is_dummy": false, + "std_accuracy": 0.01732050807568943 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 7.5600000000000005, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 8.35, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 7.5600000000000005, + "stderr": 0.0 + } + ], + "average_accuracy": 7.823333333333333, + "best_prompt": 8.35, + "prompt_id": "p2", + "CPS": 8.306023333333334, + "is_dummy": false, + "std_accuracy": 0.45610671265980385 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/mistralai/Mistral-7B-Instruct-v0.2_0_SL.json b/e3c_llm_results/mistralai/Mistral-7B-Instruct-v0.2_0_SL.json new file mode 100644 index 0000000000000000000000000000000000000000..ec8d55691ba462f4f9f19d6fbcb79d46d3a9aa6f --- /dev/null +++ b/e3c_llm_results/mistralai/Mistral-7B-Instruct-v0.2_0_SL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 17.567646000000003, + "config": { + "model_name": "mistralai/Mistral-7B-Instruct-v0.2", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "SL", + "model": "mistralai/Mistral-7B-Instruct-v0.2", + "base_model": "MistralForCausalLM", + "revision": "63a8b081895390a26e140280378bc85ec8bce07a", + "submitted_time": "2023-12-11 13:18:44+00:00", + "num_params_billion": 7.241732096, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 17.66, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 19.470000000000002, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 17.66, + "stderr": 0.0 + } + ], + "average_accuracy": 18.263333333333335, + "best_prompt": 19.470000000000002, + "prompt_id": "p2", + "CPS": 19.235062000000003, + "is_dummy": false, + "std_accuracy": 1.045003987233224 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 7.66, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 16.950000000000003, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 7.66, + "stderr": 0.0 + } + ], + "average_accuracy": 10.756666666666668, + "best_prompt": 16.950000000000003, + "prompt_id": "p2", + "CPS": 15.900230000000002, + "is_dummy": false, + "std_accuracy": 5.363584000771625 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/mistralai/Mistral-7B-Instruct-v0.2_10_EN.json b/e3c_llm_results/mistralai/Mistral-7B-Instruct-v0.2_10_EN.json new file mode 100644 index 0000000000000000000000000000000000000000..b59774c4de6ace8ec3e1efd922b4f6a655936fe6 --- /dev/null +++ b/e3c_llm_results/mistralai/Mistral-7B-Instruct-v0.2_10_EN.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 29.993946066666666, + "config": { + "model_name": "mistralai/Mistral-7B-Instruct-v0.2", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "EN", + "model": "mistralai/Mistral-7B-Instruct-v0.2", + "base_model": "MistralForCausalLM", + "revision": "63a8b081895390a26e140280378bc85ec8bce07a", + "submitted_time": "2023-12-11 13:18:44+00:00", + "num_params_billion": 7.241732096, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 47.25, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 47.3, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 48.05, + "stderr": 0.0 + } + ], + "average_accuracy": 47.53333333333333, + "best_prompt": 48.05, + "prompt_id": "p3", + "CPS": 47.801741666666665, + "is_dummy": false, + "std_accuracy": 0.4481443219916242 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 25.929999999999996, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 40.339999999999996, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 41.48, + "stderr": 0.0 + } + ], + "average_accuracy": 35.916666666666664, + "best_prompt": 41.48, + "prompt_id": "p3", + "CPS": 39.17232933333333, + "is_dummy": false, + "std_accuracy": 8.667469834578794 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.9900000000000001, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 13.88, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 21.78, + "stderr": 0.0 + } + ], + "average_accuracy": 12.216666666666669, + "best_prompt": 21.78, + "prompt_id": "p3", + "CPS": 19.697106, + "is_dummy": false, + "std_accuracy": 10.494333391565819 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 1.37, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 2.4, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 15.43, + "stderr": 0.0 + } + ], + "average_accuracy": 6.3999999999999995, + "best_prompt": 15.43, + "prompt_id": "p3", + "CPS": 14.036670999999998, + "is_dummy": false, + "std_accuracy": 7.837148716210507 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 1.43, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 39.290000000000006, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.58, + "stderr": 0.0 + } + ], + "average_accuracy": 13.766666666666667, + "best_prompt": 39.290000000000006, + "prompt_id": "p2", + "CPS": 29.261882333333336, + "is_dummy": false, + "std_accuracy": 22.107940504111493 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/mistralai/Mistral-7B-Instruct-v0.2_10_GR.json b/e3c_llm_results/mistralai/Mistral-7B-Instruct-v0.2_10_GR.json new file mode 100644 index 0000000000000000000000000000000000000000..5f74d913bb00dc2c9e4bf88d2e4dbf9fe34675a8 --- /dev/null +++ b/e3c_llm_results/mistralai/Mistral-7B-Instruct-v0.2_10_GR.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 29.209499, + "config": { + "model_name": "mistralai/Mistral-7B-Instruct-v0.2", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "GR", + "model": "mistralai/Mistral-7B-Instruct-v0.2", + "base_model": "MistralForCausalLM", + "revision": "63a8b081895390a26e140280378bc85ec8bce07a", + "submitted_time": "2023-12-11 13:18:44+00:00", + "num_params_billion": 7.241732096, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 34.98, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 36.480000000000004, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 34.98, + "stderr": 0.0 + } + ], + "average_accuracy": 35.48, + "best_prompt": 36.480000000000004, + "prompt_id": "p2", + "CPS": 36.1152, + "is_dummy": false, + "std_accuracy": 0.8660254037844427 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 10.549999999999999, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 23.43, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 21.89, + "stderr": 0.0 + } + ], + "average_accuracy": 18.62333333333333, + "best_prompt": 23.43, + "prompt_id": "p2", + "CPS": 22.303798, + "is_dummy": false, + "std_accuracy": 7.03398417209858 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/mistralai/Mistral-7B-Instruct-v0.2_10_IT.json b/e3c_llm_results/mistralai/Mistral-7B-Instruct-v0.2_10_IT.json new file mode 100644 index 0000000000000000000000000000000000000000..868408c2c0e8dddfde632c93328879776a7a19c2 --- /dev/null +++ b/e3c_llm_results/mistralai/Mistral-7B-Instruct-v0.2_10_IT.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 33.12944686666667, + "config": { + "model_name": "mistralai/Mistral-7B-Instruct-v0.2", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "IT", + "model": "mistralai/Mistral-7B-Instruct-v0.2", + "base_model": "MistralForCausalLM", + "revision": "63a8b081895390a26e140280378bc85ec8bce07a", + "submitted_time": "2023-12-11 13:18:44+00:00", + "num_params_billion": 7.241732096, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 51.470000000000006, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 52.32, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 51.49, + "stderr": 0.0 + } + ], + "average_accuracy": 51.76, + "best_prompt": 52.32, + "prompt_id": "p2", + "CPS": 52.027007999999995, + "is_dummy": false, + "std_accuracy": 0.48507731342539395 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 30.919999999999998, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 45.300000000000004, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 42.52, + "stderr": 0.0 + } + ], + "average_accuracy": 39.580000000000005, + "best_prompt": 45.300000000000004, + "prompt_id": "p2", + "CPS": 42.70884, + "is_dummy": false, + "std_accuracy": 7.627502867911624 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 1.54, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 14.34, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 11.62, + "stderr": 0.0 + } + ], + "average_accuracy": 9.166666666666666, + "best_prompt": 14.34, + "prompt_id": "p2", + "CPS": 13.598144, + "is_dummy": false, + "std_accuracy": 6.743451144134829 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 1.63, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 56.95, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 12.22, + "stderr": 0.0 + } + ], + "average_accuracy": 23.600000000000005, + "best_prompt": 56.95, + "prompt_id": "p2", + "CPS": 37.95717500000001, + "is_dummy": false, + "std_accuracy": 29.36330873726597 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 7.489999999999999, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 21.41, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 6.550000000000001, + "stderr": 0.0 + } + ], + "average_accuracy": 11.816666666666668, + "best_prompt": 21.41, + "prompt_id": "p2", + "CPS": 19.356067333333336, + "is_dummy": false, + "std_accuracy": 8.321354056482233 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/mistralai/Mistral-7B-Instruct-v0.2_10_PL.json b/e3c_llm_results/mistralai/Mistral-7B-Instruct-v0.2_10_PL.json new file mode 100644 index 0000000000000000000000000000000000000000..13a89c8e4b08849d588f153fdcb8066740e941f0 --- /dev/null +++ b/e3c_llm_results/mistralai/Mistral-7B-Instruct-v0.2_10_PL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 43.31202666666667, + "config": { + "model_name": "mistralai/Mistral-7B-Instruct-v0.2", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "PL", + "model": "mistralai/Mistral-7B-Instruct-v0.2", + "base_model": "MistralForCausalLM", + "revision": "63a8b081895390a26e140280378bc85ec8bce07a", + "submitted_time": "2023-12-11 13:18:44+00:00", + "num_params_billion": 7.241732096, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 49.11, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 50.46000000000001, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 49.11, + "stderr": 0.0 + } + ], + "average_accuracy": 49.56, + "best_prompt": 50.46000000000001, + "prompt_id": "p2", + "CPS": 50.005860000000006, + "is_dummy": false, + "std_accuracy": 0.7794228634059998 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 38.95, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 33.11, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 26.83, + "stderr": 0.0 + } + ], + "average_accuracy": 32.96333333333333, + "best_prompt": 38.95, + "prompt_id": "p1", + "CPS": 36.61819333333333, + "is_dummy": false, + "std_accuracy": 6.061330986947781 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/mistralai/Mistral-7B-Instruct-v0.2_10_SK.json b/e3c_llm_results/mistralai/Mistral-7B-Instruct-v0.2_10_SK.json new file mode 100644 index 0000000000000000000000000000000000000000..db755fca2eefd4b03af0d9deea859e0616b8b878 --- /dev/null +++ b/e3c_llm_results/mistralai/Mistral-7B-Instruct-v0.2_10_SK.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 31.267611000000002, + "config": { + "model_name": "mistralai/Mistral-7B-Instruct-v0.2", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "SK", + "model": "mistralai/Mistral-7B-Instruct-v0.2", + "base_model": "MistralForCausalLM", + "revision": "63a8b081895390a26e140280378bc85ec8bce07a", + "submitted_time": "2023-12-11 13:18:44+00:00", + "num_params_billion": 7.241732096, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 40.29, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 37.940000000000005, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 40.29, + "stderr": 0.0 + } + ], + "average_accuracy": 39.50666666666667, + "best_prompt": 40.29, + "prompt_id": "p1", + "CPS": 39.974395, + "is_dummy": false, + "std_accuracy": 1.3567731325956172 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 21.55, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 19.48, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 22.93, + "stderr": 0.0 + } + ], + "average_accuracy": 21.32, + "best_prompt": 22.93, + "prompt_id": "p3", + "CPS": 22.560827, + "is_dummy": false, + "std_accuracy": 1.7364619201122722 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/mistralai/Mistral-7B-Instruct-v0.2_10_SL.json b/e3c_llm_results/mistralai/Mistral-7B-Instruct-v0.2_10_SL.json new file mode 100644 index 0000000000000000000000000000000000000000..d8fe976be9e70fd68689e49a6e17ccab037e8e7c --- /dev/null +++ b/e3c_llm_results/mistralai/Mistral-7B-Instruct-v0.2_10_SL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 31.471755, + "config": { + "model_name": "mistralai/Mistral-7B-Instruct-v0.2", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "SL", + "model": "mistralai/Mistral-7B-Instruct-v0.2", + "base_model": "MistralForCausalLM", + "revision": "63a8b081895390a26e140280378bc85ec8bce07a", + "submitted_time": "2023-12-11 13:18:44+00:00", + "num_params_billion": 7.241732096, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 42.04, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 41.74, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 42.04, + "stderr": 0.0 + } + ], + "average_accuracy": 41.94, + "best_prompt": 42.04, + "prompt_id": "p1", + "CPS": 41.99796, + "is_dummy": false, + "std_accuracy": 0.1732050807568861 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 19.900000000000002, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 19.5, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 21.15, + "stderr": 0.0 + } + ], + "average_accuracy": 20.183333333333334, + "best_prompt": 21.15, + "prompt_id": "p3", + "CPS": 20.94555, + "is_dummy": false, + "std_accuracy": 0.8607167555783559 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/mistralai/Mistral-Nemo-Instruct-2407_0_EN.json b/e3c_llm_results/mistralai/Mistral-Nemo-Instruct-2407_0_EN.json new file mode 100644 index 0000000000000000000000000000000000000000..1bd1b313b513abb3fd3fddc070192d2116dd584a --- /dev/null +++ b/e3c_llm_results/mistralai/Mistral-Nemo-Instruct-2407_0_EN.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 13.2821676, + "config": { + "model_name": "mistralai/Mistral-Nemo-Instruct-2407", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "EN", + "model": "mistralai/Mistral-Nemo-Instruct-2407", + "base_model": "MistralForCausalLM", + "revision": "04d8a90549d23fc6bd7f642064003592df51e9b3", + "submitted_time": "2024-07-17 17:26:49+00:00", + "num_params_billion": 12.2477824, + "language": "en_fr_de_es_it_pt_ru_zh_ja" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 27.67, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 22.99, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 17.48, + "stderr": 0.0 + } + ], + "average_accuracy": 22.713333333333335, + "best_prompt": 27.67, + "prompt_id": "p1", + "CPS": 26.298490333333337, + "is_dummy": false, + "std_accuracy": 5.100630679958444 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 36.94, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 34.82, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 32.41, + "stderr": 0.0 + } + ], + "average_accuracy": 34.72333333333333, + "best_prompt": 36.94, + "prompt_id": "p1", + "CPS": 36.12116333333333, + "is_dummy": false, + "std_accuracy": 2.26654656544562 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 3.85, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.03, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 1.2933333333333332, + "best_prompt": 3.85, + "prompt_id": "p1", + "CPS": 3.7515683333333336, + "is_dummy": false, + "std_accuracy": 2.21418909159388 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.24, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.08, + "best_prompt": 0.24, + "prompt_id": "p2", + "CPS": 0.23961599999999997, + "is_dummy": false, + "std_accuracy": 0.13856406460551018 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/mistralai/Mistral-Nemo-Instruct-2407_0_GR.json b/e3c_llm_results/mistralai/Mistral-Nemo-Instruct-2407_0_GR.json new file mode 100644 index 0000000000000000000000000000000000000000..9f950af6ff240caf2360ab845773046d6b8534b7 --- /dev/null +++ b/e3c_llm_results/mistralai/Mistral-Nemo-Instruct-2407_0_GR.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 18.510654333333335, + "config": { + "model_name": "mistralai/Mistral-Nemo-Instruct-2407", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "GR", + "model": "mistralai/Mistral-Nemo-Instruct-2407", + "base_model": "MistralForCausalLM", + "revision": "04d8a90549d23fc6bd7f642064003592df51e9b3", + "submitted_time": "2024-07-17 17:26:49+00:00", + "num_params_billion": 12.2477824, + "language": "en_fr_de_es_it_pt_ru_zh_ja" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 7.32, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 6.87, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 7.32, + "stderr": 0.0 + } + ], + "average_accuracy": 7.170000000000001, + "best_prompt": 7.32, + "prompt_id": "p1", + "CPS": 7.30902, + "is_dummy": false, + "std_accuracy": 0.25980762113533173 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 15.75, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 21.17, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 32.87, + "stderr": 0.0 + } + ], + "average_accuracy": 23.263333333333332, + "best_prompt": 32.87, + "prompt_id": "p3", + "CPS": 29.712288666666666, + "is_dummy": false, + "std_accuracy": 8.749864760859639 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/mistralai/Mistral-Nemo-Instruct-2407_0_IT.json b/e3c_llm_results/mistralai/Mistral-Nemo-Instruct-2407_0_IT.json new file mode 100644 index 0000000000000000000000000000000000000000..81bbd6c64c00d7d4701e0cdef93c518d24aefeda --- /dev/null +++ b/e3c_llm_results/mistralai/Mistral-Nemo-Instruct-2407_0_IT.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 10.599325933333333, + "config": { + "model_name": "mistralai/Mistral-Nemo-Instruct-2407", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "IT", + "model": "mistralai/Mistral-Nemo-Instruct-2407", + "base_model": "MistralForCausalLM", + "revision": "04d8a90549d23fc6bd7f642064003592df51e9b3", + "submitted_time": "2024-07-17 17:26:49+00:00", + "num_params_billion": 12.2477824, + "language": "en_fr_de_es_it_pt_ru_zh_ja" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 27.92, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 17.72, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 13.16, + "stderr": 0.0 + } + ], + "average_accuracy": 19.599999999999998, + "best_prompt": 27.92, + "prompt_id": "p1", + "CPS": 25.597056, + "is_dummy": false, + "std_accuracy": 7.557459890730484 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 28.49, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 23.84, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 18.61, + "stderr": 0.0 + } + ], + "average_accuracy": 23.646666666666665, + "best_prompt": 28.49, + "prompt_id": "p1", + "CPS": 27.11013433333333, + "is_dummy": false, + "std_accuracy": 4.942836567532183 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.29, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.09666666666666666, + "best_prompt": 0.29, + "prompt_id": "p1", + "CPS": 0.2894393333333333, + "is_dummy": false, + "std_accuracy": 0.16743157806499145 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/mistralai/Mistral-Nemo-Instruct-2407_0_PL.json b/e3c_llm_results/mistralai/Mistral-Nemo-Instruct-2407_0_PL.json new file mode 100644 index 0000000000000000000000000000000000000000..f6f617f8695146472f3082ad22891853a84acc20 --- /dev/null +++ b/e3c_llm_results/mistralai/Mistral-Nemo-Instruct-2407_0_PL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 12.708361833333333, + "config": { + "model_name": "mistralai/Mistral-Nemo-Instruct-2407", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "PL", + "model": "mistralai/Mistral-Nemo-Instruct-2407", + "base_model": "MistralForCausalLM", + "revision": "04d8a90549d23fc6bd7f642064003592df51e9b3", + "submitted_time": "2024-07-17 17:26:49+00:00", + "num_params_billion": 12.2477824, + "language": "en_fr_de_es_it_pt_ru_zh_ja" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 4.83, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 4.390000000000001, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 4.83, + "stderr": 0.0 + } + ], + "average_accuracy": 4.683333333333334, + "best_prompt": 4.83, + "prompt_id": "p1", + "CPS": 4.822916, + "is_dummy": false, + "std_accuracy": 0.25403411844343504 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 21.23, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 16.86, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 16.61, + "stderr": 0.0 + } + ], + "average_accuracy": 18.233333333333334, + "best_prompt": 21.23, + "prompt_id": "p1", + "CPS": 20.593807666666667, + "is_dummy": false, + "std_accuracy": 2.598198093551247 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/mistralai/Mistral-Nemo-Instruct-2407_0_SK.json b/e3c_llm_results/mistralai/Mistral-Nemo-Instruct-2407_0_SK.json new file mode 100644 index 0000000000000000000000000000000000000000..171db4f330cc7f9d6b7d596285ad10637ded4d89 --- /dev/null +++ b/e3c_llm_results/mistralai/Mistral-Nemo-Instruct-2407_0_SK.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 12.570468000000002, + "config": { + "model_name": "mistralai/Mistral-Nemo-Instruct-2407", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "SK", + "model": "mistralai/Mistral-Nemo-Instruct-2407", + "base_model": "MistralForCausalLM", + "revision": "04d8a90549d23fc6bd7f642064003592df51e9b3", + "submitted_time": "2024-07-17 17:26:49+00:00", + "num_params_billion": 12.2477824, + "language": "en_fr_de_es_it_pt_ru_zh_ja" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 6.8500000000000005, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 8.44, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 6.8500000000000005, + "stderr": 0.0 + } + ], + "average_accuracy": 7.38, + "best_prompt": 8.44, + "prompt_id": "p2", + "CPS": 8.350536, + "is_dummy": false, + "std_accuracy": 0.9179869280115044 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 16.96, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 13.96, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 16.96, + "stderr": 0.0 + } + ], + "average_accuracy": 15.96, + "best_prompt": 16.96, + "prompt_id": "p1", + "CPS": 16.7904, + "is_dummy": false, + "std_accuracy": 1.7320508075688772 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/mistralai/Mistral-Nemo-Instruct-2407_0_SL.json b/e3c_llm_results/mistralai/Mistral-Nemo-Instruct-2407_0_SL.json new file mode 100644 index 0000000000000000000000000000000000000000..1cacb2f5cb2ed10a7172cdc1fe0420a1e0f1a7c7 --- /dev/null +++ b/e3c_llm_results/mistralai/Mistral-Nemo-Instruct-2407_0_SL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 15.375161166666668, + "config": { + "model_name": "mistralai/Mistral-Nemo-Instruct-2407", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "SL", + "model": "mistralai/Mistral-Nemo-Instruct-2407", + "base_model": "MistralForCausalLM", + "revision": "04d8a90549d23fc6bd7f642064003592df51e9b3", + "submitted_time": "2024-07-17 17:26:49+00:00", + "num_params_billion": 12.2477824, + "language": "en_fr_de_es_it_pt_ru_zh_ja" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 8.61, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 8.05, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 8.61, + "stderr": 0.0 + } + ], + "average_accuracy": 8.423333333333334, + "best_prompt": 8.61, + "prompt_id": "p1", + "CPS": 8.593928, + "is_dummy": false, + "std_accuracy": 0.3233161507461897 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 23.09, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 10.96, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 23.09, + "stderr": 0.0 + } + ], + "average_accuracy": 19.046666666666667, + "best_prompt": 23.09, + "prompt_id": "p1", + "CPS": 22.156394333333335, + "is_dummy": false, + "std_accuracy": 7.00325876527016 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/mistralai/Mistral-Nemo-Instruct-2407_10_EN.json b/e3c_llm_results/mistralai/Mistral-Nemo-Instruct-2407_10_EN.json new file mode 100644 index 0000000000000000000000000000000000000000..01cbf61242c6ef0be3119462634742a963ea82a0 --- /dev/null +++ b/e3c_llm_results/mistralai/Mistral-Nemo-Instruct-2407_10_EN.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 42.75896306666667, + "config": { + "model_name": "mistralai/Mistral-Nemo-Instruct-2407", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "EN", + "model": "mistralai/Mistral-Nemo-Instruct-2407", + "base_model": "MistralForCausalLM", + "revision": "04d8a90549d23fc6bd7f642064003592df51e9b3", + "submitted_time": "2024-07-17 17:26:49+00:00", + "num_params_billion": 12.2477824, + "language": "en_fr_de_es_it_pt_ru_zh_ja" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 57.769999999999996, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 58.41, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 56.68, + "stderr": 0.0 + } + ], + "average_accuracy": 57.62, + "best_prompt": 58.41, + "prompt_id": "p2", + "CPS": 57.948561, + "is_dummy": false, + "std_accuracy": 0.8746999485537866 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 34.82, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 50.080000000000005, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 44.49, + "stderr": 0.0 + } + ], + "average_accuracy": 43.13, + "best_prompt": 50.080000000000005, + "prompt_id": "p2", + "CPS": 46.59944, + "is_dummy": false, + "std_accuracy": 7.720369162157988 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 24.990000000000002, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 27.18, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 23.56, + "stderr": 0.0 + } + ], + "average_accuracy": 25.243333333333336, + "best_prompt": 27.18, + "prompt_id": "p2", + "CPS": 26.653614, + "is_dummy": false, + "std_accuracy": 1.8232480175042929 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 30.34, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 41.760000000000005, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 57.78, + "stderr": 0.0 + } + ], + "average_accuracy": 43.29333333333333, + "best_prompt": 57.78, + "prompt_id": "p3", + "CPS": 49.409604, + "is_dummy": false, + "std_accuracy": 13.784111626555168 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 9.049999999999999, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 40.43, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 18.04, + "stderr": 0.0 + } + ], + "average_accuracy": 22.506666666666664, + "best_prompt": 40.43, + "prompt_id": "p2", + "CPS": 33.183596333333334, + "is_dummy": false, + "std_accuracy": 16.15980919854357 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/mistralai/Mistral-Nemo-Instruct-2407_10_GR.json b/e3c_llm_results/mistralai/Mistral-Nemo-Instruct-2407_10_GR.json new file mode 100644 index 0000000000000000000000000000000000000000..9d47f4da00b8572c6d9d7ff9e6a17522b8f6736d --- /dev/null +++ b/e3c_llm_results/mistralai/Mistral-Nemo-Instruct-2407_10_GR.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 40.65579733333333, + "config": { + "model_name": "mistralai/Mistral-Nemo-Instruct-2407", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "GR", + "model": "mistralai/Mistral-Nemo-Instruct-2407", + "base_model": "MistralForCausalLM", + "revision": "04d8a90549d23fc6bd7f642064003592df51e9b3", + "submitted_time": "2024-07-17 17:26:49+00:00", + "num_params_billion": 12.2477824, + "language": "en_fr_de_es_it_pt_ru_zh_ja" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 50.81, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 49.88, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 50.81, + "stderr": 0.0 + } + ], + "average_accuracy": 50.5, + "best_prompt": 50.81, + "prompt_id": "p1", + "CPS": 50.652489, + "is_dummy": false, + "std_accuracy": 0.5369357503463518 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 20.29, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 22.96, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 33.23, + "stderr": 0.0 + } + ], + "average_accuracy": 25.49333333333333, + "best_prompt": 33.23, + "prompt_id": "p3", + "CPS": 30.65910566666666, + "is_dummy": false, + "std_accuracy": 6.831854311483326 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/mistralai/Mistral-Nemo-Instruct-2407_10_IT.json b/e3c_llm_results/mistralai/Mistral-Nemo-Instruct-2407_10_IT.json new file mode 100644 index 0000000000000000000000000000000000000000..144ace83125b2722b6bd87adbd3135852773850f --- /dev/null +++ b/e3c_llm_results/mistralai/Mistral-Nemo-Instruct-2407_10_IT.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 39.459138, + "config": { + "model_name": "mistralai/Mistral-Nemo-Instruct-2407", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "IT", + "model": "mistralai/Mistral-Nemo-Instruct-2407", + "base_model": "MistralForCausalLM", + "revision": "04d8a90549d23fc6bd7f642064003592df51e9b3", + "submitted_time": "2024-07-17 17:26:49+00:00", + "num_params_billion": 12.2477824, + "language": "en_fr_de_es_it_pt_ru_zh_ja" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 64.3, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 64.37, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 64.57000000000001, + "stderr": 0.0 + } + ], + "average_accuracy": 64.41333333333334, + "best_prompt": 64.57000000000001, + "prompt_id": "p3", + "CPS": 64.46884033333333, + "is_dummy": false, + "std_accuracy": 0.14011899704656258 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 27.08, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 40.99, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 38.6, + "stderr": 0.0 + } + ], + "average_accuracy": 35.556666666666665, + "best_prompt": 40.99, + "prompt_id": "p2", + "CPS": 38.76287666666667, + "is_dummy": false, + "std_accuracy": 7.437636273261376 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 12.19, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 12.989999999999998, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 12.870000000000001, + "stderr": 0.0 + } + ], + "average_accuracy": 12.683333333333332, + "best_prompt": 12.989999999999998, + "prompt_id": "p2", + "CPS": 12.950164, + "is_dummy": false, + "std_accuracy": 0.43143172499635823 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 44.49, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 56.589999999999996, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 37.28, + "stderr": 0.0 + } + ], + "average_accuracy": 46.120000000000005, + "best_prompt": 56.589999999999996, + "prompt_id": "p2", + "CPS": 50.665027, + "is_dummy": false, + "std_accuracy": 9.757648282245059 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 6.9, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 38.46, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 7.53, + "stderr": 0.0 + } + ], + "average_accuracy": 17.63, + "best_prompt": 38.46, + "prompt_id": "p2", + "CPS": 30.448781999999998, + "is_dummy": false, + "std_accuracy": 18.042059195114064 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/mistralai/Mistral-Nemo-Instruct-2407_10_PL.json b/e3c_llm_results/mistralai/Mistral-Nemo-Instruct-2407_10_PL.json new file mode 100644 index 0000000000000000000000000000000000000000..b64dc4233493142474036aff130aaaaff276a69f --- /dev/null +++ b/e3c_llm_results/mistralai/Mistral-Nemo-Instruct-2407_10_PL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 36.890603, + "config": { + "model_name": "mistralai/Mistral-Nemo-Instruct-2407", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "PL", + "model": "mistralai/Mistral-Nemo-Instruct-2407", + "base_model": "MistralForCausalLM", + "revision": "04d8a90549d23fc6bd7f642064003592df51e9b3", + "submitted_time": "2024-07-17 17:26:49+00:00", + "num_params_billion": 12.2477824, + "language": "en_fr_de_es_it_pt_ru_zh_ja" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 53.52, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 54.21, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 53.52, + "stderr": 0.0 + } + ], + "average_accuracy": 53.75, + "best_prompt": 54.21, + "prompt_id": "p2", + "CPS": 53.960634, + "is_dummy": false, + "std_accuracy": 0.3983716857408405 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 18.63, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 18.55, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 20.01, + "stderr": 0.0 + } + ], + "average_accuracy": 19.063333333333333, + "best_prompt": 20.01, + "prompt_id": "p3", + "CPS": 19.820572, + "is_dummy": false, + "std_accuracy": 0.8208126054912502 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/mistralai/Mistral-Nemo-Instruct-2407_10_SK.json b/e3c_llm_results/mistralai/Mistral-Nemo-Instruct-2407_10_SK.json new file mode 100644 index 0000000000000000000000000000000000000000..eada31997236f6a8902716b90e2ebb7174c22701 --- /dev/null +++ b/e3c_llm_results/mistralai/Mistral-Nemo-Instruct-2407_10_SK.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 35.643439, + "config": { + "model_name": "mistralai/Mistral-Nemo-Instruct-2407", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "SK", + "model": "mistralai/Mistral-Nemo-Instruct-2407", + "base_model": "MistralForCausalLM", + "revision": "04d8a90549d23fc6bd7f642064003592df51e9b3", + "submitted_time": "2024-07-17 17:26:49+00:00", + "num_params_billion": 12.2477824, + "language": "en_fr_de_es_it_pt_ru_zh_ja" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 50.24999999999999, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 50.4, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 50.24999999999999, + "stderr": 0.0 + } + ], + "average_accuracy": 50.29999999999999, + "best_prompt": 50.4, + "prompt_id": "p2", + "CPS": 50.349599999999995, + "is_dummy": false, + "std_accuracy": 0.08660254037844715 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 12.370000000000001, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 21.66, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 20.94, + "stderr": 0.0 + } + ], + "average_accuracy": 18.323333333333334, + "best_prompt": 21.66, + "prompt_id": "p2", + "CPS": 20.937278, + "is_dummy": false, + "std_accuracy": 5.16829114246995 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/mistralai/Mistral-Nemo-Instruct-2407_10_SL.json b/e3c_llm_results/mistralai/Mistral-Nemo-Instruct-2407_10_SL.json new file mode 100644 index 0000000000000000000000000000000000000000..e58549cd882cffb336a9d6d8a333af7027b45f0b --- /dev/null +++ b/e3c_llm_results/mistralai/Mistral-Nemo-Instruct-2407_10_SL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 36.596855166666664, + "config": { + "model_name": "mistralai/Mistral-Nemo-Instruct-2407", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "SL", + "model": "mistralai/Mistral-Nemo-Instruct-2407", + "base_model": "MistralForCausalLM", + "revision": "04d8a90549d23fc6bd7f642064003592df51e9b3", + "submitted_time": "2024-07-17 17:26:49+00:00", + "num_params_billion": 12.2477824, + "language": "en_fr_de_es_it_pt_ru_zh_ja" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 53.23, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 53.349999999999994, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 53.23, + "stderr": 0.0 + } + ], + "average_accuracy": 53.26999999999999, + "best_prompt": 53.349999999999994, + "prompt_id": "p2", + "CPS": 53.30731999999999, + "is_dummy": false, + "std_accuracy": 0.06928203230275362 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 13.900000000000002, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 20.57, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 17.27, + "stderr": 0.0 + } + ], + "average_accuracy": 17.246666666666666, + "best_prompt": 20.57, + "prompt_id": "p2", + "CPS": 19.886390333333335, + "is_dummy": false, + "std_accuracy": 3.335061218828423 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/tiiuae/Falcon3-10B-Instruct_0_EN.json b/e3c_llm_results/tiiuae/Falcon3-10B-Instruct_0_EN.json new file mode 100644 index 0000000000000000000000000000000000000000..10ae518593d1807a174e4fa432fd431d0ae97996 --- /dev/null +++ b/e3c_llm_results/tiiuae/Falcon3-10B-Instruct_0_EN.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 13.158133999999999, + "config": { + "model_name": "tiiuae/Falcon3-10B-Instruct", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "EN", + "model": "tiiuae/Falcon3-10B-Instruct", + "base_model": "LlamaForCausalLM", + "revision": "8799bc6aec0152757221dc6b272d824642db6202", + "submitted_time": "2024-12-14 05:17:25+00:00", + "num_params_billion": 10.30565376, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 22.7, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 27.089999999999996, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 29.959999999999997, + "stderr": 0.0 + } + ], + "average_accuracy": 26.58333333333333, + "best_prompt": 29.959999999999997, + "prompt_id": "p3", + "CPS": 28.948350666666663, + "is_dummy": false, + "std_accuracy": 3.6564235713786393 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 21.57, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 38.35, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 38.48, + "stderr": 0.0 + } + ], + "average_accuracy": 32.800000000000004, + "best_prompt": 38.48, + "prompt_id": "p3", + "CPS": 36.294336, + "is_dummy": false, + "std_accuracy": 9.725682495331625 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.5499999999999999, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.18333333333333332, + "best_prompt": 0.5499999999999999, + "prompt_id": "p1", + "CPS": 0.5479833333333333, + "is_dummy": false, + "std_accuracy": 0.3175426480542941 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/tiiuae/Falcon3-10B-Instruct_0_GR.json b/e3c_llm_results/tiiuae/Falcon3-10B-Instruct_0_GR.json new file mode 100644 index 0000000000000000000000000000000000000000..98ef301d1dc0f252a219466e3c5bb99d1df82d9b --- /dev/null +++ b/e3c_llm_results/tiiuae/Falcon3-10B-Instruct_0_GR.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 14.248081500000001, + "config": { + "model_name": "tiiuae/Falcon3-10B-Instruct", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "GR", + "model": "tiiuae/Falcon3-10B-Instruct", + "base_model": "LlamaForCausalLM", + "revision": "8799bc6aec0152757221dc6b272d824642db6202", + "submitted_time": "2024-12-14 05:17:25+00:00", + "num_params_billion": 10.30565376, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 21.3, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 4.95, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 21.3, + "stderr": 0.0 + } + ], + "average_accuracy": 15.85, + "best_prompt": 21.3, + "prompt_id": "p1", + "CPS": 20.13915, + "is_dummy": false, + "std_accuracy": 9.439676901250381 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 4.01, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 2.5, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 8.67, + "stderr": 0.0 + } + ], + "average_accuracy": 5.06, + "best_prompt": 8.67, + "prompt_id": "p3", + "CPS": 8.357013, + "is_dummy": false, + "std_accuracy": 3.2162244946520757 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/tiiuae/Falcon3-10B-Instruct_0_IT.json b/e3c_llm_results/tiiuae/Falcon3-10B-Instruct_0_IT.json new file mode 100644 index 0000000000000000000000000000000000000000..a8d2736e1fdc621ba8a7865b0898212c3103f0a6 --- /dev/null +++ b/e3c_llm_results/tiiuae/Falcon3-10B-Instruct_0_IT.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 9.225035466666668, + "config": { + "model_name": "tiiuae/Falcon3-10B-Instruct", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "IT", + "model": "tiiuae/Falcon3-10B-Instruct", + "base_model": "LlamaForCausalLM", + "revision": "8799bc6aec0152757221dc6b272d824642db6202", + "submitted_time": "2024-12-14 05:17:25+00:00", + "num_params_billion": 10.30565376, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 12.61, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 23.27, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 24.44, + "stderr": 0.0 + } + ], + "average_accuracy": 20.106666666666666, + "best_prompt": 24.44, + "prompt_id": "p3", + "CPS": 23.380933333333335, + "is_dummy": false, + "std_accuracy": 6.5186067018446 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 24.04, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 16.99, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 14.92, + "stderr": 0.0 + } + ], + "average_accuracy": 18.650000000000002, + "best_prompt": 24.04, + "prompt_id": "p1", + "CPS": 22.744244000000002, + "is_dummy": false, + "std_accuracy": 4.7812446078400965 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/tiiuae/Falcon3-10B-Instruct_0_PL.json b/e3c_llm_results/tiiuae/Falcon3-10B-Instruct_0_PL.json new file mode 100644 index 0000000000000000000000000000000000000000..e0599da2f28b325c997b59c07291b8b641ae9509 --- /dev/null +++ b/e3c_llm_results/tiiuae/Falcon3-10B-Instruct_0_PL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 19.314392833333333, + "config": { + "model_name": "tiiuae/Falcon3-10B-Instruct", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "PL", + "model": "tiiuae/Falcon3-10B-Instruct", + "base_model": "LlamaForCausalLM", + "revision": "8799bc6aec0152757221dc6b272d824642db6202", + "submitted_time": "2024-12-14 05:17:25+00:00", + "num_params_billion": 10.30565376, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 24.52, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 23.380000000000003, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 24.52, + "stderr": 0.0 + } + ], + "average_accuracy": 24.14, + "best_prompt": 24.52, + "prompt_id": "p1", + "CPS": 24.426824, + "is_dummy": false, + "std_accuracy": 0.6581793068761717 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 15.010000000000002, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 1.23, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 12.64, + "stderr": 0.0 + } + ], + "average_accuracy": 9.626666666666667, + "best_prompt": 15.010000000000002, + "prompt_id": "p1", + "CPS": 14.201961666666667, + "is_dummy": false, + "std_accuracy": 7.367647747641939 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/tiiuae/Falcon3-10B-Instruct_0_SK.json b/e3c_llm_results/tiiuae/Falcon3-10B-Instruct_0_SK.json new file mode 100644 index 0000000000000000000000000000000000000000..e049cbb1535fdab83832d8918c7233ae21389dce --- /dev/null +++ b/e3c_llm_results/tiiuae/Falcon3-10B-Instruct_0_SK.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 16.691507333333334, + "config": { + "model_name": "tiiuae/Falcon3-10B-Instruct", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "SK", + "model": "tiiuae/Falcon3-10B-Instruct", + "base_model": "LlamaForCausalLM", + "revision": "8799bc6aec0152757221dc6b272d824642db6202", + "submitted_time": "2024-12-14 05:17:25+00:00", + "num_params_billion": 10.30565376, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 27.169999999999998, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 31.78, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 27.169999999999998, + "stderr": 0.0 + } + ], + "average_accuracy": 28.706666666666667, + "best_prompt": 31.78, + "prompt_id": "p2", + "CPS": 30.803294666666666, + "is_dummy": false, + "std_accuracy": 2.6615847409641766 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 1.43, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 2.6, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 1.43, + "stderr": 0.0 + } + ], + "average_accuracy": 1.82, + "best_prompt": 2.6, + "prompt_id": "p2", + "CPS": 2.57972, + "is_dummy": false, + "std_accuracy": 0.6754998149518622 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/tiiuae/Falcon3-10B-Instruct_0_SL.json b/e3c_llm_results/tiiuae/Falcon3-10B-Instruct_0_SL.json new file mode 100644 index 0000000000000000000000000000000000000000..63e6960b5e06d5b9c360d7afb172496c8bd16cfe --- /dev/null +++ b/e3c_llm_results/tiiuae/Falcon3-10B-Instruct_0_SL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 12.605178333333333, + "config": { + "model_name": "tiiuae/Falcon3-10B-Instruct", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "SL", + "model": "tiiuae/Falcon3-10B-Instruct", + "base_model": "LlamaForCausalLM", + "revision": "8799bc6aec0152757221dc6b272d824642db6202", + "submitted_time": "2024-12-14 05:17:25+00:00", + "num_params_billion": 10.30565376, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 25.19, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 18.529999999999998, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 25.19, + "stderr": 0.0 + } + ], + "average_accuracy": 22.97, + "best_prompt": 25.19, + "prompt_id": "p1", + "CPS": 24.630782, + "is_dummy": false, + "std_accuracy": 3.8451527928029097 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.47000000000000003, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.58, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.47000000000000003, + "stderr": 0.0 + } + ], + "average_accuracy": 0.5066666666666667, + "best_prompt": 0.58, + "prompt_id": "p2", + "CPS": 0.5795746666666666, + "is_dummy": false, + "std_accuracy": 0.06350852961085879 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/tiiuae/Falcon3-10B-Instruct_10_EN.json b/e3c_llm_results/tiiuae/Falcon3-10B-Instruct_10_EN.json new file mode 100644 index 0000000000000000000000000000000000000000..ad6d65e78f5c187255c34d8ebaebcf4483a86c4f --- /dev/null +++ b/e3c_llm_results/tiiuae/Falcon3-10B-Instruct_10_EN.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 34.895077066666666, + "config": { + "model_name": "tiiuae/Falcon3-10B-Instruct", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "EN", + "model": "tiiuae/Falcon3-10B-Instruct", + "base_model": "LlamaForCausalLM", + "revision": "8799bc6aec0152757221dc6b272d824642db6202", + "submitted_time": "2024-12-14 05:17:25+00:00", + "num_params_billion": 10.30565376, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 58.4, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 54.21, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 59.28, + "stderr": 0.0 + } + ], + "average_accuracy": 57.29666666666666, + "best_prompt": 59.28, + "prompt_id": "p3", + "CPS": 58.104279999999996, + "is_dummy": false, + "std_accuracy": 2.709101942218737 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 43.35, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 55.86, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 55.15, + "stderr": 0.0 + } + ], + "average_accuracy": 51.45333333333334, + "best_prompt": 55.86, + "prompt_id": "p2", + "CPS": 53.398436000000004, + "is_dummy": false, + "std_accuracy": 7.026665876027785 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 27.92, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 18.16, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 3.5000000000000004, + "stderr": 0.0 + } + ], + "average_accuracy": 16.526666666666667, + "best_prompt": 27.92, + "prompt_id": "p1", + "CPS": 24.738981333333335, + "is_dummy": false, + "std_accuracy": 12.291661129942256 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 7.08, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 16.580000000000002, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 8.77, + "stderr": 0.0 + } + ], + "average_accuracy": 10.810000000000002, + "best_prompt": 16.580000000000002, + "prompt_id": "p2", + "CPS": 15.623334000000002, + "is_dummy": false, + "std_accuracy": 5.067908838959124 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 2.11, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 26.779999999999998, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 4.74, + "stderr": 0.0 + } + ], + "average_accuracy": 11.209999999999999, + "best_prompt": 26.779999999999998, + "prompt_id": "p2", + "CPS": 22.610354, + "is_dummy": false, + "std_accuracy": 13.547985090041987 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/tiiuae/Falcon3-10B-Instruct_10_GR.json b/e3c_llm_results/tiiuae/Falcon3-10B-Instruct_10_GR.json new file mode 100644 index 0000000000000000000000000000000000000000..5d46686fafe4d41a7d6e122ac2cd8759bab92f23 --- /dev/null +++ b/e3c_llm_results/tiiuae/Falcon3-10B-Instruct_10_GR.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 36.36378083333333, + "config": { + "model_name": "tiiuae/Falcon3-10B-Instruct", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "GR", + "model": "tiiuae/Falcon3-10B-Instruct", + "base_model": "LlamaForCausalLM", + "revision": "8799bc6aec0152757221dc6b272d824642db6202", + "submitted_time": "2024-12-14 05:17:25+00:00", + "num_params_billion": 10.30565376, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 33.45, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 36.55, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 33.45, + "stderr": 0.0 + } + ], + "average_accuracy": 34.483333333333334, + "best_prompt": 36.55, + "prompt_id": "p2", + "CPS": 35.79463333333333, + "is_dummy": false, + "std_accuracy": 1.7897858344878366 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 37.49, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 37.55, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 32.68, + "stderr": 0.0 + } + ], + "average_accuracy": 35.906666666666666, + "best_prompt": 37.55, + "prompt_id": "p2", + "CPS": 36.93292833333333, + "is_dummy": false, + "std_accuracy": 2.7945363360195072 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/tiiuae/Falcon3-10B-Instruct_10_IT.json b/e3c_llm_results/tiiuae/Falcon3-10B-Instruct_10_IT.json new file mode 100644 index 0000000000000000000000000000000000000000..ddef3463e79980f6f3fdb2bb1c641bddd79c3dac --- /dev/null +++ b/e3c_llm_results/tiiuae/Falcon3-10B-Instruct_10_IT.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 25.310261866666664, + "config": { + "model_name": "tiiuae/Falcon3-10B-Instruct", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "IT", + "model": "tiiuae/Falcon3-10B-Instruct", + "base_model": "LlamaForCausalLM", + "revision": "8799bc6aec0152757221dc6b272d824642db6202", + "submitted_time": "2024-12-14 05:17:25+00:00", + "num_params_billion": 10.30565376, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 58.209999999999994, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 54.32, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 56.220000000000006, + "stderr": 0.0 + } + ], + "average_accuracy": 56.25, + "best_prompt": 58.209999999999994, + "prompt_id": "p1", + "CPS": 57.069084, + "is_dummy": false, + "std_accuracy": 1.94517351411127 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 46.22, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 54.58, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 55.97, + "stderr": 0.0 + } + ], + "average_accuracy": 52.25666666666666, + "best_prompt": 55.97, + "prompt_id": "p3", + "CPS": 53.89164733333333, + "is_dummy": false, + "std_accuracy": 5.273901149370675 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 7.21, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 3.4000000000000004, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 1.5699999999999998, + "stderr": 0.0 + } + ], + "average_accuracy": 4.06, + "best_prompt": 7.21, + "prompt_id": "p1", + "CPS": 6.9828850000000005, + "is_dummy": false, + "std_accuracy": 2.8773425239272434 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 1.8599999999999999, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 6.68, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 7.739999999999999, + "stderr": 0.0 + } + ], + "average_accuracy": 5.426666666666666, + "best_prompt": 7.739999999999999, + "prompt_id": "p3", + "CPS": 7.560947999999999, + "is_dummy": false, + "std_accuracy": 3.133964475442141 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 1.05, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.44, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.73, + "stderr": 0.0 + } + ], + "average_accuracy": 0.7399999999999999, + "best_prompt": 1.05, + "prompt_id": "p1", + "CPS": 1.046745, + "is_dummy": false, + "std_accuracy": 0.3051229260478472 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/tiiuae/Falcon3-10B-Instruct_10_PL.json b/e3c_llm_results/tiiuae/Falcon3-10B-Instruct_10_PL.json new file mode 100644 index 0000000000000000000000000000000000000000..624360653ccd6f84d827609c1915ba0339b31d66 --- /dev/null +++ b/e3c_llm_results/tiiuae/Falcon3-10B-Instruct_10_PL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 48.75862866666667, + "config": { + "model_name": "tiiuae/Falcon3-10B-Instruct", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "PL", + "model": "tiiuae/Falcon3-10B-Instruct", + "base_model": "LlamaForCausalLM", + "revision": "8799bc6aec0152757221dc6b272d824642db6202", + "submitted_time": "2024-12-14 05:17:25+00:00", + "num_params_billion": 10.30565376, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 43.04, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 41.23, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 43.04, + "stderr": 0.0 + } + ], + "average_accuracy": 42.43666666666667, + "best_prompt": 43.04, + "prompt_id": "p1", + "CPS": 42.78032533333333, + "is_dummy": false, + "std_accuracy": 1.045003987233224 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 51.29, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 55.71, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 54.89000000000001, + "stderr": 0.0 + } + ], + "average_accuracy": 53.96333333333334, + "best_prompt": 55.71, + "prompt_id": "p2", + "CPS": 54.736932, + "is_dummy": false, + "std_accuracy": 2.3511982760569863 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/tiiuae/Falcon3-10B-Instruct_10_SK.json b/e3c_llm_results/tiiuae/Falcon3-10B-Instruct_10_SK.json new file mode 100644 index 0000000000000000000000000000000000000000..bc4305eba880b843f1805753bd555c7e84842d8a --- /dev/null +++ b/e3c_llm_results/tiiuae/Falcon3-10B-Instruct_10_SK.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 44.8562175, + "config": { + "model_name": "tiiuae/Falcon3-10B-Instruct", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "SK", + "model": "tiiuae/Falcon3-10B-Instruct", + "base_model": "LlamaForCausalLM", + "revision": "8799bc6aec0152757221dc6b272d824642db6202", + "submitted_time": "2024-12-14 05:17:25+00:00", + "num_params_billion": 10.30565376, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 45.45, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 41.160000000000004, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 45.45, + "stderr": 0.0 + } + ], + "average_accuracy": 44.02, + "best_prompt": 45.45, + "prompt_id": "p1", + "CPS": 44.800065000000004, + "is_dummy": false, + "std_accuracy": 2.476832654823494 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 37.5, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 46.949999999999996, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 43.38, + "stderr": 0.0 + } + ], + "average_accuracy": 42.60999999999999, + "best_prompt": 46.949999999999996, + "prompt_id": "p2", + "CPS": 44.912369999999996, + "is_dummy": false, + "std_accuracy": 4.771823550803192 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/tiiuae/Falcon3-10B-Instruct_10_SL.json b/e3c_llm_results/tiiuae/Falcon3-10B-Instruct_10_SL.json new file mode 100644 index 0000000000000000000000000000000000000000..ae741da7f8339df7e872b5639bf0c48c40836de3 --- /dev/null +++ b/e3c_llm_results/tiiuae/Falcon3-10B-Instruct_10_SL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 38.88441916666667, + "config": { + "model_name": "tiiuae/Falcon3-10B-Instruct", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "SL", + "model": "tiiuae/Falcon3-10B-Instruct", + "base_model": "LlamaForCausalLM", + "revision": "8799bc6aec0152757221dc6b272d824642db6202", + "submitted_time": "2024-12-14 05:17:25+00:00", + "num_params_billion": 10.30565376, + "language": "" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 41.21, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 39.09, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 41.21, + "stderr": 0.0 + } + ], + "average_accuracy": 40.50333333333334, + "best_prompt": 41.21, + "prompt_id": "p1", + "CPS": 40.918782666666665, + "is_dummy": false, + "std_accuracy": 1.223982570682005 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 23.23, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 30.12, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 40.63, + "stderr": 0.0 + } + ], + "average_accuracy": 31.326666666666668, + "best_prompt": 40.63, + "prompt_id": "p3", + "CPS": 36.85005566666667, + "is_dummy": false, + "std_accuracy": 8.762535782142825 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/unsloth/phi-4_0_EN.json b/e3c_llm_results/unsloth/phi-4_0_EN.json new file mode 100644 index 0000000000000000000000000000000000000000..2545b65e5fa76897736dfb11a0171847d68b5456 --- /dev/null +++ b/e3c_llm_results/unsloth/phi-4_0_EN.json @@ -0,0 +1,157 @@ +{ + "average_CPS": 9.439416133333335, + "config": { + "model_name": "unsloth/phi-4", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "EN", + "model": "unsloth/phi-4", + "base_model": "LlamaForCausalLM", + "revision": "c6220bde10fff762dbd72c3331894aa4cade249d", + "submitted_time": "2025-01-08 21:56:16+00:00", + "num_params_billion": 14.6595072, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 2.52, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 5.72, + "stderr": 0.0 + } + ], + "average_accuracy": 2.7466666666666666, + "best_prompt": 5.72, + "prompt_id": "p3", + "CPS": 5.549925333333333, + "is_dummy": false, + "std_accuracy": 2.8667286814997563 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 40.22, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 42.19, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 40.300000000000004, + "stderr": 0.0 + } + ], + "average_accuracy": 40.903333333333336, + "best_prompt": 42.19, + "prompt_id": "p2", + "CPS": 41.64715533333334, + "is_dummy": false, + "std_accuracy": 1.115003736914513 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/unsloth/phi-4_0_GR.json b/e3c_llm_results/unsloth/phi-4_0_GR.json new file mode 100644 index 0000000000000000000000000000000000000000..5f40f450f1d4931b4a767b4fd91e91843b836bb7 --- /dev/null +++ b/e3c_llm_results/unsloth/phi-4_0_GR.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 13.214538500000002, + "config": { + "model_name": "unsloth/phi-4", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "GR", + "model": "unsloth/phi-4", + "base_model": "LlamaForCausalLM", + "revision": "c6220bde10fff762dbd72c3331894aa4cade249d", + "submitted_time": "2025-01-08 21:56:16+00:00", + "num_params_billion": 14.6595072, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 29.01, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 22.08, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 9.25, + "stderr": 0.0 + } + ], + "average_accuracy": 20.113333333333333, + "best_prompt": 29.01, + "prompt_id": "p1", + "CPS": 26.429077000000003, + "is_dummy": false, + "std_accuracy": 10.025728568704288 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/unsloth/phi-4_0_IT.json b/e3c_llm_results/unsloth/phi-4_0_IT.json new file mode 100644 index 0000000000000000000000000000000000000000..53e10415ddc7b52a4cde3be0563f84032758c292 --- /dev/null +++ b/e3c_llm_results/unsloth/phi-4_0_IT.json @@ -0,0 +1,163 @@ +{ + "average_CPS": 13.046800866666667, + "config": { + "model_name": "unsloth/phi-4", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "IT", + "model": "unsloth/phi-4", + "base_model": "LlamaForCausalLM", + "revision": "c6220bde10fff762dbd72c3331894aa4cade249d", + "submitted_time": "2025-01-08 21:56:16+00:00", + "num_params_billion": 14.6595072, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 17.24, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 34.28, + "stderr": 0.0 + } + ], + "average_accuracy": 17.173333333333332, + "best_prompt": 34.28, + "prompt_id": "p3", + "CPS": 28.415834666666665, + "is_dummy": false, + "std_accuracy": 17.1400972381528 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 33.54, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 37.37, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 36.77, + "stderr": 0.0 + } + ], + "average_accuracy": 35.89333333333334, + "best_prompt": 37.37, + "prompt_id": "p2", + "CPS": 36.81816966666667, + "is_dummy": false, + "std_accuracy": 2.0600080905989993 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": 0.0, + "best_prompt": 0.0, + "prompt_id": "p1", + "CPS": 0.0, + "is_dummy": false, + "std_accuracy": 0.0 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/unsloth/phi-4_0_PL.json b/e3c_llm_results/unsloth/phi-4_0_PL.json new file mode 100644 index 0000000000000000000000000000000000000000..c43c1e9732a47cc964593f298f5929c8204806b1 --- /dev/null +++ b/e3c_llm_results/unsloth/phi-4_0_PL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 20.92978433333333, + "config": { + "model_name": "unsloth/phi-4", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "PL", + "model": "unsloth/phi-4", + "base_model": "LlamaForCausalLM", + "revision": "c6220bde10fff762dbd72c3331894aa4cade249d", + "submitted_time": "2025-01-08 21:56:16+00:00", + "num_params_billion": 14.6595072, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 2.36, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 3.66, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 2.36, + "stderr": 0.0 + } + ], + "average_accuracy": 2.793333333333333, + "best_prompt": 3.66, + "prompt_id": "p2", + "CPS": 3.62828, + "is_dummy": false, + "std_accuracy": 0.7505553499465136 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 37.99, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 38.29, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 38.129999999999995, + "stderr": 0.0 + } + ], + "average_accuracy": 38.13666666666666, + "best_prompt": 38.29, + "prompt_id": "p2", + "CPS": 38.231288666666664, + "is_dummy": false, + "std_accuracy": 0.15011106998930138 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/unsloth/phi-4_0_SK.json b/e3c_llm_results/unsloth/phi-4_0_SK.json new file mode 100644 index 0000000000000000000000000000000000000000..29845de4500953b65d294d3414a6bf74349b7bff --- /dev/null +++ b/e3c_llm_results/unsloth/phi-4_0_SK.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 21.629032, + "config": { + "model_name": "unsloth/phi-4", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "SK", + "model": "unsloth/phi-4", + "base_model": "LlamaForCausalLM", + "revision": "c6220bde10fff762dbd72c3331894aa4cade249d", + "submitted_time": "2025-01-08 21:56:16+00:00", + "num_params_billion": 14.6595072, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 3.16, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 10.7, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 3.16, + "stderr": 0.0 + } + ], + "average_accuracy": 5.673333333333333, + "best_prompt": 10.7, + "prompt_id": "p2", + "CPS": 10.162146666666667, + "is_dummy": false, + "std_accuracy": 4.353221029689778 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 32.519999999999996, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 33.26, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 32.519999999999996, + "stderr": 0.0 + } + ], + "average_accuracy": 32.766666666666666, + "best_prompt": 33.26, + "prompt_id": "p2", + "CPS": 33.09591733333333, + "is_dummy": false, + "std_accuracy": 0.4272391992003242 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/unsloth/phi-4_0_SL.json b/e3c_llm_results/unsloth/phi-4_0_SL.json new file mode 100644 index 0000000000000000000000000000000000000000..702cc3093b4d1afceef0fa35986f4156eb9ffd17 --- /dev/null +++ b/e3c_llm_results/unsloth/phi-4_0_SL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 28.7078975, + "config": { + "model_name": "unsloth/phi-4", + "num_fewshot": "0", + "batch_size": 1, + "LANG": "SL", + "model": "unsloth/phi-4", + "base_model": "LlamaForCausalLM", + "revision": "c6220bde10fff762dbd72c3331894aa4cade249d", + "submitted_time": "2025-01-08 21:56:16+00:00", + "num_params_billion": 14.6595072, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 28.7, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 9.81, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 28.7, + "stderr": 0.0 + } + ], + "average_accuracy": 22.403333333333332, + "best_prompt": 28.7, + "prompt_id": "p1", + "CPS": 26.892856666666667, + "is_dummy": false, + "std_accuracy": 10.90614658499203 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 32.09, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 17.44, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 32.09, + "stderr": 0.0 + } + ], + "average_accuracy": 27.206666666666667, + "best_prompt": 32.09, + "prompt_id": "p1", + "CPS": 30.522938333333336, + "is_dummy": false, + "std_accuracy": 8.45818144362802 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/unsloth/phi-4_10_EN.json b/e3c_llm_results/unsloth/phi-4_10_EN.json new file mode 100644 index 0000000000000000000000000000000000000000..1cb35bbe7e0f2550fa21c9cad46ea26e028595f9 --- /dev/null +++ b/e3c_llm_results/unsloth/phi-4_10_EN.json @@ -0,0 +1,163 @@ +{ + "average_CPS": 49.37233213333333, + "config": { + "model_name": "unsloth/phi-4", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "EN", + "model": "unsloth/phi-4", + "base_model": "LlamaForCausalLM", + "revision": "c6220bde10fff762dbd72c3331894aa4cade249d", + "submitted_time": "2025-01-08 21:56:16+00:00", + "num_params_billion": 14.6595072, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 60.980000000000004, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 57.11000000000001, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 61.41, + "stderr": 0.0 + } + ], + "average_accuracy": 59.833333333333336, + "best_prompt": 61.41, + "prompt_id": "p3", + "CPS": 60.441769, + "is_dummy": false, + "std_accuracy": 2.36825533533302 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 49.120000000000005, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 56.26, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 55.54, + "stderr": 0.0 + } + ], + "average_accuracy": 53.63999999999999, + "best_prompt": 56.26, + "prompt_id": "p2", + "CPS": 54.785987999999996, + "is_dummy": false, + "std_accuracy": 3.9309540826623723 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 38.41, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 32.89, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 21.91, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 21.91, + "stderr": 0.0 + } + ], + "average_accuracy": 28.779999999999998, + "best_prompt": 38.41, + "prompt_id": "p1", + "CPS": 34.711116999999994, + "is_dummy": false, + "std_accuracy": 8.24667205604782 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 42.620000000000005, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 56.3, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 42.54, + "stderr": 0.0 + } + ], + "average_accuracy": 47.153333333333336, + "best_prompt": 56.3, + "prompt_id": "p2", + "CPS": 51.15042666666667, + "is_dummy": false, + "std_accuracy": 7.9213466868540285 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 37.36, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 50.2, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 36.58, + "stderr": 0.0 + } + ], + "average_accuracy": 41.38, + "best_prompt": 50.2, + "prompt_id": "p2", + "CPS": 45.77236, + "is_dummy": false, + "std_accuracy": 7.648293927406296 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/unsloth/phi-4_10_GR.json b/e3c_llm_results/unsloth/phi-4_10_GR.json new file mode 100644 index 0000000000000000000000000000000000000000..8c57b4bec48137c4f2a59ad8a1b369ae56f8560c --- /dev/null +++ b/e3c_llm_results/unsloth/phi-4_10_GR.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 55.776253, + "config": { + "model_name": "unsloth/phi-4", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "GR", + "model": "unsloth/phi-4", + "base_model": "LlamaForCausalLM", + "revision": "c6220bde10fff762dbd72c3331894aa4cade249d", + "submitted_time": "2025-01-08 21:56:16+00:00", + "num_params_billion": 14.6595072, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 57.17, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 56.11000000000001, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 57.17, + "stderr": 0.0 + } + ], + "average_accuracy": 56.81666666666666, + "best_prompt": 57.17, + "prompt_id": "p1", + "CPS": 56.96799933333333, + "is_dummy": false, + "std_accuracy": 0.6119912853410006 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 49.35, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 52.61, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 56.779999999999994, + "stderr": 0.0 + } + ], + "average_accuracy": 52.913333333333334, + "best_prompt": 56.779999999999994, + "prompt_id": "p3", + "CPS": 54.58450666666666, + "is_dummy": false, + "std_accuracy": 3.724276216036252 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/unsloth/phi-4_10_IT.json b/e3c_llm_results/unsloth/phi-4_10_IT.json new file mode 100644 index 0000000000000000000000000000000000000000..5063a9c7a1389e6aa6729b7ed5bf1bb21267733b --- /dev/null +++ b/e3c_llm_results/unsloth/phi-4_10_IT.json @@ -0,0 +1,163 @@ +{ + "average_CPS": 49.70468556666667, + "config": { + "model_name": "unsloth/phi-4", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "IT", + "model": "unsloth/phi-4", + "base_model": "LlamaForCausalLM", + "revision": "c6220bde10fff762dbd72c3331894aa4cade249d", + "submitted_time": "2025-01-08 21:56:16+00:00", + "num_params_billion": 14.6595072, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 66.47, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 67.32000000000001, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 68.97, + "stderr": 0.0 + } + ], + "average_accuracy": 67.58666666666667, + "best_prompt": 68.97, + "prompt_id": "p3", + "CPS": 68.015915, + "is_dummy": false, + "std_accuracy": 1.2711543310445554 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 56.08, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 58.199999999999996, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 56.879999999999995, + "stderr": 0.0 + } + ], + "average_accuracy": 57.053333333333335, + "best_prompt": 58.199999999999996, + "prompt_id": "p2", + "CPS": 57.53264, + "is_dummy": false, + "std_accuracy": 1.0705761688611095 + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 17.59, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 16.75, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 8.1, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 8.1, + "stderr": 0.0 + } + ], + "average_accuracy": 12.635000000000002, + "best_prompt": 17.59, + "prompt_id": "p1", + "CPS": 16.7184155, + "is_dummy": false, + "std_accuracy": 5.247783659158725 + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 58.35, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 56.76, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 55.64, + "stderr": 0.0 + } + ], + "average_accuracy": 56.916666666666664, + "best_prompt": 58.35, + "prompt_id": "p1", + "CPS": 57.51365, + "is_dummy": false, + "std_accuracy": 1.3617758014200925 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 51.019999999999996, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 50.06, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 38.59, + "stderr": 0.0 + } + ], + "average_accuracy": 46.55666666666667, + "best_prompt": 51.019999999999996, + "prompt_id": "p1", + "CPS": 48.74280733333334, + "is_dummy": false, + "std_accuracy": 6.916012820500935 + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/unsloth/phi-4_10_PL.json b/e3c_llm_results/unsloth/phi-4_10_PL.json new file mode 100644 index 0000000000000000000000000000000000000000..3386ef045aa6738ecea0b40f79a48d241d4e4d66 --- /dev/null +++ b/e3c_llm_results/unsloth/phi-4_10_PL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 56.63946383333333, + "config": { + "model_name": "unsloth/phi-4", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "PL", + "model": "unsloth/phi-4", + "base_model": "LlamaForCausalLM", + "revision": "c6220bde10fff762dbd72c3331894aa4cade249d", + "submitted_time": "2025-01-08 21:56:16+00:00", + "num_params_billion": 14.6595072, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 55.489999999999995, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 53.239999999999995, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 55.489999999999995, + "stderr": 0.0 + } + ], + "average_accuracy": 54.73999999999999, + "best_prompt": 55.489999999999995, + "prompt_id": "p1", + "CPS": 55.07382499999999, + "is_dummy": false, + "std_accuracy": 1.299038105676658 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 54.230000000000004, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 57.599999999999994, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 59.72, + "stderr": 0.0 + } + ], + "average_accuracy": 57.18333333333334, + "best_prompt": 59.72, + "prompt_id": "p3", + "CPS": 58.20510266666667, + "is_dummy": false, + "std_accuracy": 2.7686157792899535 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/unsloth/phi-4_10_SK.json b/e3c_llm_results/unsloth/phi-4_10_SK.json new file mode 100644 index 0000000000000000000000000000000000000000..07330908077a33052c93af7b88f10bfc739383de --- /dev/null +++ b/e3c_llm_results/unsloth/phi-4_10_SK.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 54.49931766666667, + "config": { + "model_name": "unsloth/phi-4", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "SK", + "model": "unsloth/phi-4", + "base_model": "LlamaForCausalLM", + "revision": "c6220bde10fff762dbd72c3331894aa4cade249d", + "submitted_time": "2025-01-08 21:56:16+00:00", + "num_params_billion": 14.6595072, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 55.61000000000001, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 54.49, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 55.61000000000001, + "stderr": 0.0 + } + ], + "average_accuracy": 55.23666666666667, + "best_prompt": 55.61000000000001, + "prompt_id": "p1", + "CPS": 55.40238933333334, + "is_dummy": false, + "std_accuracy": 0.6466323014923835 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 51.06, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 49.94, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 55.410000000000004, + "stderr": 0.0 + } + ], + "average_accuracy": 52.13666666666666, + "best_prompt": 55.410000000000004, + "prompt_id": "p3", + "CPS": 53.596246, + "is_dummy": false, + "std_accuracy": 2.8895732095472764 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/e3c_llm_results/unsloth/phi-4_10_SL.json b/e3c_llm_results/unsloth/phi-4_10_SL.json new file mode 100644 index 0000000000000000000000000000000000000000..6891b0d5b79ca8f4d04c6f4d23fee553cd3c3a75 --- /dev/null +++ b/e3c_llm_results/unsloth/phi-4_10_SL.json @@ -0,0 +1,121 @@ +{ + "average_CPS": 55.04669683333333, + "config": { + "model_name": "unsloth/phi-4", + "num_fewshot": "10", + "batch_size": 1, + "LANG": "SL", + "model": "unsloth/phi-4", + "base_model": "LlamaForCausalLM", + "revision": "c6220bde10fff762dbd72c3331894aa4cade249d", + "submitted_time": "2025-01-08 21:56:16+00:00", + "num_params_billion": 14.6595072, + "language": "en" + }, + "tasks": { + "NER": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 55.86, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 55.58, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 55.86, + "stderr": 0.0 + } + ], + "average_accuracy": 55.76666666666667, + "best_prompt": 55.86, + "prompt_id": "p1", + "CPS": 55.807864, + "is_dummy": false, + "std_accuracy": 0.16165807537309587 + }, + "RE": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 51.17, + "stderr": 0.0 + }, + { + "prompt": "p2", + "metric": "f1", + "value": 52.32, + "stderr": 0.0 + }, + { + "prompt": "p3", + "metric": "f1", + "value": 55.78999999999999, + "stderr": 0.0 + } + ], + "average_accuracy": 53.093333333333334, + "best_prompt": 55.78999999999999, + "prompt_id": "p3", + "CPS": 54.28552966666666, + "is_dummy": false, + "std_accuracy": 2.405126469301211 + }, + "HIS": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "DIA": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + }, + "RML": { + "prompts": [ + { + "prompt": "p1", + "metric": "f1", + "value": 0.0, + "stderr": 0.0 + } + ], + "average_accuracy": null, + "std_accuracy": null, + "best_prompt": null, + "prompt_id": null, + "CPS": null, + "is_dummy": true + } + } +} \ No newline at end of file diff --git a/example_app.py b/example_app.py new file mode 100644 index 0000000000000000000000000000000000000000..e6e712f9ac66b7f5ae4305c0615540fde9141d85 --- /dev/null +++ b/example_app.py @@ -0,0 +1,324 @@ +import gradio as gr +from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns +import pandas as pd +from apscheduler.schedulers.background import BackgroundScheduler +from huggingface_hub import snapshot_download + +from src.about import ( + CITATION_BUTTON_LABEL, + CITATION_BUTTON_TEXT, + EVALUATION_QUEUE_TEXT, + INTRODUCTION_TEXT, + LLM_BENCHMARKS_TEXT, + TITLE, +) + +from src.tasks import ( + TE_DESCRIPTION, +) + +from src.display.css_html_js import custom_css +from src.display.utils import ( + BENCHMARK_COLS, + COLS, + EVAL_COLS, + EVAL_TYPES, + AutoEvalColumn, + ModelType, + fields, + WeightType, + Precision +) +from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN +from src.populate import get_evaluation_queue_df, get_leaderboard_df +from src.submission.submit import add_new_eval + + +def restart_space(): + API.restart_space(repo_id=REPO_ID) + +### Space initialisation +try: + print(EVAL_REQUESTS_PATH) + snapshot_download( + repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN + ) +except Exception: + restart_space() +try: + print(EVAL_RESULTS_PATH) + snapshot_download( + repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN + ) +except Exception: + restart_space() + + +LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS) + +( + finished_eval_queue_df, + running_eval_queue_df, + pending_eval_queue_df, +) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS) + +def init_leaderboard(dataframe): + print(dataframe) + if dataframe is None or dataframe.empty: + raise ValueError("Leaderboard DataFrame is empty or None.") + return Leaderboard( + value=dataframe, + datatype=[c.type for c in fields(AutoEvalColumn)], + select_columns=SelectColumns( + default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default], + cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden], + label="Select Columns to Display:", + ), + search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name], + hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden], + filter_columns=[ + ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"), + ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"), + ColumnFilter( + AutoEvalColumn.params.name, + type="slider", + min=0.01, + max=150, + label="Select the number of parameters (B)", + ), + ColumnFilter( + AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True + ), + ], + bool_checkboxgroup_label="Hide models", + interactive=False, + ) + + +def init_leaderboard2(dataframe, default_selection=None, hidden_columns=None): + + print("entrato===============================================") + + if dataframe is None or dataframe.empty: + raise ValueError("Leaderboard DataFrame is empty or None.") + return Leaderboard( + value=dataframe, + datatype=[c.type for c in fields(AutoEvalColumn)], + select_columns=SelectColumns( + default_selection=default_selection or [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default], + cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden], + label="Select Columns to Display:", + ), + search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name], + hide_columns=hidden_columns or [c.name for c in fields(AutoEvalColumn) if c.hidden], + filter_columns=[ + ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"), + ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"), + ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0.01, max=150, label="Select the number of parameters (B)"), + ColumnFilter(AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True), + ], + bool_checkboxgroup_label="Hide models", + interactive=False, + ) + + +demo = gr.Blocks(css=custom_css) +with demo: + gr.HTML(TITLE) + gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") + + with gr.Tabs(elem_classes="tab-buttons") as tabs: + with gr.TabItem("🏅 EVALITA-LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0): + #leaderboard = init_leaderboard(LEADERBOARD_DF) + + leaderboard = init_leaderboard2( + LEADERBOARD_DF, + default_selection=['T', 'Model', "Average ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"], + hidden_columns=[col for col in LEADERBOARD_DF.columns if + col not in ['T', 'Model', "Average ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL" ]] + ) + + + with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2): + gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") + + with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3): + with gr.Column(): + with gr.Row(): + gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text") + + with gr.Column(): + with gr.Accordion( + f"✅ Finished Evaluations ({len(finished_eval_queue_df)})", + open=False, + ): + with gr.Row(): + finished_eval_table = gr.components.Dataframe( + value=finished_eval_queue_df, + headers=EVAL_COLS, + datatype=EVAL_TYPES, + row_count=5, + ) + with gr.Accordion( + f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})", + open=False, + ): + with gr.Row(): + running_eval_table = gr.components.Dataframe( + value=running_eval_queue_df, + headers=EVAL_COLS, + datatype=EVAL_TYPES, + row_count=5, + ) + + with gr.Accordion( + f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})", + open=False, + ): + with gr.Row(): + pending_eval_table = gr.components.Dataframe( + value=pending_eval_queue_df, + headers=EVAL_COLS, + datatype=EVAL_TYPES, + row_count=5, + ) + with gr.Row(): + gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text") + + with gr.Row(): + with gr.Column(): + model_name_textbox = gr.Textbox(label="Model name") + revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main") + model_type = gr.Dropdown( + choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown], + label="Model type", + multiselect=False, + value=None, + interactive=True, + ) + + with gr.Column(): + precision = gr.Dropdown( + choices=[i.value.name for i in Precision if i != Precision.Unknown], + label="Precision", + multiselect=False, + value="float16", + interactive=True, + ) + weight_type = gr.Dropdown( + choices=[i.value.name for i in WeightType], + label="Weights type", + multiselect=False, + value="Original", + interactive=True, + ) + base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)") + + submit_button = gr.Button("Submit Eval") + submission_result = gr.Markdown() + submit_button.click( + add_new_eval, + [ + model_name_textbox, + base_model_name_textbox, + revision_name_textbox, + precision, + weight_type, + model_type, + ], + submission_result, + ) + + + with gr.TabItem("TE", elem_id="llm-benchmark-tab-table", id=4): + gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text") + #leaderboard = init_leaderboard(LEADERBOARD_DF) + + LEADERBOARD_DF_TE = LEADERBOARD_DF.rename(columns={"TE Prompt Average": "Prompt Average", + "TE Best Prompt": "Best Prompt", + "TE Best Prompt Id": "Best Prompt Id", + "TE": "Combined Performance"}) + + leaderboard = init_leaderboard2( + LEADERBOARD_DF_TE, + default_selection=['T', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id'], + hidden_columns=[col for col in LEADERBOARD_DF.columns if + col not in ['T', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id']] + ) + + + with gr.TabItem("SA", elem_id="llm-benchmark-tab-table", id=5): + gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text") + + LEADERBOARD_DF_SA = LEADERBOARD_DF.rename(columns={"SA Prompt Average": "Prompt Average", + "SA Best Prompt": "Best Prompt", + "SA Best Prompt Id": "Best Prompt Id", + "SA": "Combined Performance"}) + + leaderboard = init_leaderboard2( + LEADERBOARD_DF_SA, + default_selection=['T', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', + 'Best Prompt Id'], + hidden_columns=[col for col in LEADERBOARD_DF.columns if + col not in ['T', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', + 'Best Prompt Id']] + ) + + + + + with gr.TabItem("HS", elem_id="llm-benchmark-tab-table", id=6): + gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text") + + LEADERBOARD_DF_HS = LEADERBOARD_DF.rename(columns={"HS Prompt Average": "Prompt Average", + "HS Best Prompt": "Best Prompt", + "HS Best Prompt Id": "Best Prompt Id", + "HS": "Combined Performance"}) + + leaderboard = init_leaderboard2( + LEADERBOARD_DF_HS, + default_selection=['T', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', + 'Best Prompt Id'], + hidden_columns=[col for col in LEADERBOARD_DF.columns if + col not in ['T', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', + 'Best Prompt Id']] + ) + + + + with gr.TabItem("AT", elem_id="llm-benchmark-tab-table", id=7): + gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text") + + with gr.TabItem("WIC", elem_id="llm-benchmark-tab-table", id=8): + gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text") + + with gr.TabItem("FAQ", elem_id="llm-benchmark-tab-table", id=9): + gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text") + + with gr.TabItem("LS", elem_id="llm-benchmark-tab-table", id=10): + gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text") + + with gr.TabItem("SU", elem_id="llm-benchmark-tab-table", id=11): + gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text") + + with gr.TabItem("NER", elem_id="llm-benchmark-tab-table", id=12): + gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text") + + with gr.TabItem("REL", elem_id="llm-benchmark-tab-table", id=13): + gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text") + + + with gr.Row(): + with gr.Accordion("📙 Citation", open=False): + citation_button = gr.Textbox( + value=CITATION_BUTTON_TEXT, + label=CITATION_BUTTON_LABEL, + lines=20, + elem_id="citation-button", + show_copy_button=True, + ) + +scheduler = BackgroundScheduler() +scheduler.add_job(restart_space, "interval", seconds=1800) +scheduler.start() +demo.queue(default_concurrency_limit=40).launch() \ No newline at end of file diff --git a/example_app2.py b/example_app2.py new file mode 100644 index 0000000000000000000000000000000000000000..9268e66807d66f4d99c6c97a748691f46972e4e8 --- /dev/null +++ b/example_app2.py @@ -0,0 +1,216 @@ +import gradio as gr +from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns +import pandas as pd +from apscheduler.schedulers.background import BackgroundScheduler +from huggingface_hub import snapshot_download + +from src.about import ( + CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, + INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE +) +from src.tasks import TASK_DESCRIPTIONS, MEASURE_DESCRIPTION +from src.display.css_html_js import custom_css +from src.display.utils import ( + BENCHMARK_COLS, COLS, EVAL_COLS, EVAL_TYPES, AutoEvalColumn, + ModelType, fields, WeightType, Precision +) +from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN +from src.populate import get_evaluation_queue_df, get_leaderboard_df +from src.submission.submit import add_new_eval + + + + +# Define the task icons and names +TASK_ICONS = { + "TE": "📊", # Textual Entailment + "SA": "😃", # Sentiment Analysis + "HS": "⚠️", # Hate Speech + "AT": "🏥", # Admission Test + "WIC": "🔤", # Word in Context + "FAQ": "❓", # Frequently Asked Questions + "LS": "🔄", # Lexical Substitution + "SU": "📝", # Summarization + "NER": "🏷️", # Named Entity Recognition + "REL": "🔗", # Relation Extraction +} + +TASK_NAMES = { + "TE": "Textual Entailment", + "SA": "Sentiment Analysis", + "HS": "Hate Speech", + "AT": "Admission Test", + "WIC": "Word in Context", + "FAQ": "Frequently Asked Questions", + "LS": "Lexical Substitution", + "SU": "Summarization", + "NER": "Named Entity Recognition", + "REL": "Relation Extraction", +} + + +# Tooltip descriptions for each task +TASK_TOOLTIPS = { + "TE": "Identify logical relationships between two text segments.", + "SA": "Classify the sentiment (positive, negative, neutral) of a text.", + "HS": "Detect hate speech in a text.", + "AT": "Classify whether a clinical statement pertains to an admission test.", + "WIC": "Identify words in context and their meaning.", + "FAQ": "Answer frequently asked questions based on given text.", + "LS": "Identify alternative words in a given context.", + "SU": "Summarize long text into a shorter version.", + "NER": "Identify named entities (e.g., persons, locations, organizations) in text.", + "REL": "Extract and link laboratory test results to the respective tests in clinical narratives.", +} + + + + +def restart_space(): + """Restart the Hugging Face space.""" + API.restart_space(repo_id=REPO_ID) + + +def download_snapshot(repo, local_dir): + """Try to download a snapshot from the Hugging Face Hub, restarting space on failure.""" + try: + print(f"Downloading from {repo} to {local_dir}...") + snapshot_download(repo_id=repo, local_dir=local_dir, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN) + except Exception as e: + print(f"Error downloading {repo}: {e}") + restart_space() + + +# Space initialization +download_snapshot(QUEUE_REPO, EVAL_REQUESTS_PATH) +download_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH) + +# Load leaderboard and evaluation queue data +LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS) +finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS) + + +def init_leaderboard(dataframe, default_selection=None, hidden_columns=None): + """Initialize a leaderboard with specific columns.""" + if dataframe is None or dataframe.empty: + raise ValueError("Leaderboard DataFrame is empty or None.") + + return Leaderboard( + value=dataframe, + datatype=[c.type for c in fields(AutoEvalColumn)], + select_columns=SelectColumns( + default_selection=default_selection or [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default], + cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden], + label="Select Columns to Display:", + ), + search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name], + hide_columns=hidden_columns or [c.name for c in fields(AutoEvalColumn) if c.hidden], + filter_columns=[ + #ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"), + ColumnFilter(AutoEvalColumn.fewshot_type.name, type="checkboxgroup", label="Few-Shot Learning (FS)"), + #ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"), + ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0.01, max=150, label="Select the number of parameters (B)"), + #ColumnFilter(AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True), + ], + bool_checkboxgroup_label="Hide models", + interactive=False, + ) + + +def prepare_leaderboard_df(df, task_prefix): + """Rename columns for a specific task to a standard format.""" + return df.rename(columns={ + f"{task_prefix} Prompt Average": "Prompt Average", + f"{task_prefix} Best Prompt": "Best Prompt", + f"{task_prefix} Best Prompt Id": "Best Prompt Id", + task_prefix: "Combined Performance" + }) + + +demo = gr.Blocks(css=custom_css) +with demo: + gr.HTML(TITLE) + gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") + + with gr.Tabs(elem_classes="tab-buttons") as tabs: + # Main leaderboard tab + with gr.TabItem("🏅 EVALITA-LLM Benchmark", elem_id="llm-benchmark-tab-table"): + leaderboard = init_leaderboard( + LEADERBOARD_DF, + default_selection=['FS', 'Model', "Avg. Combined Performance ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"], + hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in + ['FS', 'Model', "Avg. Combined Performance ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]] + ) + + # About tab + with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table"): + gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") + + ''' + # Submission tab + with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table"): + gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text") + + for queue_name, queue_df in [ + ("✅ Finished Evaluations", finished_eval_queue_df), + ("🔄 Running Evaluation Queue", running_eval_queue_df), + ("⏳ Pending Evaluation Queue", pending_eval_queue_df) + ]: + with gr.Accordion(f"{queue_name} ({len(queue_df)})", open=False): + gr.components.Dataframe(value=queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5) + + gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text") + with gr.Row(): + model_name_textbox = gr.Textbox(label="Model name") + revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main") + model_type = gr.Dropdown(choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown], + label="Model type", multiselect=False, interactive=True) + precision = gr.Dropdown(choices=[i.value.name for i in Precision if i != Precision.Unknown], + label="Precision", multiselect=False, value="float16", interactive=True) + weight_type = gr.Dropdown(choices=[i.value.name for i in WeightType], + label="Weights type", multiselect=False, value="Original", interactive=True) + base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)") + + submit_button = gr.Button("Submit Eval") + submission_result = gr.Markdown() + submit_button.click( + add_new_eval, + [model_name_textbox, base_model_name_textbox, revision_name_textbox, precision, weight_type, model_type], + submission_result, + ) + ''' + + # Task-specific leaderboards + for task in ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]: + + with gr.TabItem(f"{TASK_ICONS[task]}{task}", elem_id="llm-benchmark-tab-table"): + + task_description = TASK_DESCRIPTIONS.get(task, "Description not available.") + + + + + gr.Markdown(task_description, elem_classes="markdown-text") + + + gr.Markdown(MEASURE_DESCRIPTION, elem_classes="markdown-text") + + + + leaderboard = init_leaderboard( + prepare_leaderboard_df(LEADERBOARD_DF, task), + default_selection=['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id'], + hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in + ['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id']] + ) + + # Citation section + with gr.Accordion("📙 Citation", open=False): + gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True) + +# Background job to restart space +scheduler = BackgroundScheduler() +scheduler.add_job(restart_space, "interval", seconds=1800) +scheduler.start() + +demo.queue(default_concurrency_limit=40).launch() \ No newline at end of file diff --git a/get_model_info.py b/get_model_info.py new file mode 100644 index 0000000000000000000000000000000000000000..1a1f893faac9c892eff482d8027cd1fe724a1c6c --- /dev/null +++ b/get_model_info.py @@ -0,0 +1,128 @@ +""" +MODEL METADATA EXTRACTOR + +This script processes model evaluation output files (input_folder) from the lm-eval-harness library, +extracts model identifiers, retrieves detailed metadata from HuggingFace +and saves the information as structured JSON files (output_folder). + +Input: Directory containing .out files from lm-eval-harness +Output: Directory with JSON files containing model metadata +""" + +# Example input file format (lm-eval-harness output): +''' +hf (pretrained=swap-uniba/LLaMAntino-3-ANITA-8B-Inst-DPO-ITA,trust_remote_code=True), gen_kwargs: (None), limit: None, num_fewshot: 5, batch_size: 1 +| Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr| +|------------------------|------:|------|-----:|--------|---|-----:|---|------| +|evalita-mp | 1|none | |acc |↑ |0.5605|± |0.0052| +... +Job completed +''' + +# Example output JSON format: +''' +{ + "model": "swap-uniba/LLaMAntino-3-ANITA-8B-Inst-DPO-ITA", + "base_model": "LlamaForCausalLM", + "revision": "2b6e46e4c9d341dc8bf8350a167492c880116b66", + "submitted_time": "2024-04-29 09:34:12+00:00", + "num_params_billion": 8.030261248, + "language": "en_it" +} +''' + +import os +import re +import json +from huggingface_hub import HfApi + +# Configures the Hugging Face token (if needed) +# TOKEN = "YOUR_HUGGINGFACE_API_TOKEN" +api = HfApi() + +# Directory paths +# input_folder: Directory containing the output files of the lm-eval-harness library, including model accuracy metrics. +input_folder = "../evalita_llm_models_output/" +# output_folder: Directory where JSON files with model characteristics will be saved. +output_folder = "../evalita_llm_requests/" + +# Creates the output folder if it doesn't exist +os.makedirs(output_folder, exist_ok=True) + +# Regular expression to find the model name +model_pattern = re.compile(r"pretrained=([\w\-./]+)") + +# Scans files in the input folder +for filename in os.listdir(input_folder): + if filename.endswith('.out'): + file_path = os.path.join(input_folder, filename) + + # Reads the file content + with open(file_path, "r", encoding="utf-8") as f: + content = f.read() + + # Extracts the model name + match = model_pattern.search(content) + if match: + model_name = match.group(1) + print(f"Processing model: {model_name}") + + try: + # Retrieves model information from HuggingFace + model_info = api.model_info(model_name) + + # Calculates the number of parameters in billions, if available + num_params = None + if model_info.safetensors and "BF16" in model_info.safetensors.parameters: + num_params = model_info.safetensors.parameters["BF16"] / 1e9 # Convert to billions + + # Extracts and concatenates languages + language = "_".join(model_info.card_data.get("language", [])) if model_info.card_data else "" + + #print(model_info) + + # Builds the dictionary with required metadata + model_data = { + "model": model_name, + "base_model": model_info.config.get("architectures", [""])[0] if model_info.config else "", + "revision": model_info.sha, + # "precision": "bfloat16", # If available, replace with real value + # "weight_type": "Original", + # "status": "FINISHED", + "submitted_time": str(model_info.created_at), + # "model_type": "pretrained", + # "likes": model_info.likes, + # "params": model_info.safetensors_size_in_bytes / 1e9 if model_info.safetensors_size_in_bytes else None, + # "license": model_info.license, + # "private": model_info.private, + "num_params_billion": num_params, # Number of parameters in billions + "language": language, # Extracted language + } + + # Separates the model_name into two parts: directory name and file name + if "/" in model_name: + dir_name, file_name = model_name.split("/", 1) + else: + dir_name, file_name = model_name, model_name # If no "/", use the same name + + # Creates the folder for saving the produced json files + model_output_folder = os.path.join(output_folder, dir_name) + os.makedirs(model_output_folder, exist_ok=True) + + # Saves the JSON file in the appropriate folder + output_file = os.path.join(model_output_folder, f"{file_name}.json") + + # Check if the file already exists + if os.path.exists(output_file): + print(f"File {output_file} already exists. Skipping...") + continue + + with open(output_file, "w", encoding="utf-8") as f: + json.dump(model_data, f, indent=4) + + print(f"Saved metadata for {model_name} in {output_file}") + + except Exception as e: + print(f"Error retrieving info for {model_name}: {e}") + + print("Process finished!") \ No newline at end of file diff --git a/preprocess_models_output.py b/preprocess_models_output.py new file mode 100644 index 0000000000000000000000000000000000000000..62e2a1bcc07c528120cb90c29ab115b41dcc71ca --- /dev/null +++ b/preprocess_models_output.py @@ -0,0 +1,290 @@ +""" +EVALITA LLM EVALUATION PROCESSOR + +Transforms raw model evaluation outputs into structured performance reports for leaderboard integration. + +DATA PIPELINE OVERVIEW: + +1. Inputs: + - Evaluation Results: Raw .out files from lm-eval-harness + - Model Metadata: Pre-collected .json files from HuggingFace + +2. Output: + - Comprehensive evaluation reports in JSON format + - Ready for ingestion into the evaluation leaderboard + +-------------------------------------------------------------------- +INPUT SPECIFICATION + +Evaluation Results (.out format): + hf (pretrained=model-org/model-name), num_fewshot: 5, batch_size: 1 + | Task | Metric | Value | Stderr | + |---------------|--------|--------|--------| + | main-task | acc | 0.5605 | 0.0052 | + | - sub-task | acc | 0.4640 | 0.0088 | + | - prompt-1 | acc | 0.3720 | 0.0216 | + +Model Metadata (.json format): + { + "model": "model-org/model-name", + "base_model": "ModelArchitecture", + "revision": "git_commit_hash", + "parameters": 8.03, + "language": "en_it" + } + +-------------------------------------------------------------------- +OUTPUT SPECIFICATION + +Evaluation Report (.json format): + { + "summary_metrics": { + "average_CPS": 41.74, + "num_tasks": 12 + }, + "model_config": { + "identifier": "model-org/model-name", + "architecture": "ModelArchitecture", + "parameters": 8.03, + "evaluation_settings": { + "fewshot": 5, + "batch_size": 1 + } + }, + "task_results": { + "task-name": { + "average_score": 52.60, + "best_prompt": { + "id": "prompt-6", + "score": 66.57 + }, + "prompt_analysis": [ + { + "prompt_id": "prompt-1", + "score": 37.20, + "stderr": 0.0216 + } + ] + } + } + } +""" + +import json +import os +import re +import statistics + +TASKS ={"NER", "RE", "DIA", "RML", "HIS" } + +def safe_float(value): + """Safely converts a value to float, returning None if the conversion fails.""" + try: + return float(value) + except ValueError: + return None + + +def calculate_task_metrics(task_info): + """Calculates average accuracy, best prompt accuracy, and CPS for a given task.""" + accuracies = [prompt['value'] for prompt in task_info['prompts'] if prompt['value'] is not None] + + if not accuracies: + return None + + task_info['average_accuracy'] = sum(accuracies) / len(accuracies) + task_info['std_accuracy'] = statistics.stdev(accuracies) if len(accuracies) > 1 else 0.0 + best_prompt_data = max(task_info['prompts'], key=lambda x: x['value']) + task_info['best_prompt'] = best_prompt_data['value'] + task_info['prompt_id'] = best_prompt_data['prompt'] + + # Calculate CPS + avg_acc = task_info['average_accuracy'] + best_acc = task_info['best_prompt'] + task_info['CPS'] = (1 - (best_acc - avg_acc) / 100) * best_acc + + +def extract_data_from_file(file_path): + """Extracts task and prompt data from a specified file.""" + + + LANG="" + if file_path.find ("__en__")!=-1 : LANG="EN" + if file_path.find ("__sl__")!=-1 : LANG="SL" + if file_path.find ("__it__")!=-1 : LANG="IT" + if file_path.find ("__gr__")!=-1 : LANG="GR" + if file_path.find ("__sk__")!=-1 : LANG="SK" + if file_path.find ("__pl__")!=-1 : LANG="PL" + if LANG=="" : + print ("ERROR: ",file_path) + + with open(file_path, 'r') as file: + lines = file.readlines() + + tasks_data = {} + + current_task = None + + for line in lines: + line = line.strip() + + # Skips empty lines + if not line: + continue + + # Skips header lines + if line.startswith("| Tasks") or line.startswith(" | Task"): + continue + + # Extracts model configuration details + if line.startswith("hf (pretrained=") or line.startswith("hf(pretrained="): + start = line.find("pretrained=") + len("pretrained=") + end = line.find(" )", start) + pretrained_model = line[start:end] + + num_fewshot_match = re.search(r"num_fewshot:\s*([\w\d]+)", line) + num_fewshot = num_fewshot_match.group(1) if num_fewshot_match else None + + batch_size_match = re.search(r"batch_size:\s*(\d+)", line) + batch_size = int(batch_size_match.group(1)) if batch_size_match else None + + continue + + columns = line.split('|') + if len(columns) != 11: + continue + print (columns) + task_name = columns[1] + metric = columns[5].strip() + value = safe_float(columns[7]) + stderr = safe_float(columns[9]) + print (value) + # Skips normalized accuracy metrics + if metric == "acc_norm": + continue + + # Identifies task and prompt sections in the file + if task_name.startswith(" - "): + task_name = task_name[3:].strip() + current_task = task_name + tasks_data.setdefault(current_task, + {'prompts': [], 'average_accuracy': 0, 'best_prompt': None, 'prompt_id': None, + 'CPS': None, 'is_dummy': False }) + + elif task_name.startswith(" - ") and current_task: + prompt_name = task_name[4:].strip() + prompt_data = {'prompt': prompt_name, 'metric': metric, 'value': value * 100, + 'stderr': stderr} + tasks_data[current_task]['prompts'].append(prompt_data) + + # Special handling for evalita NER task to calculate weighted prompt averages + if "evalita NER" in tasks_data: + task_info = tasks_data["evalita NER"] + weight_map = {"ADG prompt-1": 521, "ADG prompt-2": 521, "FIC prompt-1": 1517, "FIC prompt-2": 1517, + "WN prompt-1": 2088, "WN prompt-2": 2088} + + weighted_values = {"prompt-1": 0, "prompt-2": 0} + total_weights = sum(weight_map.values()) + + for prompt in task_info['prompts']: + if prompt['prompt'] in weight_map: + if "prompt-1" in prompt['prompt']: + weighted_values["prompt-1"] += weight_map[prompt['prompt']] * prompt['value'] + elif "prompt-2" in prompt['prompt']: + weighted_values["prompt-2"] += weight_map[prompt['prompt']] * prompt['value'] + + task_info['prompts'] = [ + {"prompt": "prompt-1", "metric": "acc", "value": weighted_values["prompt-1"] / total_weights, + 'stderr': None}, + {"prompt": "prompt-2", "metric": "acc", "value": weighted_values["prompt-2"] / total_weights, + 'stderr': None}] + + # Inject dummy entries for any TASKS not present in the file + for must_have in TASKS: + if must_have not in tasks_data: + tasks_data[must_have] = { + 'prompts': [ + {'prompt': 'p1', 'metric': 'f1', 'value': 0.0, 'stderr': 0.0} + ], + 'average_accuracy': None, + 'std_accuracy': None, + 'best_prompt': None, + 'prompt_id': None, + 'CPS': None, + 'is_dummy': True # <--- mark as dummy + } + + # Calculates task metrics for each task + for task_info in tasks_data.values(): + if task_info.get('is_dummy'): + # leave metrics as None for dummy tasks + continue + calculate_task_metrics(task_info) + + # Calculates the average CPS across all tasks + tasks_with_cps = [ + t['CPS'] for t in tasks_data.values() if not t.get('is_dummy') and t.get('CPS') is not None ] + average_CPS = (sum(tasks_with_cps) / len(tasks_with_cps)) if tasks_with_cps else 0 + + #tasks_with_cps = [task['CPS'] for task in tasks_data.values() if task['CPS'] is not None] + #average_CPS = sum(tasks_with_cps) / len(tasks_with_cps) if tasks_with_cps else 0 + + config = { + "model_name": pretrained_model, + "num_fewshot": num_fewshot, + "batch_size": batch_size, + "LANG": LANG + } + + return {'average_CPS': average_CPS, 'config': config, 'tasks': tasks_data} + + +""" +MAIN PROCESSING PIPELINE + +This script executes the complete evaluation data processing workflow: + +1. Input Sources: + - Raw evaluation results (.out files) from: ../evalita_llm_models_output/ + - Model metadata JSON files from: ../evalita_llm_requests/ + +2. Processing Steps: + - Parses evaluation metrics from .out files + - Combines with model metadata + - Calculates aggregated performance statistics + +3. Output: + - Structured JSON results saved to: ../evalita_llm_results/ + - Organized by model organization/name + - Contains complete evaluation results with metadata +""" +directory_in_path = '/home/sfarzi/leaderboard/trail_leaderboard/csv_new/output/' +directory_in_requests_path = '/home/sfarzi/leaderboard/trail_leaderboard/e3c_llm_requests/' +directory_out_results_path = '/home/sfarzi/leaderboard/trail_leaderboard/e3c_llm_results/' + +for filename in os.listdir(directory_in_path): + if filename.endswith('.txt'): + file_path = os.path.join(directory_in_path, filename) + json_output = extract_data_from_file(file_path) + + model_org_name, model_name = json_output['config']['model_name'].split('/') + + + config_file_path = os.path.join(directory_in_requests_path, model_org_name, f"{model_name}.json") + + if os.path.exists(config_file_path): + with open(config_file_path, 'r', encoding='utf-8') as config_file: + additional_config = json.load(config_file) + json_output['config'].update(additional_config) + + + org_folder_path = os.path.join(directory_out_results_path, model_org_name) + os.makedirs(org_folder_path, exist_ok=True) + + file_suffix = f"{json_output['config']['num_fewshot']}" +"_"+ f"{json_output['config']['LANG']}" + output_file_path = os.path.join(org_folder_path, f"{model_name}_{file_suffix}.json") + + with open(output_file_path, 'w', newline="\n") as outfile: + json.dump(json_output, outfile, indent=4) + + print(f"File {filename} processed and saved to {output_file_path}") \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000000000000000000000000000000000000..3b4737924b5a7d81c962a4e28b66ac6cdcc3b004 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,13 @@ +[tool.ruff] +# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default. +select = ["E", "F"] +ignore = ["E501"] # line too long (black is taking care of this) +line-length = 119 +fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"] + +[tool.isort] +profile = "black" +line_length = 119 + +[tool.black] +line-length = 119 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..46d41ea882da58a810ff984860b8fda48abf8f04 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,17 @@ +APScheduler +black +datasets +gradio +gradio[oauth] +gradio_leaderboard==0.0.13 +gradio_client +huggingface-hub>=0.18.0 +matplotlib +numpy +pandas +python-dateutil +tqdm +transformers +tokenizers>=0.15.0 +sentencepiece +plotly diff --git a/run_instructions.txt b/run_instructions.txt new file mode 100644 index 0000000000000000000000000000000000000000..a750b69dfb0a3a63c8ef77fb6bfef3c5bc9b2f2c --- /dev/null +++ b/run_instructions.txt @@ -0,0 +1,42 @@ +Model Evaluation and Leaderboard + +1) Model Evaluation +Before integrating a model into the leaderboard, it must first be evaluated using the lm-eval-harness library in both zero-shot and 5-shot configurations. + +This can be done with the following command: + +lm_eval --model hf --model_args pretrained=google/gemma-3-12b-it \ + --tasks evalita-mp --device cuda:0 --batch_size 1 --trust_remote_code \ + --output_path model_output --num_fewshot 5 -- + +The output generated by the library will include the model's accuracy scores on the benchmark tasks. +This output is written to the standard output and should be saved in a txt file (e.g., slurm-8368.out), which needs to be placed in the + evalita_llm_models_output directory for further processing. + +2) Extracting Model Metadata +To display model details on the leaderboard (e.g., organization/group, model name, and parameter count), metadata must be retrieved from Hugging Face. + +This can be done by running: + +python get_model_info.py + +This script processes the evaluation files from Step 1 and saves each model's metadata in a JSON file within the evalita_llm_requests directory. + +3) Generating Leaderboard Submission File +The leaderboard requires a structured file containing each model’s metadata along with its benchmark accuracy scores. + +To generate this file, run: + +python preprocess_model_output. + +This script combines the accuracy results from Step 1 with the metadata from Step 2 and outputs a JSON file in the evalita_llm_results directory. + +4) Updating the Hugging Face Repository +The evalita_llm_results repository on HuggingFace must be updated with the newly generated files from Step 3. + +5) Running the Leaderboard Application +Finally, execute the leaderboard application by running: + +python app.py + + diff --git a/src/.ipynb_checkpoints/about-checkpoint.py b/src/.ipynb_checkpoints/about-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..36db643246b90c32a8c8262d87af92c878ba7cfe --- /dev/null +++ b/src/.ipynb_checkpoints/about-checkpoint.py @@ -0,0 +1,198 @@ +from dataclasses import dataclass +from enum import Enum + +@dataclass +class Task: + benchmark: str + metric: str + metric_type: str + col_name: str + +# Select your tasks here +# --------------------------------------------------- +class Tasks(Enum): + # task_key in the json file, metric_key in the json file, name to display in the leaderboard + + task1 = Task("text-entailment_1", "acc", "CPS", "TE") + task2 = Task("text-entailment_2", "acc", "average_accuracy", "TE Prompt Average") + task3 = Task("text-entailment_3", "acc", "std_accuracy", "TE Prompt Std") + task4 = Task("text-entailment_4", "acc", "best_prompt", "TE Best Prompt") + task5 = Task("text-entailment_5", "acc", "prompt_id", "TE Best Prompt Id") + + task6 = Task("sentiment-analysis_1", "acc", "CPS", "SA") + task7 = Task("sentiment-analysis_2", "acc", "average_accuracy", "SA Prompt Average") + task8 = Task("sentiment-analysis_3", "acc", "std_accuracy", "SA STD Accuracy") + task9 = Task("sentiment-analysis_4", "acc", "best_prompt", "SA Best Prompt") + task10 = Task("sentiment-analysis_5", "acc", "prompt_id", "SA Best Prompt Id") + + task11 = Task("hate-speech-detection_1", "acc", "CPS", "HS") + task12 = Task("hate-speech-detection_2", "acc", "average_accuracy", "HS Prompt Average") + task13 = Task("hate-speech-detection_3", "acc", "std_accuracy", "HS Prompt Std") + task14 = Task("hate-speech-detection_4", "acc", "best_prompt", "HS Best Prompt") + task15 = Task("hate-speech-detection_5", "acc", "prompt_id", "HS Best Prompt Id") + + task16 = Task("admission-test_1", "acc", "CPS", "AT") + task17 = Task("admission-test_2", "acc", "average_accuracy", "AT Prompt Average") + task18 = Task("admission-test_3", "acc", "std_accuracy", "AT Prompt Std") + task19 = Task("admission-test_4", "acc", "best_prompt", "AT Best Prompt") + task20 = Task("admission-test_5", "acc", "prompt_id", "AT Best Prompt Id") + + task21 = Task("word-in-context_1", "acc", "CPS", "WIC") + task22 = Task("word-in-context_2", "acc", "average_accuracy", "WIC Prompt Average") + task23 = Task("word-in-context_3", "acc", "std_accuracy", "WIC Prompt Std") + task24 = Task("word-in-context_4", "acc", "best_prompt", "WIC Best Prompt") + task25 = Task("word-in-context_5", "acc", "prompt_id", "WIC Best Prompt Id") + + task26 = Task("faq_1", "acc", "CPS", "FAQ") + task27 = Task("faq_2", "acc", "average_accuracy", "FAQ Prompt Average") + task28 = Task("faq_3", "acc", "std_accuracy", "FAQ Prompt Std") + task29 = Task("faq_4", "acc", "best_prompt", "FAQ Best Prompt") + task30 = Task("faq_5", "acc", "prompt_id", "FAQ Best Prompt Id") + + task31 = Task("lexical-substitution_1", "acc", "CPS", "LS") + task32 = Task("lexical-substitution_2", "acc", "average_accuracy", "LS Prompt Average") + task33 = Task("lexical-substitution_3", "acc", "std_accuracy", "LS Prompt Std") + task34 = Task("lexical-substitution_4", "acc", "best_prompt", "LS Best Prompt") + task35 = Task("lexical-substitution_5", "acc", "prompt_id", "LS Best Prompt Id") + + task36 = Task("summarization-fanpage_1", "acc", "CPS", "SU") + task37 = Task("summarization-fanpage_2", "acc", "average_accuracy", "SU Prompt Average") + task38 = Task("summarization-fanpage_3", "acc", "std_accuracy", "SU Prompt Std") + task39 = Task("summarization-fanpage_4", "acc", "best_prompt", "SU Best Prompt") + task40 = Task("summarization-fanpage_5", "acc", "prompt_id", "SU Best Prompt Id") + + task41 = Task("evalita NER_1", "acc", "CPS", "NER") + task42 = Task("evalita NER_2", "acc", "average_accuracy", "NER Prompt Average") + task43 = Task("evalita NER_3", "acc", "std_accuracy", "NER Prompt Std") + task44 = Task("evalita NER_4", "acc", "best_prompt", "NER Best Prompt") + task45 = Task("evalita NER_5", "acc", "prompt_id", "NER Best Prompt Id") + + task46 = Task("relation-extraction_1", "acc", "CPS", "REL") + task47 = Task("relation-extraction_2", "acc", "average_accuracy", "REL Prompt Average") + task48 = Task("relation-extraction_5", "acc", "std_accuracy", "REL Prompt Std") + task49 = Task("relation-extraction_3", "acc", "best_prompt", "REL Best Prompt") + task50 = Task("relation-extraction_4", "acc", "prompt_id", "REL Best Prompt Id") + + ''' + task0 = Task("TextualEntailment", "acc", "Textual Entailment") + task1 = Task("TextualEntailment_best", "acc", "TextualEntailment Best") + task2 = Task("Sentiment Analysis", "acc", "Sentiment Analysis") + task3 = Task("Sentiment Analysis_best", "acc", "Sentiment Analysis_best") + task4 = Task("Hate Speech", "acc", "Hate Speech") + task5 = Task("Hate Speech_best", "acc", "Hate Speech_best") + task6 = Task("Admission Test", "acc", "Admission Test") + task7 = Task("Admission Test_best", "acc", "Admission Test_best") + task8 = Task("Word in Context", "acc", "Word in Context") + task9 = Task("Word in Context_best", "acc", "Word in Context_best") + task10 = Task("FAQ", "acc", "FAQ") + task11 = Task("FAQ_best", "acc", "FAQ_best") + task12 = Task("Lexical Substitution", "acc", "Lexical Substitution") + task13 = Task("Lexical Substitution_best", "acc", "Lexical Substitution_best") + task14 = Task("Summarization", "acc", "Summarization") + task15 = Task("Summarization_best", "acc", "Summarization_best") + task16 = Task("NER", "acc", "NER") + task17 = Task("NER_best", "acc", "NER_best") + task18 = Task("REL", "acc", "REL") + task19 = Task("REL_best", "acc", "REL_best") + ''' + +# Your leaderboard name +TITLE = """

🚀 EVALITA-LLM Leaderboard 🚀

""" + +# What does your leaderboard evaluate? +INTRODUCTION_TEXT = """ +Evalita-LLM is a benchmark designed to evaluate Large Language Models (LLMs) on Italian tasks. The distinguishing features of Evalita-LLM are the following: (i) **all tasks are native Italian**, avoiding translation issues and potential cultural biases; (ii) the benchmark includes **generative** tasks, enabling more natural interaction with LLMs; (iii) **all tasks are evaluated against multiple prompts**, this way mitigating the model sensitivity to specific prompts and allowing a fairer evaluation. + +**Multiple-choice tasks:** 📊TE (Textual Entailment), 😃SA (Sentiment Analysis), ⚠️HS (Hate Speech Detection), 🏥AT (Admission Test), 🔤WIC (Word in Context), ❓FAQ (Frequently Asked Questions)
+**Generative tasks:** 🔄LS (Lexical Substitution), 📝SU (Summarization), 🏷️NER (Named Entity Recognition), 🔗REL (Relation Extraction) +""" + +# Which evaluations are you running? how can people reproduce what you have? +LLM_BENCHMARKS_TEXT = f""" +### Groups + +- `evalita-mp`: All tasks (perplexity and non-perplexity based). +- `evalita-mp_gen`: Only generative tasks. +- `evalita-mp_mc`: Only multiple-choice tasks. + +#### Tasks + +The following Evalita-LLM tasks can also be evaluated in isolation: + - `evalita-mp_te`: Textual Entailment (TE) + - `evalita-mp_sa`: Sentiment Analysis (SA) + - `evalita-mp_wic`: Word in Context (WIC) + - `evalita-mp_hs`: Hate Speech Detection (HS) + - `evalita-mp_at`: Admission Tests (AT) + - `evalita-mp_faq`: Frequently Asked Questions & Question Answering (FAQ) + - `evalita-mp_sum_fp`: Summarization (SU) + - `evalita-mp_ls`: Lexical Substitution LS) + - `evalita-mp_ner_group`: Named Entity Recognition (NER) + - `evalita-mp_re`: Relation Extraction (REL) + + +### Usage + +```bash + +lm_eval --model hf --model_args pretrained=meta-llama/Llama-2-7b-hf --tasks evalita-mp --device cuda:0 --batch_size 1 +``` + + + + +""" + +EVALUATION_QUEUE_TEXT = """ +## Some good practices before submitting a model + +### 1) Make sure you can load your model and tokenizer using AutoClasses: +```python +from transformers import AutoConfig, AutoModel, AutoTokenizer +config = AutoConfig.from_pretrained("your model name", revision=revision) +model = AutoModel.from_pretrained("your model name", revision=revision) +tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision) +``` +If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded. + +Note: make sure your model is public! +Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted! + +### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index) +It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`! + +### 3) Make sure your model has an open license! +This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗 + +### 4) Fill up your model card +When we add extra information about models to the leaderboard, it will be automatically taken from the model card + +## In case of model failure +If your model is displayed in the `FAILED` category, its execution stopped. +Make sure you have followed the above steps first. +If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task). +""" + +CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" +CITATION_BUTTON_TEXT = r""" +@misc{magnini2025evalitallmbenchmarkinglargelanguage, + title={Evalita-LLM: Benchmarking Large Language Models on Italian}, + author={Bernardo Magnini and Roberto Zanoli and Michele Resta and Martin Cimmino and Paolo Albano and Marco Madeddu and Viviana Patti}, + year={2025}, + eprint={2502.02289}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2502.02289}, +} +""" diff --git a/src/.ipynb_checkpoints/envs-checkpoint.py b/src/.ipynb_checkpoints/envs-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..9db342d84e248b24bae574cb6cb33a42efa92c04 --- /dev/null +++ b/src/.ipynb_checkpoints/envs-checkpoint.py @@ -0,0 +1,46 @@ +import os + +from huggingface_hub import HfApi + +# Info to change for your repository +# ---------------------------------- +TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org + +#OWNER = "giux78" # Change to your org - don't forget to create a results and request dataset, with the correct format! +OWNER = "saeedfarzi" +# ---------------------------------- + +#REPO_ID = f"{OWNER}/leaderboard-evalita" +#QUEUE_REPO = f"{OWNER}/evalita-requests" +#RESULTS_REPO = f"{OWNER}/evalita-results" + +REPO_ID = f"{OWNER}/MediLingua_Leaderboard" +QUEUE_REPO = f"{OWNER}/e3c_llm_requests" +RESULTS_REPO = f"{OWNER}/e3c_llm_results" + +# If you setup a cache later, just change HF_HOME +#CACHE_PATH=os.getenv("HF_HOME", "/home/sfarzi/leaderboard/") + +# Local caches +#EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue") +#EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results") +#EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk") +#EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk") + +#EVAL_REQUESTS_PATH ='/home/sfarzi/leaderboard/llm_leaderboard/e3c_llm_requests' #os.path.join(CACHE_PATH, "eval-queue") +#EVAL_RESULTS_PATH = '/home/sfarzi/leaderboard/llm_leaderboard/e3c_llm_results'#os.path.join(CACHE_PATH, "eval-results") +#EVAL_REQUESTS_PATH_BACKEND = '/home/sfarzi/leaderboard/llm_leaderboard/e3c_llm_requests' #os.path.join(CACHE_PATH, "eval-queue-bk") +#EVAL_RESULTS_PATH_BACKEND = '/home/sfarzi/leaderboard/llm_leaderboard/e3c_llm_results' #os.path.join(CACHE_PATH, "eval-results-bk") + +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) + +# Go one directory up from BASE_DIR +PARENT_DIR = os.path.dirname(BASE_DIR) + +# Now set the paths to the directories one level up +EVAL_REQUESTS_PATH = os.path.join(PARENT_DIR, "e3c_llm_requests") +EVAL_RESULTS_PATH = os.path.join(PARENT_DIR, "e3c_llm_results") +EVAL_REQUESTS_PATH_BACKEND = EVAL_REQUESTS_PATH +EVAL_RESULTS_PATH_BACKEND = EVAL_RESULTS_PATH + +API = HfApi(token=TOKEN) diff --git a/src/.ipynb_checkpoints/populate-checkpoint.py b/src/.ipynb_checkpoints/populate-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..5bbaf385b52d6edf173633353b9458b76c868158 --- /dev/null +++ b/src/.ipynb_checkpoints/populate-checkpoint.py @@ -0,0 +1,59 @@ +import json +import os + +import pandas as pd + +from src.display.formatting import has_no_nan_values, make_clickable_model +from src.display.utils import AutoEvalColumn, EvalQueueColumn +from src.leaderboard.read_evals import get_raw_eval_results + + +def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame: + """Creates a dataframe from all the individual experiment results""" + raw_data = get_raw_eval_results(results_path, requests_path) + all_data_json = [v.to_dict() for v in raw_data] + + df = pd.DataFrame.from_records(all_data_json) + df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False) + df = df[cols].round(decimals=2) + #df.to_csv("output.csv", index=False) + + # filter out if any of the benchmarks have not been produced + df = df[has_no_nan_values(df, benchmark_cols)] + return df + + +def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]: + """Creates the different dataframes for the evaluation queues requestes""" + entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")] + all_evals = [] + + for entry in entries: + if ".json" in entry: + file_path = os.path.join(save_path, entry) + with open(file_path) as fp: + data = json.load(fp) + + data[EvalQueueColumn.model.name] = make_clickable_model(data["model"]) + data[EvalQueueColumn.revision.name] = data.get("revision", "main") + + all_evals.append(data) + elif ".md" not in entry: + # this is a folder + sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")] + for sub_entry in sub_entries: + file_path = os.path.join(save_path, entry, sub_entry) + with open(file_path) as fp: + data = json.load(fp) + + data[EvalQueueColumn.model.name] = make_clickable_model(data["model"]) + data[EvalQueueColumn.revision.name] = data.get("revision", "main") + all_evals.append(data) + + pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]] + running_list = [e for e in all_evals if e["status"] == "RUNNING"] + finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"] + df_pending = pd.DataFrame.from_records(pending_list, columns=cols) + df_running = pd.DataFrame.from_records(running_list, columns=cols) + df_finished = pd.DataFrame.from_records(finished_list, columns=cols) + return df_finished[cols], df_running[cols], df_pending[cols] diff --git a/src/.ipynb_checkpoints/tasks-checkpoint.py b/src/.ipynb_checkpoints/tasks-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..1b3c7f6764cf25ec0f0a0b3705ed89617235a505 --- /dev/null +++ b/src/.ipynb_checkpoints/tasks-checkpoint.py @@ -0,0 +1,183 @@ +from dataclasses import dataclass +from enum import Enum + +@dataclass +class Task: + benchmark: str + # metric: str + accuracy: str + col_name: str + +NUM_FEWSHOT = 0 # Change with your few shot +# --------------------------------------------------- + +# Your leaderboard name +TITLE = """

🚀 EVALITA-LLM Leaderboard 🚀

""" + +# What does your leaderboard evaluate? +INTRODUCTION_TEXT = """ +Evalita-LLM is a benchmark designed to evaluate Large Language Models (LLMs) on Italian tasks. The distinguishing features of Evalita-LLM are the following: (i) all tasks are native Italian, avoiding translation issues and potential cultural biases; (ii) the benchmark includes generative tasks, enabling more natural interaction with LLMs; (iii) all tasks are evaluated against multiple prompts, this way mitigating the model sensitivity to specific prompts and allowing a fairer evaluation. +""" + +#MEASURE_DESCRIPTION = "Combined Performance = (1 - (Best_Prompt - Prompt_Average) / 100) * Best_Prompt. Prompt Average = accuracy averaged over the six prompts. Best Prompt = accuracy of the best prompt. Prompt ID = ID of the best prompt (see legend above)" +MEASURE_DESCRIPTION = "**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = accuracy averaged over the assessed prompts. **Best Prompt** = accuracy of the best prompt. **Prompt ID** = ID of the best prompt (see legend above)." +#MEASURE_DESCRIPTION = "**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = avg. accuracy over prompts. **Best Prompt** = accuracy of best prompt. **Prompt ID** = ID of the best prompt (see legend above)." + +# Tasks Descriptions +TE_DESCRIPTION = """### Textual Entailment (TE) --- *Multiple-choice task* + The input are two sentences: the text (T) and the hypothesis (H). The model has to determine whether the meaning of the hypothesis is logically entailed by the text. + +| # | Prompt | Answer Choices | +|-----|------------|--------------| +| 1 | La frase: '{{text1}}' implica logicamente che la frase: '{{text2}}' sia vera? | ["Sì", "No"] | +| 2 | Devi risolvere un compito di inferenza semantica. La frase: '{{text1}}' implica logicamente che la frase: '{{text2}}' sia vera? | ["Sì", "No"] | +| 3 | La frase: '{{text1}}' implica logicamente che la frase: '{{text2}}' sia vera?\\nA: Sì\\nB: No\\nRisposta: | ["A", "B"] | +| 4 | Devi risolvere un compito di inferenza semantica. La frase: '{{text1}}' implica logicamente che la frase: '{{text2}}' sia vera?\\nA: Sì\\nB: No\\nRisposta: | ["A", "B"] | +| 5 | Frase 1: '{{text1}}' Frase 2: '{{text2}}' | ["La frase 1 implica logicamente che la frase 2 sia vera", "La frase 1 non implica logicamente che la frase 2 sia vera"] | +| 6 | Devi risolvere un compito di inferenza semantica. Frase 1: '{{text1}}' Frase 2: '{{text2}}' | ["La frase 1 implica logicamente che la frase 2 sia vera", "La frase 1 non implica logicamente che la frase 2 sia vera"] | + +**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = accuracy averaged over the 6 prompts. **Best Prompt** = accuracy of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). + +""" + +SA_DESCRIPTION = """### Sentiment Analysis (SA) --- *Multiple-choice task* + The input is a tweet. The model has to determine the sentiment polarity of the text, categorizing it into one of four classes: positive, negative, neutral, or mixed. + +| # | Prompt | Answer Choices | +|-----|--------------------------------------------------------------------------------|-----------------------------| +| 1 | Qual è il sentiment espresso nel seguente tweet: '{{text}}'? | ["Positivo", "Negativo", "Neutro", "Misto"] | +| 2 | Devi svolgere un compito di analisi del sentiment. Qual è il sentiment espresso nel seguente tweet: '{{text}}'? | ["Positivo", "Negativo", "Neutro", "Misto"] | +| 3 | Qual è il sentiment espresso nel seguente tweet: '{{text}}'?\\nA: Positivo\\nB: Negativo\\nC: Neutro\\nD: Misto\\nRisposta: | ["A", "B", "C", "D"] | +| 4 | Devi svolgere un compito di analisi del sentiment. Qual è il sentiment espresso nel seguente tweet: '{{text}}'?\\nA: Positivo\\nB: Negativo\\nC: Neutro\\nD: \\nRisposta: | ["A", "B", "C", "D"] | +| 5 | Il seguente tweet: '{{text}}' esprime un sentiment | ["Positivo", "Negativo", "Neutro", "Misto"] | +| 6 | Devi svolgere un compito di analisi del sentiment. Il seguente tweet: '{{text}}' esprime un sentiment | ["Positivo", "Negativo", "Neutro", "Misto"] | + +**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1-macro averaged over the 6 prompts. **Best Prompt** = F1-macro of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). + +""" + +HS_DESCRIPTION = """### Hate Speech (HS) --- *Multiple-choice task* + The input is a tweet. The model has to determine whether the text contains hateful content directed towards marginalized or minority groups. The output is a binary classification: hateful or not hateful. + +| # | Prompt | Answer Choices | +|-----|--------------------------------------------------------------------------------|-------------------------------------------------| +| 1 | C'è incitamento all'odio nel seguente tweet: '{{full_text}}'? | ["Falso", "Vero"] | +| 2 | Devi svolgere un compito di identificazione di incitamento all'odio. C'è incitamento all'odio nel seguente tweet: '{{full_text}}'? | ["Falso", "Vero"] | +| 3 | C'è incitamento all'odio nel seguente tweet: '{{full_text}}'?\\nA: Vero\\nB: Falso\\nRisposta: | ["B", "A"] | +| 4 | Devi svolgere un compito di identificazione di incitamento all'odio. C'è incitamento all'odio nel seguente tweet: '{{full_text}}'?\\nA: Vero\\nB: Falso\\nRisposta: | ["B", "A"] | +| 5 | Il tweet: '{{full_text}}' | ["non contiene incitamento all'odio", "contiene incitamento all'odio"] | +| 6 | Devi svolgere un compito di identificazione di incitamento all'odio. Il tweet: '{{full_text}}' | ["non contiene incitamento all'odio", "contiene incitamento all'odio"] | + +**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1-micro averaged over the 6 prompts. **Best Prompt** = F1-micro of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). + +""" + +AT_DESCRIPTION = """### Admission Tests (AT) --- *Multiple-choice task* + The input is a multiple-choice question with five options (A-E) from Italian medical specialty entrance exams, and the model must identify the correct answer. + +| # | Prompt | Answer Choices | +|-----|--------------------------------------------------------------------------------|-----------------------------| +| 1 | Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta? | ["A", "B", "C", "D", "E"] | +| 2 | Devi risolvere un compito di risposte a domande. Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta? | ["A", "B", "C", "D", "E"] | +| 3 | Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta?\\nA: {{A}}\\nB: {{B}}\\nC: {{C}}\\nD: {{D}}\\nE: {{E}}\\nRisposta: | ["A", "B", "C", "D", "E"] | +| 4 | Devi risolvere un compito a scelta multipla. Dato il seguente caso clinico: '{{background}}', qual è la risposta corretta alla domanda: '{{domanda}}'?\\nA: {{A}}\\nB: {{B}}\\nC: {{C}}\\nD: {{D}}\\nE: {{E}}\\nRisposta:Devi risolvere un compito a scelta multipla. Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta?\\nA: {{A}}\\nB: {{B}}\\nC: {{C}}\\nD: {{D}}\\nE: {{E}}\\nRisposta: | ["A", "B", "C", "D", "E"] | +| 5 | Dato il seguente quesito di medicina '{{Question}}' la risposta corretta è: | ["A", "B", "C", "D", "E"] | +| 6 | Devi risolvere un compito di risposte a domande. Dato il seguente quesito di medicina '{{Question}}' la risposta corretta è: | ["A", "B", "C", "D", "E"] | + +**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = accuracy averaged over the 6 prompts. **Best Prompt** = accuracy of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). + +""" + +WIC_DESCRIPTION = """### Word in Context (WIC) --- *Multiple-choice task* + The input consists of a word (w) and two sentences. The model has to determine whether the word w has the same meaning in both sentences. The output is a binary classification: 1 (same meaning) or 0 (different meaning). + +| # | Prompt | Answer Choices | +|-----|--------------------------------------------------------------------------------|-------------------------------------------------| +| 1 | La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' ha lo stesso significato della parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}'? | ["No", "Sì"] | +| 2 | Devi determinare se una stessa parola usata in due frasi differenti ha lo stesso significato in entrambi i contesti. La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' ha lo stesso significato della parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}'? | ["No", "Sì"] | +| 3 | La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' ha lo stesso significato della parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}'?\\nA: Sì\\nB: No\\nRisposta: | ["B", "A"] | +| 4 | Devi determinare se una stessa parola usata in due frasi differenti ha lo stesso significato in entrambi i contesti. La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' ha lo stesso significato della parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}'?\\nA: \\nB: No\\nRisposta: | ["B", "A"] | +| 5 | La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' e la parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}' | ["non hanno lo stesso significato", "hanno lo stesso significato"] | +| 6 | Devi determinare se una stessa parola usata in due frasi differenti ha lo stesso significato in entrambi i contesti. La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' e la parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}' | ["non hanno lo stesso significato", "hanno lo stesso significato"] | + +**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1-macro averaged over the 6 prompts. **Best Prompt** = F1-macro of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). + +""" + +FAQ_DESCRIPTION = """### Frequently Asked Questions & Question Answering (FAQ) --- *Multiple-choice task* + The input is a user query regarding the water supply service. The model must identify the correct answer from the 4 available options. + +| # | Prompt | Answer Choices | +|-----|--------------------------------------------------------------------------------|-----------------------------| +| 1 | Rispondi alla seguente domanda: '{{question}}' | {{[A, B, C, D]}} | +| 2 | Devi risolvere un compito di risposte a domande. Rispondi alla seguente domanda: '{{question}}' | {{[A, B, C, D]}} | +| 3 | Rispondi alla seguente domanda: '{{question}}'\\nA: {{A}}\\nB: {{B}}\\nC: {{C}}\\nD: {{D}}\\nRisposta: | ["A", "B", "C", "D"] | +| 4 | Devi risolvere un compito a scelta multipla. Rispondi alla seguente domanda: '{{question}}'\\nA: {{A}}\\nB: {{B}}\\nC: {{C}}\\nD: {{D}}\\nRisposta: | ["A", "B", "C", "D"] | +| 5 | La risposta alla domanda: '{{question}}' è: | {{[A, B, C, D]}} | +| 6 | Devi risolvere un compito di risposte a domande. La risposta alla domanda: '{{question}}' è: | {{[A, B, C, D]}} | + +**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = accuracy averaged over the 6 prompts. **Best Prompt** = accuracy of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). + +""" + +LS_DESCRIPTION = """### Lexical Substitution (LS) --- *Generative task* + The input is a sentence containing a target word (w). The model has to replace the target word w with its most suitable synonyms that are contextually relevant. + +| # | Prompt | +|-----|--------------------------------------------------------------------------------| +| 7 | Trova 10 parole che possono sostituire la parola racchiusa tra i marcatori `` nella seguente frase: '{{context}}', mantenendo lo stesso significato. Elenca i lemmi (forme base) di queste parole, separandoli con una virgola, ad esempio: lemma1, lemma2, lemma3, lemma4, lemma5. Non aggiungere commenti o altro testo. Risposta: | +| 8 | Devi risolvere un compito di sostituzione lessicale. Trova 10 parole che possono sostituire la parola racchiusa tra i marcatori `` nella seguente frase: '{{context}}', mantenendo lo stesso significato. Elenca i lemmi (forme base) di queste parole, separandoli con una virgola, ad esempio: lemma1, lemma2, lemma3, lemma4, lemma5. Non aggiungere commenti o altro testo. Risposta: | + +**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1 averaged over the 2 prompts. **Best Prompt** = F1 of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). + +""" + +SU_DESCRIPTION = """### Summarization (SUM) --- *Generative task* + The input is a news article. The model has to generate a concise summary of the input text, capturing the key information and main points. + +| # | Prompt | +|-----|--------------------------------------------------------------------------------| +| 7 | Riassumi il seguente articolo di giornale: '{{source}}'\\nRiassunto: | +| 8 | Devi risolvere un compito di sintesi automatica del testo. Riassumi il seguente articolo di giornale: '{{source}}'\\nRiassunto: | + +**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1 averaged over the 2 prompts. **Best Prompt** = F1 of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). + +""" + +NER_DESCRIPTION = """### Named Entity Recognition (NER) --- *Generative task* + The input is a sentence. The model has to identify and classify Named Entities into predefined categories such as person, organization, and location. + +| # | Prompt | +|-----|--------------------------------------------------------------------------------| +| 7 | Estrai tutte le entità di tipo PER (persona), LOC (luogo) e ORG (organizzazione) dal testo seguente. Riporta ogni entità con il formato: Entità$Tipo, separando ciascuna coppia con ','. Se non ci sono entità da estrarre, rispondi con '&&NOENT&&'.\\nTesto: '{{text}}'\\nEntità: | +| 8 | Devi svolgere un compito di riconoscimento delle entità nei testi. Estrai tutte le entità di tipo PER (persona), LOC (luogo) e ORG (organizzazione) dal testo seguente. Riporta ogni entità con il formato: Entità$Tipo, separando ciascuna coppia con ','. Se non ci sono entità da estrarre, rispondi con '&&NOENT&&'.\\nTesto: '{{text}}'\\nEntità: | + +**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1 averaged over the 2 prompts. **Best Prompt** = F1 of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). + +""" + +REL_DESCRIPTION = """### Relation Extraction (REL) --- *Generative task* + The input is a sentence of a clinical text. The model must identify and extract relationships between laboratory test results (e.g., blood pressure) and the corresponding tests or procedures that generated them (e.g., blood pressure test). + +| # | Prompt | +|-----|--------------------------------------------------------------------------------| +| 7 | Dato un documento medico devi estrarre tutte le misurazioni degli esami medici presenti. Riporta ogni relazione nel formato: misurazione$esame, separando ciascuna coppia con '%'. Se non ci sono relazioni da estrarre, rispondi con '&&NOREL&&'.\\nTesto: '{{text}}'\\nRelazioni: | +| 8 | Devi svolgere un compito di estrazione di relazioni da documenti medici. Dato un documento medico devi estrarre tutte le misurazioni degli esami medici presenti. Riporta ogni relazione nel formato: misurazione$esame, separando ciascuna coppia con '%'. Se non ci sono relazioni da estrarre, rispondi con '&&NOREL&&'.\\nTesto: '{{text}}'\\nRelazioni: | + +**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1 averaged over the 2 prompts. **Best Prompt** = F1 of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). + +""" + +# Create a dictionary to map task names to their descriptions +TASK_DESCRIPTIONS = { + "TE": TE_DESCRIPTION, + "SA": SA_DESCRIPTION, + "HS": HS_DESCRIPTION, + "AT": AT_DESCRIPTION, + "WIC": WIC_DESCRIPTION, + "FAQ": FAQ_DESCRIPTION, + "LS": LS_DESCRIPTION, + "SU": SU_DESCRIPTION, + "NER": NER_DESCRIPTION, + "REL": REL_DESCRIPTION +} \ No newline at end of file diff --git a/src/__pycache__/about.cpython-310.pyc b/src/__pycache__/about.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cbda2e403d65ed8d4dd1241f0570fd9ba76c09c8 Binary files /dev/null and b/src/__pycache__/about.cpython-310.pyc differ diff --git a/src/__pycache__/envs.cpython-310.pyc b/src/__pycache__/envs.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6bd39ddac2141e91072782e80e67cb0024ba03bb Binary files /dev/null and b/src/__pycache__/envs.cpython-310.pyc differ diff --git a/src/__pycache__/populate.cpython-310.pyc b/src/__pycache__/populate.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..58d0b6452f839a4f869e923bc26c831f6f5e7922 Binary files /dev/null and b/src/__pycache__/populate.cpython-310.pyc differ diff --git a/src/__pycache__/tasks.cpython-310.pyc b/src/__pycache__/tasks.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7398e8e67ffdf2626fd183050638841f51b03422 Binary files /dev/null and b/src/__pycache__/tasks.cpython-310.pyc differ diff --git a/src/about.py b/src/about.py new file mode 100644 index 0000000000000000000000000000000000000000..bca5fccde80ed6f165afbd30825b0f257b43ded0 --- /dev/null +++ b/src/about.py @@ -0,0 +1,222 @@ +from dataclasses import dataclass +from enum import Enum + +@dataclass +class Task: + benchmark: str + metric: str + metric_type: str + col_name: str + +# Select your tasks here +# --------------------------------------------------- +class Tasks(Enum): + # task_key in the json file, metric_key in the json file, name to display in the leaderboard + + #task1 = Task("text-entailment_1", "acc", "CPS", "TE") + #task2 = Task("text-entailment_2", "acc", "average_accuracy", "TE Prompt Average") + #task3 = Task("text-entailment_3", "acc", "std_accuracy", "TE Prompt Std") + #task4 = Task("text-entailment_4", "acc", "best_prompt", "TE Best Prompt") + #task5 = Task("text-entailment_5", "acc", "prompt_id", "TE Best Prompt Id") + + #task6 = Task("sentiment-analysis_1", "acc", "CPS", "SA") + #task7 = Task("sentiment-analysis_2", "acc", "average_accuracy", "SA Prompt Average") + #task8 = Task("sentiment-analysis_3", "acc", "std_accuracy", "SA STD Accuracy") + #task9 = Task("sentiment-analysis_4", "acc", "best_prompt", "SA Best Prompt") + #task10 = Task("sentiment-analysis_5", "acc", "prompt_id", "SA Best Prompt Id") + + #task11 = Task("hate-speech-detection_1", "acc", "CPS", "HS") + #task12 = Task("hate-speech-detection_2", "acc", "average_accuracy", "HS Prompt Average") + #task13 = Task("hate-speech-detection_3", "acc", "std_accuracy", "HS Prompt Std") + #task14 = Task("hate-speech-detection_4", "acc", "best_prompt", "HS Best Prompt") + #task15 = Task("hate-speech-detection_5", "acc", "prompt_id", "HS Best Prompt Id") + + #task16 = Task("admission-test_1", "acc", "CPS", "AT") + #task17 = Task("admission-test_2", "acc", "average_accuracy", "AT Prompt Average") + #task18 = Task("admission-test_3", "acc", "std_accuracy", "AT Prompt Std") + #task19 = Task("admission-test_4", "acc", "best_prompt", "AT Best Prompt") + #task20 = Task("admission-test_5", "acc", "prompt_id", "AT Best Prompt Id") + + #task21 = Task("word-in-context_1", "acc", "CPS", "WIC") + #task22 = Task("word-in-context_2", "acc", "average_accuracy", "WIC Prompt Average") + #task23 = Task("word-in-context_3", "acc", "std_accuracy", "WIC Prompt Std") + #task24 = Task("word-in-context_4", "acc", "best_prompt", "WIC Best Prompt") + #task25 = Task("word-in-context_5", "acc", "prompt_id", "WIC Best Prompt Id") + + #task26 = Task("faq_1", "acc", "CPS", "FAQ") + #task27 = Task("faq_2", "acc", "average_accuracy", "FAQ Prompt Average") + #task28 = Task("faq_3", "acc", "std_accuracy", "FAQ Prompt Std") + #task29 = Task("faq_4", "acc", "best_prompt", "FAQ Best Prompt") + #task30 = Task("faq_5", "acc", "prompt_id", "FAQ Best Prompt Id") + + #task31 = Task("lexical-substitution_1", "acc", "CPS", "LS") + #task32 = Task("lexical-substitution_2", "acc", "average_accuracy", "LS Prompt Average") + #task33 = Task("lexical-substitution_3", "acc", "std_accuracy", "LS Prompt Std") + #task34 = Task("lexical-substitution_4", "acc", "best_prompt", "LS Best Prompt") + #task35 = Task("lexical-substitution_5", "acc", "prompt_id", "LS Best Prompt Id") + + #task36 = Task("summarization-fanpage_1", "acc", "CPS", "SU") + #task37 = Task("summarization-fanpage_2", "acc", "average_accuracy", "SU Prompt Average") + #task38 = Task("summarization-fanpage_3", "acc", "std_accuracy", "SU Prompt Std") + #task39 = Task("summarization-fanpage_4", "acc", "best_prompt", "SU Best Prompt") + #task40 = Task("summarization-fanpage_5", "acc", "prompt_id", "SU Best Prompt Id") + + #task41 = Task("evalita NER_1", "acc", "CPS", "NER") + #task42 = Task("evalita NER_2", "acc", "average_accuracy", "NER Prompt Average") + #task43 = Task("evalita NER_3", "acc", "std_accuracy", "NER Prompt Std") + #task44 = Task("evalita NER_4", "acc", "best_prompt", "NER Best Prompt") + #task45 = Task("evalita NER_5", "acc", "prompt_id", "NER Best Prompt Id") + + #task46 = Task("relation-extraction_1", "acc", "CPS", "REL") + #task47 = Task("relation-extraction_2", "acc", "average_accuracy", "REL Prompt Average") + #task48 = Task("relation-extraction_5", "acc", "std_accuracy", "REL Prompt Std") + #task49 = Task("relation-extraction_3", "acc", "best_prompt", "REL Best Prompt") + #task50 = Task("relation-extraction_4", "acc", "prompt_id", "REL Best Prompt Id") + task1 = Task("RE_1", "acc", "CPS", "REL") + task2 = Task("RE_2", "acc", "average_accuracy", "REL Prompt Average") + task3 = Task("RE_5", "acc", "std_accuracy", "REL Prompt Std") + task4 = Task("RE_3", "acc", "best_prompt", "REL Best Prompt") + task5 = Task("RE_4", "acc", "prompt_id", "REL Best Prompt Id") + + task6 = Task("NER_1", "acc", "CPS", "NER") + task7 = Task("NER_2", "acc", "average_accuracy", "NER Prompt Average") + task8 = Task("NER_3", "acc", "std_accuracy", "NER Prompt Std") + task9 = Task("NER_4", "acc", "best_prompt", "NER Best Prompt") + task10 = Task("NER_5", "acc", "prompt_id", "NER Best Prompt Id") + + task11 = Task("RML_1", "acc", "CPS", "RML") + task12 = Task("RML_2", "acc", "average_accuracy", "RML Prompt Average") + task13 = Task("RML_3", "acc", "std_accuracy", "RML Prompt Std") + task14 = Task("RML_4", "acc", "best_prompt", "RML Best Prompt") + task15 = Task("RML_5", "acc", "prompt_id", "RML Best Prompt Id") + + + + task16 = Task("DIA_1", "acc", "CPS", "DIA") + task17 = Task("DIA_2", "acc", "average_accuracy", "DIA Prompt Average") + task18 = Task("DIA_3", "acc", "std_accuracy", "DIA Prompt Std") + task19 = Task("DIA_4", "acc", "best_prompt", "DIA Best Prompt") + task20 = Task("DIA_5", "acc", "prompt_id", "DIA Best Prompt Id") + + task21 = Task("HIS_1", "acc", "CPS", "HIS") + task22 = Task("HIS_2", "acc", "average_accuracy", "HIS Prompt Average") + task23 = Task("HIS_3", "acc", "std_accuracy", "HIS Prompt Std") + task24 = Task("HIS_4", "acc", "best_prompt", "HIS Best Prompt") + task25 = Task("HIS_5", "acc", "prompt_id", "HIS Best Prompt Id") + ''' + task0 = Task("TextualEntailment", "acc", "Textual Entailment") + task1 = Task("TextualEntailment_best", "acc", "TextualEntailment Best") + task2 = Task("Sentiment Analysis", "acc", "Sentiment Analysis") + task3 = Task("Sentiment Analysis_best", "acc", "Sentiment Analysis_best") + task4 = Task("Hate Speech", "acc", "Hate Speech") + task5 = Task("Hate Speech_best", "acc", "Hate Speech_best") + task6 = Task("Admission Test", "acc", "Admission Test") + task7 = Task("Admission Test_best", "acc", "Admission Test_best") + task8 = Task("Word in Context", "acc", "Word in Context") + task9 = Task("Word in Context_best", "acc", "Word in Context_best") + task10 = Task("FAQ", "acc", "FAQ") + task11 = Task("FAQ_best", "acc", "FAQ_best") + task12 = Task("Lexical Substitution", "acc", "Lexical Substitution") + task13 = Task("Lexical Substitution_best", "acc", "Lexical Substitution_best") + task14 = Task("Summarization", "acc", "Summarization") + task15 = Task("Summarization_best", "acc", "Summarization_best") + task16 = Task("NER", "acc", "NER") + task17 = Task("NER_best", "acc", "NER_best") + task18 = Task("REL", "acc", "REL") + task19 = Task("REL_best", "acc", "REL_best") + ''' + +# Your leaderboard name +TITLE = """

🚀 ECREAM-LLM Leaderboard 🚀

""" + +# What does your leaderboard evaluate? +INTRODUCTION_TEXT = """ +

The eCream-LLM leaderboard , developed within the eCream Project (enabling Clinical Research in Emergency and Acute care Medicine), is designed to evaluate Large Language Models (LLMs) on two tasks pertaining to the medical domain. Its distinguishing features are:
(i) all tasks are implemented for six languages including English, Italian, Slovak, Slovenian, Polish and Greek;
(ii) all tasks are generative, thus allowing for a more natural interaction with LLMs;
(iii) all tasks are evaluated against multiple prompts, this way mitigating the model sensitivity to specific prompts and allowing a fairer evaluation.
+

**Generative tasks:** 🏷️NER (Named Entity Recognition), 🔗REL (Relation Extraction), 😃RML(CRF RML) +
**Multiple-choice task:** 🏥DIA (CRF Diagnosis), 📝HIS (CRF History) + +""" + + + +# Which evaluations are you running? how can people reproduce what you have? +LLM_BENCHMARKS_TEXT = f""" +### Groups + +- `evalita-mp`: All tasks (perplexity and non-perplexity based). +- `evalita-mp_gen`: Only generative tasks. + +#### Tasks + +The following Evalita-LLM tasks can also be evaluated in isolation: + - `evalita-mp_ner_group`: Named Entity Recognition (NER) + - `evalita-mp_re`: Relation Extraction (REL) + + +### Usage + +```bash + +lm_eval --model hf --model_args pretrained=meta-llama/Llama-2-7b-hf --tasks evalita-mp_re --device cuda:0 --batch_size 1 +``` + + + + +""" + +EVALUATION_QUEUE_TEXT = """ +## Some good practices before submitting a model + +### 1) Make sure you can load your model and tokenizer using AutoClasses: +```python +from transformers import AutoConfig, AutoModel, AutoTokenizer +config = AutoConfig.from_pretrained("your model name", revision=revision) +model = AutoModel.from_pretrained("your model name", revision=revision) +tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision) +``` +If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded. + +Note: make sure your model is public! +Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted! + +### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index) +It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`! + +### 3) Make sure your model has an open license! +This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗 + +### 4) Fill up your model card +When we add extra information about models to the leaderboard, it will be automatically taken from the model card + +## In case of model failure +If your model is displayed in the `FAILED` category, its execution stopped. +Make sure you have followed the above steps first. +If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task). +""" + +CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" +CITATION_BUTTON_TEXT = r""" +@article{magnini2025cost, + title={A cost-effective approach to counterbalance the scarcity of medical datasets}, + author={Magnini, Bernardo and Farzi, Saeed and Ferrazzi, Pietro and Ghosh, Soumitra and Lavelli, Alberto and Mezzanotte, Giulia and Speranza, Manuela}, + journal={Frontiers in Disaster and Emergency Medicine}, + volume={3}, + pages={1558200}, + year={2025}, + publisher={Frontiers Media SA}, + url={https://www.frontiersin.org/journals/disaster-and-emergency-medicine/articles/10.3389/femer.2025.1558200/full} +} +""" diff --git a/src/display/.ipynb_checkpoints/css_html_js-checkpoint.py b/src/display/.ipynb_checkpoints/css_html_js-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..721bc2e346c9e2578013986ee477c46a5fe11b17 --- /dev/null +++ b/src/display/.ipynb_checkpoints/css_html_js-checkpoint.py @@ -0,0 +1,122 @@ +custom_css = """ + +.markdown-text { + font-size: 16px !important; +} + +#models-to-add-text { + font-size: 18px !important; +} + +#citation-button span { + font-size: 16px !important; +} + +#citation-button textarea { + font-size: 16px !important; +} + +#citation-button > label > button { + margin: 6px; + transform: scale(1.3); +} + +#leaderboard-table { + margin-top: 15px +} + +#leaderboard-table-lite { + margin-top: 15px +} + +#search-bar-table-box > div:first-child { + background: none; + border: none; +} + +#search-bar { + padding: 0px; +} + +/* Limit the width of the first AutoEvalColumn so that names don't expand too much */ +#leaderboard-table td:nth-child(2), +#leaderboard-table th:nth-child(2) { + max-width: 400px; + overflow: auto; + white-space: nowrap; +} + +.tab-buttons button { + font-size: 20px; +} + +#scale-logo { + border-style: none !important; + box-shadow: none; + display: block; + margin-left: auto; + margin-right: auto; + max-width: 600px; +} + +#scale-logo .download { + display: none; +} +#filter_type{ + border: 0; + padding-left: 0; + padding-top: 0; +} +#filter_type label { + display: flex; +} +#filter_type label > span{ + margin-top: var(--spacing-lg); + margin-right: 0.5em; +} +#filter_type label > .wrap{ + width: 103px; +} +#filter_type label > .wrap .wrap-inner{ + padding: 2px; +} +#filter_type label > .wrap .wrap-inner input{ + width: 1px +} +#filter-columns-type{ + border:0; + padding:0.5; +} +#filter-columns-size{ + border:0; + padding:0.5; +} +#box-filter > .form{ + border: 0 +} + +/* === Added scaling for plots === */ +#line-chart, +#boxplot-task { + max-width: 100%; + width: 100%; + height: auto; + margin: 0 auto; + display: block; +} + +/* nasconde la barra degli strumenti Plotly */ +.modebar { + display: none !important; +} + +""" + +get_window_url_params = """ + function(url_params) { + const params = new URLSearchParams(window.location.search); + url_params = Object.fromEntries(params); + return url_params; + } + """ + diff --git a/src/display/.ipynb_checkpoints/formatting-checkpoint.py b/src/display/.ipynb_checkpoints/formatting-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..ba340b8c51c98be420f01682eedda01099dc92a3 --- /dev/null +++ b/src/display/.ipynb_checkpoints/formatting-checkpoint.py @@ -0,0 +1,30 @@ +def model_hyperlink(link, model_name): + return f'{model_name}' + + +def make_clickable_model(model_name): + link = f"https://huggingface.co/{model_name}" + #Remove author prefix from model names for EVALITA-LLM + model_name = model_name.split("/")[-1] + #print(model_name) + return model_hyperlink(link, model_name) + + +def styled_error(error): + return f"

{error}

" + + +def styled_warning(warn): + return f"

{warn}

" + + +def styled_message(message): + return f"

{message}

" + + +def has_no_nan_values(df, columns): + return df[columns].notna().all(axis=1) + + +def has_nan_values(df, columns): + return df[columns].isna().any(axis=1) diff --git a/src/display/.ipynb_checkpoints/utils-checkpoint.py b/src/display/.ipynb_checkpoints/utils-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..f11c07243e402f48a8ba018a8942b2190e5747e7 --- /dev/null +++ b/src/display/.ipynb_checkpoints/utils-checkpoint.py @@ -0,0 +1,188 @@ +from dataclasses import dataclass, make_dataclass +from enum import Enum + +import pandas as pd + +from src.about import Tasks + +def fields(raw_class): + return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"] + + +# These classes are for user facing column names, +# to avoid having to change them all around the code +# when a modif is needed +@dataclass +class ColumnContent: + name: str + type: str + displayed_by_default: bool + hidden: bool = False + never_hidden: bool = False + +## Leaderboard columns +auto_eval_column_dict = [] +# Init +#auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)]) + +auto_eval_column_dict.append(["rank", ColumnContent, ColumnContent("Rank", "number", True, never_hidden=True)]) +auto_eval_column_dict.append(["size_symbol", ColumnContent, ColumnContent("Size", "number", True, never_hidden=True)]) + +auto_eval_column_dict.append(["fewshot_symbol", ColumnContent, ColumnContent("FS", "str", True, never_hidden=True)]) +auto_eval_column_dict.append(["is_5fewshot", ColumnContent, ColumnContent("IS_FS", "bool", True)]) + +auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)]) +#auto_eval_column_dict.append(["fewshot", ColumnContent, ColumnContent("Few-Shot", "str", True)]) + +#Scores +auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg. Comb. Perf. ⬆️", "number", True)]) +for task in Tasks: + auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)]) + +# Model information +#auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)]) +auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)]) +auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)]) +#auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)]) +auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)]) +auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)]) +auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)]) +auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)]) +auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)]) +#auto_eval_column_dict.append(["submitted_time", ColumnContent, ColumnContent("Submitted time", "date", False)]) + +# We use make dataclass to dynamically fill the scores from Tasks +AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True) + +## For the queue columns in the submission tab +@dataclass(frozen=True) +class EvalQueueColumn: # Queue column + model = ColumnContent("model", "markdown", True) + revision = ColumnContent("revision", "str", True) + private = ColumnContent("private", "bool", True) + #precision = ColumnContent("precision", "str", True) + weight_type = ColumnContent("weight_type", "str", "Original") + status = ColumnContent("status", "str", True) + +## All the model information that we might need +@dataclass +class ModelDetails: + name: str + display_name: str = "" + symbol: str = "" # emoji + + +class ModelType(Enum): + PT = ModelDetails(name="pretrained", symbol="🟢") + FT = ModelDetails(name="fine-tuned", symbol="🔶") + IFT = ModelDetails(name="instruction-tuned", symbol="⭕") + RL = ModelDetails(name="RL-tuned", symbol="🟦") + Unknown = ModelDetails(name="", symbol="?") + + def to_str(self, separator=" "): + return f"{self.value.symbol}{separator}{self.value.name}" + + @staticmethod + def from_str(type): + if "fine-tuned" in type or "🔶" in type: + return ModelType.FT + if "pretrained" in type or "🟢" in type: + return ModelType.PT + if "RL-tuned" in type or "🟦" in type: + return ModelType.RL + if "instruction-tuned" in type or "⭕" in type: + return ModelType.IFT + return ModelType.Unknown + +@dataclass +class FewShotDetails: + name: str + symbol: str = "" # emoji + +class FewShotType(Enum): + ZS = FewShotDetails(name="zero-shot", symbol="🅾️") + FS = FewShotDetails(name="5-few-shot", symbol="5️⃣") + Unknown = FewShotDetails(name="unknown", symbol="❓") + + def to_str(self, separator=" "): + return f"{self.value.symbol}{separator}{self.value.name}" + + @staticmethod + def from_num_fewshot(is_5fewshot): + """Determines FewShotType based on num_fewshot.""" + if is_5fewshot is False: + return FewShotType.ZS + elif is_5fewshot is True: + return FewShotType.FS + return FewShotType.Unknown + +@dataclass +class SizeDetails: + name: str + symbol: str = "" # emoji + +class SizeType(Enum): + SMALL = SizeDetails(name="small", symbol="🔵") + MEDIUM = SizeDetails(name="medium", symbol="🔵🔵") + LARGE = SizeDetails(name="large", symbol="🔵🔵🔵") + Unknown = SizeDetails(name="unknown", symbol="❓") + + def to_str(self, separator=" "): + return f"{self.value.symbol}{separator}{self.value.name}" + + @staticmethod + def num2type(size): + """Determines FewShotType based on num_fewshot.""" + if size <= 10: + return SizeType.SMALL + elif size > 10 and size <= 50: + return SizeType.MEDIUM + else: + return SizeType.LARGE + +class WeightType(Enum): + Adapter = ModelDetails("Adapter") + Original = ModelDetails("Original") + Delta = ModelDetails("Delta") + +class Precision(Enum): + float16 = ModelDetails("float16") + bfloat16 = ModelDetails("bfloat16") + Unknown = ModelDetails("?") + + def from_str(precision): + if precision in ["torch.float16", "float16"]: + return Precision.float16 + if precision in ["torch.bfloat16", "bfloat16"]: + return Precision.bfloat16 + return Precision.Unknown + +# Column selection +COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden] + +EVAL_COLS = [c.name for c in fields(EvalQueueColumn)] +EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)] + +BENCHMARK_COLS = [t.value.col_name for t in Tasks] + +''' +# Nuovi valori per CPS, AVERAGE, BEST, e ID nella tabella +@dataclass +class NewColumnContent: + name: str + type: str + displayed_by_default: bool + hidden: bool = False + never_hidden: bool = False +''' + +''' +new_column_dict = [] +# Aggiungi CPS, VERAGE, BEST, ID +new_column_dict.append(["CPS", NewColumnContent, NewColumnContent("CPS", "number", True)]) +new_column_dict.append(["AVERAGE", NewColumnContent, NewColumnContent("Average ⬆️", "number", True)]) +new_column_dict.append(["BEST", NewColumnContent, NewColumnContent("Best Performance", "number", True)]) +new_column_dict.append(["ID", NewColumnContent, NewColumnContent("ID", "str", True)]) +NewColumn = make_dataclass("NewColumn", new_column_dict, frozen=True) +NEW_COLS = [c.name for c in fields(NewColumn) if not c.hidden] +''' diff --git a/src/display/__pycache__/css_html_js.cpython-310.pyc b/src/display/__pycache__/css_html_js.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1449d56d8d763c9a9c0645e8c9c5598a7b29738f Binary files /dev/null and b/src/display/__pycache__/css_html_js.cpython-310.pyc differ diff --git a/src/display/__pycache__/formatting.cpython-310.pyc b/src/display/__pycache__/formatting.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e183b7463adcf188048f008b51c49e84531113a4 Binary files /dev/null and b/src/display/__pycache__/formatting.cpython-310.pyc differ diff --git a/src/display/__pycache__/utils.cpython-310.pyc b/src/display/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5c7f9272e5f5572d8022b193d6b66762b4d49c24 Binary files /dev/null and b/src/display/__pycache__/utils.cpython-310.pyc differ diff --git a/src/display/css_html_js.py b/src/display/css_html_js.py new file mode 100644 index 0000000000000000000000000000000000000000..70f9e59d881e34cf3b6a371bff0a9b56876dba17 --- /dev/null +++ b/src/display/css_html_js.py @@ -0,0 +1,139 @@ +custom_css = """ +/* gray background behind the combo row only */ +#filters-wrap { + background: #f2f3f5; /* light gray */ + border-radius: 12px; /* optional */ + padding: 12px 16px; /* breathing room */ + margin-bottom: 8px; /* space before plots */ + box-sizing: border-box; +} +.markdown-text1 { + font-size: 16px !important; + max-height: 300px; /* adjust height as you like */ + overflow-y: auto; /* vertical scroll when text is too long */ + overflow-x: hidden; /* hide horizontal scroll bar completely */ + white-space: normal; /* allow line wrapping */ + word-wrap: break-word; + display: block; + padding-right: 8px; /* optional: avoid text sticking to scrollbar */ +} +.markdown-text { + font-size: 16px !important; + +} +#models-to-add-text { + font-size: 18px !important; +} + +#citation-button span { + font-size: 16px !important; +} + +#citation-button textarea { + font-size: 16px !important; +} + +#citation-button > label > button { + margin: 6px; + transform: scale(1.3); +} + +#leaderboard-table { + margin-top: 15px +} + +#leaderboard-table-lite { + margin-top: 15px +} + +#search-bar-table-box > div:first-child { + background: none; + border: none; +} + +#search-bar { + padding: 0px; +} + +/* Limit the width of the first AutoEvalColumn so that names don't expand too much */ +#leaderboard-table td:nth-child(2), +#leaderboard-table th:nth-child(2) { + max-width: 400px; + overflow: auto; + white-space: nowrap; +} + +.tab-buttons button { + font-size: 20px; +} + +#scale-logo { + border-style: none !important; + box-shadow: none; + display: block; + margin-left: auto; + margin-right: auto; + max-width: 600px; +} + +#scale-logo .download { + display: none; +} +#filter_type{ + border: 0; + padding-left: 0; + padding-top: 0; +} +#filter_type label { + display: flex; +} +#filter_type label > span{ + margin-top: var(--spacing-lg); + margin-right: 0.5em; +} +#filter_type label > .wrap{ + width: 103px; +} +#filter_type label > .wrap .wrap-inner{ + padding: 2px; +} +#filter_type label > .wrap .wrap-inner input{ + width: 1px +} +#filter-columns-type{ + border:0; + padding:0.5; +} +#filter-columns-size{ + border:0; + padding:0.5; +} +#box-filter > .form{ + border: 0 +} + +/* === Added scaling for plots === */ +#line-chart, +#boxplot-task { + max-width: 100%; + width: 100%; + height: auto; + margin: 0 auto; + display: block; +} + +/* nasconde la barra degli strumenti Plotly */ +.modebar { + display: none !important; +} + +""" + +get_window_url_params = """ + function(url_params) { + const params = new URLSearchParams(window.location.search); + url_params = Object.fromEntries(params); + return url_params; + } + """ + diff --git a/src/display/formatting.py b/src/display/formatting.py new file mode 100644 index 0000000000000000000000000000000000000000..ba340b8c51c98be420f01682eedda01099dc92a3 --- /dev/null +++ b/src/display/formatting.py @@ -0,0 +1,30 @@ +def model_hyperlink(link, model_name): + return f'{model_name}' + + +def make_clickable_model(model_name): + link = f"https://huggingface.co/{model_name}" + #Remove author prefix from model names for EVALITA-LLM + model_name = model_name.split("/")[-1] + #print(model_name) + return model_hyperlink(link, model_name) + + +def styled_error(error): + return f"

{error}

" + + +def styled_warning(warn): + return f"

{warn}

" + + +def styled_message(message): + return f"

{message}

" + + +def has_no_nan_values(df, columns): + return df[columns].notna().all(axis=1) + + +def has_nan_values(df, columns): + return df[columns].isna().any(axis=1) diff --git a/src/display/utils.py b/src/display/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..ce06546426f490f8841725ec7b93d7b8c3fab089 --- /dev/null +++ b/src/display/utils.py @@ -0,0 +1,189 @@ +from dataclasses import dataclass, make_dataclass +from enum import Enum + +import pandas as pd + +from src.about import Tasks + +def fields(raw_class): + return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"] + + +# These classes are for user facing column names, +# to avoid having to change them all around the code +# when a modif is needed +@dataclass +class ColumnContent: + name: str + type: str + displayed_by_default: bool + hidden: bool = False + never_hidden: bool = False + +## Leaderboard columns +auto_eval_column_dict = [] +# Init +#auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)]) + +auto_eval_column_dict.append(["rank", ColumnContent, ColumnContent("Rank", "number", True, never_hidden=True)]) +auto_eval_column_dict.append(["size_symbol", ColumnContent, ColumnContent("Size", "number", True, never_hidden=True)]) + +auto_eval_column_dict.append(["fewshot_symbol", ColumnContent, ColumnContent("FS", "str", True, never_hidden=True)]) +auto_eval_column_dict.append(["is_5fewshot", ColumnContent, ColumnContent("IS_FS", "bool", True)]) +auto_eval_column_dict.append(["LANG", ColumnContent, ColumnContent("LANG", "str", True, never_hidden=True)]) + +auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)]) +#auto_eval_column_dict.append(["fewshot", ColumnContent, ColumnContent("Few-Shot", "str", True)]) + +#Scores +auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg. Comb. Perf. ⬆️", "number", True)]) +for task in Tasks: + auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)]) + +# Model information +#auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)]) +auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)]) +auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)]) +#auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)]) +auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)]) +auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)]) +auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)]) +auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)]) +auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)]) +#auto_eval_column_dict.append(["submitted_time", ColumnContent, ColumnContent("Submitted time", "date", False)]) + +# We use make dataclass to dynamically fill the scores from Tasks +AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True) + +## For the queue columns in the submission tab +@dataclass(frozen=True) +class EvalQueueColumn: # Queue column + model = ColumnContent("model", "markdown", True) + revision = ColumnContent("revision", "str", True) + private = ColumnContent("private", "bool", True) + #precision = ColumnContent("precision", "str", True) + weight_type = ColumnContent("weight_type", "str", "Original") + status = ColumnContent("status", "str", True) + +## All the model information that we might need +@dataclass +class ModelDetails: + name: str + display_name: str = "" + symbol: str = "" # emoji + + +class ModelType(Enum): + PT = ModelDetails(name="pretrained", symbol="🟢") + FT = ModelDetails(name="fine-tuned", symbol="🔶") + IFT = ModelDetails(name="instruction-tuned", symbol="⭕") + RL = ModelDetails(name="RL-tuned", symbol="🟦") + Unknown = ModelDetails(name="", symbol="?") + + def to_str(self, separator=" "): + return f"{self.value.symbol}{separator}{self.value.name}" + + @staticmethod + def from_str(type): + if "fine-tuned" in type or "🔶" in type: + return ModelType.FT + if "pretrained" in type or "🟢" in type: + return ModelType.PT + if "RL-tuned" in type or "🟦" in type: + return ModelType.RL + if "instruction-tuned" in type or "⭕" in type: + return ModelType.IFT + return ModelType.Unknown + +@dataclass +class FewShotDetails: + name: str + symbol: str = "" # emoji + +class FewShotType(Enum): + ZS = FewShotDetails(name="zero-shot", symbol="🅾️") + FS = FewShotDetails(name="10-few-shot", symbol="🔟") + Unknown = FewShotDetails(name="unknown", symbol="❓") + + def to_str(self, separator=" "): + return f"{self.value.symbol}{separator}{self.value.name}" + + @staticmethod + def from_num_fewshot(is_5fewshot): + """Determines FewShotType based on num_fewshot.""" + if is_5fewshot is False: + return FewShotType.ZS + elif is_5fewshot is True: + return FewShotType.FS + return FewShotType.Unknown + +@dataclass +class SizeDetails: + name: str + symbol: str = "" # emoji + +class SizeType(Enum): + SMALL = SizeDetails(name="small", symbol="🔵") + MEDIUM = SizeDetails(name="medium", symbol="🔵🔵") + LARGE = SizeDetails(name="large", symbol="🔵🔵🔵") + Unknown = SizeDetails(name="unknown", symbol="❓") + + def to_str(self, separator=" "): + return f"{self.value.symbol}{separator}{self.value.name}" + + @staticmethod + def num2type(size): + """Determines FewShotType based on num_fewshot.""" + if size <= 10: + return SizeType.SMALL + elif size > 10 and size <= 50: + return SizeType.MEDIUM + else: + return SizeType.LARGE + +class WeightType(Enum): + Adapter = ModelDetails("Adapter") + Original = ModelDetails("Original") + Delta = ModelDetails("Delta") + +class Precision(Enum): + float16 = ModelDetails("float16") + bfloat16 = ModelDetails("bfloat16") + Unknown = ModelDetails("?") + + def from_str(precision): + if precision in ["torch.float16", "float16"]: + return Precision.float16 + if precision in ["torch.bfloat16", "bfloat16"]: + return Precision.bfloat16 + return Precision.Unknown + +# Column selection +COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden] + +EVAL_COLS = [c.name for c in fields(EvalQueueColumn)] +EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)] + +BENCHMARK_COLS = [t.value.col_name for t in Tasks] + +''' +# Nuovi valori per CPS, AVERAGE, BEST, e ID nella tabella +@dataclass +class NewColumnContent: + name: str + type: str + displayed_by_default: bool + hidden: bool = False + never_hidden: bool = False +''' + +''' +new_column_dict = [] +# Aggiungi CPS, VERAGE, BEST, ID +new_column_dict.append(["CPS", NewColumnContent, NewColumnContent("CPS", "number", True)]) +new_column_dict.append(["AVERAGE", NewColumnContent, NewColumnContent("Average ⬆️", "number", True)]) +new_column_dict.append(["BEST", NewColumnContent, NewColumnContent("Best Performance", "number", True)]) +new_column_dict.append(["ID", NewColumnContent, NewColumnContent("ID", "str", True)]) +NewColumn = make_dataclass("NewColumn", new_column_dict, frozen=True) +NEW_COLS = [c.name for c in fields(NewColumn) if not c.hidden] +''' diff --git a/src/envs.py b/src/envs.py new file mode 100644 index 0000000000000000000000000000000000000000..9db342d84e248b24bae574cb6cb33a42efa92c04 --- /dev/null +++ b/src/envs.py @@ -0,0 +1,46 @@ +import os + +from huggingface_hub import HfApi + +# Info to change for your repository +# ---------------------------------- +TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org + +#OWNER = "giux78" # Change to your org - don't forget to create a results and request dataset, with the correct format! +OWNER = "saeedfarzi" +# ---------------------------------- + +#REPO_ID = f"{OWNER}/leaderboard-evalita" +#QUEUE_REPO = f"{OWNER}/evalita-requests" +#RESULTS_REPO = f"{OWNER}/evalita-results" + +REPO_ID = f"{OWNER}/MediLingua_Leaderboard" +QUEUE_REPO = f"{OWNER}/e3c_llm_requests" +RESULTS_REPO = f"{OWNER}/e3c_llm_results" + +# If you setup a cache later, just change HF_HOME +#CACHE_PATH=os.getenv("HF_HOME", "/home/sfarzi/leaderboard/") + +# Local caches +#EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue") +#EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results") +#EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk") +#EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk") + +#EVAL_REQUESTS_PATH ='/home/sfarzi/leaderboard/llm_leaderboard/e3c_llm_requests' #os.path.join(CACHE_PATH, "eval-queue") +#EVAL_RESULTS_PATH = '/home/sfarzi/leaderboard/llm_leaderboard/e3c_llm_results'#os.path.join(CACHE_PATH, "eval-results") +#EVAL_REQUESTS_PATH_BACKEND = '/home/sfarzi/leaderboard/llm_leaderboard/e3c_llm_requests' #os.path.join(CACHE_PATH, "eval-queue-bk") +#EVAL_RESULTS_PATH_BACKEND = '/home/sfarzi/leaderboard/llm_leaderboard/e3c_llm_results' #os.path.join(CACHE_PATH, "eval-results-bk") + +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) + +# Go one directory up from BASE_DIR +PARENT_DIR = os.path.dirname(BASE_DIR) + +# Now set the paths to the directories one level up +EVAL_REQUESTS_PATH = os.path.join(PARENT_DIR, "e3c_llm_requests") +EVAL_RESULTS_PATH = os.path.join(PARENT_DIR, "e3c_llm_results") +EVAL_REQUESTS_PATH_BACKEND = EVAL_REQUESTS_PATH +EVAL_RESULTS_PATH_BACKEND = EVAL_RESULTS_PATH + +API = HfApi(token=TOKEN) diff --git a/src/leaderboard/.ipynb_checkpoints/read_evals-checkpoint.py b/src/leaderboard/.ipynb_checkpoints/read_evals-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..dee8db5a6e176e56893d5d1e39b2d009c1835832 --- /dev/null +++ b/src/leaderboard/.ipynb_checkpoints/read_evals-checkpoint.py @@ -0,0 +1,231 @@ +import glob +import json +import math +import os +from dataclasses import dataclass, field + +import dateutil +import numpy as np +from typing import Dict, Union +from datetime import datetime + +#from get_model_info import num_params +from src.display.formatting import make_clickable_model +from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, FewShotType, SizeType +from src.submission.check_validity import is_model_on_hub + + +@dataclass +class EvalResult: + """Represents one full evaluation. Built from a combination of the result and request file for a given run. + """ + eval_name: str # org_model_precision (uid) + full_model: str # org/model (path on hub) + org: str + model: str + revision: str # commit hash, "" if main + #submitted_time: datetime + results: Dict[str, Union[float, int]] # float o int + average_CPS: float + is_5fewshot: bool + fewshot_symbol: FewShotType = FewShotType.Unknown + weight_type: WeightType = WeightType.Original # Original or Adapter + architecture: str = "Unknown" + license: str = "?" + likes: int = 0 + num_params: int = 0 + date: str = "" # submission date of request file + still_on_hub: bool = False + rank: int = 0#str = field(default=0) # nuovo campo con default = 0 + size_symbol: SizeType = SizeType.Unknown + + @classmethod + def init_from_json_file(self, json_filepath): + """Inits the result from the specific model result file""" + with open(json_filepath) as fp: + data = json.load(fp) + + config = data.get("config") + + #average_CPS = f"{data.get('average_CPS'):.2f}" + # Get average_CPS + average_CPS = float(data.get('average_CPS', 0.0)) # 0.0 come valore di default + # Get number of fewshot + fewshot = config.get("num_fewshot", False) + + rank = 0 + + try: + if fewshot == "5": + is_5fewshot = True + else: + is_5fewshot = False + except ValueError: + is_5fewshot = False + # Determine the few-shot type (ZS or FS) based on num_fewshot + fewshot_symbol = FewShotType.from_num_fewshot(is_5fewshot) # Use the new + + # Determine the number of parameters of the models + num_params = int(0) + num_params_billion = config.get("num_params_billion") + if num_params_billion is not None: + num_params = math.ceil(num_params_billion) + + size_symbol = SizeType.num2type(num_params) + + # Get model and org + org_and_model = config.get("model_name", config.get("model_args", None)) + org_and_model = org_and_model.split("/", 1) + + if len(org_and_model) == 1: + org = None + model = org_and_model[0] + #result_key = f"{model}_{precision.value.name}" + result_key = f"{model}_{is_5fewshot}" + else: + org = org_and_model[0] + model = org_and_model[1] + #result_key = f"{org}_{model}_{precision.value.name}" + result_key = f"{org}_{model}_{is_5fewshot}" + full_model = "/".join(org_and_model) + + still_on_hub, _, model_config = is_model_on_hub( + full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False + ) + architecture = "?" + if model_config is not None: + architectures = getattr(model_config, "architectures", None) + if architectures: + architecture = ";".join(architectures) + + # Extract the results of the models + results = {} + for task in Tasks: + task = task.value + + for k, v in data["tasks"].items(): + if task.benchmark[:-2] == k: + if "Best Prompt Id" in task.col_name: + results[task.benchmark] = int(v[task.metric_type][-1:]) + else: + #results[task.benchmark] = f"{v[task.metric_type]:.2f}" # Ensure two decimals for display + results[task.benchmark] = float(v[task.metric_type]) + #value = float(v[task.metric_type]) + #results[task.benchmark] = round(value, 2) # Arrotonda a 2 decimali + + return self( + eval_name=result_key, + full_model=full_model, + org=org, + model=model, + results=results, + average_CPS=average_CPS, + fewshot_symbol=fewshot_symbol, + is_5fewshot=is_5fewshot, + revision= config.get("model_sha", ""), + still_on_hub=still_on_hub, + architecture=architecture, + num_params=num_params, + rank = rank, + size_symbol=size_symbol + #submitted_time=config.get("submitted_time", ""), + ) + + ''' + def update_with_request_file(self, requests_path): + """Finds the relevant request file for the current model and updates info with it""" + request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name) + + try: + with open(request_file, "r") as f: + request = json.load(f) + self.model_type = ModelType.from_str(request.get("model_type", "")) + self.weight_type = WeightType[request.get("weight_type", "Original")] + self.license = request.get("license", "?") + self.likes = request.get("likes", 0) + self.num_params = request.get("params", 0) + self.date = request.get("submitted_time", "") + except Exception: + print(f"Could not find request file for {self.org}/{self.model} with precision + ''' + + def to_dict(self): + """Converts the Eval Result to a dict compatible with our dataframe display""" + average = self.average_CPS + + fewshot_symbol = ( + self.fewshot_symbol.value.symbol if isinstance(self.fewshot_symbol, FewShotType) else "❓" + ) + + size_symbol = ( + self.size_symbol.value.symbol if isinstance(self.size_symbol, SizeType) else "❓" + ) + + data_dict = { + "eval_name": self.eval_name, # not a column, just a save name, + #AutoEvalColumn.precision.name: self.precision.value.name, + #AutoEvalColumn.model_type.name: self.model_type.value.name, + #AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol, + #AutoEvalColumn.model_type.name: self.model_type.value.name if self.model_type else "Unknown", + #AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol if self.model_type else "Unknown", + AutoEvalColumn.fewshot_symbol.name: fewshot_symbol, + AutoEvalColumn.weight_type.name: self.weight_type.value.name, + AutoEvalColumn.architecture.name: self.architecture, + AutoEvalColumn.model.name: make_clickable_model(self.full_model), + AutoEvalColumn.revision.name: self.revision, + AutoEvalColumn.average.name: average, + AutoEvalColumn.is_5fewshot.name: self.is_5fewshot, + AutoEvalColumn.license.name: self.license, + AutoEvalColumn.likes.name: self.likes, + AutoEvalColumn.params.name: self.num_params, + AutoEvalColumn.still_on_hub.name: self.still_on_hub, + AutoEvalColumn.rank.name: self.rank, + AutoEvalColumn.size_symbol.name: size_symbol + } + + for task in Tasks: + data_dict[task.value.col_name] = self.results[task.value.benchmark] + + return data_dict + + +def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]: + """From the path of the results folder root, extract all needed info for results""" + model_result_filepaths = [] + + for root, _, files in os.walk(results_path): + # We should only have json files in model results + if len(files) == 0 or any([not f.endswith(".json") for f in files]): + continue + + # Sort the files by date + try: + files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7]) + except dateutil.parser._parser.ParserError: + files = [files[-1]] + + for file in files: + model_result_filepaths.append(os.path.join(root, file)) + + eval_results = {} + for model_result_filepath in model_result_filepaths: + # Creation of result + eval_result = EvalResult.init_from_json_file(model_result_filepath) + #eval_result.update_with_request_file(requests_path) + + # Store results of same eval together + eval_name = eval_result.eval_name + if eval_name in eval_results.keys(): + eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None}) + else: + eval_results[eval_name] = eval_result + + results = [] + for v in eval_results.values(): + try: + v.to_dict() # we test if the dict version is complete + results.append(v) + except KeyError: # not all eval values present + continue + + return results diff --git a/src/leaderboard/__pycache__/read_evals.cpython-310.pyc b/src/leaderboard/__pycache__/read_evals.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..589cc9af8005efdec6ec464513f3be2ce6d3804f Binary files /dev/null and b/src/leaderboard/__pycache__/read_evals.cpython-310.pyc differ diff --git a/src/leaderboard/read_evals.py b/src/leaderboard/read_evals.py new file mode 100644 index 0000000000000000000000000000000000000000..213d6eeb8364e23ec83252752376f6bbe945999f --- /dev/null +++ b/src/leaderboard/read_evals.py @@ -0,0 +1,243 @@ +import glob +import json +import math +import os +from dataclasses import dataclass, field + +import dateutil +import numpy as np +from typing import Dict, Union +from datetime import datetime + +#from get_model_info import num_params +from src.display.formatting import make_clickable_model +from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, FewShotType, SizeType +from src.submission.check_validity import is_model_on_hub + + +@dataclass +class EvalResult: + """Represents one full evaluation. Built from a combination of the result and request file for a given run. + """ + eval_name: str # org_model_precision (uid) + full_model: str # org/model (path on hub) + org: str + model: str + revision: str # commit hash, "" if main + #submitted_time: datetime + results: Dict[str, Union[float, int]] # float o int + average_CPS: float + is_5fewshot: bool + fewshot_symbol: FewShotType = FewShotType.Unknown + weight_type: WeightType = WeightType.Original # Original or Adapter + architecture: str = "Unknown" + license: str = "?" + likes: int = 0 + Lang:str="EN" + num_params: int = 0 + date: str = "" # submission date of request file + still_on_hub: bool = False + rank: int = 0#str = field(default=0) # nuovo campo con default = 0 + size_symbol: SizeType = SizeType.Unknown + + @classmethod + def init_from_json_file(self, json_filepath): + """Inits the result from the specific model result file""" + with open(json_filepath) as fp: + data = json.load(fp) + + config = data.get("config") + + #average_CPS = f"{data.get('average_CPS'):.2f}" + # Get average_CPS + average_CPS = float(data.get('average_CPS', 0.0)) # 0.0 come valore di default + # Get number of fewshot + fewshot = config.get("num_fewshot", False) + + rank = 0 + Lang=config.get("LANG", "EN") + try: + if fewshot == "10": + is_5fewshot = True + else: + is_5fewshot = False + except ValueError: + is_5fewshot = False + # Determine the few-shot type (ZS or FS) based on num_fewshot + fewshot_symbol = FewShotType.from_num_fewshot(is_5fewshot) # Use the new + + # Determine the number of parameters of the models + num_params = int(0) + num_params_billion = config.get("num_params_billion") + if num_params_billion is not None: + num_params = math.ceil(num_params_billion) + + size_symbol = SizeType.num2type(num_params) + + # Get model and org + org_and_model = config.get("model_name", config.get("model_args", None)) + org_and_model = org_and_model.split("/", 1) + + if len(org_and_model) == 1: + org = None + model = org_and_model[0] + #result_key = f"{model}_{precision.value.name}" + result_key = f"{model}_{is_5fewshot}" + else: + org = org_and_model[0] + model = org_and_model[1] + #result_key = f"{org}_{model}_{precision.value.name}" + result_key = f"{org}_{model}_{is_5fewshot}" + full_model = "/".join(org_and_model) + + still_on_hub, _, model_config = is_model_on_hub( + full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False + ) + architecture = "?" + if model_config is not None: + architectures = getattr(model_config, "architectures", None) + if architectures: + architecture = ";".join(architectures) + + # Extract the results of the models + results = {} + #print (data) + #print ("Tasks", Tasks) + for task in Tasks: + task = task.value + + for k, v in data["tasks"].items(): + #print ("k , v ", k, v) + #print ("benchmark: ",task.benchmark) + if task.benchmark[:-2] == k: + if "Best Prompt Id" in task.col_name: + if v["is_dummy"] : results[task.benchmark]="n/a" + else : results[task.benchmark] = int(v[task.metric_type][-1:]) + else: + #results[task.benchmark] = f"{v[task.metric_type]:.2f}" # Ensure two decimals for display + #print (v) + if v["is_dummy"] : results[task.benchmark]="n/a" + else: results[task.benchmark] = round(float(v[task.metric_type]), 2)#float(v[task.metric_type]) + #value = float(v[task.metric_type]) + #results[task.benchmark] = round(value, 2) # Arrotonda a 2 decimali + #print (results) + #print ( "************ End of Reading file ****************") + return self( + eval_name=result_key+"_"+Lang, #result_key, + full_model=full_model, + Lang=Lang, + org=org, + model=model, + results=results, + average_CPS=average_CPS, + fewshot_symbol=fewshot_symbol, + is_5fewshot=is_5fewshot, + revision= config.get("model_sha", ""), + still_on_hub=still_on_hub, + architecture=architecture, + num_params=num_params, + rank = rank, + size_symbol=size_symbol + #submitted_time=config.get("submitted_time", ""), + ) + + ''' + def update_with_request_file(self, requests_path): + """Finds the relevant request file for the current model and updates info with it""" + request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name) + + try: + with open(request_file, "r") as f: + request = json.load(f) + self.model_type = ModelType.from_str(request.get("model_type", "")) + self.weight_type = WeightType[request.get("weight_type", "Original")] + self.license = request.get("license", "?") + self.likes = request.get("likes", 0) + self.num_params = request.get("params", 0) + self.date = request.get("submitted_time", "") + except Exception: + print(f"Could not find request file for {self.org}/{self.model} with precision + ''' + + def to_dict(self): + """Converts the Eval Result to a dict compatible with our dataframe display""" + average = self.average_CPS + + fewshot_symbol = ( + self.fewshot_symbol.value.symbol if isinstance(self.fewshot_symbol, FewShotType) else "❓" + ) + + size_symbol = ( + self.size_symbol.value.symbol if isinstance(self.size_symbol, SizeType) else "❓" + ) + + data_dict = { + "eval_name": self.eval_name, # not a column, just a save name, + #AutoEvalColumn.precision.name: self.precision.value.name, + #AutoEvalColumn.model_type.name: self.model_type.value.name, + #AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol, + #AutoEvalColumn.model_type.name: self.model_type.value.name if self.model_type else "Unknown", + #AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol if self.model_type else "Unknown", + AutoEvalColumn.fewshot_symbol.name: fewshot_symbol, + AutoEvalColumn.weight_type.name: self.weight_type.value.name, + AutoEvalColumn.architecture.name: self.architecture, + AutoEvalColumn.model.name: make_clickable_model(self.full_model), + AutoEvalColumn.revision.name: self.revision, + AutoEvalColumn.average.name: average, + AutoEvalColumn.is_5fewshot.name: self.is_5fewshot, + AutoEvalColumn.license.name: self.license, + AutoEvalColumn.likes.name: self.likes, + AutoEvalColumn.params.name: self.num_params, + AutoEvalColumn.still_on_hub.name: self.still_on_hub, + AutoEvalColumn.rank.name: self.rank, + AutoEvalColumn.size_symbol.name: size_symbol, + AutoEvalColumn.LANG.name:self.Lang + } + + for task in Tasks: + data_dict[task.value.col_name] = self.results[task.value.benchmark] + + return data_dict + + +def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]: + """From the path of the results folder root, extract all needed info for results""" + model_result_filepaths = [] + + for root, _, files in os.walk(results_path): + # We should only have json files in model results + if len(files) == 0 or any([not f.endswith(".json") for f in files]): + continue + + # Sort the files by date + try: + files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7]) + except dateutil.parser._parser.ParserError: + files = [files[-1]] + + for file in files: + model_result_filepaths.append(os.path.join(root, file)) + + eval_results = {} + for model_result_filepath in model_result_filepaths: + # Creation of result + eval_result = EvalResult.init_from_json_file(model_result_filepath) + #eval_result.update_with_request_file(requests_path) + + # Store results of same eval together + eval_name = eval_result.eval_name + if eval_name in eval_results.keys(): + eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None}) + else: + eval_results[eval_name] = eval_result + + results = [] + for v in eval_results.values(): + try: + v.to_dict() # we test if the dict version is complete + results.append(v) + except KeyError: # not all eval values present + #print (KeyError) + continue + + return results diff --git a/src/populate.py b/src/populate.py new file mode 100644 index 0000000000000000000000000000000000000000..54d3a3b775a5c24d898f8e76e10a7d461babbd38 --- /dev/null +++ b/src/populate.py @@ -0,0 +1,59 @@ +import json +import os + +import pandas as pd + +from src.display.formatting import has_no_nan_values, make_clickable_model +from src.display.utils import AutoEvalColumn, EvalQueueColumn +from src.leaderboard.read_evals import get_raw_eval_results + + +def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame: + """Creates a dataframe from all the individual experiment results""" + raw_data = get_raw_eval_results(results_path, requests_path) + all_data_json = [v.to_dict() for v in raw_data] + print (all_data_json) + df = pd.DataFrame.from_records(all_data_json) + df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False) + df = df[cols].round(decimals=2) + #df.to_csv("output.csv", index=False) + + # filter out if any of the benchmarks have not been produced + df = df[has_no_nan_values(df, benchmark_cols)] + return df + + +def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]: + """Creates the different dataframes for the evaluation queues requestes""" + entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")] + all_evals = [] + + for entry in entries: + if ".json" in entry: + file_path = os.path.join(save_path, entry) + with open(file_path) as fp: + data = json.load(fp) + + data[EvalQueueColumn.model.name] = make_clickable_model(data["model"]) + data[EvalQueueColumn.revision.name] = data.get("revision", "main") + + all_evals.append(data) + elif ".md" not in entry: + # this is a folder + sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")] + for sub_entry in sub_entries: + file_path = os.path.join(save_path, entry, sub_entry) + with open(file_path) as fp: + data = json.load(fp) + + data[EvalQueueColumn.model.name] = make_clickable_model(data["model"]) + data[EvalQueueColumn.revision.name] = data.get("revision", "main") + all_evals.append(data) + + pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]] + running_list = [e for e in all_evals if e["status"] == "RUNNING"] + finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"] + df_pending = pd.DataFrame.from_records(pending_list, columns=cols) + df_running = pd.DataFrame.from_records(running_list, columns=cols) + df_finished = pd.DataFrame.from_records(finished_list, columns=cols) + return df_finished[cols], df_running[cols], df_pending[cols] diff --git a/src/submission/__pycache__/check_validity.cpython-310.pyc b/src/submission/__pycache__/check_validity.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..003d1bc369d07a5f69718e741182e92e5a3d16e8 Binary files /dev/null and b/src/submission/__pycache__/check_validity.cpython-310.pyc differ diff --git a/src/submission/__pycache__/submit.cpython-310.pyc b/src/submission/__pycache__/submit.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1b3c40cb1de073d2011bb643428ca80f61b5e5ba Binary files /dev/null and b/src/submission/__pycache__/submit.cpython-310.pyc differ diff --git a/src/submission/check_validity.py b/src/submission/check_validity.py new file mode 100644 index 0000000000000000000000000000000000000000..3c3ce45c4dacd2d600544c87584ee72c81d3b956 --- /dev/null +++ b/src/submission/check_validity.py @@ -0,0 +1,99 @@ +import json +import os +import re +from collections import defaultdict +from datetime import datetime, timedelta, timezone + +import huggingface_hub +from huggingface_hub import ModelCard +from huggingface_hub.hf_api import ModelInfo +from transformers import AutoConfig +from transformers.models.auto.tokenization_auto import AutoTokenizer + +def check_model_card(repo_id: str) -> tuple[bool, str]: + """Checks if the model card and license exist and have been filled""" + try: + card = ModelCard.load(repo_id) + except huggingface_hub.utils.EntryNotFoundError: + return False, "Please add a model card to your model to explain how you trained/fine-tuned it." + + # Enforce license metadata + if card.data.license is None: + if not ("license_name" in card.data and "license_link" in card.data): + return False, ( + "License not found. Please add a license to your model card using the `license` metadata or a" + " `license_name`/`license_link` pair." + ) + + # Enforce card content + if len(card.text) < 200: + return False, "Please add a description to your model card, it is too short." + + return True, "" + +def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]: + """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses.""" + try: + config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token) + if test_tokenizer: + try: + tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token) + except ValueError as e: + return ( + False, + f"uses a tokenizer which is not in a transformers release: {e}", + None + ) + except Exception as e: + return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None) + return True, None, config + + except ValueError: + return ( + False, + "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.", + None + ) + + except Exception as e: + return False, "was not found on hub!", None + + +def get_model_size(model_info: ModelInfo, precision: str): + """Gets the model size from the configuration, or the model name if the configuration does not contain the information.""" + try: + model_size = round(model_info.safetensors["total"] / 1e9, 3) + except (AttributeError, TypeError): + return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in example_app.py + + size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1 + model_size = size_factor * model_size + return model_size + +def get_model_arch(model_info: ModelInfo): + """Gets the model architecture from the configuration""" + return model_info.config.get("architectures", "Unknown") + +def already_submitted_models(requested_models_dir: str) -> set[str]: + """Gather a list of already submitted models to avoid duplicates""" + depth = 1 + file_names = [] + users_to_submission_dates = defaultdict(list) + + for root, _, files in os.walk(requested_models_dir): + current_depth = root.count(os.sep) - requested_models_dir.count(os.sep) + if current_depth == depth: + for file in files: + if not file.endswith(".json"): + continue + with open(os.path.join(root, file), "r") as f: + info = json.load(f) + file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}") + + # Select organisation + if info["model"].count("/") == 0 or "submitted_time" not in info: + continue + organisation, _ = info["model"].split("/") + users_to_submission_dates[organisation].append(info["submitted_time"]) + + return set(file_names), users_to_submission_dates diff --git a/src/submission/submit.py b/src/submission/submit.py new file mode 100644 index 0000000000000000000000000000000000000000..cac6ea48e803a0af42dabe5226191c769dbec71d --- /dev/null +++ b/src/submission/submit.py @@ -0,0 +1,119 @@ +import json +import os +from datetime import datetime, timezone + +from src.display.formatting import styled_error, styled_message, styled_warning +from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO +from src.submission.check_validity import ( + already_submitted_models, + check_model_card, + get_model_size, + is_model_on_hub, +) + +REQUESTED_MODELS = None +USERS_TO_SUBMISSION_DATES = None + +def add_new_eval( + model: str, + base_model: str, + revision: str, + precision: str, + weight_type: str, + model_type: str, +): + global REQUESTED_MODELS + global USERS_TO_SUBMISSION_DATES + if not REQUESTED_MODELS: + REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH) + + user_name = "" + model_path = model + if "/" in model: + user_name = model.split("/")[0] + model_path = model.split("/")[1] + + precision = precision.split(" ")[0] + current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + + if model_type is None or model_type == "": + return styled_error("Please select a model type.") + + # Does the model actually exist? + if revision == "": + revision = "main" + + # Is the model on the hub? + if weight_type in ["Delta", "Adapter"]: + base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True) + if not base_model_on_hub: + return styled_error(f'Base model "{base_model}" {error}') + + if not weight_type == "Adapter": + model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True) + if not model_on_hub: + return styled_error(f'Model "{model}" {error}') + + # Is the model info correctly filled? + try: + model_info = API.model_info(repo_id=model, revision=revision) + except Exception: + return styled_error("Could not get your model information. Please fill it up properly.") + + model_size = get_model_size(model_info=model_info, precision=precision) + + # Were the model card and license filled? + try: + license = model_info.cardData["license"] + except Exception: + return styled_error("Please select a license for your model") + + modelcard_OK, error_msg = check_model_card(model) + if not modelcard_OK: + return styled_error(error_msg) + + # Seems good, creating the eval + print("Adding new eval") + + eval_entry = { + "model": model, + "base_model": base_model, + "revision": revision, + "precision": precision, + "weight_type": weight_type, + "status": "PENDING", + "submitted_time": current_time, + "model_type": model_type, + "likes": model_info.likes, + "params": model_size, + "license": license, + "private": False, + } + + # Check for duplicate submission + if f"{model}_{revision}_{precision}" in REQUESTED_MODELS: + return styled_warning("This model has been already submitted.") + + print("Creating eval file") + OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}" + os.makedirs(OUT_DIR, exist_ok=True) + out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json" + + with open(out_path, "w") as f: + f.write(json.dumps(eval_entry)) + + print("Uploading eval file") + API.upload_file( + path_or_fileobj=out_path, + path_in_repo=out_path.split("eval-queue/")[1], + repo_id=QUEUE_REPO, + repo_type="dataset", + commit_message=f"Add {model} to eval queue", + ) + + # Remove the local file + os.remove(out_path) + + return styled_message( + "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list." + ) diff --git a/src/tasks.py b/src/tasks.py new file mode 100644 index 0000000000000000000000000000000000000000..085a7e9d0aed6c071974d5882917cb01e9bf36ce --- /dev/null +++ b/src/tasks.py @@ -0,0 +1,255 @@ +from dataclasses import dataclass +from enum import Enum + +@dataclass +class Task: + benchmark: str + # metric: str + accuracy: str + col_name: str + +NUM_FEWSHOT = 0 # Change with your few shot +# --------------------------------------------------- + +# Your leaderboard name +TITLE = """

🚀 ECREAM-LLM Leaderboard 🚀

""" + +# What does your leaderboard evaluate? +INTRODUCTION_TEXT = """ +ECREAM-LLM is a benchmark designed to evaluate Large Language Models (LLMs) on Italian tasks. The distinguishing features of Evalita-LLM are the following: (i) all tasks are native Italian, avoiding translation issues and potential cultural biases; (ii) the benchmark includes generative tasks, enabling more natural interaction with LLMs; (iii) all tasks are evaluated against multiple prompts, this way mitigating the model sensitivity to specific prompts and allowing a fairer evaluation. +""" + +#MEASURE_DESCRIPTION = "Combined Performance = (1 - (Best_Prompt - Prompt_Average) / 100) * Best_Prompt. Prompt Average = accuracy averaged over the six prompts. Best Prompt = accuracy of the best prompt. Prompt ID = ID of the best prompt (see legend above)" +MEASURE_DESCRIPTION = "**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = accuracy averaged over the assessed prompts. **Best Prompt** = accuracy of the best prompt. **Prompt ID** = ID of the best prompt (see legend above)." +#MEASURE_DESCRIPTION = "**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = avg. accuracy over prompts. **Best Prompt** = accuracy of best prompt. **Prompt ID** = ID of the best prompt (see legend above)." + +# Tasks Descriptions +RML_DESCRIPTION = """### RML (RML) --- *Multiple-choice task* +| # | Prompt (EN) | +|-----|--------------------------------------------------------------------------------| +| 1 | You are a clinical medical expert. You must answer a question about laboratory test results and measurements for a patient. To do this, you have access to the patient’s medical history. Here is the patient’s medical history: <<<{{clinical_note}}>>> If the test/exam was performed only once, answer with the test/exam results. If the information is not contained in the medical history, answer with “not available.” Answer this question based on the patient’s medical history: <<>> | +| 2 | Given a clinical note describing the history of a patient, answer a question about laboratory test results and measurements. Answer with the test/exam results exactly as reported in the text. If the information is not contained in the clinical history, answer with “not available.” Here is the patient’s clinical history: <<<{{clinical_note}}>>> What are the results and measurements of {{item}}?| +| 3 |Here it is the clinical history of a patient:<<<{{clinical_note}}>>> What are the results and measurements of {{item}}?| + +| # | Prompt (IT) | +|-----|--------------------------------------------------------------------------------| +| 1 |Sei un esperto medico clinico. Devi rispondere ad una domanda su risultati e misure di test di laboratorio rispetto ad un paziente. Per farlo, hai a disposizione la storia clinica del paziente. Ecco la storia clinica del paziente:<<<{{clinical_note}}>>> Se il test/esame è stato effettuato sono una volta, rispondi con i risultati del test/esame. Se l'informazione non è contenuta nella storia clinica, rispondi con 'non disponibile'. Rispondi a questa domanda sulla base della storia clinica del paziente: <<>> | +| 2 | Data una nota clinica che descrive la storia di un paziente, rispondi ad una domanda su risultati e misure di test di laboratorio. Rispondi con i risultati del test/esame così come riportati nel testo. Se l'informazione non è contenuta nella storia clinica, rispondi con 'non disponibile'. Ecco la storia clinica del paziente:<<<{{clinical_note}}>>> Quali sono i risultati e le misure di {{item}}?| +| 3 | Ecco la storia clinica di un paziente:<<<{{clinical_note}}>>> Quali sono i risultati e le misure di {{item}}?| + + + +**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = accuracy averaged over the 6 prompts. **Best Prompt** = accuracy of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). + +""" + +DIA_DESCRIPTION = """### DIAGNOSIS (DIA) --- *Multiple-choice task* +| # | Prompt (EN) | +|-----|--------------------------------------------------------------------------------| +| 1 | You are a clinical medical expert. You must answer a question about the patient’s diagnosis. To do this, you have access to the patient’s medical history. Answer “Yes” if the patient’s definitive diagnosis is the one indicated. If the information is not contained in the medical history, answer “not available. Patient medical history: <<<{{clinical_note}}>>> Answer this question based on the patient’s medical history: <<>>"| +| 2 | Given a patient's medical history, answer a question. Patient medical history: <<<{{clinical_note}}>>>. Question: <<>>. The answer can be 'Yes', 'No' or 'not available' if the information is not contained in the medical history.| +| 3 |Hai a disposizione una nota clinica relativa ad un paziente: <<<{{clinical_note}}>>>. Non è detto che la nota clinica contenga informazioni rilevanti per rispondere alla domanda. In tal caso, rispondi con 'non disponibile'. Data la storia clinica sopra presentata, la diagnosi è {{item}}?| + +| # | Prompt (IT) | +|-----|--------------------------------------------------------------------------------| +| 1 | Sei un esperto medico clinico. Devi rispondere ad una domanda sulla diagnosi del paziente. Per farlo, hai a disposizione la storia clinica del paziente. Risondi 'Sì' se la diagnosi definitiva del paziente è quella indicata. Se l'informazione non è contenuta nella storia clinica, rispondi con 'non disponibile'. Storia clinica: <<<{{clinical_note}}>>>. Rispondi a questa domanda sulla base della storia clinica del paziente: <<>> | +| 2 | Data la storia medica di un paziente, rispondi ad una domanda. Storia clinica: <<<{{clinical_note}}>>>. Domanda: <<>>. La risposta può essere 'Sì', 'No' oppure 'non disponibile' se l'informazione non è contenuta nella storia clinica| +| 3 |Hai a disposizione una nota clinica relativa ad un paziente: <<<{{clinical_note}}>>>. Non è detto che la nota clinica contenga informazioni rilevanti per rispondere alla domanda. In tal caso, rispondi con 'non disponibile'. Data la storia clinica sopra presentata, la diagnosi è {{item}}? | + +**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1-macro averaged over the 6 prompts. **Best Prompt** = F1-macro of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). + +""" + +HIS_DESCRIPTION = """### HISTORY (HIS) --- *Multiple-choice task* + + +| # | Prompt (EN) | +|-----|--------------------------------------------------------------------------------| +| 1 | You are a clinical medical expert. You must answer a question about the patient’s history. To do this, you have access to the patient’s medical history. The answer must consist of three components: polarity, contextual modality, and chronicity. You must combine these three components to answer the question. Contextual modality can be: a) “Certainly” if the answer is certain, b) “Possibly” if the answer is hypothetical, c) “Probably” if the answer is probable. Polarity can be: a) “Yes” if the answer is affirmative, b) “No” if the answer is negative. Chronicity can be: a) “Chronic” if the condition in question is certainly permanent,b) “Certainly not chronic” if the condition is temporary or transient, c) “Possibly chronic” otherwise. Patient medical history: <<<{{clinical_note}}>>>. Answer this question based on the patient’s medical history: <<> | +| 2 | Given the patient’s medical history, answer a question. Patient medical history: <<<{{clinical_note}}>>>. Question: <<>>. The possible answers are: Yes, possibly chronic Certainly yes, certainly not chronic Probably yes, possibl chronic Possibly yes, chronic Certainly yes, chronic Possibly yes, possibly chronic Probably yes, chronic Yes, chronic Yes, certainly not chronic Probably yes, certainly not chronic Certainly yes, possibly chronic Possibly yes, certainly not chronic Not available Probably no, chronic Certainly no, chronic Possibly no, chronic Probably no, possibly chronic Certainly no, possibly chronic Possibly no, possibly chronic Certainly no, certainly not chronic Probably no, certainly not chronic Possibly no, certainly not chronic Probably no Certainly no Possibly no Probably no, certainly not chronic Certainly no, possibly chronic Possibly no, certainly not chronic Probably no, possibly chronic Certainly no, certainly not chronic| +| 3 | You are a clinical medical expert. You must answer a question about the patient’s history. To do this, you have access to the patient’s medical history. Patient medical history: <<<{{clinical_note}}>>>. The answer must consist of three components: polarity, contextual modality, and chronicity. You must combine these three components to answer the question. Contextual modality can be: a) “Certainly” if the answer is certain, b) “Possibly” if the answer is hypothetical, c) “Probably” if the answer is probable. Polarity can be: a) “Yes” if the answer is affirmative, b) “No” if the answer is negative. Chronicity can be: a) “Chronic” if the condition in question is certainly permanent, b) “Certainly not chronic” if the condition is temporary or transient, c) “Possibly chronic” otherwise. Answer this question based on the patient’s medical history: <<>| + +| # | Prompt (IT) | +|-----|--------------------------------------------------------------------------------| +| 1 | Sei un esperto medico clinico. Devi rispondere ad una domanda su la storia del paziente. Per farlo, hai a disposizione la storia clinica del paziente. La risposta è composta da tre componenti: polarità, modalità contestuale e permanenza. Devi mettere insieme queste tre componenti per rispondere alla domanda. Modalità contestuale può essere: a)'Certamente' se la risposta è certa, b)'Possibilmente' se la risposta è ipotetica, c)'Probabilmente' se la risposta è probabile. Polarità può essere: a)'sì' se la risposta è affermativa, b) 'no' se la risposta è negativa. Permanenza può essere: a)'cronico' se l'oggetto della domanda è sicuramente permanente per sempre, b)'certamente non cronico' se se l'oggetto della domanda è temporaeo o transitorio, c)'possibilmente cronico' altrimenti. Storia clinica: <<<{{clinical_note}}>>>. Rispondi a questa domanda sulla base della storia clinica del paziente: <<>> | +| 2 | Data la storia medica di un paziente, rispondi ad una domanda. Storia clinica: <<<{{clinical_note}}>>>. Domanda: <<>>. Le opzioni sono: - sì, possibilmente cronico - Certamente sì, Certamente non cronico - probabilmente sì, possibilmente cronico - possibilmente sì, cronico - Certamente sì, cronico - possibilmente sì, possibilmente cronico - probabilmente sì, cronico - sì, cronico - sì, Certamente non cronico - probabilmente sì, Certamente non cronico - Certamente sì, possibilmente cronico - possibilmente sì, Certamente non cronico - non disponibile - probabilmente no, cronico - Certamente no, cronico - possibilmente no, cronico - probabilmente no, possibilmente cronico - Certamente no, possibilmente cronico - possibilmente no, possibilmente cronico - Certamente no, Certamente non cronico - probabilmente no, Certamente non cronico - possibilmente no, Certamente non cronico - probabilmente no - Certamente no - possibilmente no - probabilmente no, Certamente non cronico - Certamente no, possibilmente cronico - possibilmente no, Certamente non cronico - probabilmente no, possibilment cronico - Certamente no, Certamente non cronico| +| 3 | Sei un esperto medico clinico. Devi rispondere ad una domanda su la storia del paziente. Per farlo, hai a disposizione la storia clinica del paziente. Storia clinica: <<<{{clinical_note}}>>>. La risposta è composta da tre componenti: polarità, modalità contestuale e permanenza. Devi mettere insieme queste tre componenti per rispondere alla domanda. - modalità contestuale può essere: a)'Certamente' se la risposta è certa, b)'Possibilmente' se la risposta è ipotetica, c)'Probabilmente' se la risposta è probabile. - polarità può essere: a)'sì' se la risposta è affermativa, b) 'no' se la risposta è negativa. - permanenza può essere: a)'cronico' se l'oggetto della domanda è sicuramente permanente per sempre, b)'certamente non cronico' se se l'oggetto della domanda è temporaeo o transitorio, c)'possibilmente cronico' altrimenti. Rispondi a questa domanda sulla base della storia clinica del paziente: <<>>| + + + +**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1-micro averaged over the 6 prompts. **Best Prompt** = F1-micro of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). + +""" + +AT_DESCRIPTION = """### Admission Tests (AT) --- *Multiple-choice task* + The input is a multiple-choice question with five options (A-E) from Italian medical specialty entrance exams, and the model must identify the correct answer. + +| # | Prompt | Answer Choices | +|-----|--------------------------------------------------------------------------------|-----------------------------| +| 1 | Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta? | ["A", "B", "C", "D", "E"] | +| 2 | Devi risolvere un compito di risposte a domande. Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta? | ["A", "B", "C", "D", "E"] | +| 3 | Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta?\\nA: {{A}}\\nB: {{B}}\\nC: {{C}}\\nD: {{D}}\\nE: {{E}}\\nRisposta: | ["A", "B", "C", "D", "E"] | +| 4 | Devi risolvere un compito a scelta multipla. Dato il seguente caso clinico: '{{background}}', qual è la risposta corretta alla domanda: '{{domanda}}'?\\nA: {{A}}\\nB: {{B}}\\nC: {{C}}\\nD: {{D}}\\nE: {{E}}\\nRisposta:Devi risolvere un compito a scelta multipla. Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta?\\nA: {{A}}\\nB: {{B}}\\nC: {{C}}\\nD: {{D}}\\nE: {{E}}\\nRisposta: | ["A", "B", "C", "D", "E"] | +| 5 | Dato il seguente quesito di medicina '{{Question}}' la risposta corretta è: | ["A", "B", "C", "D", "E"] | +| 6 | Devi risolvere un compito di risposte a domande. Dato il seguente quesito di medicina '{{Question}}' la risposta corretta è: | ["A", "B", "C", "D", "E"] | + +**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = accuracy averaged over the 6 prompts. **Best Prompt** = accuracy of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). + +""" + +WIC_DESCRIPTION = """### Word in Context (WIC) --- *Multiple-choice task* + The input consists of a word (w) and two sentences. The model has to determine whether the word w has the same meaning in both sentences. The output is a binary classification: 1 (same meaning) or 0 (different meaning). + +| # | Prompt | Answer Choices | +|-----|--------------------------------------------------------------------------------|-------------------------------------------------| +| 1 | La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' ha lo stesso significato della parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}'? | ["No", "Sì"] | +| 2 | Devi determinare se una stessa parola usata in due frasi differenti ha lo stesso significato in entrambi i contesti. La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' ha lo stesso significato della parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}'? | ["No", "Sì"] | +| 3 | La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' ha lo stesso significato della parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}'?\\nA: Sì\\nB: No\\nRisposta: | ["B", "A"] | +| 4 | Devi determinare se una stessa parola usata in due frasi differenti ha lo stesso significato in entrambi i contesti. La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' ha lo stesso significato della parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}'?\\nA: \\nB: No\\nRisposta: | ["B", "A"] | +| 5 | La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' e la parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}' | ["non hanno lo stesso significato", "hanno lo stesso significato"] | +| 6 | Devi determinare se una stessa parola usata in due frasi differenti ha lo stesso significato in entrambi i contesti. La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' e la parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}' | ["non hanno lo stesso significato", "hanno lo stesso significato"] | + +**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1-macro averaged over the 6 prompts. **Best Prompt** = F1-macro of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). + +""" + +FAQ_DESCRIPTION = """### Frequently Asked Questions & Question Answering (FAQ) --- *Multiple-choice task* + The input is a user query regarding the water supply service. The model must identify the correct answer from the 4 available options. + +| # | Prompt | Answer Choices | +|-----|--------------------------------------------------------------------------------|-----------------------------| +| 1 | Rispondi alla seguente domanda: '{{question}}' | {{[A, B, C, D]}} | +| 2 | Devi risolvere un compito di risposte a domande. Rispondi alla seguente domanda: '{{question}}' | {{[A, B, C, D]}} | +| 3 | Rispondi alla seguente domanda: '{{question}}'\\nA: {{A}}\\nB: {{B}}\\nC: {{C}}\\nD: {{D}}\\nRisposta: | ["A", "B", "C", "D"] | +| 4 | Devi risolvere un compito a scelta multipla. Rispondi alla seguente domanda: '{{question}}'\\nA: {{A}}\\nB: {{B}}\\nC: {{C}}\\nD: {{D}}\\nRisposta: | ["A", "B", "C", "D"] | +| 5 | La risposta alla domanda: '{{question}}' è: | {{[A, B, C, D]}} | +| 6 | Devi risolvere un compito di risposte a domande. La risposta alla domanda: '{{question}}' è: | {{[A, B, C, D]}} | + +**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = accuracy averaged over the 6 prompts. **Best Prompt** = accuracy of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). + +""" + +LS_DESCRIPTION = """### Lexical Substitution (LS) --- *Generative task* + The input is a sentence containing a target word (w). The model has to replace the target word w with its most suitable synonyms that are contextually relevant. + +| # | Prompt | +|-----|--------------------------------------------------------------------------------| +| 7 | Trova 10 parole che possono sostituire la parola racchiusa tra i marcatori `` nella seguente frase: '{{context}}', mantenendo lo stesso significato. Elenca i lemmi (forme base) di queste parole, separandoli con una virgola, ad esempio: lemma1, lemma2, lemma3, lemma4, lemma5. Non aggiungere commenti o altro testo. Risposta: | +| 8 | Devi risolvere un compito di sostituzione lessicale. Trova 10 parole che possono sostituire la parola racchiusa tra i marcatori `` nella seguente frase: '{{context}}', mantenendo lo stesso significato. Elenca i lemmi (forme base) di queste parole, separandoli con una virgola, ad esempio: lemma1, lemma2, lemma3, lemma4, lemma5. Non aggiungere commenti o altro testo. Risposta: | + +**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1 averaged over the 2 prompts. **Best Prompt** = F1 of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). + +""" + +SU_DESCRIPTION = """### Summarization (SUM) --- *Generative task* + The input is a news article. The model has to generate a concise summary of the input text, capturing the key information and main points. + +| # | Prompt | +|-----|--------------------------------------------------------------------------------| +| 7 | Riassumi il seguente articolo di giornale: '{{source}}'\\nRiassunto: | +| 8 | Devi risolvere un compito di sintesi automatica del testo. Riassumi il seguente articolo di giornale: '{{source}}'\\nRiassunto: | + +**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1 averaged over the 2 prompts. **Best Prompt** = F1 of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). + +""" + +NER_DESCRIPTION = """### Named Entity Recognition (NER) --- *Generative task* + The input is a sentence of a clinical text. The model has to identify the clinical entities, which include all occurrences of clinical disorders (i.e. diseases and symptoms). + +| # | Prompt (IT) | +|-----|--------------------------------------------------------------------------------| +| 1 | Devi svolgere un compito di riconoscimento di entità in testi medici. Dalla seguente frase, estrai tutte le entità del tipo CLINENTITY, che include tutti i disturbi di carattere medico in una singola categoria (cioè, sia malattie che sintomi). Riporta ogni entità nel formato: Entity$CLINENTITY, separando ogni coppia con ','. Se non ci sono entità da estrarre, rispondi con '&&NOENT&&'. | +| 2 | Devi svolgere un compito di riconoscimento di entità in testi medici. Dalla seguente frase, estrai tutte le entità del tipo CLINENTITY, che include tutti i disturbi di carattere medico (un disturbo è definito come un processo patologico definito, con un insieme caratteristico di segni e sintomi). Restituisci ogni entità nel seguente formato: Entity$CLINENTITY, separando ogni coppia con ','. Se non ci sono entità da estrarre, rispondi con '&&NOENT&&'.| +| 3 | Devi svolgere un compito di riconoscimento di entità in note cliniche. Dalla seguente frase, estrai tutte le entità del tipo CLINENTITY, che include tutti i disturbi di carattere medico in una singola categoria (cioè, sia malattie che sintomi). Restituisci ogni entità nel seguente formato: Entity$CLINENTITY, separando ogni coppia con ','. Se non ci sono entità da estrarre, rispondi con '&&NOENT&&'.| + +| # | Prompt (SK) | +|-----|--------------------------------------------------------------------------------| +| 1 | Máš za úlohu rozpoznať entity v lekárskych textoch. Z nasledujúcej vety vyber všetky entity typu CLINENTITY, ktoré zahŕňajú všetky zdravotné poruchy v jednej kategórii (t. j. choroby aj symptómy). Každú entitu uveď vo formáte: Entity$CLINENTITY, pričom každú dvojicu oddeľ znakom „,“. Ak nie sú žiadne entity, ktoré by sa mohli/dali vybrať, odpovedz/vráť „&&NOENT&&“.| +| 2 | Máš za úlohu rozpoznať entity v lekárskych textoch. Z nasledujúcej vety vyber všetky entity typu CLINENTITY, ktoré zahŕňajú všetky lekárske poruchy (porucha je definovaná ako určitý patologický proces s charakteristickým súborom príznakov a symptómov). Vráť každú entitu v nasledujúcom formáte: Entity$CLINENTITY, pričom každú dvojicu oddeľ znakom „,“. Ak nie sú žiadne entity, ktoré by sa mohli/dali vybrať, odpovedz/vráť „&&NOENT&&“.| +| 3 | Máš za úlohu rozpoznať entity v klinických poznámkach. Z nasledujúcej vety vyber všetky entity typu CLINENTITY, ktoré zahŕňajú všetky zdravotné poruchy v jednej kategórii (t. j. choroby aj symptómy). Vráť každú entitu v nasledujúcom formáte: Entity$CLINENTITY, pričom každú dvojicu oddeľ znakom „,“. Ak nie sú žiadne entity, ktoré by bolo možné vybrať, odpovedz/vráť „&&NOENT&&“.| + +| # | Prompt (SL) | +|-----|--------------------------------------------------------------------------------| +| 1 | Tvoja naloga je prepoznavanje entitet v medicinskih besedilih. Iz naslednjega stavka izlušči vse entitete tipa CLINENTITY, kamor spadajo vse medicinske motnje v posamezni kategoriji (tj. tako bolezni kot simptomi). Vsako entiteto zapiši v obliki: Entity$CLINENTITY, posamezne pare pa loči z vejico ','. Če ni nobene entitete za izluščiti, vrni &&NOENT&&.| +| 2 | Tvoja naloga je prepoznavanje entitet v medicinskih besedilih. Iz naslednjega stavka izlušči vse entitete tipa CLINENTITY, kamor spadajo vse medicinske motnje (motnja je opredeljena kot določen patološki proces s značilnim naborom znakov in simptomov). Vsako entiteto zapiši v obliki: Entity$CLINENTITY, posamezne pare pa loči z vejico ','. Če ni nobene entitete za izluščiti, vrni &&NOENT&&.| +| 3 | Tvoja naloga je prepoznavanje entitet v kliničnih zapisih. Iz naslednjega stavka izlušči vse entitete tipa CLINENTITY, kamor spadajo vse medicinske motnje v posamezni kategoriji (tako bolezni kot simptomi). Vsako entiteto zapiši v obliki: Entity$CLINENTITY, posamezne pare pa loči z vejico ','. Če ni nobene entitete za izluščiti, vrni &&NOENT&&.| + +| # | Prompt (GR) | +|-----|--------------------------------------------------------------------------------| +| 1 | Έχεις να εκτελέσεις τη δραστηριότητα του να εντοπίσεις οντότητες μέσα σε ιατρικά κείμενα. Στην παρακάτω πρόταση, να εξάγεις όλες τις οντότητες του τύπου CLINENTITY, η οποία περιλαμβάνει όλες τις ιατρικές διαταραχές σε μία μόνο κατηγορία (δλδ τόσο νοσήματα όσο και συμπτώματα). Να αναφέρεις κάθε οντότητα με την μορφή: Οντότητα$CLINENTITY, χωρίζοντας κάθε ζευγάρι με ','. Αν δεν υπάρχουν οντότητες για να εξαχθούν, απάντησε με το '&&NOENT&&'.| +| 2 | Έχεις να εκτελέσεις μία δραστηριότητα αναγνώρισης οντοτήτων σε ιατρικά κείμενα. Από τις ακόλουθες προτάσεις, να εξάγεις όλες τις οντότητες του τύπου CLINENTITY, ο οποίος περιλαμβάνει όλες τις ιατρικές διαταραχές (μια διαταραχή ορίζεται ως μία ξεκάθαρα παθολογική διαδικασία με ένα χαρακτηριστικό συνδυασμό σημείων και συμπτωμάτων). Επέστρεφε κάθε οντότητα με την ακόλουθη μορφή: Οντότητα$CLINENTITY, χωρίζοντας κάθε ζευγάρι με ','. Αν δεν υπάρχουν οντότητες να εξαχθούν, απάντησε με το '&&NOENT&&'.| +| 3 | Έχεις να εκτελέσεις μια δραστηριότητα αναγνώρισης οντοτήτων σε κλινικά σημειώματα. Από την ακόλουθη πρόταση να εξάγεις όλες τις οντότητες του τύπου CLINENTITY, που περιλαμβάνει όλες τις ιατρικές διαταραχές σε μία μόνο κατηγορία (δλδ τόσο νοσήματα όσο και συμπτώματα). Επέστρεψε κάθε οντότητα με την ακόλουθη μορφή: Οντότητα$CLINENTITY, χωρίζοντας κάθε ζευγάρι με ','. Αν δεν υπάρχουν οντότητες για να εξαχθούν, απάντησε με το '&&NOENT&&'.| + +| # | Prompt (PL) | +|-----|--------------------------------------------------------------------------------| +| 1 | Zadanie polega na rozpoznawania jednostek (chorobowych) w tekstach medycznych. Z poniższego zdania wyodrębnij wszystkie jednostki typu CLINENTITY, które obejmują wszystkie schorzenia medyczne danej kategorii (tj. zarówno choroby jak i objawy). Każda jednostka powinna być zgłoszona w formacie: Entity$CLINENTITY, z oddzieleniem każdej pary znakiem ”,”. Jeśli nie ma żadnych jednostek do wyodrębnienia, odpowiedz '&&NOENT&&'. | +| 2 |Zadanie polega na rozpoznawaniu jednostek (chorobowych) w tekstach medycznych. Z poniższego zdania wyodrębnij wszystkie jednostki typu CLINENTITY, które obejmują wszystkie schorzenia medyczne (schorzenie definiuje się jako określony proces patologiczny z charakterystycznym zestawem objawów). Zwróć każdą jednostkę w następującym formacie: Entity$CLINENTITY, oddzielając każdą parę znakiem ”,”. Jeśli nie ma jednostek do wyodrębnienia, odpowiedz '&&NOENT&&'. | +| 3 | Zadanie polega na rozpoznawania jednostek (chorobowych) w notatkach klinicznych. Z poniższego zdania wyodrębnij wszystkie jednostki typu CLINENTITY, które obejmują wszystkie schorzenia medyczne z danej kategorii (tj. zarówno choroby jak i objawy). Zapisz każdą jednostkę w następującym formacie: Entity$CLINENTITY, oddzielając każdą parę znakiem ”,”. Jeśli nie ma jednostek do wyodrębnienia, odpowiedz '&&NOENT&&'.| + +| # | Prompt (EN) | +|-----|--------------------------------------------------------------------------------------------------------------------------------------| +| 1 | You have to perform a task of entity recognition in medical texts. From the following sentence, extract all the entities of type CLINENTITY, which includes all medical disorders in a single category (i.e. both diseases and symptoms). Report each entity with the format: Entity$CLINENTITY, separating each pair with ','. If there are no entities to extract, answer with '&&NOENT&&'. | +| 2 | You have to perform a task of entity recognition in medical texts. From the following sentence, extract all the entities of type CLINENTITY, which includes all medical disorders (a disorder is defined as a definite pathologic process with a characteristic set of signs and symptoms). Return each entity in the following format: Entity$CLINENTITY, separating each pair with ','. If there are no entities to extract, answer with '&&NOENT&&'.| +| 3 | You have to perform a task of entity recognition in clinical notes. From the following sentence, extract all the entities of type CLINENTITY, which includes all medical disorders in a single category (i.e. both diseases and symptoms). Return each entity in the following format: Entity$CLINENTITY, separating each pair with ','. If there are no entities to extract, answer with '&&NOENT&&'.| + +**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1 averaged over the 2 prompts. **Best Prompt** = F1 of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). + +""" + +REL_DESCRIPTION = """### Relation Extraction (REL) --- *Generative task* + The input is a sentence of a clinical text. The model must identify and extract relations between laboratory test results (e.g.,122/81mmHg) and the corresponding tests or procedures (e.g., blood pressure). + +| # | Prompt (IT) | +|-----|--------------------------------------------------------------------------------| +| 1 | Devi estrarre relazioni da una frase nel campo medico. Data una frase in italiano, estrai tutti i test di laboratorio con i loro valori. Ritorna i risultati come: valore$voce_medica&valore$voce_medica. Usa '&&NOREL&&' se non trovi nessuna relazione. | +| 2 | Estrai tutte le coppie test-valore menzionate nella seguente frase nel campo medico. Includi solamente misurazioni esplicite in cui il nome di un test di laboratorio è chiaramente collegato alla sua misurazione. Scrivi ciascuna coppia nel formato: valore$nome_test. Congiungi coppie multiple usando '&'. Se nessuma coppia valida esiste, ritorna esattamente: '&&NOREL&&'.| +| 3 | Estrai tutte le coppie test-valore dalla seguente frase medica. Includi solamente test di laboratorio e i valori delle corrispondenti misurazioni. Formatta ciascuna coppia come valore$nome_test, e separa coppie multiple usando '&'. Se non c'è nessuna coppia, ritorna '&&NOREL&&'.| + +| # | Prompt (SK) | +|-----|--------------------------------------------------------------------------------| +| 1 | Vyber súvislosti z lekárskej vety. Na základe slovenskej vety vyber všetky položky laboratórnych testov spolu s ich hodnotami. Vráť výsledky v tvare: value$medical_item&value$medical_item. Ak sa nenašli žiadne súvislosti, použi ‚&&NOREL&&‘. | +| 2 | Vyber všetky dvojice laboratórnych testov a hodnôt uvedené v nasledujúcej lekárskej vete. Zahrň iba explicitné/jednoznačné merania, kde je názov laboratórneho testu jasne prepojený/spätý s nameranou hodnotou. Každú dvojicu zapíš vo formáte: value$test_name. Viaceré dvojice spoj pomocou znaku '&'. Ak neexistujú žiadne platné dvojice, vráť: „&&NOREL&&“.| +| 3 | Vyber všetky dvojice laboratórnych testov a hodnôt z nasledujúcej lekárskej vety. Zahrň iba laboratórne testy a ich zodpovedajúce namerané hodnoty. Každý pár formátuj ako value$test_name a viacero párov oddeľ znakom „&“. Ak takéto páry neexistujú, vráť „&&NOREL&&“.| + +| # | Prompt (SL) | +|-----|--------------------------------------------------------------------------------| +| 1 | Iz medicinskega stavka je treba izluščiti relacije. Iz slovenskega stavka izlušči vse laboratorijske preiskave skupaj z njihovimi vrednostmi. Rezultat vrni v obliki: value$medical_item&value$medical_item. Če v stavku ni mogoče najti nobene relacije, uporabi '&&NOREL&&'.| +| 2 |Iz danega medicinskega stavka izlušči vse pare laboratorijska preiskava–vrednost. Vključi samo tiste vrednosti, kjer je ime laboratorijske preiskave jasno povezano z izmerjeno vrednostjo. Vsak par zapiši v obliki: value$test_name. Če je parov več, jih poveži z znakom '&'. Če veljavnih parov ni, vrni natanko: &&NOREL&&. | +| 3 | Iz naslednjega medicinskega stavka izlušči vse pare laboratorijska preiskava–vrednost. Vključi samo laboratorijske preiskave in njihove pripadajoče izmerjene vrednosti. Vsak par zapiši v obliki: value$test_name, več parov pa loči z znakom '&'. Če takih parov ni, vrni &&NOREL&&.| + +| # | Prompt (GR) | +|-----|--------------------------------------------------------------------------------| +| 1 | Πρέπει να εξάγεις σχέσεις από μια πρόταση ενός ιατρικού αρχείου. Θα σου δοθεί μία πρόταση στα Ελληνικά και θα πρέπει να εξάγεις όλες τις τιμές εργαστηριακών αποτελεσμάτων με τις τιμές τους. Παρουσίασε τα αποτελέσματα με την παρακάτω μορφή: τιμή$ιατρικό_αντικείμενο&τιμή$ιατρικό_αντικείμενο. Χρησιμοποίησε '&&NOREL&&' αν δεν βρεθούν σχέσεις.| +| 2 | Πρέπει να εξάγεις όλα τα ζευγάρια εργαστηριακών εξετάσεων και αποτελεσμάτων από την παρακάτω πρόταση ενός ιατρικού αρχείου. Να περιλάβεις μόνο συγκεκριμένες μετρήσεις όπου βρεις ότι το όνομα μιας εργαστηριακής εξέτασης συνδέεται ξεκάθαρα με την τιμή που μετρήθηκε. Γράψε κάθε ζευγάρι με την μορφή: τιμή$όνομα_εξέτασης. Ένωσε πολλαπλά ζευγάρια χρησιμοποιώντας το σύμβολο'&'. Αν δεν υπάρχουν ζευγάρια να επιστρέψεις την τιμή: '&&NOREL&&'.| +| 3 | Να εξάγεις όλα τα ζευγάρια εργαστηριακών εξετάσεων-τιμών από την παρακάτω πρόταση ενός ιατρικού αρχείου. Να συμπεριλάβεις μόνο εργαστηριακές εξετάσεις και τις αντίστοιχες μετρημένες τιμές τους. Φτιάξε κάθε ζευγάρι με τη μορφή τιμή$όνομα_εξέτασης, και χώρισε πολλαπλά ζευγάρια χρησιμοποιώντας το σύμβολο '&'. Αν δεν υπάρχουν τέτοια ζευγάρια να επιστρέψεις την τιμή '&&NOREL&&'.| + +| # | Prompt (PL) | +|-----|--------------------------------------------------------------------------------| +| 1 | Wyodrębnij zależności z wyrażenia medycznego. Dla danego polskiego zdania wyodrębnij wszystkie pozycje badań laboratoryjnych wraz z ich wartościami. Wyniki należy zapisać w formacie: value$medical_item&value$medical_item. Jeśli nie znaleziono żadnych zależności, napisz '&&NOREL&&'.| +| 2 |Wyodrębnij wszystkie pary badań laboratoryjnych – wartość wymienione w poniższym wyrażeniu medycznym. Uwzględnij tylko wyraźne pomiary, w których nazwa badania laboratoryjnego jest wyraźnie powiązana z jego wartością pomiarową. Zapisz każdą parę w formacie: value$test_name. wyodrębnij pary za pomocą znaku '&'. Jeśli nie ma żadnych par, napisz '&&NOREL&&'. | +| 3 | Wyodrębnij wszystkie pary badań laboratoryjnych – wartość z poniższego wyrażenia medycznego. Uwzględnij tylko badania laboratoryjne i odpowiadające im wartości pomiarowe. Zapisz każdą parę jako value$test_name i oddziel pary za pomocą znaku '&'. Jeśli nie ma żadnych par, napisz '&&NOREL&&'.| + +| # | Prompt (EN) | +|-----|--------------------------------------------------------------------------------------------------------------------------------------| +| 1 | You have to extract relations from a medical sentence. Given an English sentence, extract all lab test items with their values. Return results like: value$medical_item&value$medical_item. Use '&&NOREL&&' if no relations are found. | +| 2 | Extract all lab test–value pairs mentioned in the following medical sentence. Include only explicit measurements where a lab test name is clearly linked to its measured value. Write each pair in the format: value$test_name. Join multiple pairs using '&'. If no valid pairs exist, return exactly: '&&NOREL&&'.| +| 3 | Extract all lab test–value pairs from the following medical sentence. Only include lab tests and their corresponding measured values. Format each pair as value$test_name, and separate multiple pairs using '&'. If there are no such pairs, return '&&NOREL&&'.| + + +**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1 averaged over the 2 prompts. **Best Prompt** = F1 of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). + +""" + +# Create a dictionary to map task names to their descriptions +TASK_DESCRIPTIONS = { + "RML": RML_DESCRIPTION, + "DIA": DIA_DESCRIPTION, + "HIS": HIS_DESCRIPTION, + "AT": AT_DESCRIPTION, + "WIC": WIC_DESCRIPTION, + "FAQ": FAQ_DESCRIPTION, + "LS": LS_DESCRIPTION, + "SU": SU_DESCRIPTION, + "NER": NER_DESCRIPTION, + "REL": REL_DESCRIPTION +} \ No newline at end of file