Open-FinLLM-Leaderboard

Sleeping

App Files Files Community

Jimin Huang commited on Jan 8

Commit

3adbd07

1 Parent(s): 2e3dc13

Add dataset

Browse files

Files changed (8) hide show

backend/app/services/leaderboard.py +4 -29
backend/poetry.lock +0 -0
backend/pyproject.toml +1 -0
contents.py +4 -0
frontend/src/pages/LeaderboardPage/components/Leaderboard/constants/defaults.js +2 -32
frontend/src/pages/LeaderboardPage/components/Leaderboard/constants/tooltips.js +5 -93
frontend/src/pages/LeaderboardPage/components/Leaderboard/utils/columnUtils.js +3 -48
frontend/src/pages/QuotePage/QuotePage.js +1 -1

backend/app/services/leaderboard.py CHANGED Viewed

@@ -107,35 +107,10 @@ class LeaderboardService:
             evaluations = {
                 "ifeval": {
-                    "name": "IFEval",
-                    "value": data.get("IFEval Raw", 0),
-                    "normalized_score": data.get("IFEval", 0)
                 },
-                "bbh": {
-                    "name": "BBH",
-                    "value": data.get("BBH Raw", 0),
-                    "normalized_score": data.get("BBH", 0)
-                },
-                "math": {
-                    "name": "MATH Level 5",
-                    "value": data.get("MATH Lvl 5 Raw", 0),
-                    "normalized_score": data.get("MATH Lvl 5", 0)
-                },
-                "gpqa": {
-                    "name": "GPQA",
-                    "value": data.get("GPQA Raw", 0),
-                    "normalized_score": data.get("GPQA", 0)
-                },
-                "musr": {
-                    "name": "MUSR",
-                    "value": data.get("MUSR Raw", 0),
-                    "normalized_score": data.get("MUSR", 0)
-                },
-                "mmlu_pro": {
-                    "name": "MMLU-PRO",
-                    "value": data.get("MMLU-PRO Raw", 0),
-                    "normalized_score": data.get("MMLU-PRO", 0)
-                }
             }
             features = {
@@ -205,4 +180,4 @@ class LeaderboardService:
         except Exception as e:
             logger.error(LogFormatter.error(f"Failed to transform data for {data.get('fullname', 'Unknown')}", e))
-            raise

             evaluations = {
                 "ifeval": {
+                    "name": "MultiFin",
+                    "value": data.get("MultiFin Raw", 0),
+                    "normalized_score": data.get("MultiFin", 0)
                 },
             }
             features = {
         except Exception as e:
             logger.error(LogFormatter.error(f"Failed to transform data for {data.get('fullname', 'Unknown')}", e))
+            raise

backend/poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

backend/pyproject.toml CHANGED Viewed

@@ -19,6 +19,7 @@ safetensors = "^0.4.5"
 aiofiles = "^24.1.0"
 fastapi-cache2 = "^0.2.1"
 python-dotenv = "^1.0.1"
 [tool.poetry.group.dev.dependencies]
 pytest = "^8.3.4"

 aiofiles = "^24.1.0"
 fastapi-cache2 = "^0.2.1"
 python-dotenv = "^1.0.1"
+pydantic = "^2.10.4"
 [tool.poetry.group.dev.dependencies]
 pytest = "^8.3.4"

contents.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from datasets import load_dataset
+ds = load_dataset("open-llm-leaderboard/contents")
+ds.push_to_hub("TheFinAI/greek-contents")

frontend/src/pages/LeaderboardPage/components/Leaderboard/constants/defaults.js CHANGED Viewed

@@ -125,41 +125,11 @@ const COLUMNS = {
     },
   },
   EVALUATION: {
-    "evaluations.ifeval.normalized_score": {
       group: "evaluation",
       size: COLUMN_SIZES.BENCHMARK,
       defaultVisible: true,
-      label: "IFEval",
-    },
-    "evaluations.bbh.normalized_score": {
-      group: "evaluation",
-      size: COLUMN_SIZES.BENCHMARK,
-      defaultVisible: true,
-      label: "BBH",
-    },
-    "evaluations.math.normalized_score": {
-      group: "evaluation",
-      size: COLUMN_SIZES.BENCHMARK,
-      defaultVisible: true,
-      label: "MATH",
-    },
-    "evaluations.gpqa.normalized_score": {
-      group: "evaluation",
-      size: COLUMN_SIZES.BENCHMARK,
-      defaultVisible: true,
-      label: "GPQA",
-    },
-    "evaluations.musr.normalized_score": {
-      group: "evaluation",
-      size: COLUMN_SIZES.BENCHMARK,
-      defaultVisible: true,
-      label: "MUSR",
-    },
-    "evaluations.mmlu_pro.normalized_score": {
-      group: "evaluation",
-      size: COLUMN_SIZES.BENCHMARK,
-      defaultVisible: true,
-      label: "MMLU-PRO",
     },
   },
   MODEL_INFO: {

     },
   },
   EVALUATION: {
+    "evaluations.multifin.normalized_score": {
       group: "evaluation",
       size: COLUMN_SIZES.BENCHMARK,
       defaultVisible: true,
+      label: "MultiFin",
     },
   },
   MODEL_INFO: {

frontend/src/pages/LeaderboardPage/components/Leaderboard/constants/tooltips.js CHANGED Viewed

@@ -40,107 +40,19 @@ export const COLUMN_TOOLTIPS = {
     },
   ]),
-  IFEVAL: createTooltipContent("Instruction-Following Evaluation (IFEval):", [
     {
       label: "Purpose",
       description:
-        "Tests model's ability to follow explicit formatting instructions",
-      subItems: ["Instruction following", "Formatting", "Generation"],
     },
     {
-      label: "Scoring: Accuracy",
-      description: "Was the format asked for strictly respected.",
     },
   ]),
-  BBH: createTooltipContent("Big Bench Hard (BBH):", [
-    {
-      label: "Overview",
-      description: "Collection of challenging for LLM tasks across domains, for example",
-      subItems: [
-        "Language understanding",
-        "Mathematical reasoning",
-        "Common sense and world knowledge",
-      ],
-    },
-    {
-      label: "Scoring: Accuracy",
-      description:
-        "Was the correct choice selected among the options.",
-    },
-  ]),
-  MATH: createTooltipContent(
-    "Mathematics Aptitude Test of Heuristics (MATH), level 5:",
-    [
-      {
-        label: "Content",
-        description: "High school level competitions mathematical problems",
-        subItems: ["Complex algebra", "Geometry problems", "Advanced calculus"],
-      },
-      {
-        label: "Scoring: Exact match",
-        description:
-          "Was the solution generated correct and in the expected format",
-      },
-    ]
-  ),
-  GPQA: createTooltipContent("Graduate-Level Google-Proof Q&A (GPQA):", [
-    {
-      label: "Focus",
-      description: "PhD-level knowledge multiple choice questions in science",
-      subItems: [
-        "Chemistry",
-        "Biology",
-        "Physics",
-      ],
-    },
-    {
-      label: "Scoring: Accuracy",
-      description:
-        "Was the correct choice selected among the options.",
-    },
-  ]),
-  MUSR: createTooltipContent("Multistep Soft Reasoning (MuSR):", [
-    {
-      label: "Scope",
-      description: "Reasoning and understanding on/of long texts",
-      subItems: [
-        "Language understanding",
-        "Reasoning capabilities",
-        "Long context reasoning",
-      ],
-    },
-    {
-      label: "Scoring: Accuracy",
-      description:
-        "Was the correct choice selected among the options.",
-    },
-  ]),
-  MMLU_PRO: createTooltipContent(
-    "Massive Multitask Language Understanding - Professional (MMLU-Pro):",
-    [
-      {
-        label: "Coverage",
-        description: "Expertly reviewed multichoice questions across domains, for example:",
-        subItems: [
-          "Medicine and healthcare",
-          "Law and ethics",
-          "Engineering",
-          "Mathematics",
-        ],
-      },
-      {
-        label: "Scoring: Accuracy",
-        description:
-          "Was the correct choice selected among the options.",
-      },
-    ]
-  ),
   ARCHITECTURE: createTooltipContent("Model Architecture Information:", [
     {
       label: "Definition",

     },
   ]),
+  MultiFin: createTooltipContent("Multilingual Financial NLP (MultiFin):", [
     {
       label: "Purpose",
       description:
+        "Tests model's ability to understand real-world financial article headlines",
+      subItems: ["Language Understanding", "Classification"],
     },
     {
+      label: "Scoring: Micro F1",
+      description: "Was the correct choice selected among the options.",
     },
   ]),
   ARCHITECTURE: createTooltipContent("Model Architecture Information:", [
     {
       label: "Definition",

frontend/src/pages/LeaderboardPage/components/Leaderboard/utils/columnUtils.js CHANGED Viewed

@@ -752,56 +752,11 @@ export const createColumns = (
   const evaluationColumns = [
     {
       accessorKey: "evaluations.ifeval.normalized_score",
-      header: createHeaderCell("IFEval", COLUMN_TOOLTIPS.IFEVAL),
       cell: ({ row, getValue }) =>
-        createScoreCell(getValue, row, "evaluations.ifeval.normalized_score"),
       size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
-        "evaluations.ifeval.normalized_score"
-      ],
-    },
-    {
-      accessorKey: "evaluations.bbh.normalized_score",
-      header: createHeaderCell("BBH", COLUMN_TOOLTIPS.BBH),
-      cell: ({ row, getValue }) =>
-        createScoreCell(getValue, row, "evaluations.bbh.normalized_score"),
-      size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
-        "evaluations.bbh.normalized_score"
-      ],
-    },
-    {
-      accessorKey: "evaluations.math.normalized_score",
-      header: createHeaderCell("MATH", COLUMN_TOOLTIPS.MATH),
-      cell: ({ row, getValue }) =>
-        createScoreCell(getValue, row, "evaluations.math.normalized_score"),
-      size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
-        "evaluations.math.normalized_score"
-      ],
-    },
-    {
-      accessorKey: "evaluations.gpqa.normalized_score",
-      header: createHeaderCell("GPQA", COLUMN_TOOLTIPS.GPQA),
-      cell: ({ row, getValue }) =>
-        createScoreCell(getValue, row, "evaluations.gpqa.normalized_score"),
-      size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
-        "evaluations.gpqa.normalized_score"
-      ],
-    },
-    {
-      accessorKey: "evaluations.musr.normalized_score",
-      header: createHeaderCell("MUSR", COLUMN_TOOLTIPS.MUSR),
-      cell: ({ row, getValue }) =>
-        createScoreCell(getValue, row, "evaluations.musr.normalized_score"),
-      size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
-        "evaluations.musr.normalized_score"
-      ],
-    },
-    {
-      accessorKey: "evaluations.mmlu_pro.normalized_score",
-      header: createHeaderCell("MMLU-PRO", COLUMN_TOOLTIPS.MMLU_PRO),
-      cell: ({ row, getValue }) =>
-        createScoreCell(getValue, row, "evaluations.mmlu_pro.normalized_score"),
-      size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
-        "evaluations.mmlu_pro.normalized_score"
       ],
     },
   ];

   const evaluationColumns = [
     {
       accessorKey: "evaluations.ifeval.normalized_score",
+      header: createHeaderCell("MultiFin", COLUMN_TOOLTIPS.IFEVAL),
       cell: ({ row, getValue }) =>
+        createScoreCell(getValue, row, "evaluations.multifin.normalized_score"),
       size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
+        "evaluations.multifin.normalized_score"
       ],
     },
   ];

frontend/src/pages/QuotePage/QuotePage.js CHANGED Viewed

@@ -60,7 +60,7 @@ const priorWork = [
 const benchmarks = [
   {
-    title: "IFEval: Instruction-Following Evaluation",
     authors: "Zhou et al.",
     citation: `@misc{zhou2023instructionfollowingevaluationlargelanguage,
   title={Instruction-Following Evaluation for Large Language Models},

 const benchmarks = [
   {
+    title: "MultiFin: Instruction-Following Evaluation",
     authors: "Zhou et al.",
     citation: `@misc{zhou2023instructionfollowingevaluationlargelanguage,
   title={Instruction-Following Evaluation for Large Language Models},