Jimin Huang commited on
Commit
3adbd07
·
1 Parent(s): 2e3dc13

Add dataset

Browse files
backend/app/services/leaderboard.py CHANGED
@@ -107,35 +107,10 @@ class LeaderboardService:
107
 
108
  evaluations = {
109
  "ifeval": {
110
- "name": "IFEval",
111
- "value": data.get("IFEval Raw", 0),
112
- "normalized_score": data.get("IFEval", 0)
113
  },
114
- "bbh": {
115
- "name": "BBH",
116
- "value": data.get("BBH Raw", 0),
117
- "normalized_score": data.get("BBH", 0)
118
- },
119
- "math": {
120
- "name": "MATH Level 5",
121
- "value": data.get("MATH Lvl 5 Raw", 0),
122
- "normalized_score": data.get("MATH Lvl 5", 0)
123
- },
124
- "gpqa": {
125
- "name": "GPQA",
126
- "value": data.get("GPQA Raw", 0),
127
- "normalized_score": data.get("GPQA", 0)
128
- },
129
- "musr": {
130
- "name": "MUSR",
131
- "value": data.get("MUSR Raw", 0),
132
- "normalized_score": data.get("MUSR", 0)
133
- },
134
- "mmlu_pro": {
135
- "name": "MMLU-PRO",
136
- "value": data.get("MMLU-PRO Raw", 0),
137
- "normalized_score": data.get("MMLU-PRO", 0)
138
- }
139
  }
140
 
141
  features = {
@@ -205,4 +180,4 @@ class LeaderboardService:
205
 
206
  except Exception as e:
207
  logger.error(LogFormatter.error(f"Failed to transform data for {data.get('fullname', 'Unknown')}", e))
208
- raise
 
107
 
108
  evaluations = {
109
  "ifeval": {
110
+ "name": "MultiFin",
111
+ "value": data.get("MultiFin Raw", 0),
112
+ "normalized_score": data.get("MultiFin", 0)
113
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  }
115
 
116
  features = {
 
180
 
181
  except Exception as e:
182
  logger.error(LogFormatter.error(f"Failed to transform data for {data.get('fullname', 'Unknown')}", e))
183
+ raise
backend/poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
backend/pyproject.toml CHANGED
@@ -19,6 +19,7 @@ safetensors = "^0.4.5"
19
  aiofiles = "^24.1.0"
20
  fastapi-cache2 = "^0.2.1"
21
  python-dotenv = "^1.0.1"
 
22
 
23
  [tool.poetry.group.dev.dependencies]
24
  pytest = "^8.3.4"
 
19
  aiofiles = "^24.1.0"
20
  fastapi-cache2 = "^0.2.1"
21
  python-dotenv = "^1.0.1"
22
+ pydantic = "^2.10.4"
23
 
24
  [tool.poetry.group.dev.dependencies]
25
  pytest = "^8.3.4"
contents.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from datasets import load_dataset
2
+
3
+ ds = load_dataset("open-llm-leaderboard/contents")
4
+ ds.push_to_hub("TheFinAI/greek-contents")
frontend/src/pages/LeaderboardPage/components/Leaderboard/constants/defaults.js CHANGED
@@ -125,41 +125,11 @@ const COLUMNS = {
125
  },
126
  },
127
  EVALUATION: {
128
- "evaluations.ifeval.normalized_score": {
129
  group: "evaluation",
130
  size: COLUMN_SIZES.BENCHMARK,
131
  defaultVisible: true,
132
- label: "IFEval",
133
- },
134
- "evaluations.bbh.normalized_score": {
135
- group: "evaluation",
136
- size: COLUMN_SIZES.BENCHMARK,
137
- defaultVisible: true,
138
- label: "BBH",
139
- },
140
- "evaluations.math.normalized_score": {
141
- group: "evaluation",
142
- size: COLUMN_SIZES.BENCHMARK,
143
- defaultVisible: true,
144
- label: "MATH",
145
- },
146
- "evaluations.gpqa.normalized_score": {
147
- group: "evaluation",
148
- size: COLUMN_SIZES.BENCHMARK,
149
- defaultVisible: true,
150
- label: "GPQA",
151
- },
152
- "evaluations.musr.normalized_score": {
153
- group: "evaluation",
154
- size: COLUMN_SIZES.BENCHMARK,
155
- defaultVisible: true,
156
- label: "MUSR",
157
- },
158
- "evaluations.mmlu_pro.normalized_score": {
159
- group: "evaluation",
160
- size: COLUMN_SIZES.BENCHMARK,
161
- defaultVisible: true,
162
- label: "MMLU-PRO",
163
  },
164
  },
165
  MODEL_INFO: {
 
125
  },
126
  },
127
  EVALUATION: {
128
+ "evaluations.multifin.normalized_score": {
129
  group: "evaluation",
130
  size: COLUMN_SIZES.BENCHMARK,
131
  defaultVisible: true,
132
+ label: "MultiFin",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  },
134
  },
135
  MODEL_INFO: {
frontend/src/pages/LeaderboardPage/components/Leaderboard/constants/tooltips.js CHANGED
@@ -40,107 +40,19 @@ export const COLUMN_TOOLTIPS = {
40
  },
41
  ]),
42
 
43
- IFEVAL: createTooltipContent("Instruction-Following Evaluation (IFEval):", [
44
  {
45
  label: "Purpose",
46
  description:
47
- "Tests model's ability to follow explicit formatting instructions",
48
- subItems: ["Instruction following", "Formatting", "Generation"],
49
  },
50
  {
51
- label: "Scoring: Accuracy",
52
- description: "Was the format asked for strictly respected.",
53
  },
54
  ]),
55
 
56
- BBH: createTooltipContent("Big Bench Hard (BBH):", [
57
- {
58
- label: "Overview",
59
- description: "Collection of challenging for LLM tasks across domains, for example",
60
- subItems: [
61
- "Language understanding",
62
- "Mathematical reasoning",
63
- "Common sense and world knowledge",
64
- ],
65
- },
66
- {
67
- label: "Scoring: Accuracy",
68
- description:
69
- "Was the correct choice selected among the options.",
70
- },
71
- ]),
72
-
73
- MATH: createTooltipContent(
74
- "Mathematics Aptitude Test of Heuristics (MATH), level 5:",
75
- [
76
- {
77
- label: "Content",
78
- description: "High school level competitions mathematical problems",
79
- subItems: ["Complex algebra", "Geometry problems", "Advanced calculus"],
80
- },
81
- {
82
- label: "Scoring: Exact match",
83
- description:
84
- "Was the solution generated correct and in the expected format",
85
- },
86
- ]
87
- ),
88
-
89
- GPQA: createTooltipContent("Graduate-Level Google-Proof Q&A (GPQA):", [
90
- {
91
- label: "Focus",
92
- description: "PhD-level knowledge multiple choice questions in science",
93
- subItems: [
94
- "Chemistry",
95
- "Biology",
96
- "Physics",
97
- ],
98
- },
99
- {
100
- label: "Scoring: Accuracy",
101
- description:
102
- "Was the correct choice selected among the options.",
103
- },
104
- ]),
105
-
106
- MUSR: createTooltipContent("Multistep Soft Reasoning (MuSR):", [
107
- {
108
- label: "Scope",
109
- description: "Reasoning and understanding on/of long texts",
110
- subItems: [
111
- "Language understanding",
112
- "Reasoning capabilities",
113
- "Long context reasoning",
114
- ],
115
- },
116
- {
117
- label: "Scoring: Accuracy",
118
- description:
119
- "Was the correct choice selected among the options.",
120
- },
121
- ]),
122
-
123
- MMLU_PRO: createTooltipContent(
124
- "Massive Multitask Language Understanding - Professional (MMLU-Pro):",
125
- [
126
- {
127
- label: "Coverage",
128
- description: "Expertly reviewed multichoice questions across domains, for example:",
129
- subItems: [
130
- "Medicine and healthcare",
131
- "Law and ethics",
132
- "Engineering",
133
- "Mathematics",
134
- ],
135
- },
136
- {
137
- label: "Scoring: Accuracy",
138
- description:
139
- "Was the correct choice selected among the options.",
140
- },
141
- ]
142
- ),
143
-
144
  ARCHITECTURE: createTooltipContent("Model Architecture Information:", [
145
  {
146
  label: "Definition",
 
40
  },
41
  ]),
42
 
43
+ MultiFin: createTooltipContent("Multilingual Financial NLP (MultiFin):", [
44
  {
45
  label: "Purpose",
46
  description:
47
+ "Tests model's ability to understand real-world financial article headlines",
48
+ subItems: ["Language Understanding", "Classification"],
49
  },
50
  {
51
+ label: "Scoring: Micro F1",
52
+ description: "Was the correct choice selected among the options.",
53
  },
54
  ]),
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  ARCHITECTURE: createTooltipContent("Model Architecture Information:", [
57
  {
58
  label: "Definition",
frontend/src/pages/LeaderboardPage/components/Leaderboard/utils/columnUtils.js CHANGED
@@ -752,56 +752,11 @@ export const createColumns = (
752
  const evaluationColumns = [
753
  {
754
  accessorKey: "evaluations.ifeval.normalized_score",
755
- header: createHeaderCell("IFEval", COLUMN_TOOLTIPS.IFEVAL),
756
  cell: ({ row, getValue }) =>
757
- createScoreCell(getValue, row, "evaluations.ifeval.normalized_score"),
758
  size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
759
- "evaluations.ifeval.normalized_score"
760
- ],
761
- },
762
- {
763
- accessorKey: "evaluations.bbh.normalized_score",
764
- header: createHeaderCell("BBH", COLUMN_TOOLTIPS.BBH),
765
- cell: ({ row, getValue }) =>
766
- createScoreCell(getValue, row, "evaluations.bbh.normalized_score"),
767
- size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
768
- "evaluations.bbh.normalized_score"
769
- ],
770
- },
771
- {
772
- accessorKey: "evaluations.math.normalized_score",
773
- header: createHeaderCell("MATH", COLUMN_TOOLTIPS.MATH),
774
- cell: ({ row, getValue }) =>
775
- createScoreCell(getValue, row, "evaluations.math.normalized_score"),
776
- size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
777
- "evaluations.math.normalized_score"
778
- ],
779
- },
780
- {
781
- accessorKey: "evaluations.gpqa.normalized_score",
782
- header: createHeaderCell("GPQA", COLUMN_TOOLTIPS.GPQA),
783
- cell: ({ row, getValue }) =>
784
- createScoreCell(getValue, row, "evaluations.gpqa.normalized_score"),
785
- size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
786
- "evaluations.gpqa.normalized_score"
787
- ],
788
- },
789
- {
790
- accessorKey: "evaluations.musr.normalized_score",
791
- header: createHeaderCell("MUSR", COLUMN_TOOLTIPS.MUSR),
792
- cell: ({ row, getValue }) =>
793
- createScoreCell(getValue, row, "evaluations.musr.normalized_score"),
794
- size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
795
- "evaluations.musr.normalized_score"
796
- ],
797
- },
798
- {
799
- accessorKey: "evaluations.mmlu_pro.normalized_score",
800
- header: createHeaderCell("MMLU-PRO", COLUMN_TOOLTIPS.MMLU_PRO),
801
- cell: ({ row, getValue }) =>
802
- createScoreCell(getValue, row, "evaluations.mmlu_pro.normalized_score"),
803
- size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
804
- "evaluations.mmlu_pro.normalized_score"
805
  ],
806
  },
807
  ];
 
752
  const evaluationColumns = [
753
  {
754
  accessorKey: "evaluations.ifeval.normalized_score",
755
+ header: createHeaderCell("MultiFin", COLUMN_TOOLTIPS.IFEVAL),
756
  cell: ({ row, getValue }) =>
757
+ createScoreCell(getValue, row, "evaluations.multifin.normalized_score"),
758
  size: TABLE_DEFAULTS.COLUMNS.COLUMN_SIZES[
759
+ "evaluations.multifin.normalized_score"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
760
  ],
761
  },
762
  ];
frontend/src/pages/QuotePage/QuotePage.js CHANGED
@@ -60,7 +60,7 @@ const priorWork = [
60
 
61
  const benchmarks = [
62
  {
63
- title: "IFEval: Instruction-Following Evaluation",
64
  authors: "Zhou et al.",
65
  citation: `@misc{zhou2023instructionfollowingevaluationlargelanguage,
66
  title={Instruction-Following Evaluation for Large Language Models},
 
60
 
61
  const benchmarks = [
62
  {
63
+ title: "MultiFin: Instruction-Following Evaluation",
64
  authors: "Zhou et al.",
65
  citation: `@misc{zhou2023instructionfollowingevaluationlargelanguage,
66
  title={Instruction-Following Evaluation for Large Language Models},