luisrguerra
commited on
Update index.html
Browse files- index.html +53 -3
index.html
CHANGED
@@ -31,12 +31,48 @@
|
|
31 |
<div><canvas id="radarChart" height="750"></canvas></div>
|
32 |
<p>The MMLU (Massive Multitask Language Understanding) test is a benchmark that measures language understanding and performance on 57 tasks.</p>
|
33 |
<p>MT-Bench: Benchmark test with questions prepared by the Chatbot Arena team. Uses GPT-4 to evaluate responses.</p>
|
|
|
34 |
<div id="tableBenchMark"></div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
<script>
|
37 |
const benchmarkData = [
|
38 |
{
|
39 |
-
name: 'gpt-4-1106-preview',
|
40 |
mmlu: null,
|
41 |
mtbench: 9.32,
|
42 |
arenaelo:1249,
|
@@ -105,6 +141,20 @@
|
|
105 |
organization: 'OpenAI',
|
106 |
license: 'Proprietary',
|
107 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
{
|
109 |
name: 'Claude 2.1',
|
110 |
mmlu: null,
|
@@ -254,7 +304,7 @@
|
|
254 |
'<th>MMLU</th>' +
|
255 |
'<th>MT-Bench</th>' +
|
256 |
'<th>Arena Elo</th>' +
|
257 |
-
'<th>
|
258 |
'<th>Winogrande</th>' +
|
259 |
'<th>TruthfulQA</th>' +
|
260 |
'<th>HellaSwag</th>' +
|
@@ -336,7 +386,7 @@
|
|
336 |
}
|
337 |
const dataSetRadar = getDataSetRadar(benchmarkData);
|
338 |
let data = {
|
339 |
-
labels: ['MMLU', 'MT-bench','Arena Elo','
|
340 |
datasets: getDataSetRadar(benchmarkData)
|
341 |
};
|
342 |
|
|
|
31 |
<div><canvas id="radarChart" height="750"></canvas></div>
|
32 |
<p>The MMLU (Massive Multitask Language Understanding) test is a benchmark that measures language understanding and performance on 57 tasks.</p>
|
33 |
<p>MT-Bench: Benchmark test with questions prepared by the Chatbot Arena team. Uses GPT-4 to evaluate responses.</p>
|
34 |
+
<p>GSM8K is a dataset of 8.5K high quality linguistically diverse grade school math word problems created by human problem writers. A bright middle school student should be able to solve every problem.</p>
|
35 |
<div id="tableBenchMark"></div>
|
36 |
+
<h4>Best models for solving math problems:</h4>
|
37 |
+
<ul>
|
38 |
+
<li>gpt-4-1106-preview (turbo)</li>
|
39 |
+
<li>gpt-4-0613</li>
|
40 |
+
<li>gpt-4-0314</li>
|
41 |
+
<li>Gemini Ultra</li>
|
42 |
+
</ul>
|
43 |
+
<h4>Models with the best cost benefit:</h4>
|
44 |
+
<ul>
|
45 |
+
<li>Gemini Pro</li>
|
46 |
+
<li>gpt-3.5-turbo-0613</li>
|
47 |
+
<li>gpt-3.5-turbo-1106</li>
|
48 |
+
<li>Claude Instant 1</li>
|
49 |
+
<li>Mixtral 8x7B Instruct</li>
|
50 |
+
<li>Mistral Medium</li>
|
51 |
+
</ul>
|
52 |
+
<h4>Models with fewer hallucinations:</h4>
|
53 |
+
<ul>
|
54 |
+
<li>gpt-4-1106-preview (turbo)</li>
|
55 |
+
<li>gpt-4-0613</li>
|
56 |
+
<li>gpt-4-0314</li>
|
57 |
+
<li>Gemini Ultra</li>
|
58 |
+
<li>Claude 2.1</li>
|
59 |
+
</ul>
|
60 |
+
<h4>Models with a high level of hallucinations:</h4>
|
61 |
+
<ul>
|
62 |
+
<li>Mixtral 8x7B Instruct</li>
|
63 |
+
<li>Yi 34B</li>
|
64 |
+
</ul>
|
65 |
+
<h4>Open Models:</h4>
|
66 |
+
<ul>
|
67 |
+
<li>Mixtral 8x7B Instruct</li>
|
68 |
+
<li>Yi 34B</li>
|
69 |
+
</ul>
|
70 |
+
|
71 |
|
72 |
<script>
|
73 |
const benchmarkData = [
|
74 |
{
|
75 |
+
name: 'gpt-4-1106-preview (turbo)',
|
76 |
mmlu: null,
|
77 |
mtbench: 9.32,
|
78 |
arenaelo:1249,
|
|
|
141 |
organization: 'OpenAI',
|
142 |
license: 'Proprietary',
|
143 |
},
|
144 |
+
{
|
145 |
+
name: 'gpt-3.5-turbo-1106',
|
146 |
+
mmlu: null,
|
147 |
+
mtbench: 8.32,
|
148 |
+
arenaelo:1072,
|
149 |
+
gsm8k: null,
|
150 |
+
winogrande: null,
|
151 |
+
truthfulqa: null,
|
152 |
+
hellaswag:null,
|
153 |
+
arc:null,
|
154 |
+
parameters: '20B - 175B (not confirmed)',
|
155 |
+
organization: 'OpenAI',
|
156 |
+
license: 'Proprietary',
|
157 |
+
},
|
158 |
{
|
159 |
name: 'Claude 2.1',
|
160 |
mmlu: null,
|
|
|
304 |
'<th>MMLU</th>' +
|
305 |
'<th>MT-Bench</th>' +
|
306 |
'<th>Arena Elo</th>' +
|
307 |
+
'<th>GSM8k</th>' +
|
308 |
'<th>Winogrande</th>' +
|
309 |
'<th>TruthfulQA</th>' +
|
310 |
'<th>HellaSwag</th>' +
|
|
|
386 |
}
|
387 |
const dataSetRadar = getDataSetRadar(benchmarkData);
|
388 |
let data = {
|
389 |
+
labels: ['MMLU', 'MT-bench','Arena Elo','GSM8k','Winogrande','TruthfulQA','HellaSwag','ARC'],
|
390 |
datasets: getDataSetRadar(benchmarkData)
|
391 |
};
|
392 |
|