luisrguerra commited on
Commit
a56da64
·
verified ·
1 Parent(s): 512c89a

Update index.html

Browse files
Files changed (1) hide show
  1. index.html +53 -3
index.html CHANGED
@@ -31,12 +31,48 @@
31
  <div><canvas id="radarChart" height="750"></canvas></div>
32
  <p>The MMLU (Massive Multitask Language Understanding) test is a benchmark that measures language understanding and performance on 57 tasks.</p>
33
  <p>MT-Bench: Benchmark test with questions prepared by the Chatbot Arena team. Uses GPT-4 to evaluate responses.</p>
 
34
  <div id="tableBenchMark"></div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  <script>
37
  const benchmarkData = [
38
  {
39
- name: 'gpt-4-1106-preview',
40
  mmlu: null,
41
  mtbench: 9.32,
42
  arenaelo:1249,
@@ -105,6 +141,20 @@
105
  organization: 'OpenAI',
106
  license: 'Proprietary',
107
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  {
109
  name: 'Claude 2.1',
110
  mmlu: null,
@@ -254,7 +304,7 @@
254
  '<th>MMLU</th>' +
255
  '<th>MT-Bench</th>' +
256
  '<th>Arena Elo</th>' +
257
- '<th>GSM-8k</th>' +
258
  '<th>Winogrande</th>' +
259
  '<th>TruthfulQA</th>' +
260
  '<th>HellaSwag</th>' +
@@ -336,7 +386,7 @@
336
  }
337
  const dataSetRadar = getDataSetRadar(benchmarkData);
338
  let data = {
339
- labels: ['MMLU', 'MT-bench','Arena Elo','GSM-8k','Winogrande','TruthfulQA','HellaSwag','ARC'],
340
  datasets: getDataSetRadar(benchmarkData)
341
  };
342
 
 
31
  <div><canvas id="radarChart" height="750"></canvas></div>
32
  <p>The MMLU (Massive Multitask Language Understanding) test is a benchmark that measures language understanding and performance on 57 tasks.</p>
33
  <p>MT-Bench: Benchmark test with questions prepared by the Chatbot Arena team. Uses GPT-4 to evaluate responses.</p>
34
+ <p>GSM8K is a dataset of 8.5K high quality linguistically diverse grade school math word problems created by human problem writers. A bright middle school student should be able to solve every problem.</p>
35
  <div id="tableBenchMark"></div>
36
+ <h4>Best models for solving math problems:</h4>
37
+ <ul>
38
+ <li>gpt-4-1106-preview (turbo)</li>
39
+ <li>gpt-4-0613</li>
40
+ <li>gpt-4-0314</li>
41
+ <li>Gemini Ultra</li>
42
+ </ul>
43
+ <h4>Models with the best cost benefit:</h4>
44
+ <ul>
45
+ <li>Gemini Pro</li>
46
+ <li>gpt-3.5-turbo-0613</li>
47
+ <li>gpt-3.5-turbo-1106</li>
48
+ <li>Claude Instant 1</li>
49
+ <li>Mixtral 8x7B Instruct</li>
50
+ <li>Mistral Medium</li>
51
+ </ul>
52
+ <h4>Models with fewer hallucinations:</h4>
53
+ <ul>
54
+ <li>gpt-4-1106-preview (turbo)</li>
55
+ <li>gpt-4-0613</li>
56
+ <li>gpt-4-0314</li>
57
+ <li>Gemini Ultra</li>
58
+ <li>Claude 2.1</li>
59
+ </ul>
60
+ <h4>Models with a high level of hallucinations:</h4>
61
+ <ul>
62
+ <li>Mixtral 8x7B Instruct</li>
63
+ <li>Yi 34B</li>
64
+ </ul>
65
+ <h4>Open Models:</h4>
66
+ <ul>
67
+ <li>Mixtral 8x7B Instruct</li>
68
+ <li>Yi 34B</li>
69
+ </ul>
70
+
71
 
72
  <script>
73
  const benchmarkData = [
74
  {
75
+ name: 'gpt-4-1106-preview (turbo)',
76
  mmlu: null,
77
  mtbench: 9.32,
78
  arenaelo:1249,
 
141
  organization: 'OpenAI',
142
  license: 'Proprietary',
143
  },
144
+ {
145
+ name: 'gpt-3.5-turbo-1106',
146
+ mmlu: null,
147
+ mtbench: 8.32,
148
+ arenaelo:1072,
149
+ gsm8k: null,
150
+ winogrande: null,
151
+ truthfulqa: null,
152
+ hellaswag:null,
153
+ arc:null,
154
+ parameters: '20B - 175B (not confirmed)',
155
+ organization: 'OpenAI',
156
+ license: 'Proprietary',
157
+ },
158
  {
159
  name: 'Claude 2.1',
160
  mmlu: null,
 
304
  '<th>MMLU</th>' +
305
  '<th>MT-Bench</th>' +
306
  '<th>Arena Elo</th>' +
307
+ '<th>GSM8k</th>' +
308
  '<th>Winogrande</th>' +
309
  '<th>TruthfulQA</th>' +
310
  '<th>HellaSwag</th>' +
 
386
  }
387
  const dataSetRadar = getDataSetRadar(benchmarkData);
388
  let data = {
389
+ labels: ['MMLU', 'MT-bench','Arena Elo','GSM8k','Winogrande','TruthfulQA','HellaSwag','ARC'],
390
  datasets: getDataSetRadar(benchmarkData)
391
  };
392