luisrguerra
commited on
Update index.html
Browse files- index.html +59 -6
index.html
CHANGED
@@ -154,6 +154,14 @@
|
|
154 |
</ul>
|
155 |
<h4>Versions of models already surpassed by fine-tune, new versions or new architectures:</h4>
|
156 |
<ul>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
<li>gpt-4-0314</li>
|
158 |
<li>Claude 2-2.1</li>
|
159 |
<li>Claude Instant 1-1.2</li>
|
@@ -423,10 +431,58 @@
|
|
423 |
license: 'Proprietary',
|
424 |
},
|
425 |
{
|
426 |
-
name: 'Gemini
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
427 |
mmlu: 81.9,
|
428 |
mtbench: null,
|
429 |
-
arenaelo:
|
430 |
gsm8k: 91.7,
|
431 |
winogrande: null,
|
432 |
truthfulqa: null,
|
@@ -690,8 +746,6 @@
|
|
690 |
function getDataSetRadar(data) {
|
691 |
const mmluMaxValue = getBenchmarkMaxValue("mmlu",data);
|
692 |
const mmluMultiplier = 100/mmluMaxValue;
|
693 |
-
const mtbenchMaxValue = getBenchmarkMaxValue("mtbench",data);
|
694 |
-
const mtbenchMultiplier = 100/mtbenchMaxValue;
|
695 |
const arenaeloMaxValue = getBenchmarkMaxValue("arenaelo",data);
|
696 |
const arenaeloMultiplier = 100/arenaeloMaxValue;
|
697 |
const gsm8kMaxValue = getBenchmarkMaxValue("gsm8k",data);
|
@@ -714,7 +768,6 @@
|
|
714 |
label: data[i].name,
|
715 |
data: [
|
716 |
(data[i].mmlu*mmluMultiplier),
|
717 |
-
(data[i].mtbench*mtbenchMultiplier),
|
718 |
(data[i].arenaelo*arenaeloMultiplier),
|
719 |
(data[i].gsm8k*gsm8kMultiplier),
|
720 |
(data[i].winogrande*winograndeMultiplier),
|
@@ -731,7 +784,7 @@
|
|
731 |
}
|
732 |
const dataSetRadar = getDataSetRadar(benchmarkData);
|
733 |
let data = {
|
734 |
-
labels: ['MMLU',
|
735 |
datasets: getDataSetRadar(benchmarkData)
|
736 |
};
|
737 |
|
|
|
154 |
</ul>
|
155 |
<h4>Versions of models already surpassed by fine-tune, new versions or new architectures:</h4>
|
156 |
<ul>
|
157 |
+
<li>Gemini Pro 1.0</li>
|
158 |
+
<li>Grok 1</li>
|
159 |
+
<li>DBRX Instruct</li>
|
160 |
+
<li>Mistral Medium</li>
|
161 |
+
<li>Gemma 1.0 7B</li>
|
162 |
+
<li>Zephyr-ORPO-141b-A35b-v0.1</li>
|
163 |
+
<li>Yi 1.0 34B</li>
|
164 |
+
<li>gpt-4-0613</li>
|
165 |
<li>gpt-4-0314</li>
|
166 |
<li>Claude 2-2.1</li>
|
167 |
<li>Claude Instant 1-1.2</li>
|
|
|
431 |
license: 'Proprietary',
|
432 |
},
|
433 |
{
|
434 |
+
name: 'Gemini-Advanced-0514',
|
435 |
+
mmlu: null,
|
436 |
+
mtbench: null,
|
437 |
+
arenaelo:1267,
|
438 |
+
gsm8k: null,
|
439 |
+
winogrande: null,
|
440 |
+
truthfulqa: null,
|
441 |
+
hellaswag:null,
|
442 |
+
arc:null,
|
443 |
+
nothallucination: null,
|
444 |
+
alpacaeval: null,
|
445 |
+
parameters: null,
|
446 |
+
organization: 'Google',
|
447 |
+
license: 'Proprietary',
|
448 |
+
},
|
449 |
+
{
|
450 |
+
name: 'Gemini-1.5-Flash-API-0514',
|
451 |
+
mmlu: 78.9,
|
452 |
+
mtbench: null,
|
453 |
+
arenaelo:1230,
|
454 |
+
gsm8k: null,
|
455 |
+
winogrande: null,
|
456 |
+
truthfulqa: null,
|
457 |
+
hellaswag:null,
|
458 |
+
arc:null,
|
459 |
+
nothallucination: null,
|
460 |
+
alpacaeval: null,
|
461 |
+
parameters: null,
|
462 |
+
organization: 'Google',
|
463 |
+
license: 'Proprietary',
|
464 |
+
},
|
465 |
+
{
|
466 |
+
name: 'Gemini-1.5-Pro-API-0514',
|
467 |
+
mmlu: 85.9,
|
468 |
+
mtbench: null,
|
469 |
+
arenaelo:1265,
|
470 |
+
gsm8k: null,
|
471 |
+
winogrande: null,
|
472 |
+
truthfulqa: null,
|
473 |
+
hellaswag:null,
|
474 |
+
arc:null,
|
475 |
+
nothallucination: null,
|
476 |
+
alpacaeval: null,
|
477 |
+
parameters: null,
|
478 |
+
organization: 'Google',
|
479 |
+
license: 'Proprietary',
|
480 |
+
},
|
481 |
+
{
|
482 |
+
name: 'Gemini-1.5-Pro-API-0409-Preview',
|
483 |
mmlu: 81.9,
|
484 |
mtbench: null,
|
485 |
+
arenaelo:1258,
|
486 |
gsm8k: 91.7,
|
487 |
winogrande: null,
|
488 |
truthfulqa: null,
|
|
|
746 |
function getDataSetRadar(data) {
|
747 |
const mmluMaxValue = getBenchmarkMaxValue("mmlu",data);
|
748 |
const mmluMultiplier = 100/mmluMaxValue;
|
|
|
|
|
749 |
const arenaeloMaxValue = getBenchmarkMaxValue("arenaelo",data);
|
750 |
const arenaeloMultiplier = 100/arenaeloMaxValue;
|
751 |
const gsm8kMaxValue = getBenchmarkMaxValue("gsm8k",data);
|
|
|
768 |
label: data[i].name,
|
769 |
data: [
|
770 |
(data[i].mmlu*mmluMultiplier),
|
|
|
771 |
(data[i].arenaelo*arenaeloMultiplier),
|
772 |
(data[i].gsm8k*gsm8kMultiplier),
|
773 |
(data[i].winogrande*winograndeMultiplier),
|
|
|
784 |
}
|
785 |
const dataSetRadar = getDataSetRadar(benchmarkData);
|
786 |
let data = {
|
787 |
+
labels: ['MMLU','Arena Elo','GSM8k','Winogrande','TruthfulQA','HellaSwag','ARC','AlpacaEval','Not Hallucination'],
|
788 |
datasets: getDataSetRadar(benchmarkData)
|
789 |
};
|
790 |
|