guerra-llm-ai-leaderboard

Running

App Files Files Community

luisrguerra commited on Jan 14, 2024

Commit

a56da64

verified ·

1 Parent(s): 512c89a

Update index.html

Browse files

Files changed (1) hide show

index.html +53 -3

index.html CHANGED Viewed

@@ -31,12 +31,48 @@
     <div><canvas id="radarChart" height="750"></canvas></div>
     <p>The MMLU (Massive Multitask Language Understanding) test is a benchmark that measures language understanding and performance on 57 tasks.</p>
     <p>MT-Bench: Benchmark test with questions prepared by the Chatbot Arena team. Uses GPT-4 to evaluate responses.</p>
     <div id="tableBenchMark"></div>
     <script>
         const benchmarkData = [
         {
-            name: 'gpt-4-1106-preview',
             mmlu: null,
             mtbench: 9.32,
             arenaelo:1249,
@@ -105,6 +141,20 @@
             organization: 'OpenAI',
             license: 'Proprietary',
           },
           {
             name: 'Claude 2.1',
             mmlu: null,
@@ -254,7 +304,7 @@
                                   '<th>MMLU</th>' +
                                   '<th>MT-Bench</th>' +
                                   '<th>Arena Elo</th>' +
-                                  '<th>GSM-8k</th>' +
                                   '<th>Winogrande</th>' +
                                   '<th>TruthfulQA</th>' +
                                   '<th>HellaSwag</th>' +
@@ -336,7 +386,7 @@
         }
         const dataSetRadar = getDataSetRadar(benchmarkData);
         let data = {
-            labels: ['MMLU', 'MT-bench','Arena Elo','GSM-8k','Winogrande','TruthfulQA','HellaSwag','ARC'],
             datasets: getDataSetRadar(benchmarkData)
         };

     <div><canvas id="radarChart" height="750"></canvas></div>
     <p>The MMLU (Massive Multitask Language Understanding) test is a benchmark that measures language understanding and performance on 57 tasks.</p>
     <p>MT-Bench: Benchmark test with questions prepared by the Chatbot Arena team. Uses GPT-4 to evaluate responses.</p>
+    <p>GSM8K is a dataset of 8.5K high quality linguistically diverse grade school math word problems created by human problem writers. A bright middle school student should be able to solve every problem.</p>
     <div id="tableBenchMark"></div>
+    <h4>Best models for solving math problems:</h4>
+    <ul>
+      <li>gpt-4-1106-preview (turbo)</li>
+      <li>gpt-4-0613</li>
+      <li>gpt-4-0314</li>
+      <li>Gemini Ultra</li>
+    </ul>
+    <h4>Models with the best cost benefit:</h4>
+    <ul>
+      <li>Gemini Pro</li>
+      <li>gpt-3.5-turbo-0613</li>
+      <li>gpt-3.5-turbo-1106</li>
+      <li>Claude Instant 1</li>
+      <li>Mixtral 8x7B Instruct</li>
+      <li>Mistral Medium</li>
+    </ul>
+    <h4>Models with fewer hallucinations:</h4>
+    <ul>
+      <li>gpt-4-1106-preview (turbo)</li>
+      <li>gpt-4-0613</li>
+      <li>gpt-4-0314</li>
+      <li>Gemini Ultra</li>
+      <li>Claude 2.1</li>
+    </ul>
+    <h4>Models with a high level of hallucinations:</h4>
+    <ul>
+      <li>Mixtral 8x7B Instruct</li>
+      <li>Yi 34B</li>
+    </ul>
+    <h4>Open Models:</h4>
+    <ul>
+      <li>Mixtral 8x7B Instruct</li>
+      <li>Yi 34B</li>
+    </ul>
     <script>
         const benchmarkData = [
         {
+            name: 'gpt-4-1106-preview (turbo)',
             mmlu: null,
             mtbench: 9.32,
             arenaelo:1249,
             organization: 'OpenAI',
             license: 'Proprietary',
           },
+          {
+            name: 'gpt-3.5-turbo-1106',
+            mmlu: null,
+            mtbench: 8.32,
+            arenaelo:1072,
+            gsm8k: null,
+            winogrande: null,
+            truthfulqa: null,
+            hellaswag:null,
+            arc:null,
+            parameters: '20B - 175B (not confirmed)',
+            organization: 'OpenAI',
+            license: 'Proprietary',
+          },
           {
             name: 'Claude 2.1',
             mmlu: null,
                                   '<th>MMLU</th>' +
                                   '<th>MT-Bench</th>' +
                                   '<th>Arena Elo</th>' +
+                                  '<th>GSM8k</th>' +
                                   '<th>Winogrande</th>' +
                                   '<th>TruthfulQA</th>' +
                                   '<th>HellaSwag</th>' +
         }
         const dataSetRadar = getDataSetRadar(benchmarkData);
         let data = {
+            labels: ['MMLU', 'MT-bench','Arena Elo','GSM8k','Winogrande','TruthfulQA','HellaSwag','ARC'],
             datasets: getDataSetRadar(benchmarkData)
         };