Spaces:

llmonitor
/

benchmarks

Build error

App Files Files Community

vincelwt commited on Oct 8, 2023

Commit

97826e4

unverified ·

1 Parent(s): 7e5cb25

Fixes and updates

Browse files

Files changed (9) hide show

app/compare/[slugs]/page.js +2 -1
app/layout.js +13 -1
app/page.js +3 -10
run/poetry.lock +67 -1
run/pyproject.toml +1 -0
run/queriers.py +21 -34
run/run.py +48 -15
run/together_cleaner.py +42 -0
utils/db.js +19 -4

app/compare/[slugs]/page.js CHANGED Viewed

@@ -1,4 +1,3 @@
-import Link from "next/link"
 import db, { getModels } from "@/utils/db"
 export default async function Comparison({ params }) {
@@ -54,9 +53,11 @@ export default async function Comparison({ params }) {
             </td>
             <td>
               <pre>{row.model1?.result?.trim()}</pre>
             </td>
             <td>
               <pre>{row.model2?.result?.trim()}</pre>
             </td>
           </tr>
         ))}

 import db, { getModels } from "@/utils/db"
 export default async function Comparison({ params }) {
             </td>
             <td>
               <pre>{row.model1?.result?.trim()}</pre>
+              <p>{row.model1 ? `Score: ${row.model1?.score}` : "Not rated"}</p>
             </td>
             <td>
               <pre>{row.model2?.result?.trim()}</pre>
+              <p>{row.model2 ? `Score: ${row.model2?.score}` : "Not rated"}</p>
             </td>
           </tr>
         ))}

app/layout.js CHANGED Viewed

@@ -1,6 +1,7 @@
 import Link from "next/link"
 import "@/styles/globals.css"
 import { Suspense } from "react"
 export const metadata = {
   title: "LLMonitor Benchmarks",
@@ -10,6 +11,17 @@ export const metadata = {
 export default function RootLayout({ children }) {
   return (
     <html lang="en">
       <body>
         <main>
           <h1>LLMonitor Benchmarks</h1>
@@ -32,7 +44,7 @@ export default function RootLayout({ children }) {
           <p>
             Credit:{" "}
             <a href="https://twitter.com/vincelwt" target="_blank">
-              @vincelwt
             </a>{" "}
             /{" "}
             <a href="https://llmonitor.com" target="_blank">

 import Link from "next/link"
 import "@/styles/globals.css"
 import { Suspense } from "react"
+import PlausibleProvider from "next-plausible"
 export const metadata = {
   title: "LLMonitor Benchmarks",
 export default function RootLayout({ children }) {
   return (
     <html lang="en">
+      <head>
+        <PlausibleProvider
+          domain="benchmarks.llmonitor.com"
+          scriptProps={{
+            src: "https://llmonitor.com/p/js/script.js",
+            // @ts-ignore
+            "data-api": "https://llmonitor.com/p/event",
+          }}
+          customDomain="benchmarks.llmonitor.com"
+        />
+      </head>
       <body>
         <main>
           <h1>LLMonitor Benchmarks</h1>
           <p>
             Credit:{" "}
             <a href="https://twitter.com/vincelwt" target="_blank">
+              @vincelwt\
             </a>{" "}
             /{" "}
             <a href="https://llmonitor.com" target="_blank">

app/page.js CHANGED Viewed

@@ -1,17 +1,10 @@
-import db from "@/utils/db"
 import Link from "next/link"
 export default async function Leaderboard() {
   const [potentialPoints] = await db`SELECT SUM(points) as total FROM rubrics`
-  const models = await db`
-    SELECT models.*, SUM(results.score) as total_score
-    FROM models
-    LEFT JOIN results ON models.id = results.model
-    GROUP BY models.id
-    ORDER BY total_score DESC;
-    `
   return (
     <>
       <p>
@@ -49,7 +42,7 @@ export default async function Leaderboard() {
             .filter((s) => s.total_score)
             .map((model, i) => (
               <tr key={i}>
-                <td>{i + 1}</td>
                 <td>{model.name}</td>
                 <td>
                   {parseInt((model.total_score / potentialPoints.total) * 100)}

+import db, { getModels } from "@/utils/db"
 import Link from "next/link"
 export default async function Leaderboard() {
   const [potentialPoints] = await db`SELECT SUM(points) as total FROM rubrics`
+  const models = await getModels()
   return (
     <>
       <p>
             .filter((s) => s.total_score)
             .map((model, i) => (
               <tr key={i}>
+                <td>{model.rank}</td>
                 <td>{model.name}</td>
                 <td>
                   {parseInt((model.total_score / potentialPoints.total) * 100)}

run/poetry.lock CHANGED Viewed

@@ -312,6 +312,20 @@ files = [
     {file = "charset_normalizer-3.3.0-py3-none-any.whl", hash = "sha256:e46cd37076971c1040fc8c41273a8b3e2c624ce4f2be3f5dfcb7a430c1d3acc2"},
 ]
 [[package]]
 name = "colorama"
 version = "0.4.6"
@@ -1051,6 +1065,17 @@ files = [
     {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"},
 ]
 [[package]]
 name = "tenacity"
 version = "8.2.3"
@@ -1079,6 +1104,26 @@ files = [
 [package.extras]
 tests = ["pytest", "pytest-cov"]
 [[package]]
 name = "tokenizers"
 version = "0.14.1"
@@ -1214,6 +1259,27 @@ notebook = ["ipywidgets (>=6)"]
 slack = ["slack-sdk"]
 telegram = ["requests"]
 [[package]]
 name = "typing-extensions"
 version = "4.8.0"
@@ -1332,4 +1398,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "2532aa0d8a88a1e9dc8e0b916abb81ff0ed1bfee5230c1f141139ceeeef5d305"

     {file = "charset_normalizer-3.3.0-py3-none-any.whl", hash = "sha256:e46cd37076971c1040fc8c41273a8b3e2c624ce4f2be3f5dfcb7a430c1d3acc2"},
 ]
+[[package]]
+name = "click"
+version = "8.1.7"
+description = "Composable command line interface toolkit"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"},
+    {file = "click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"},
+]
+[package.dependencies]
+colorama = {version = "*", markers = "platform_system == \"Windows\""}
 [[package]]
 name = "colorama"
 version = "0.4.6"
     {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"},
 ]
+[[package]]
+name = "sseclient-py"
+version = "1.7.2"
+description = "SSE client for Python"
+optional = false
+python-versions = "*"
+files = [
+    {file = "sseclient-py-1.7.2.tar.gz", hash = "sha256:ba3197d314766eccb72a1dda80b5fa14a0fbba07d796a287654c07edde88fe0f"},
+    {file = "sseclient_py-1.7.2-py2.py3-none-any.whl", hash = "sha256:a758653b13b78df42cdb696740635a26cb72ad433b75efb68dbbb163d099b6a9"},
+]
 [[package]]
 name = "tenacity"
 version = "8.2.3"
 [package.extras]
 tests = ["pytest", "pytest-cov"]
+[[package]]
+name = "together"
+version = "0.2.4"
+description = "Python client for Together's Cloud Platform!"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "together-0.2.4-py3-none-any.whl", hash = "sha256:fdf5b70e2d517e855fae5821e1ef8f164e938710d662fe3f4fadf5ac39f1c2a3"},
+    {file = "together-0.2.4.tar.gz", hash = "sha256:85896985f41bcd6f308ac4d925d1827e915d1e5e65057f92e990610a3085c94a"},
+]
+[package.dependencies]
+requests = "*"
+sseclient-py = "1.7.2"
+tqdm = "*"
+typer = "*"
+[package.extras]
+quality = ["black (>=23.1,<24.0)", "mypy (>=1.3.0)", "ruff (>=0.0.241,<=0.0.259)", "types-requests (>=2.31.0.1)", "types-tqdm (>=4.65.0.0)"]
 [[package]]
 name = "tokenizers"
 version = "0.14.1"
 slack = ["slack-sdk"]
 telegram = ["requests"]
+[[package]]
+name = "typer"
+version = "0.9.0"
+description = "Typer, build great CLIs. Easy to code. Based on Python type hints."
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "typer-0.9.0-py3-none-any.whl", hash = "sha256:5d96d986a21493606a358cae4461bd8cdf83cbf33a5aa950ae629ca3b51467ee"},
+    {file = "typer-0.9.0.tar.gz", hash = "sha256:50922fd79aea2f4751a8e0408ff10d2662bd0c8bbfa84755a699f3bada2978b2"},
+]
+[package.dependencies]
+click = ">=7.1.1,<9.0.0"
+typing-extensions = ">=3.7.4.3"
+[package.extras]
+all = ["colorama (>=0.4.3,<0.5.0)", "rich (>=10.11.0,<14.0.0)", "shellingham (>=1.3.0,<2.0.0)"]
+dev = ["autoflake (>=1.3.1,<2.0.0)", "flake8 (>=3.8.3,<4.0.0)", "pre-commit (>=2.17.0,<3.0.0)"]
+doc = ["cairosvg (>=2.5.2,<3.0.0)", "mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=8.1.4,<9.0.0)", "pillow (>=9.3.0,<10.0.0)"]
+test = ["black (>=22.3.0,<23.0.0)", "coverage (>=6.2,<7.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.910)", "pytest (>=4.4.0,<8.0.0)", "pytest-cov (>=2.10.0,<5.0.0)", "pytest-sugar (>=0.9.4,<0.10.0)", "pytest-xdist (>=1.32.0,<4.0.0)", "rich (>=10.11.0,<14.0.0)", "shellingham (>=1.3.0,<2.0.0)"]
 [[package]]
 name = "typing-extensions"
 version = "4.8.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
+content-hash = "3b888e591a06f7343d7ee83a93fa52e86b3ad6aec53614bb2d25e8703307af3e"

run/pyproject.toml CHANGED Viewed

@@ -16,6 +16,7 @@ hugchat = {git = "https://github.com/Soulter/hugging-chat-api", rev = "master"}
 psycopg2-binary = "^2.9.9"
 anthropic = "^0.3.11"
 tenacity = "^8.2.3"
 [build-system]
 requires = ["poetry-core"]

 psycopg2-binary = "^2.9.9"
 anthropic = "^0.3.11"
 tenacity = "^8.2.3"
+together = "^0.2.4"
 [build-system]
 requires = ["poetry-core"]

run/queriers.py CHANGED Viewed

@@ -2,17 +2,11 @@ import openai
 import os
 import json
 import requests
-from llmonitor import monitor
 from hugchat import hugchat
 from hugchat.login import Login
 from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT
-from tenacity import (
-    retry,
-    stop_after_attempt,
-    wait_exponential,
-    wait_random_exponential,
-)  # for exponential backoff
 from dotenv import load_dotenv
 load_dotenv()
@@ -30,9 +24,7 @@ ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY')
 HUGGING_EMAIL = os.environ.get("HUGGING_EMAIL")
 HUGGING_PASSWORD = os.environ.get("HUGGING_PASSWORD")
-MAX_TOKENS = 600
-monitor(openai)
 # Log in to huggingface and grant authorization to huggingchat
@@ -69,33 +61,29 @@ def hugchat_func(model, params):
     return query_result['text']
-def together(model, params):
-    def format_prompt(prompt, prompt_type):
-      if prompt_type == "language":
-          return f"Q: {prompt}\nA: "
-      if prompt_type == "code":
-          return f"# {prompt}"
-      if prompt_type == "chat":
-          return f"\n<human>: {prompt}\n<bot>: "
-    url = "https://api.together.xyz/inference"
-    headers = {
-        "Content-Type": "application/json",
-        "Authorization": f"Bearer {TOGETHER_API_KEY}",
-    }
-    data = {
-        "model": model['api_id'],
-        "prompt": format_prompt(params['text'], model['type']),
-        "stop": "\n<human>" if model['type'] == "chat" else params.get('stop', None),
-        "temperature": 0,
-        "max_tokens": MAX_TOKENS,
-    }
-    response = requests.post(url, headers=headers, data=json.dumps(data))
-    result = response.json()
-    return result['output']['choices'][0]['text'].rstrip(params['stop'])
 def cohere(model, params):
     options = {
@@ -121,7 +109,6 @@ def cohere(model, params):
     return json_response['generations'][0]['text']
-@retry(wait=wait_exponential(multiplier=1, min=4, max=16))
 def openai_func(model, params):
     openai.api_key = OPENAI_API_KEY

 import os
 import json
 import requests
 from hugchat import hugchat
 from hugchat.login import Login
+import together
 from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT
 from dotenv import load_dotenv
 load_dotenv()
 HUGGING_EMAIL = os.environ.get("HUGGING_EMAIL")
 HUGGING_PASSWORD = os.environ.get("HUGGING_PASSWORD")
+MAX_TOKENS = 700
 # Log in to huggingface and grant authorization to huggingchat
     return query_result['text']
+def together_func(model, params):
+    # def format_prompt(prompt, prompt_type):
+    #   if prompt_type == "language":
+    #       return f"Q: {prompt}\nA: "
+    #   if prompt_type == "code":
+    #       return f"# {prompt}"
+    #   if prompt_type == "chat":
+        #   return f"<human>: {prompt}\n<bot>: "
+    together.api_key = TOGETHER_API_KEY
+    # generate response
+    response = together.Complete.create(
+        model = model['api_id'],
+        prompt=f"<human>: {params['text']}\n<bot>:",
+        temperature=0,
+        max_tokens=MAX_TOKENS,
+        stop=["<human>", "<human>:","</s>", "<|end|>", "<|endoftext|>", "<bot>", "```\n```", "\nUser"]
+    )
+    return response['output']['choices'][0]['text'].rstrip(params['stop'])
 def cohere(model, params):
     options = {
     return json_response['generations'][0]['text']
 def openai_func(model, params):
     openai.api_key = OPENAI_API_KEY

run/run.py CHANGED Viewed

@@ -2,14 +2,23 @@ import sqlite3
 import time
 from termcolor import colored
 import psycopg2
-from queriers import together, cohere, openai_func, openrouter, ai21, alephalpha, hugchat_func, anthropic_func
 import psycopg2.extras
 import psycopg2.pool
 import os
 from dotenv import load_dotenv
 load_dotenv()
 # Connect to database
 PG_URI = os.environ.get("POSTGRES_URL")
@@ -66,7 +75,7 @@ def ask_prompt(prompt, model):
         return
     mapping = {
-        "together": together,
         "cohere": cohere,   # Add these functions to the mapping once they are translated
         "openai": openai_func,
         "openrouter": openrouter,
@@ -134,7 +143,42 @@ for model in models:
 # Calculate scores
 results = get_results()
-#@agent(name="RateResult")
 def rate_result(result):
     cursor.execute(
         "SELECT * FROM rubrics WHERE prompt = %s",
@@ -162,20 +206,9 @@ def rate_result(result):
         if result["result"].strip() == "":
             score = 0
         else:
-            grading_text = (
-                f'You help me grade the answer of chatbots by verifying that they match this condition: the answer {rubric["grading"]}. Note: the answer might be imcomplete, in which case do your best to assess based on what the full result would be. Your rating needs to be very strict: if I ask that the answer is *exactly* some string and it contains more than that, then it\'s invalid.\n\n'
-                f'\n\n--START OF THE ANSWER--\n{result["result"]}\n--END OF THE ANSWER--\n\n'
-                # f'Take a deep breath and explain step by step how you come to the conclusion.'
-                # f'Finally, reply on the last line with YES if the following answer matches this condition (otherwies reply NO).'
-                f'Reply with YES if the text between START and END matches exactly the above condition (otherwise reply NO).'
-            )
-            # get gpt-4 model
-            gpt4 = next((item for item in models if item['api_id'] == 'gpt-4'), None)
-            prompt = { }
-            response_text = openai_func(gpt4, {"text": grading_text})
             print(colored(f"-> {response_text}", 'yellow'))

 import time
 from termcolor import colored
 import psycopg2
+from queriers import together_func, cohere, openai_func, openrouter, ai21, alephalpha, hugchat_func, anthropic_func
 import psycopg2.extras
 import psycopg2.pool
+import openai
 import os
 from dotenv import load_dotenv
 load_dotenv()
+from llmonitor import monitor, agent, tool
+from tenacity import (
+    retry,
+    wait_exponential,
+)  # for exponential backoff
+monitor(openai)
 # Connect to database
 PG_URI = os.environ.get("POSTGRES_URL")
         return
     mapping = {
+        "together": together_func,
         "cohere": cohere,   # Add these functions to the mapping once they are translated
         "openai": openai_func,
         "openrouter": openrouter,
 # Calculate scores
 results = get_results()
+# @tool(name='grade_openai')
+@retry(wait=wait_exponential(multiplier=1, min=4, max=16))
+def grade_openai(rubric, result):
+    openai.api_key = os.getenv('OPENAI_API_KEY')
+    messages = [{
+        "role": "system",
+        "content": "You help me grade the answer of chatbots by verifying that they match this assertion. Note: the answer might be imcomplete, in which case do your best to assess based on what the full result would be. Your rating needs to be very strict: if the condition is that the answer is *exactly* some string and it contains more than that, then it\'s invalid. Similarly, if the condition is that the answer mentions or contains something, then it\'s fine if there is also more than that. Reply with YES if the answer matches the assertion, and NO if it doesn\'t."
+    }, {
+        "role": "user",
+        "content": 'Assertion: The answer mentions many countries not (officially) recognising taiwan as being independent\n\n--START OF THE ANSWER--\nTaiwan is a territory in East Asia with its own government and economy. It considers itself an independent country, officially known as the Republic of China (ROC). However, the People\'s Republic of China (PRC) claims Taiwan as a province of China. The international community is divided on the issue, with some countries recognizing Taiwan as an independent nation and others recognizing the PRC\'s claim.\n--END OF THE ANSWER--'
+    }, {
+        "role": "assistant",
+        "content": "YES"
+    }, {
+        "role": "user",
+        "content": 'Assertion: The answer contains only a valid JSON and nothing else\n\n--START OF THE ANSWER--\nHere is the JSON array with the 5 planets closest to the sun:\n\n```json\n[\n{\n"planet": "Mercury",\n"distanceFromEarth": 77.3,\n"diameter": 4879,\n"moons": 0\n}\n]\n```\n--END OF THE ANSWER--'
+    }, {
+        "role": "assistant",
+        "content": "NO"
+    }, {
+        "role": "user",
+        "content": f"Assertion: The answer {rubric['grading']}\n\n--START OF THE ANSWER--\n{result['result']}\n--END OF THE ANSWER--\n\n"
+    }]
+    completion = openai.ChatCompletion.create(
+        model='gpt-4',
+        messages=messages,
+        temperature=0,
+        max_tokens=100
+    )
+    return completion.choices[0].message.content
+@agent(name="RateResult")
 def rate_result(result):
     cursor.execute(
         "SELECT * FROM rubrics WHERE prompt = %s",
         if result["result"].strip() == "":
             score = 0
         else:
+            response_text = grade_openai(rubric, result)
             print(colored(f"-> {response_text}", 'yellow'))

run/together_cleaner.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# This cleans up the results from the together API by removing the stop tokens, for some reason the API doesn't do this itself.
+import psycopg2
+import psycopg2.extras
+import psycopg2.pool
+import os
+from dotenv import load_dotenv
+load_dotenv()
+# Connect to database
+PG_URI = os.environ.get("POSTGRES_URL")
+conn = psycopg2.connect(PG_URI)
+cur = conn.cursor()
+# Execute the SQL query
+cur.execute("SELECT result FROM results INNER JOIN models ON results.model = models.id WHERE models.api = 'together'")
+# Fetch all the rows
+rows = cur.fetchall()
+str_array = ["<human>", "<human>:", "</bot>", "</s>", "<|end|>", "<|endoftext|>", "```\n```", "\nUser"]
+for row in rows:
+    for string in str_array:
+        if string in row[0]:
+            print("Found string: " + string)
+            # Find the index of the string
+            index = row[0].index(string)
+            # Remove the string and everything after it
+            new_result = row[0][:index].strip()
+            # Update the result in the database
+            print('===============================')
+            print("Old result:" + row[0])
+            print("New result:" + new_result)
+            cur.execute("UPDATE results SET result = %s WHERE result = %s", (new_result, row[0]))
+conn.commit()
+conn.close()

utils/db.js CHANGED Viewed

@@ -12,10 +12,25 @@ export const getModels = cache(async () => {
     ORDER BY total_score DESC;
   `
-  return models.map((m) => ({
-    ...m,
-    slug: m.api_id.split("/").pop().toLowerCase(),
-  }))
 })
 export default sql

     ORDER BY total_score DESC;
   `
+  console.log("models", models)
+  const sorted = models.sort((a, b) => b.total_score - a.total_score)
+  // set the rank, so that if two models have the same score, they have the same rank
+  for (let i = 0; i < sorted.length; i++) {
+    const model = sorted[i]
+    const previousModel = sorted[i - 1]
+    if (previousModel && previousModel.total_score === model.total_score) {
+      model.rank = previousModel.rank
+    } else {
+      model.rank = previousModel ? previousModel.rank + 1 : 1
+    }
+    model.slug = model.api_id.split("/").pop().toLowerCase()
+  }
+  return sorted
 })
 export default sql