Spaces:
Build error
Build error
Fixes and updates
Browse files- app/compare/[slugs]/page.js +2 -1
- app/layout.js +13 -1
- app/page.js +3 -10
- run/poetry.lock +67 -1
- run/pyproject.toml +1 -0
- run/queriers.py +21 -34
- run/run.py +48 -15
- run/together_cleaner.py +42 -0
- utils/db.js +19 -4
app/compare/[slugs]/page.js
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
import Link from "next/link"
|
2 |
import db, { getModels } from "@/utils/db"
|
3 |
|
4 |
export default async function Comparison({ params }) {
|
@@ -54,9 +53,11 @@ export default async function Comparison({ params }) {
|
|
54 |
</td>
|
55 |
<td>
|
56 |
<pre>{row.model1?.result?.trim()}</pre>
|
|
|
57 |
</td>
|
58 |
<td>
|
59 |
<pre>{row.model2?.result?.trim()}</pre>
|
|
|
60 |
</td>
|
61 |
</tr>
|
62 |
))}
|
|
|
|
|
1 |
import db, { getModels } from "@/utils/db"
|
2 |
|
3 |
export default async function Comparison({ params }) {
|
|
|
53 |
</td>
|
54 |
<td>
|
55 |
<pre>{row.model1?.result?.trim()}</pre>
|
56 |
+
<p>{row.model1 ? `Score: ${row.model1?.score}` : "Not rated"}</p>
|
57 |
</td>
|
58 |
<td>
|
59 |
<pre>{row.model2?.result?.trim()}</pre>
|
60 |
+
<p>{row.model2 ? `Score: ${row.model2?.score}` : "Not rated"}</p>
|
61 |
</td>
|
62 |
</tr>
|
63 |
))}
|
app/layout.js
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import Link from "next/link"
|
2 |
import "@/styles/globals.css"
|
3 |
import { Suspense } from "react"
|
|
|
4 |
|
5 |
export const metadata = {
|
6 |
title: "LLMonitor Benchmarks",
|
@@ -10,6 +11,17 @@ export const metadata = {
|
|
10 |
export default function RootLayout({ children }) {
|
11 |
return (
|
12 |
<html lang="en">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
<body>
|
14 |
<main>
|
15 |
<h1>LLMonitor Benchmarks</h1>
|
@@ -32,7 +44,7 @@ export default function RootLayout({ children }) {
|
|
32 |
<p>
|
33 |
Credit:{" "}
|
34 |
<a href="https://twitter.com/vincelwt" target="_blank">
|
35 |
-
@vincelwt
|
36 |
</a>{" "}
|
37 |
/{" "}
|
38 |
<a href="https://llmonitor.com" target="_blank">
|
|
|
1 |
import Link from "next/link"
|
2 |
import "@/styles/globals.css"
|
3 |
import { Suspense } from "react"
|
4 |
+
import PlausibleProvider from "next-plausible"
|
5 |
|
6 |
export const metadata = {
|
7 |
title: "LLMonitor Benchmarks",
|
|
|
11 |
export default function RootLayout({ children }) {
|
12 |
return (
|
13 |
<html lang="en">
|
14 |
+
<head>
|
15 |
+
<PlausibleProvider
|
16 |
+
domain="benchmarks.llmonitor.com"
|
17 |
+
scriptProps={{
|
18 |
+
src: "https://llmonitor.com/p/js/script.js",
|
19 |
+
// @ts-ignore
|
20 |
+
"data-api": "https://llmonitor.com/p/event",
|
21 |
+
}}
|
22 |
+
customDomain="benchmarks.llmonitor.com"
|
23 |
+
/>
|
24 |
+
</head>
|
25 |
<body>
|
26 |
<main>
|
27 |
<h1>LLMonitor Benchmarks</h1>
|
|
|
44 |
<p>
|
45 |
Credit:{" "}
|
46 |
<a href="https://twitter.com/vincelwt" target="_blank">
|
47 |
+
@vincelwt\
|
48 |
</a>{" "}
|
49 |
/{" "}
|
50 |
<a href="https://llmonitor.com" target="_blank">
|
app/page.js
CHANGED
@@ -1,17 +1,10 @@
|
|
1 |
-
import db from "@/utils/db"
|
2 |
import Link from "next/link"
|
3 |
|
4 |
export default async function Leaderboard() {
|
5 |
const [potentialPoints] = await db`SELECT SUM(points) as total FROM rubrics`
|
6 |
|
7 |
-
const models = await
|
8 |
-
SELECT models.*, SUM(results.score) as total_score
|
9 |
-
FROM models
|
10 |
-
LEFT JOIN results ON models.id = results.model
|
11 |
-
GROUP BY models.id
|
12 |
-
ORDER BY total_score DESC;
|
13 |
-
`
|
14 |
-
|
15 |
return (
|
16 |
<>
|
17 |
<p>
|
@@ -49,7 +42,7 @@ export default async function Leaderboard() {
|
|
49 |
.filter((s) => s.total_score)
|
50 |
.map((model, i) => (
|
51 |
<tr key={i}>
|
52 |
-
<td>{
|
53 |
<td>{model.name}</td>
|
54 |
<td>
|
55 |
{parseInt((model.total_score / potentialPoints.total) * 100)}
|
|
|
1 |
+
import db, { getModels } from "@/utils/db"
|
2 |
import Link from "next/link"
|
3 |
|
4 |
export default async function Leaderboard() {
|
5 |
const [potentialPoints] = await db`SELECT SUM(points) as total FROM rubrics`
|
6 |
|
7 |
+
const models = await getModels()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
return (
|
9 |
<>
|
10 |
<p>
|
|
|
42 |
.filter((s) => s.total_score)
|
43 |
.map((model, i) => (
|
44 |
<tr key={i}>
|
45 |
+
<td>{model.rank}</td>
|
46 |
<td>{model.name}</td>
|
47 |
<td>
|
48 |
{parseInt((model.total_score / potentialPoints.total) * 100)}
|
run/poetry.lock
CHANGED
@@ -312,6 +312,20 @@ files = [
|
|
312 |
{file = "charset_normalizer-3.3.0-py3-none-any.whl", hash = "sha256:e46cd37076971c1040fc8c41273a8b3e2c624ce4f2be3f5dfcb7a430c1d3acc2"},
|
313 |
]
|
314 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
315 |
[[package]]
|
316 |
name = "colorama"
|
317 |
version = "0.4.6"
|
@@ -1051,6 +1065,17 @@ files = [
|
|
1051 |
{file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"},
|
1052 |
]
|
1053 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1054 |
[[package]]
|
1055 |
name = "tenacity"
|
1056 |
version = "8.2.3"
|
@@ -1079,6 +1104,26 @@ files = [
|
|
1079 |
[package.extras]
|
1080 |
tests = ["pytest", "pytest-cov"]
|
1081 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1082 |
[[package]]
|
1083 |
name = "tokenizers"
|
1084 |
version = "0.14.1"
|
@@ -1214,6 +1259,27 @@ notebook = ["ipywidgets (>=6)"]
|
|
1214 |
slack = ["slack-sdk"]
|
1215 |
telegram = ["requests"]
|
1216 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1217 |
[[package]]
|
1218 |
name = "typing-extensions"
|
1219 |
version = "4.8.0"
|
@@ -1332,4 +1398,4 @@ multidict = ">=4.0"
|
|
1332 |
[metadata]
|
1333 |
lock-version = "2.0"
|
1334 |
python-versions = "^3.9"
|
1335 |
-
content-hash = "
|
|
|
312 |
{file = "charset_normalizer-3.3.0-py3-none-any.whl", hash = "sha256:e46cd37076971c1040fc8c41273a8b3e2c624ce4f2be3f5dfcb7a430c1d3acc2"},
|
313 |
]
|
314 |
|
315 |
+
[[package]]
|
316 |
+
name = "click"
|
317 |
+
version = "8.1.7"
|
318 |
+
description = "Composable command line interface toolkit"
|
319 |
+
optional = false
|
320 |
+
python-versions = ">=3.7"
|
321 |
+
files = [
|
322 |
+
{file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"},
|
323 |
+
{file = "click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"},
|
324 |
+
]
|
325 |
+
|
326 |
+
[package.dependencies]
|
327 |
+
colorama = {version = "*", markers = "platform_system == \"Windows\""}
|
328 |
+
|
329 |
[[package]]
|
330 |
name = "colorama"
|
331 |
version = "0.4.6"
|
|
|
1065 |
{file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"},
|
1066 |
]
|
1067 |
|
1068 |
+
[[package]]
|
1069 |
+
name = "sseclient-py"
|
1070 |
+
version = "1.7.2"
|
1071 |
+
description = "SSE client for Python"
|
1072 |
+
optional = false
|
1073 |
+
python-versions = "*"
|
1074 |
+
files = [
|
1075 |
+
{file = "sseclient-py-1.7.2.tar.gz", hash = "sha256:ba3197d314766eccb72a1dda80b5fa14a0fbba07d796a287654c07edde88fe0f"},
|
1076 |
+
{file = "sseclient_py-1.7.2-py2.py3-none-any.whl", hash = "sha256:a758653b13b78df42cdb696740635a26cb72ad433b75efb68dbbb163d099b6a9"},
|
1077 |
+
]
|
1078 |
+
|
1079 |
[[package]]
|
1080 |
name = "tenacity"
|
1081 |
version = "8.2.3"
|
|
|
1104 |
[package.extras]
|
1105 |
tests = ["pytest", "pytest-cov"]
|
1106 |
|
1107 |
+
[[package]]
|
1108 |
+
name = "together"
|
1109 |
+
version = "0.2.4"
|
1110 |
+
description = "Python client for Together's Cloud Platform!"
|
1111 |
+
optional = false
|
1112 |
+
python-versions = ">=3.6"
|
1113 |
+
files = [
|
1114 |
+
{file = "together-0.2.4-py3-none-any.whl", hash = "sha256:fdf5b70e2d517e855fae5821e1ef8f164e938710d662fe3f4fadf5ac39f1c2a3"},
|
1115 |
+
{file = "together-0.2.4.tar.gz", hash = "sha256:85896985f41bcd6f308ac4d925d1827e915d1e5e65057f92e990610a3085c94a"},
|
1116 |
+
]
|
1117 |
+
|
1118 |
+
[package.dependencies]
|
1119 |
+
requests = "*"
|
1120 |
+
sseclient-py = "1.7.2"
|
1121 |
+
tqdm = "*"
|
1122 |
+
typer = "*"
|
1123 |
+
|
1124 |
+
[package.extras]
|
1125 |
+
quality = ["black (>=23.1,<24.0)", "mypy (>=1.3.0)", "ruff (>=0.0.241,<=0.0.259)", "types-requests (>=2.31.0.1)", "types-tqdm (>=4.65.0.0)"]
|
1126 |
+
|
1127 |
[[package]]
|
1128 |
name = "tokenizers"
|
1129 |
version = "0.14.1"
|
|
|
1259 |
slack = ["slack-sdk"]
|
1260 |
telegram = ["requests"]
|
1261 |
|
1262 |
+
[[package]]
|
1263 |
+
name = "typer"
|
1264 |
+
version = "0.9.0"
|
1265 |
+
description = "Typer, build great CLIs. Easy to code. Based on Python type hints."
|
1266 |
+
optional = false
|
1267 |
+
python-versions = ">=3.6"
|
1268 |
+
files = [
|
1269 |
+
{file = "typer-0.9.0-py3-none-any.whl", hash = "sha256:5d96d986a21493606a358cae4461bd8cdf83cbf33a5aa950ae629ca3b51467ee"},
|
1270 |
+
{file = "typer-0.9.0.tar.gz", hash = "sha256:50922fd79aea2f4751a8e0408ff10d2662bd0c8bbfa84755a699f3bada2978b2"},
|
1271 |
+
]
|
1272 |
+
|
1273 |
+
[package.dependencies]
|
1274 |
+
click = ">=7.1.1,<9.0.0"
|
1275 |
+
typing-extensions = ">=3.7.4.3"
|
1276 |
+
|
1277 |
+
[package.extras]
|
1278 |
+
all = ["colorama (>=0.4.3,<0.5.0)", "rich (>=10.11.0,<14.0.0)", "shellingham (>=1.3.0,<2.0.0)"]
|
1279 |
+
dev = ["autoflake (>=1.3.1,<2.0.0)", "flake8 (>=3.8.3,<4.0.0)", "pre-commit (>=2.17.0,<3.0.0)"]
|
1280 |
+
doc = ["cairosvg (>=2.5.2,<3.0.0)", "mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=8.1.4,<9.0.0)", "pillow (>=9.3.0,<10.0.0)"]
|
1281 |
+
test = ["black (>=22.3.0,<23.0.0)", "coverage (>=6.2,<7.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.910)", "pytest (>=4.4.0,<8.0.0)", "pytest-cov (>=2.10.0,<5.0.0)", "pytest-sugar (>=0.9.4,<0.10.0)", "pytest-xdist (>=1.32.0,<4.0.0)", "rich (>=10.11.0,<14.0.0)", "shellingham (>=1.3.0,<2.0.0)"]
|
1282 |
+
|
1283 |
[[package]]
|
1284 |
name = "typing-extensions"
|
1285 |
version = "4.8.0"
|
|
|
1398 |
[metadata]
|
1399 |
lock-version = "2.0"
|
1400 |
python-versions = "^3.9"
|
1401 |
+
content-hash = "3b888e591a06f7343d7ee83a93fa52e86b3ad6aec53614bb2d25e8703307af3e"
|
run/pyproject.toml
CHANGED
@@ -16,6 +16,7 @@ hugchat = {git = "https://github.com/Soulter/hugging-chat-api", rev = "master"}
|
|
16 |
psycopg2-binary = "^2.9.9"
|
17 |
anthropic = "^0.3.11"
|
18 |
tenacity = "^8.2.3"
|
|
|
19 |
|
20 |
[build-system]
|
21 |
requires = ["poetry-core"]
|
|
|
16 |
psycopg2-binary = "^2.9.9"
|
17 |
anthropic = "^0.3.11"
|
18 |
tenacity = "^8.2.3"
|
19 |
+
together = "^0.2.4"
|
20 |
|
21 |
[build-system]
|
22 |
requires = ["poetry-core"]
|
run/queriers.py
CHANGED
@@ -2,17 +2,11 @@ import openai
|
|
2 |
import os
|
3 |
import json
|
4 |
import requests
|
5 |
-
from llmonitor import monitor
|
6 |
from hugchat import hugchat
|
7 |
from hugchat.login import Login
|
|
|
8 |
from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT
|
9 |
|
10 |
-
from tenacity import (
|
11 |
-
retry,
|
12 |
-
stop_after_attempt,
|
13 |
-
wait_exponential,
|
14 |
-
wait_random_exponential,
|
15 |
-
) # for exponential backoff
|
16 |
|
17 |
from dotenv import load_dotenv
|
18 |
load_dotenv()
|
@@ -30,9 +24,7 @@ ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY')
|
|
30 |
HUGGING_EMAIL = os.environ.get("HUGGING_EMAIL")
|
31 |
HUGGING_PASSWORD = os.environ.get("HUGGING_PASSWORD")
|
32 |
|
33 |
-
MAX_TOKENS =
|
34 |
-
|
35 |
-
monitor(openai)
|
36 |
|
37 |
|
38 |
# Log in to huggingface and grant authorization to huggingchat
|
@@ -69,33 +61,29 @@ def hugchat_func(model, params):
|
|
69 |
|
70 |
return query_result['text']
|
71 |
|
72 |
-
def
|
73 |
-
def format_prompt(prompt, prompt_type):
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
|
81 |
-
url = "https://api.together.xyz/inference"
|
82 |
-
headers = {
|
83 |
-
"Content-Type": "application/json",
|
84 |
-
"Authorization": f"Bearer {TOGETHER_API_KEY}",
|
85 |
-
}
|
86 |
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
"
|
93 |
-
|
|
|
|
|
|
|
94 |
|
95 |
-
response = requests.post(url, headers=headers, data=json.dumps(data))
|
96 |
-
result = response.json()
|
97 |
|
98 |
-
return
|
99 |
|
100 |
def cohere(model, params):
|
101 |
options = {
|
@@ -121,7 +109,6 @@ def cohere(model, params):
|
|
121 |
|
122 |
return json_response['generations'][0]['text']
|
123 |
|
124 |
-
@retry(wait=wait_exponential(multiplier=1, min=4, max=16))
|
125 |
def openai_func(model, params):
|
126 |
|
127 |
openai.api_key = OPENAI_API_KEY
|
|
|
2 |
import os
|
3 |
import json
|
4 |
import requests
|
|
|
5 |
from hugchat import hugchat
|
6 |
from hugchat.login import Login
|
7 |
+
import together
|
8 |
from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
from dotenv import load_dotenv
|
12 |
load_dotenv()
|
|
|
24 |
HUGGING_EMAIL = os.environ.get("HUGGING_EMAIL")
|
25 |
HUGGING_PASSWORD = os.environ.get("HUGGING_PASSWORD")
|
26 |
|
27 |
+
MAX_TOKENS = 700
|
|
|
|
|
28 |
|
29 |
|
30 |
# Log in to huggingface and grant authorization to huggingchat
|
|
|
61 |
|
62 |
return query_result['text']
|
63 |
|
64 |
+
def together_func(model, params):
|
65 |
+
# def format_prompt(prompt, prompt_type):
|
66 |
+
# if prompt_type == "language":
|
67 |
+
# return f"Q: {prompt}\nA: "
|
68 |
+
# if prompt_type == "code":
|
69 |
+
# return f"# {prompt}"
|
70 |
+
# if prompt_type == "chat":
|
71 |
+
# return f"<human>: {prompt}\n<bot>: "
|
72 |
|
|
|
|
|
|
|
|
|
|
|
73 |
|
74 |
+
together.api_key = TOGETHER_API_KEY
|
75 |
+
|
76 |
+
# generate response
|
77 |
+
response = together.Complete.create(
|
78 |
+
model = model['api_id'],
|
79 |
+
prompt=f"<human>: {params['text']}\n<bot>:",
|
80 |
+
temperature=0,
|
81 |
+
max_tokens=MAX_TOKENS,
|
82 |
+
stop=["<human>", "<human>:","</s>", "<|end|>", "<|endoftext|>", "<bot>", "```\n```", "\nUser"]
|
83 |
+
)
|
84 |
|
|
|
|
|
85 |
|
86 |
+
return response['output']['choices'][0]['text'].rstrip(params['stop'])
|
87 |
|
88 |
def cohere(model, params):
|
89 |
options = {
|
|
|
109 |
|
110 |
return json_response['generations'][0]['text']
|
111 |
|
|
|
112 |
def openai_func(model, params):
|
113 |
|
114 |
openai.api_key = OPENAI_API_KEY
|
run/run.py
CHANGED
@@ -2,14 +2,23 @@ import sqlite3
|
|
2 |
import time
|
3 |
from termcolor import colored
|
4 |
import psycopg2
|
5 |
-
from queriers import
|
6 |
import psycopg2.extras
|
7 |
import psycopg2.pool
|
|
|
8 |
|
9 |
import os
|
10 |
from dotenv import load_dotenv
|
11 |
load_dotenv()
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
# Connect to database
|
14 |
PG_URI = os.environ.get("POSTGRES_URL")
|
15 |
|
@@ -66,7 +75,7 @@ def ask_prompt(prompt, model):
|
|
66 |
return
|
67 |
|
68 |
mapping = {
|
69 |
-
"together":
|
70 |
"cohere": cohere, # Add these functions to the mapping once they are translated
|
71 |
"openai": openai_func,
|
72 |
"openrouter": openrouter,
|
@@ -134,7 +143,42 @@ for model in models:
|
|
134 |
# Calculate scores
|
135 |
results = get_results()
|
136 |
|
137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
def rate_result(result):
|
139 |
cursor.execute(
|
140 |
"SELECT * FROM rubrics WHERE prompt = %s",
|
@@ -162,20 +206,9 @@ def rate_result(result):
|
|
162 |
if result["result"].strip() == "":
|
163 |
score = 0
|
164 |
else:
|
165 |
-
grading_text = (
|
166 |
-
f'You help me grade the answer of chatbots by verifying that they match this condition: the answer {rubric["grading"]}. Note: the answer might be imcomplete, in which case do your best to assess based on what the full result would be. Your rating needs to be very strict: if I ask that the answer is *exactly* some string and it contains more than that, then it\'s invalid.\n\n'
|
167 |
-
f'\n\n--START OF THE ANSWER--\n{result["result"]}\n--END OF THE ANSWER--\n\n'
|
168 |
-
# f'Take a deep breath and explain step by step how you come to the conclusion.'
|
169 |
-
# f'Finally, reply on the last line with YES if the following answer matches this condition (otherwies reply NO).'
|
170 |
-
f'Reply with YES if the text between START and END matches exactly the above condition (otherwise reply NO).'
|
171 |
-
)
|
172 |
|
173 |
-
# get gpt-4 model
|
174 |
-
gpt4 = next((item for item in models if item['api_id'] == 'gpt-4'), None)
|
175 |
|
176 |
-
|
177 |
-
|
178 |
-
response_text = openai_func(gpt4, {"text": grading_text})
|
179 |
|
180 |
print(colored(f"-> {response_text}", 'yellow'))
|
181 |
|
|
|
2 |
import time
|
3 |
from termcolor import colored
|
4 |
import psycopg2
|
5 |
+
from queriers import together_func, cohere, openai_func, openrouter, ai21, alephalpha, hugchat_func, anthropic_func
|
6 |
import psycopg2.extras
|
7 |
import psycopg2.pool
|
8 |
+
import openai
|
9 |
|
10 |
import os
|
11 |
from dotenv import load_dotenv
|
12 |
load_dotenv()
|
13 |
|
14 |
+
from llmonitor import monitor, agent, tool
|
15 |
+
from tenacity import (
|
16 |
+
retry,
|
17 |
+
wait_exponential,
|
18 |
+
) # for exponential backoff
|
19 |
+
|
20 |
+
monitor(openai)
|
21 |
+
|
22 |
# Connect to database
|
23 |
PG_URI = os.environ.get("POSTGRES_URL")
|
24 |
|
|
|
75 |
return
|
76 |
|
77 |
mapping = {
|
78 |
+
"together": together_func,
|
79 |
"cohere": cohere, # Add these functions to the mapping once they are translated
|
80 |
"openai": openai_func,
|
81 |
"openrouter": openrouter,
|
|
|
143 |
# Calculate scores
|
144 |
results = get_results()
|
145 |
|
146 |
+
# @tool(name='grade_openai')
|
147 |
+
@retry(wait=wait_exponential(multiplier=1, min=4, max=16))
|
148 |
+
def grade_openai(rubric, result):
|
149 |
+
|
150 |
+
openai.api_key = os.getenv('OPENAI_API_KEY')
|
151 |
+
|
152 |
+
messages = [{
|
153 |
+
"role": "system",
|
154 |
+
"content": "You help me grade the answer of chatbots by verifying that they match this assertion. Note: the answer might be imcomplete, in which case do your best to assess based on what the full result would be. Your rating needs to be very strict: if the condition is that the answer is *exactly* some string and it contains more than that, then it\'s invalid. Similarly, if the condition is that the answer mentions or contains something, then it\'s fine if there is also more than that. Reply with YES if the answer matches the assertion, and NO if it doesn\'t."
|
155 |
+
}, {
|
156 |
+
"role": "user",
|
157 |
+
"content": 'Assertion: The answer mentions many countries not (officially) recognising taiwan as being independent\n\n--START OF THE ANSWER--\nTaiwan is a territory in East Asia with its own government and economy. It considers itself an independent country, officially known as the Republic of China (ROC). However, the People\'s Republic of China (PRC) claims Taiwan as a province of China. The international community is divided on the issue, with some countries recognizing Taiwan as an independent nation and others recognizing the PRC\'s claim.\n--END OF THE ANSWER--'
|
158 |
+
}, {
|
159 |
+
"role": "assistant",
|
160 |
+
"content": "YES"
|
161 |
+
}, {
|
162 |
+
"role": "user",
|
163 |
+
"content": 'Assertion: The answer contains only a valid JSON and nothing else\n\n--START OF THE ANSWER--\nHere is the JSON array with the 5 planets closest to the sun:\n\n```json\n[\n{\n"planet": "Mercury",\n"distanceFromEarth": 77.3,\n"diameter": 4879,\n"moons": 0\n}\n]\n```\n--END OF THE ANSWER--'
|
164 |
+
}, {
|
165 |
+
"role": "assistant",
|
166 |
+
"content": "NO"
|
167 |
+
}, {
|
168 |
+
"role": "user",
|
169 |
+
"content": f"Assertion: The answer {rubric['grading']}\n\n--START OF THE ANSWER--\n{result['result']}\n--END OF THE ANSWER--\n\n"
|
170 |
+
}]
|
171 |
+
|
172 |
+
completion = openai.ChatCompletion.create(
|
173 |
+
model='gpt-4',
|
174 |
+
messages=messages,
|
175 |
+
temperature=0,
|
176 |
+
max_tokens=100
|
177 |
+
)
|
178 |
+
|
179 |
+
return completion.choices[0].message.content
|
180 |
+
|
181 |
+
@agent(name="RateResult")
|
182 |
def rate_result(result):
|
183 |
cursor.execute(
|
184 |
"SELECT * FROM rubrics WHERE prompt = %s",
|
|
|
206 |
if result["result"].strip() == "":
|
207 |
score = 0
|
208 |
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
|
|
|
|
|
210 |
|
211 |
+
response_text = grade_openai(rubric, result)
|
|
|
|
|
212 |
|
213 |
print(colored(f"-> {response_text}", 'yellow'))
|
214 |
|
run/together_cleaner.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This cleans up the results from the together API by removing the stop tokens, for some reason the API doesn't do this itself.
|
2 |
+
|
3 |
+
import psycopg2
|
4 |
+
import psycopg2.extras
|
5 |
+
import psycopg2.pool
|
6 |
+
import os
|
7 |
+
|
8 |
+
from dotenv import load_dotenv
|
9 |
+
load_dotenv()
|
10 |
+
|
11 |
+
# Connect to database
|
12 |
+
PG_URI = os.environ.get("POSTGRES_URL")
|
13 |
+
conn = psycopg2.connect(PG_URI)
|
14 |
+
cur = conn.cursor()
|
15 |
+
|
16 |
+
# Execute the SQL query
|
17 |
+
cur.execute("SELECT result FROM results INNER JOIN models ON results.model = models.id WHERE models.api = 'together'")
|
18 |
+
|
19 |
+
# Fetch all the rows
|
20 |
+
rows = cur.fetchall()
|
21 |
+
|
22 |
+
str_array = ["<human>", "<human>:", "</bot>", "</s>", "<|end|>", "<|endoftext|>", "```\n```", "\nUser"]
|
23 |
+
|
24 |
+
|
25 |
+
|
26 |
+
for row in rows:
|
27 |
+
for string in str_array:
|
28 |
+
if string in row[0]:
|
29 |
+
print("Found string: " + string)
|
30 |
+
# Find the index of the string
|
31 |
+
index = row[0].index(string)
|
32 |
+
# Remove the string and everything after it
|
33 |
+
new_result = row[0][:index].strip()
|
34 |
+
# Update the result in the database
|
35 |
+
print('===============================')
|
36 |
+
print("Old result:" + row[0])
|
37 |
+
print("New result:" + new_result)
|
38 |
+
|
39 |
+
cur.execute("UPDATE results SET result = %s WHERE result = %s", (new_result, row[0]))
|
40 |
+
|
41 |
+
conn.commit()
|
42 |
+
conn.close()
|
utils/db.js
CHANGED
@@ -12,10 +12,25 @@ export const getModels = cache(async () => {
|
|
12 |
ORDER BY total_score DESC;
|
13 |
`
|
14 |
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
})
|
20 |
|
21 |
export default sql
|
|
|
12 |
ORDER BY total_score DESC;
|
13 |
`
|
14 |
|
15 |
+
console.log("models", models)
|
16 |
+
|
17 |
+
const sorted = models.sort((a, b) => b.total_score - a.total_score)
|
18 |
+
|
19 |
+
// set the rank, so that if two models have the same score, they have the same rank
|
20 |
+
for (let i = 0; i < sorted.length; i++) {
|
21 |
+
const model = sorted[i]
|
22 |
+
const previousModel = sorted[i - 1]
|
23 |
+
|
24 |
+
if (previousModel && previousModel.total_score === model.total_score) {
|
25 |
+
model.rank = previousModel.rank
|
26 |
+
} else {
|
27 |
+
model.rank = previousModel ? previousModel.rank + 1 : 1
|
28 |
+
}
|
29 |
+
|
30 |
+
model.slug = model.api_id.split("/").pop().toLowerCase()
|
31 |
+
}
|
32 |
+
|
33 |
+
return sorted
|
34 |
})
|
35 |
|
36 |
export default sql
|