vincelwt commited on
Commit
19513c9
1 Parent(s): a4ef64a

add llmonitor & start scoring

Browse files
Files changed (5) hide show
  1. pages/index.js +22 -16
  2. run/database.db +0 -0
  3. run/queriers.py +4 -0
  4. run/requirements.txt +2 -1
  5. run/run.py +86 -9
pages/index.js CHANGED
@@ -47,7 +47,7 @@ export default function Home({ prompts, models }) {
47
  <meta name="viewport" content="width=device-width, initial-scale=1" />
48
  </Head>
49
  <main>
50
- <h1>Asking 60+ LLMs a set of 20 questions</h1>
51
  <br />
52
  <p>
53
  Benchmarks like HellaSwag are a bit too abstract for me to get a sense
@@ -69,13 +69,13 @@ export default function Home({ prompts, models }) {
69
  <br />
70
  <p>
71
  {`view: `}
 
 
 
 
72
  <a href="#" onClick={() => changeView("prompt")}>
73
- all prompts
74
  </a>{" "}
75
- /{" "}
76
- <a href="#" onClick={() => changeView("model")}>
77
- all models
78
- </a>
79
  </p>
80
  <br />
81
  {viewBy === "prompt" ? (
@@ -103,16 +103,22 @@ export default function Home({ prompts, models }) {
103
  </>
104
  ) : (
105
  <ul>
106
- {models.map((model, i) => (
107
- <li key={i}>
108
- {model.name} -{" "}
109
- <Link
110
- href={`/model/${model.api_id.split("/").pop().toLowerCase()}`}
111
- >
112
- results
113
- </Link>
114
- </li>
115
- ))}
 
 
 
 
 
 
116
  </ul>
117
  )}
118
  <br />
 
47
  <meta name="viewport" content="width=device-width, initial-scale=1" />
48
  </Head>
49
  <main>
50
+ <h1>Crowdsourced LLM Benchmark</h1>
51
  <br />
52
  <p>
53
  Benchmarks like HellaSwag are a bit too abstract for me to get a sense
 
69
  <br />
70
  <p>
71
  {`view: `}
72
+ <a href="#" onClick={() => changeView("model")}>
73
+ models
74
+ </a>{" "}
75
+ /
76
  <a href="#" onClick={() => changeView("prompt")}>
77
+ prompts
78
  </a>{" "}
 
 
 
 
79
  </p>
80
  <br />
81
  {viewBy === "prompt" ? (
 
103
  </>
104
  ) : (
105
  <ul>
106
+ {models
107
+ .score((s) => s.score)
108
+ .map((model, i) => (
109
+ <li key={i}>
110
+ {model.name} -{" "}
111
+ <Link
112
+ href={`/model/${model.api_id
113
+ .split("/")
114
+ .pop()
115
+ .toLowerCase()}`}
116
+ >
117
+ results
118
+ </Link>{" "}
119
+ - score: {model.score}
120
+ </li>
121
+ ))}
122
  </ul>
123
  )}
124
  <br />
run/database.db CHANGED
Binary files a/run/database.db and b/run/database.db differ
 
run/queriers.py CHANGED
@@ -4,6 +4,8 @@ import json
4
  import requests
5
  from dotenv import load_dotenv
6
 
 
 
7
  load_dotenv()
8
 
9
  TOGETHER_API_KEY = os.getenv('TOGETHER_API_KEY')
@@ -15,6 +17,8 @@ OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
15
 
16
  MAX_TOKENS = 300
17
 
 
 
18
  def together(model, params):
19
  def format_prompt(prompt, prompt_type):
20
  if prompt_type == "language":
 
4
  import requests
5
  from dotenv import load_dotenv
6
 
7
+ from llmonitor import monitor
8
+
9
  load_dotenv()
10
 
11
  TOGETHER_API_KEY = os.getenv('TOGETHER_API_KEY')
 
17
 
18
  MAX_TOKENS = 300
19
 
20
+ monitor(openai)
21
+
22
  def together(model, params):
23
  def format_prompt(prompt, prompt_type):
24
  if prompt_type == "language":
run/requirements.txt CHANGED
@@ -2,4 +2,5 @@ openai
2
  pandas
3
  requests
4
  python-dotenv
5
- gradio
 
 
2
  pandas
3
  requests
4
  python-dotenv
5
+ gradio
6
+ llmonitor
run/run.py CHANGED
@@ -1,6 +1,7 @@
1
  import sqlite3
2
  import time
3
-
 
4
  from queriers import together, cohere, openai_func, openrouter, ai21, alephalpha
5
 
6
  db = sqlite3.connect("./database.db")
@@ -22,6 +23,12 @@ models = [dict(model) for model in models]
22
  prompts = cursor.execute("SELECT * FROM prompts").fetchall()
23
  prompts = [dict(prompt) for prompt in prompts]
24
 
 
 
 
 
 
 
25
  def insert_result(modelId, promptId, result, duration, rate):
26
  cursor.execute(
27
  "INSERT INTO results (model, prompt, result, duration, rate) VALUES (?, ?, ?, ?, ?)",
@@ -89,15 +96,85 @@ def ask_prompt(prompt, model):
89
  total_benchmarks = len(models) * len(prompts)
90
  print(f"Running {total_benchmarks} benchmarks")
91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
- for model in models:
94
- if model["type"] == "language":
95
- continue
96
- for prompt in prompts:
97
- if prompt["type"] != "code" and model["type"] == "code":
98
- print("Skipping non-code benchmark for code model")
99
- continue
100
 
101
- ask_prompt(prompt, model)
 
 
 
 
 
102
 
103
  db.close()
 
1
  import sqlite3
2
  import time
3
+ from termcolor import colored
4
+ from llmonitor import agent
5
  from queriers import together, cohere, openai_func, openrouter, ai21, alephalpha
6
 
7
  db = sqlite3.connect("./database.db")
 
23
  prompts = cursor.execute("SELECT * FROM prompts").fetchall()
24
  prompts = [dict(prompt) for prompt in prompts]
25
 
26
+
27
+ def get_results():
28
+ results = cursor.execute("SELECT * FROM results").fetchall()
29
+ print(results[0].keys())
30
+ return [dict(result) for result in results]
31
+
32
  def insert_result(modelId, promptId, result, duration, rate):
33
  cursor.execute(
34
  "INSERT INTO results (model, prompt, result, duration, rate) VALUES (?, ?, ?, ?, ?)",
 
96
  total_benchmarks = len(models) * len(prompts)
97
  print(f"Running {total_benchmarks} benchmarks")
98
 
99
+ # # Run prompts
100
+ # for model in models:
101
+ # if model["type"] == "language":
102
+ # continue
103
+ # for prompt in prompts:
104
+ # if prompt["type"] != "code" and model["type"] == "code":
105
+ # print("Skipping non-code benchmark for code model")
106
+ # continue
107
+
108
+ # ask_prompt(prompt, model)
109
+
110
+ # Calculate scores
111
+ results = get_results()
112
+
113
+ @agent(name="RateResult")
114
+ def rate_result(result):
115
+ rubrics = cursor.execute(
116
+ "SELECT * FROM rubrics WHERE prompt = ?",
117
+ (result["prompt"],)
118
+ ).fetchall()
119
+
120
+ has_rubrics = len(rubrics) > 0
121
+
122
+ if not has_rubrics:
123
+ return
124
+
125
+ print(colored('---------------------------', 'white'))
126
+ print(colored('----------RATING-----------', 'white'))
127
+ print(colored('---------------------------', 'white'))
128
+ print(colored(result["result"], 'cyan'))
129
+ print(colored('---------------------------', 'white'))
130
+
131
+ score = None
132
+
133
+ for rubric in rubrics:
134
+
135
+ print('Rubric: '+colored(rubric["grading"], 'magenta'))
136
+
137
+ if result["result"].strip() == "":
138
+ score = 0
139
+ else:
140
+ grading_text = (
141
+ f'You help verify that the following answer match this condition: the answer {rubric["grading"]}. Note: the answer might be imcomplete, in which case do your best to assess based on what the full result would be.\n\n'
142
+ f'\n\n--START OF THE ANSWER--\n{result["result"]}\n--END OF THE ANSWER--\n\n'
143
+ f'Take a deep breath and explain step by step how you come to the conclusion.'
144
+ f'Finally, reply on the last line with YES if the following answer matches this condition (otherwies reply NO).'
145
+ )
146
+
147
+ # get gpt-4 model
148
+ gpt4 = next((item for item in models if item['api_id'] == 'gpt-4'), None)
149
+
150
+ prompt = { }
151
+
152
+ response_text = openai_func(gpt4, {"text": grading_text})
153
+
154
+ print(colored(f"-> {response_text}", 'yellow'))
155
+
156
+ last_line = response_text.splitlines()[-1]
157
+
158
+ # If it includes a yes, then it's valid
159
+ if "YES" in last_line:
160
+ print(colored(f'Valid! + {rubric["points"]} points', 'green'))
161
+ score = rubric["points"] if score is None else score + rubric["points"]
162
+
163
+ print('Final score: '+colored(score, 'cyan'))
164
+
165
+ return score
166
+
167
+
168
 
169
+ for result in results:
170
+ if not result["score"]:
171
+ score = rate_result(result)
 
 
 
 
172
 
173
+ if score is not None:
174
+ cursor.execute(
175
+ "UPDATE results SET score = ? WHERE id == ?",
176
+ (score, result["id"])
177
+ )
178
+ db.commit()
179
 
180
  db.close()