File size: 1,865 Bytes
97826e4
7e5cb25
 
 
 
 
97826e4
7e5cb25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97826e4
7e5cb25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import db, { getModels } from "@/utils/db"
import Link from "next/link"

export default async function Leaderboard() {
  const [potentialPoints] = await db`SELECT SUM(points) as total FROM rubrics`

  const models = await getModels()
  return (
    <>
      <p>
        Traditional LLMs benchmarks have drawbacks: they quickly become part of
        training datasets and are hard to relate-to in terms of real-world
        use-cases.
      </p>
      <p>
        I made this as an experiment to address these issues. Here the dataset
        is dynamic (changes every week) and composed of crowdsourced real-world
        prompts.
      </p>
      <p>
        We then use GPT-4 to grade each model's response against a set of
        rubrics (more details on the about page). The prompt dataset is easily
        explorable as the score is only 1 dimension.
      </p>
      <p>
        The results are stored in Postgres database and those are the raw
        results.
      </p>

      <br />
      <table style={{ maxWidth: 600 }}>
        <thead>
          <tr>
            <th width={70}>Rank</th>
            <th width={250}>Model</th>
            <th>Score</th>
            <th>Results</th>
          </tr>
        </thead>
        <tbody>
          {models
            .filter((s) => s.total_score)
            .map((model, i) => (
              <tr key={i}>
                <td>{model.rank}</td>
                <td>{model.name}</td>
                <td>
                  {parseInt((model.total_score / potentialPoints.total) * 100)}
                </td>
                <td>
                  <Link
                    href={`/${model.api_id.split("/").pop().toLowerCase()}`}
                  >
                    view
                  </Link>
                </td>
              </tr>
            ))}
        </tbody>
      </table>
    </>
  )
}