Spaces:

llmonitor
/

benchmarks

Build error

File size: 1,844 Bytes

34dd66d

import db, { getModels } from "@/utils/db"
import Link from "next/link"

export default async function Leaderboard() {
  const [potentialPoints] = await db`SELECT SUM(points) as total FROM rubrics`

  const models = await getModels()
  return (
    <>
      <p>
        Traditional LLMs benchmarks have drawbacks: they quickly become part of
        training datasets and are hard to relate to in terms of real-world
        use-cases.
      </p>
      <p>
        I made this as an experiment to address these issues. Here, the dataset
        is dynamic (changes every week) and composed of crowdsourced real-world
        prompts.
      </p>
      <p>
        We then use GPT-4 to grade each model's response against a set of
        rubrics (more details on the about page). The prompt dataset is easily
        explorable.
      </p>
      <p>
        Everything is then stored in a Postgres database and this page shows the
        raw results.
      </p>

      <br />
      <table style={{ maxWidth: 600 }}>
        <thead>
          <tr>
            <th width={70}>Rank</th>
            <th width={250}>Model</th>
            <th>Score</th>
            <th>Results</th>
          </tr>
        </thead>
        <tbody>
          {models
            .filter((s) => s.total_score)
            .map((model, i) => (
              <tr key={i}>
                <td>{model.rank}</td>
                <td>{model.name}</td>
                <td>
                  {parseInt((model.total_score / potentialPoints.total) * 100)}
                </td>
                <td>
                  <Link
                    href={`/${model.api_id.split("/").pop().toLowerCase()}`}
                  >
                    view
                  </Link>
                </td>
              </tr>
            ))}
        </tbody>
      </table>
    </>
  )
}