import db, { getModels } from "@/utils/db" import Link from "next/link" export default async function Leaderboard() { const [potentialPoints] = await db`SELECT SUM(points) as total FROM rubrics` const models = await getModels() return ( <>

Traditional LLMs benchmarks have drawbacks: they quickly become part of training datasets and are hard to relate to in terms of real-world use-cases.

I made this as an experiment to address these issues. Here, the dataset is dynamic (changes every week) and composed of crowdsourced real-world prompts.

We then use GPT-4 to grade each model's response against a set of rubrics (more details on the about page). The prompt dataset is easily explorable.

Everything is then stored in a Postgres database and this page shows the raw results.

{models .filter((s) => s.total_score) .map((model, i) => ( ))}

Rank	Model	Score	Results
{model.rank}	{model.name}	{parseInt((model.total_score / potentialPoints.total) * 100)}	view

) }