Spaces:
				
			
			
	
			
			
		Build error
		
	
	
	
			
			
	
	
	
	
		
		
		Build error
		
	typos
Browse files- app/page.js +5 -5
 
    	
        app/page.js
    CHANGED
    
    | 
         @@ -9,22 +9,22 @@ export default async function Leaderboard() { 
     | 
|
| 9 | 
         
             
                <>
         
     | 
| 10 | 
         
             
                  <p>
         
     | 
| 11 | 
         
             
                    Traditional LLMs benchmarks have drawbacks: they quickly become part of
         
     | 
| 12 | 
         
            -
                    training datasets and are hard to relate 
     | 
| 13 | 
         
             
                    use-cases.
         
     | 
| 14 | 
         
             
                  </p>
         
     | 
| 15 | 
         
             
                  <p>
         
     | 
| 16 | 
         
            -
                    I made this as an experiment to address these issues. Here the dataset
         
     | 
| 17 | 
         
             
                    is dynamic (changes every week) and composed of crowdsourced real-world
         
     | 
| 18 | 
         
             
                    prompts.
         
     | 
| 19 | 
         
             
                  </p>
         
     | 
| 20 | 
         
             
                  <p>
         
     | 
| 21 | 
         
             
                    We then use GPT-4 to grade each model's response against a set of
         
     | 
| 22 | 
         
             
                    rubrics (more details on the about page). The prompt dataset is easily
         
     | 
| 23 | 
         
            -
                    explorable 
     | 
| 24 | 
         
             
                  </p>
         
     | 
| 25 | 
         
             
                  <p>
         
     | 
| 26 | 
         
            -
                     
     | 
| 27 | 
         
            -
                    results.
         
     | 
| 28 | 
         
             
                  </p>
         
     | 
| 29 | 
         | 
| 30 | 
         
             
                  <br />
         
     | 
| 
         | 
|
| 9 | 
         
             
                <>
         
     | 
| 10 | 
         
             
                  <p>
         
     | 
| 11 | 
         
             
                    Traditional LLMs benchmarks have drawbacks: they quickly become part of
         
     | 
| 12 | 
         
            +
                    training datasets and are hard to relate to in terms of real-world
         
     | 
| 13 | 
         
             
                    use-cases.
         
     | 
| 14 | 
         
             
                  </p>
         
     | 
| 15 | 
         
             
                  <p>
         
     | 
| 16 | 
         
            +
                    I made this as an experiment to address these issues. Here, the dataset
         
     | 
| 17 | 
         
             
                    is dynamic (changes every week) and composed of crowdsourced real-world
         
     | 
| 18 | 
         
             
                    prompts.
         
     | 
| 19 | 
         
             
                  </p>
         
     | 
| 20 | 
         
             
                  <p>
         
     | 
| 21 | 
         
             
                    We then use GPT-4 to grade each model's response against a set of
         
     | 
| 22 | 
         
             
                    rubrics (more details on the about page). The prompt dataset is easily
         
     | 
| 23 | 
         
            +
                    explorable.
         
     | 
| 24 | 
         
             
                  </p>
         
     | 
| 25 | 
         
             
                  <p>
         
     | 
| 26 | 
         
            +
                    Everything is then stored in a Postgres database and this page shows the
         
     | 
| 27 | 
         
            +
                    raw results.
         
     | 
| 28 | 
         
             
                  </p>
         
     | 
| 29 | 
         | 
| 30 | 
         
             
                  <br />
         
     |