Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	
		burtenshaw
		
	commited on
		
		
					Commit 
							
							·
						
						1c7c01e
	
1
								Parent(s):
							
							b5eec3d
								
update app to use lighteval format
Browse files
    	
        app.py
    CHANGED
    
    | 
         @@ -9,7 +9,7 @@ from datasets import load_dataset 
     | 
|
| 9 | 
         
             
            abs_path = Path(__file__).parent
         
     | 
| 10 | 
         
             
            submissions = json.load(open(abs_path / "submissions.json"))
         
     | 
| 11 | 
         | 
| 12 | 
         
            -
            TASKS = [" 
     | 
| 13 | 
         
             
            TYPES = [
         
     | 
| 14 | 
         
             
                "markdown",
         
     | 
| 15 | 
         
             
                "markdown",
         
     | 
| 
         @@ -21,14 +21,45 @@ COLUMNS = ["User", "Model Name", "MMLU", "Average ⬆️", "Results"] 
     | 
|
| 21 | 
         
             
            WIDTHS = ["25%", "25%", "15%", "15%", "10%"]
         
     | 
| 22 | 
         | 
| 23 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 24 | 
         
             
            def load_submissions():
         
     | 
| 25 | 
         
             
                leaderboard = []
         
     | 
| 26 | 
         
            -
             
     | 
| 27 | 
         
             
                for submission in submissions["submissions"]:
         
     | 
| 28 | 
         
             
                    ds = load_dataset(submission["results-dataset"], "results")
         
     | 
| 29 | 
         
            -
                    ds = ds.filter(lambda x: x["task"] in TASKS)
         
     | 
| 30 | 
         | 
| 31 | 
         
            -
                     
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 32 | 
         | 
| 33 | 
         
             
                    leaderboard_row = {}
         
     | 
| 34 | 
         | 
| 
         @@ -40,11 +71,12 @@ def load_submissions(): 
     | 
|
| 40 | 
         
             
                        f"[{submission['model_name']}](https://huggingface.co/{submission['model_name']})"
         
     | 
| 41 | 
         
             
                    )
         
     | 
| 42 | 
         | 
| 43 | 
         
            -
                    for result in  
     | 
| 44 | 
         
            -
                        leaderboard_row[ 
     | 
| 45 | 
         
            -
                        all_accuracy.append(result["accuracy"])
         
     | 
| 46 | 
         | 
| 47 | 
         
            -
                    leaderboard_row["Average ⬆️"] = sum( 
     | 
| 
         | 
|
| 
         | 
|
| 48 | 
         | 
| 49 | 
         
             
                    leaderboard_row["results-dataset"] = (
         
     | 
| 50 | 
         
             
                        f"[🔗](https://huggingface.co/datasets/{submission['results-dataset']})"
         
     | 
| 
         | 
|
| 9 | 
         
             
            abs_path = Path(__file__).parent
         
     | 
| 10 | 
         
             
            submissions = json.load(open(abs_path / "submissions.json"))
         
     | 
| 11 | 
         | 
| 12 | 
         
            +
            TASKS = [("gsm8k", "lighteval|gsm8k|0", "extractive_match")]
         
     | 
| 13 | 
         
             
            TYPES = [
         
     | 
| 14 | 
         
             
                "markdown",
         
     | 
| 15 | 
         
             
                "markdown",
         
     | 
| 
         | 
|
| 21 | 
         
             
            WIDTHS = ["25%", "25%", "15%", "15%", "10%"]
         
     | 
| 22 | 
         | 
| 23 | 
         | 
| 24 | 
         
            +
            def load_results(dataset):
         
     | 
| 25 | 
         
            +
                results = []
         
     | 
| 26 | 
         
            +
             
     | 
| 27 | 
         
            +
                try:
         
     | 
| 28 | 
         
            +
                    output = dataset["latest"]["results"]
         
     | 
| 29 | 
         
            +
                    output = output[-1]
         
     | 
| 30 | 
         
            +
                except KeyError as e:
         
     | 
| 31 | 
         
            +
                    raise ValueError("Cannot find 'latest' key in the dataset")
         
     | 
| 32 | 
         
            +
             
     | 
| 33 | 
         
            +
                try:
         
     | 
| 34 | 
         
            +
                    output = json.loads(output)
         
     | 
| 35 | 
         
            +
                except ValueError as e:
         
     | 
| 36 | 
         
            +
                    raise ValueError("Cannot parse the output as JSON")
         
     | 
| 37 | 
         
            +
             
     | 
| 38 | 
         
            +
                for name, task, metric in TASKS:
         
     | 
| 39 | 
         
            +
                    try:
         
     | 
| 40 | 
         
            +
                        output = output[task]
         
     | 
| 41 | 
         
            +
                    except KeyError as e:
         
     | 
| 42 | 
         
            +
                        raise ValueError(f"Cannot find '{task}' key in the dataset")
         
     | 
| 43 | 
         
            +
             
     | 
| 44 | 
         
            +
                    try:
         
     | 
| 45 | 
         
            +
                        output = (name, output[metric])
         
     | 
| 46 | 
         
            +
                    except KeyError as e:
         
     | 
| 47 | 
         
            +
                        raise ValueError("Cannot find 'extractive_match' key in the dataset")
         
     | 
| 48 | 
         
            +
             
     | 
| 49 | 
         
            +
                    results.append(output)
         
     | 
| 50 | 
         
            +
             
     | 
| 51 | 
         
            +
                return results
         
     | 
| 52 | 
         
            +
             
     | 
| 53 | 
         
            +
             
     | 
| 54 | 
         
             
            def load_submissions():
         
     | 
| 55 | 
         
             
                leaderboard = []
         
     | 
| 
         | 
|
| 56 | 
         
             
                for submission in submissions["submissions"]:
         
     | 
| 57 | 
         
             
                    ds = load_dataset(submission["results-dataset"], "results")
         
     | 
| 
         | 
|
| 58 | 
         | 
| 59 | 
         
            +
                    try:
         
     | 
| 60 | 
         
            +
                        results = load_results(ds)
         
     | 
| 61 | 
         
            +
                    except ValueError as e:
         
     | 
| 62 | 
         
            +
                        raise ValueError(f"Cannot load results for {ds['results-dataset']}") from e
         
     | 
| 63 | 
         | 
| 64 | 
         
             
                    leaderboard_row = {}
         
     | 
| 65 | 
         | 
| 
         | 
|
| 71 | 
         
             
                        f"[{submission['model_name']}](https://huggingface.co/{submission['model_name']})"
         
     | 
| 72 | 
         
             
                    )
         
     | 
| 73 | 
         | 
| 74 | 
         
            +
                    for name, result in results:
         
     | 
| 75 | 
         
            +
                        leaderboard_row[name] = result
         
     | 
| 
         | 
|
| 76 | 
         | 
| 77 | 
         
            +
                    leaderboard_row["Average ⬆️"] = sum(result for _, result in results) / len(
         
     | 
| 78 | 
         
            +
                        results
         
     | 
| 79 | 
         
            +
                    )
         
     | 
| 80 | 
         | 
| 81 | 
         
             
                    leaderboard_row["results-dataset"] = (
         
     | 
| 82 | 
         
             
                        f"[🔗](https://huggingface.co/datasets/{submission['results-dataset']})"
         
     | 
    	
        docs.md
    CHANGED
    
    | 
         @@ -65,7 +65,14 @@ Open a pull request on the [leaderboard space](https://huggingface.co/spaces/smo 
     | 
|
| 65 | 
         
             
            ```json
         
     | 
| 66 | 
         
             
            {
         
     | 
| 67 | 
         
             
                "submissions": [
         
     | 
| 68 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 69 | 
         
             
                    ... # existing submissions
         
     | 
| 70 | 
         | 
| 71 | 
         
             
                    {
         
     | 
| 
         | 
|
| 65 | 
         
             
            ```json
         
     | 
| 66 | 
         
             
            {
         
     | 
| 67 | 
         
             
                "submissions": [
         
     | 
| 68 | 
         
            +
                    {
         
     | 
| 69 | 
         
            +
                        "username": "HuggingFaceTB",
         
     | 
| 70 | 
         
            +
                        "model_name": "SmolLM3-3B",
         
     | 
| 71 | 
         
            +
                        "chapter": "1",
         
     | 
| 72 | 
         
            +
                        "submission_date": "2025-09-02",
         
     | 
| 73 | 
         
            +
                        "results-dataset": "smol-course/details_HuggingFaceTB__SmolLM3-3B_private"
         
     | 
| 74 | 
         
            +
                    },
         
     | 
| 75 | 
         
            +
                    
         
     | 
| 76 | 
         
             
                    ... # existing submissions
         
     | 
| 77 | 
         | 
| 78 | 
         
             
                    {
         
     |