File size: 4,283 Bytes
0855f92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/env python3
"""

Script to download external models data from the Open Portuguese LLM Leaderboard

and convert it to CSV format for import into the benchmark.

"""

import requests
import pandas as pd
import json
import sys

def download_external_models():
    """Download external models data and convert to CSV."""
    
    url = "https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard/raw/main/external_models_results.json"
    
    print("Downloading external models data...")
    
    try:
        # Download the JSON file
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes
        
        # Parse JSON
        data = response.json()
        
        if not isinstance(data, list):
            print("Error: Expected JSON array, got:", type(data))
            return
        
        print(f"Downloaded {len(data)} external models")
        
        # Extract data for each model
        extracted_data = []
        
        for item in data:
            if not isinstance(item, dict):
                print(f"Warning: Skipping non-dict item: {type(item)}")
                continue
            
            # Extract required fields
            model = item.get('model', '')
            link = item.get('link', '')
            result_metrics = item.get('result_metrics', {})
            
            if not isinstance(result_metrics, dict):
                print(f"Warning: Skipping model '{model}' - result_metrics is not a dict")
                continue
            
            # Extract metrics
            assin2_sts = result_metrics.get('assin2_sts', 0.0)
            assin2_rte = result_metrics.get('assin2_rte', 0.0)
            faquad_nli = result_metrics.get('faquad_nli', 0.0)
            hatebr_offensive = result_metrics.get('hatebr_offensive', 0.0)
            
            # Create row data
            row_data = {
                'model': model,
                'link': link,
                'assin2_sts': assin2_sts,
                'assin2_rte': assin2_rte,
                'faquad_nli': faquad_nli,
                'hatebr_offensive': hatebr_offensive
            }
            
            extracted_data.append(row_data)
        
        # Create DataFrame
        df = pd.DataFrame(extracted_data)
        
        # Save to CSV
        output_file = 'external_models.csv'
        df.to_csv(output_file, index=False)
        
        print(f"\nSuccessfully extracted {len(df)} models to {output_file}")
        
        # Show first few entries as preview
        print("\nFirst 5 entries:")
        print(df.head().to_string(index=False))
        
        # Show some statistics
        if not df.empty:
            print(f"\nStatistics:")
            print(f"Total models: {len(df)}")
            
            # Count models with non-zero scores for each metric
            print(f"\nModels with scores:")
            print(f"ASSIN2 STS: {(df['assin2_sts'] > 0).sum()}")
            print(f"ASSIN2 RTE: {(df['assin2_rte'] > 0).sum()}")
            print(f"FaQuAD-NLI: {(df['faquad_nli'] > 0).sum()}")
            print(f"HateBR: {(df['hatebr_offensive'] > 0).sum()}")
            
            # Average scores
            print(f"\nAverage scores:")
            print(df[['assin2_sts', 'assin2_rte', 'faquad_nli', 'hatebr_offensive']].mean().round(3))
            
            # Show data types and info
            print(f"\nDataFrame info:")
            print(df.info())
        
    except requests.exceptions.RequestException as e:
        print(f"Error downloading data: {e}")
        sys.exit(1)
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"Unexpected error: {e}")
        sys.exit(1)

def main():
    """Main function to run the download."""
    print("External Models Data Downloader")
    print("=" * 40)
    
    try:
        download_external_models()
        print("\nDownload completed successfully!")
    except Exception as e:
        print(f"Error during download: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()