File size: 4,041 Bytes
ee9e25e
 
 
 
 
 
513e813
cc32c4f
b58e1f0
0a77c60
b58e1f0
 
ee9e25e
 
513e813
83a34f0
ee9e25e
7f2d984
 
 
 
30fa96a
ee9e25e
 
7f2d984
 
 
ee9e25e
f228d38
 
 
 
 
 
 
 
c32735e
513e813
25b87bf
38d88f9
 
 
 
7622af3
b58e1f0
38d88f9
 
b58e1f0
38d88f9
25b87bf
b58e1f0
25b87bf
 
b58e1f0
 
 
25b87bf
 
b58e1f0
 
 
38d88f9
9adae3c
0a77c60
9adae3c
0a77c60
b58e1f0
0a77c60
9adae3c
0a77c60
b58e1f0
0a77c60
b58e1f0
25b87bf
513e813
9adae3c
38d88f9
83a34f0
 
 
 
 
 
 
 
 
 
 
cc32c4f
 
 
 
0a77c60
 
 
 
 
 
cc32c4f
 
 
c32735e
 
7f2d984
 
30fa96a
 
7f2d984
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import pandas as pd
import os
import fnmatch
import json
import re
import numpy as np
import requests
from urllib.parse import quote
from datetime import datetime
import uuid



class DetailsDataProcessor:
    # Download 
    #url example https://huggingface.co/datasets/open-llm-leaderboard/details/resolve/main/64bits/LexPodLM-13B/details_harness%7ChendrycksTest-moral_scenarios%7C5_2023-07-25T13%3A41%3A51.227672.json
    
    # def __init__(self, directory='results', pattern='results*.json'):
    #     self.directory = directory
    #     self.pattern = pattern

    def __init__(self, directory='results', pattern='results*.json'):
        self.directory = directory
        self.pattern = pattern
        if not os.path.exists('details_data'):
            os.makedirs('details_data')


    def _find_files(self, directory='results', pattern='results*.json'):
        matching_files = []  # List to hold matching filenames
        for root, dirs, files in os.walk(directory):
            for basename in files:
                if fnmatch.fnmatch(basename, pattern):
                    filename = os.path.join(root, basename)
                    matching_files.append(filename)  # Append the matching filename to the list
        return matching_files  # Return the list of matching filenames

    @staticmethod
    def download_file(url, directory='details_data'):
        # Extract relevant parts from the URL
        segments = url.split('/')
        organization = segments[-3]
        model_name = segments[-2]
        task = url.split('%7ChendrycksTest-')[1].split('%7C')[0]

        # Construct the filename
        safe_file_name = f"{organization}_{model_name}_{task}.json"

        # Create the full save file path
        save_file_path = os.path.join(directory, safe_file_name)

        error_count = 0
        success_count = 0
        try:
            # Sending a GET request
            r = requests.get(url, allow_redirects=True)
            r.raise_for_status()

            # Writing the content to the specified file
            with open(save_file_path, 'wb') as file:
                file.write(r.content)
            print(save_file_path)

            success_count += 1
        except requests.ConnectionError as e:
            error_count += 1
        except requests.HTTPError as e:
            error_count += 1
        except FileNotFoundError as e:
            error_count += 1
        except Exception as e:
            error_count += 1

        return error_count, success_count



    @staticmethod
    def single_file_pipeline(url, filename):
        DetailsDataProcessor.download_file(url, filename)
        # read file
        with open(filename) as f:
            data = json.load(f)
        # convert to dataframe
        df = pd.DataFrame(data)
        return df

    @staticmethod
    def build_url(file_path):
        segments = file_path.split('/')
        bits = segments[1]
        model_name = segments[2]

        try:
            timestamp = segments[3].split('_')[1]
        except IndexError:
            print(f"Error: {file_path}")
            return None
        
        url = f'https://huggingface.co/datasets/open-llm-leaderboard/details/resolve/main/{bits}/{model_name}/details_harness%7ChendrycksTest-moral_scenarios%7C5_{quote(timestamp, safe="")}'
        return url
    
    def pipeline(self):
        error_count = 0
        success_count = 0
        file_paths = self._find_files(self.directory, self.pattern)
        for file_path in file_paths:
            print(f"Processing file path: {file_path}")
            url = self.build_url(file_path)
            if url:
                errors, successes = self.download_file(url)
                error_count += errors
                success_count += successes
            else:
                print(f"Error building URL for file path: {file_path}")
                error_count += 1

        print(f"Downloaded {success_count} files successfully. Encountered {error_count} errors.")
        return success_count, error_count