File size: 4,859 Bytes
35378f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import os
import pandas as pd
import requests, json
from io import StringIO

def get_github_data():
    '''
    Get data from csv files on Github
    Args:
        None    
    Returns: 
        latest_df: singular list containing dataframe of the latest version of the leaderboard with only 4 columns 
        all_dfs: list of dataframes for previous versions + latest version including columns for all games
        all_vnames: list of the names for the previous versions + latest version (For Details and Versions Tab Dropdown)
    '''
    uname = "clembench"
    repo = "clembench-runs"
    json_url = f"https://raw.githubusercontent.com/{uname}/{repo}/main/benchmark_runs.json"
    resp = requests.get(json_url)
    if resp.status_code == 200:
        json_data = json.loads(resp.text)
        versions = json_data['versions']
        version_names = []
        csv_url = f"https://raw.githubusercontent.com/{uname}/{repo}/main/"
        for ver in versions:
            version_names.append(ver['version'])
            csv_path = ver['result_file'].split('/')[1:]
            csv_path = '/'.join(csv_path)
        
        #Sort by latest version
        float_content = [float(s[1:]) for s in version_names]
        float_content.sort(reverse=True)
        version_names = ['v'+str(s) for s in float_content]

        DFS = []
        for version in version_names:
            result_url = csv_url+ version + '/' + csv_path
            csv_response = requests.get(result_url)
            if csv_response.status_code == 200:
                df = pd.read_csv(StringIO(csv_response.text))
                df = process_df(df)
                df = df.sort_values(by=list(df.columns)[1], ascending=False) # Sort by clemscore
                DFS.append(df)
            else:
                print(f"Failed to read CSV file for version : {version}. Status Code : {resp.status_code}")

        # Only keep relavant columns for the main leaderboard
        latest_df_dummy = DFS[0]
        all_columns = list(latest_df_dummy.columns)
        keep_columns = all_columns[0:4]
        latest_df_dummy = latest_df_dummy.drop(columns=[c for c in all_columns if c not in keep_columns])

        latest_df = [latest_df_dummy]
        all_dfs = []
        all_vnames = []
        for df, name in zip(DFS, version_names):
            all_dfs.append(df)
            all_vnames.append(name) 
        return latest_df, all_dfs, all_vnames
    
    else:
        print(f"Failed to read JSON file: Status Code : {resp.status_code}")

def process_df(df: pd.DataFrame) -> pd.DataFrame:
    '''
    Process dataframe 
    - Remove repition in model names 
    - Convert datatypes to sort by "float" instead of "str" for sorting
    - Update column names
    Args:
        df: Unprocessed Dataframe (after using update_cols)
    Returns:
        df: Processed Dataframe
    '''

    # Change column type to float from str
    list_column_names = list(df.columns)
    model_col_name = list_column_names[0]
    for col in list_column_names:
        if col != model_col_name:
            df[col] = df[col].astype(float)

    # Remove repetition in model names, if any
    models_list = []
    for i in range(len(df)):
        model_name = df.iloc[i][model_col_name]
        splits = model_name.split('--')
        splits = [split.replace('-t0.0', '') for split in splits] # Comment to not remove -t0.0
        if splits[0] == splits[1]:
            models_list.append(splits[0])
        else:
            models_list.append(splits[0] + "--" + splits[1])
    df[model_col_name] = models_list

    # Update column names
    update = ['Model', 'Clemscore', '% Played', 'Quality Score']
    game_metrics = list_column_names[4:]

    for col in game_metrics:
        splits = col.split(',')
        update.append(splits[0].capitalize() + "" + splits[1])
    
    map_cols = {}
    for i in range(len(update)):
        map_cols[list_column_names[i]] = str(update[i])

    df = df.rename(columns=map_cols)    
    return df

def filter_search(df: pd.DataFrame, query: str) -> pd.DataFrame:
    '''
    Filter the dataframe based on the search query
    Args:
        df: Unfiltered dataframe
        query: a string of queries separated by ";"
    Return:
        filtered_df: Dataframe containing searched queries in the 'Model' column 
    '''
    queries = query.split(';')
    list_cols = list(df.columns)
    df_len = len(df)
    filtered_models = []
    models_list = list(df[list_cols[0]])
    for q in queries:
        q = q.lower()
        q = q.strip()
        for i in range(df_len):
            model_name = models_list[i]
            if q in model_name.lower():
                filtered_models.append(model_name) # Append model names containing query q

    filtered_df = df[df[list_cols[0]].isin(filtered_models)]

    if query == "":
        return df

    return filtered_df