Corey Morris commited on
Commit
ee5ac8e
1 Parent(s): 843a5ef

Refactor. Extracted methods.

Browse files
Files changed (1) hide show
  1. result_data_processor.py +39 -46
result_data_processor.py CHANGED
@@ -4,65 +4,58 @@ import fnmatch
4
  import json
5
 
6
  class ResultDataProcessor:
7
- def __init__(self):
 
 
 
8
  self.data = self.process_data()
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  def process_data(self):
11
- dataframes = []
12
-
13
- def find_files(directory, pattern):
14
- for root, dirs, files in os.walk(directory):
15
- for basename in files:
16
- if fnmatch.fnmatch(basename, pattern):
17
- filename = os.path.join(root, basename)
18
- yield filename
19
-
20
- for filename in find_files('results', 'results*.json'):
21
- model_name = filename.split('/')[2]
22
- with open(filename) as f:
23
- data = json.load(f)
24
- df = pd.DataFrame(data['results']).T
25
-
26
-
27
- # data cleanup
28
- df = df.rename(columns={'acc': model_name})
29
- # Replace 'hendrycksTest-' with a more descriptive column name
30
- df.index = df.index.str.replace('hendrycksTest-', 'MMLU_', regex=True)
31
- df.index = df.index.str.replace('harness\|', '', regex=True)
32
- # remove |5 from the index
33
- df.index = df.index.str.replace('\|5', '', regex=True)
34
 
35
-
36
- dataframes.append(df[[model_name]])
37
-
38
- data = pd.concat(dataframes, axis=1)
39
-
40
- data = data.transpose()
41
  data['Model Name'] = data.index
42
  cols = data.columns.tolist()
43
  cols = cols[-1:] + cols[:-1]
44
  data = data[cols]
45
 
46
- # remove the Model Name column
47
- data = data.drop(['Model Name'], axis=1)
48
-
49
- # remove the all column
50
- data = data.drop(['all'], axis=1)
51
-
52
- # remove the truthfulqa:mc|0 column
53
- data = data.drop(['truthfulqa:mc|0'], axis=1)
54
-
55
- # create a new column that averages the results from each of the columns with a name that start with MMLU
56
  data['MMLU_average'] = data.filter(regex='MMLU').mean(axis=1)
57
 
58
- # move the MMLU_average column to the third column in the dataframe
59
  cols = data.columns.tolist()
60
  cols = cols[:2] + cols[-1:] + cols[2:-1]
61
  data = data[cols]
62
 
63
- return data
64
-
65
- # filter data based on the index
66
  def get_data(self, selected_models):
67
- filtered_data = self.data[self.data.index.isin(selected_models)]
68
- return filtered_data
 
4
  import json
5
 
6
  class ResultDataProcessor:
7
+
8
+ def __init__(self, directory='results', pattern='results*.json'):
9
+ self.directory = directory
10
+ self.pattern = pattern
11
  self.data = self.process_data()
12
 
13
+ @staticmethod
14
+ def _find_files(directory, pattern):
15
+ for root, dirs, files in os.walk(directory):
16
+ for basename in files:
17
+ if fnmatch.fnmatch(basename, pattern):
18
+ filename = os.path.join(root, basename)
19
+ yield filename
20
+
21
+ def _read_and_transform_data(self, filename):
22
+ with open(filename) as f:
23
+ data = json.load(f)
24
+ df = pd.DataFrame(data['results']).T
25
+ return df
26
+
27
+ def _cleanup_dataframe(self, df, model_name):
28
+ df = df.rename(columns={'acc': model_name})
29
+ df.index = (df.index.str.replace('hendrycksTest-', 'MMLU_', regex=True)
30
+ .str.replace('harness\|', '', regex=True)
31
+ .str.replace('\|5', '', regex=True))
32
+ return df[[model_name]]
33
+
34
  def process_data(self):
35
+ dataframes = [self._cleanup_dataframe(self._read_and_transform_data(filename), filename.split('/')[2])
36
+ for filename in self._find_files(self.directory, self.pattern)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
+ data = pd.concat(dataframes, axis=1).transpose()
39
+
40
+ # Add Model Name and rearrange columns
 
 
 
41
  data['Model Name'] = data.index
42
  cols = data.columns.tolist()
43
  cols = cols[-1:] + cols[:-1]
44
  data = data[cols]
45
 
46
+ # Remove the 'Model Name' column
47
+ data = data.drop(columns=['Model Name'])
48
+
49
+ # Add average column
 
 
 
 
 
 
50
  data['MMLU_average'] = data.filter(regex='MMLU').mean(axis=1)
51
 
52
+ # Reorder columns to move 'MMLU_average' to the third position
53
  cols = data.columns.tolist()
54
  cols = cols[:2] + cols[-1:] + cols[2:-1]
55
  data = data[cols]
56
 
57
+ # Drop specific columns
58
+ return data.drop(columns=['all', 'truthfulqa:mc|0'])
59
+
60
  def get_data(self, selected_models):
61
+ return self.data[self.data.index.isin(selected_models)]