Corey Morris commited on
Commit
74822dd
1 Parent(s): 30fa96a

removed most commented out code from details processor

Browse files
Files changed (1) hide show
  1. details_data_processor.py +0 -149
details_data_processor.py CHANGED
@@ -51,15 +51,6 @@ class DetailsDataProcessor:
51
  constructed_url = base_url + organization + '/' + model + '/' + other_chunk + filename
52
  return constructed_url
53
 
54
- # @staticmethod
55
- # def _find_files(directory, pattern):
56
- # for root, dirs, files in os.walk(directory):
57
- # for basename in files:
58
- # if fnmatch.fnmatch(basename, pattern):
59
- # filename = os.path.join(root, basename)
60
- # yield filename
61
-
62
-
63
 
64
  def _find_files(self, directory, pattern):
65
  matching_files = [] # List to hold matching filenames
@@ -81,143 +72,3 @@ class DetailsDataProcessor:
81
  df = self.single_file_pipeline(url, file_path)
82
  dataframes.append(df)
83
  return dataframes
84
- # @staticmethod
85
- # def _find_files(directory, pattern):
86
- # for root, dirs, files in os.walk(directory):
87
- # for basename in files:
88
- # if fnmatch.fnmatch(basename, pattern):
89
- # filename = os.path.join(root, basename)
90
- # yield filename
91
-
92
- # def _read_and_transform_data(self, filename):
93
- # with open(filename) as f:
94
- # data = json.load(f)
95
- # df = pd.DataFrame(data['results']).T
96
- # return df
97
-
98
- # def _cleanup_dataframe(self, df, model_name):
99
- # df = df.rename(columns={'acc': model_name})
100
- # df.index = (df.index.str.replace('hendrycksTest-', 'MMLU_', regex=True)
101
- # .str.replace('harness\|', '', regex=True)
102
- # .str.replace('\|5', '', regex=True))
103
- # return df[[model_name]]
104
-
105
- # def _extract_mc1(self, df, model_name):
106
- # df = df.rename(columns={'mc1': model_name})
107
- # # rename row harness|truthfulqa:mc|0 to truthfulqa:mc1
108
- # df.index = (df.index.str.replace('mc\|0', 'mc1', regex=True))
109
- # # just return the harness|truthfulqa:mc1 row
110
- # df = df.loc[['harness|truthfulqa:mc1']]
111
- # return df[[model_name]]
112
-
113
- # def _extract_mc2(self, df, model_name):
114
- # # rename row harness|truthfulqa:mc|0 to truthfulqa:mc2
115
- # df = df.rename(columns={'mc2': model_name})
116
- # df.index = (df.index.str.replace('mc\|0', 'mc2', regex=True))
117
- # df = df.loc[['harness|truthfulqa:mc2']]
118
- # return df[[model_name]]
119
-
120
- # # remove extreme outliers from column harness|truthfulqa:mc1
121
- # def _remove_mc1_outliers(self, df):
122
- # mc1 = df['harness|truthfulqa:mc1']
123
- # # Identify the outliers
124
- # # outliers_condition = mc1 > mc1.quantile(.95)
125
- # outliers_condition = mc1 == 1.0
126
- # # Replace the outliers with NaN
127
- # df.loc[outliers_condition, 'harness|truthfulqa:mc1'] = np.nan
128
- # return df
129
-
130
-
131
-
132
- # @staticmethod
133
- # def _extract_parameters(model_name):
134
- # """
135
- # Function to extract parameters from model name.
136
- # It handles names with 'b/B' for billions and 'm/M' for millions.
137
- # """
138
- # # pattern to match a number followed by 'b' (representing billions) or 'm' (representing millions)
139
- # pattern = re.compile(r'(\d+\.?\d*)([bBmM])')
140
-
141
- # match = pattern.search(model_name)
142
-
143
- # if match:
144
- # num, magnitude = match.groups()
145
- # num = float(num)
146
-
147
- # # convert millions to billions
148
- # if magnitude.lower() == 'm':
149
- # num /= 1000
150
-
151
- # return num
152
-
153
- # # return NaN if no match
154
- # return np.nan
155
-
156
-
157
- # def process_data(self):
158
-
159
- # dataframes = []
160
- # organization_names = []
161
- # for filename in self._find_files(self.directory, self.pattern):
162
- # raw_data = self._read_and_transform_data(filename)
163
- # split_path = filename.split('/')
164
- # model_name = split_path[2]
165
- # organization_name = split_path[1]
166
- # cleaned_data = self._cleanup_dataframe(raw_data, model_name)
167
- # mc1 = self._extract_mc1(raw_data, model_name)
168
- # mc2 = self._extract_mc2(raw_data, model_name)
169
- # cleaned_data = pd.concat([cleaned_data, mc1])
170
- # cleaned_data = pd.concat([cleaned_data, mc2])
171
- # organization_names.append(organization_name)
172
- # dataframes.append(cleaned_data)
173
-
174
-
175
- # data = pd.concat(dataframes, axis=1).transpose()
176
-
177
- # # Add organization column
178
- # data['organization'] = organization_names
179
-
180
- # # Add Model Name and rearrange columns
181
- # data['Model Name'] = data.index
182
- # cols = data.columns.tolist()
183
- # cols = cols[-1:] + cols[:-1]
184
- # data = data[cols]
185
-
186
- # # Remove the 'Model Name' column
187
- # data = data.drop(columns=['Model Name'])
188
-
189
- # # Add average column
190
- # data['MMLU_average'] = data.filter(regex='MMLU').mean(axis=1)
191
-
192
- # # Reorder columns to move 'MMLU_average' to the third position
193
- # cols = data.columns.tolist()
194
- # cols = cols[:2] + cols[-1:] + cols[2:-1]
195
- # data = data[cols]
196
-
197
- # # Drop specific columns
198
- # data = data.drop(columns=['all', 'truthfulqa:mc|0'])
199
-
200
- # # Add parameter count column using extract_parameters function
201
- # data['Parameters'] = data.index.to_series().apply(self._extract_parameters)
202
-
203
- # # move the parameters column to the front of the dataframe
204
- # cols = data.columns.tolist()
205
- # cols = cols[-1:] + cols[:-1]
206
- # data = data[cols]
207
-
208
- # # remove extreme outliers from column harness|truthfulqa:mc1
209
- # data = self._remove_mc1_outliers(data)
210
-
211
- # return data
212
-
213
- # def rank_data(self):
214
- # # add rank for each column to the dataframe
215
- # # copy the data dataframe to avoid modifying the original dataframe
216
- # rank_data = self.data.copy()
217
- # for col in list(rank_data.columns):
218
- # rank_data[col + "_rank"] = rank_data[col].rank(ascending=False, method='min')
219
-
220
- # return rank_data
221
-
222
- # def get_data(self, selected_models):
223
- # return self.data[self.data.index.isin(selected_models)]
 
51
  constructed_url = base_url + organization + '/' + model + '/' + other_chunk + filename
52
  return constructed_url
53
 
 
 
 
 
 
 
 
 
 
54
 
55
  def _find_files(self, directory, pattern):
56
  matching_files = [] # List to hold matching filenames
 
72
  df = self.single_file_pipeline(url, file_path)
73
  dataframes.append(df)
74
  return dataframes