rahulnair23 commited on
Commit
0de1d17
1 Parent(s): ab6548f

test commit

Browse files
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ __pycache__
2
+ .vscode/
3
+ .DS_Store
app.py CHANGED
@@ -1,7 +1,259 @@
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "! Space for LLM-rank-themselves."
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import pandas as pd
3
+ import numpy as np
4
+ from rouge_score import rouge_scorer
5
+ from joblib import Parallel, delayed
6
+ from selfrank.algos.greedy import SelfRankGreedy
7
+ from selfrank.algos.iterative import SelfRank
8
+ from selfrank.algos.baseline import MCARank
9
+ from selfrank.algos.triplet import equality, rouge
10
+ import matplotlib.pyplot as plt
11
 
 
 
12
 
13
+ class UI:
14
+
15
+ def __init__(self):
16
+ """Load any static assets"""
17
+ pass
18
+
19
+ def header_block(self):
20
+ """Title/description"""
21
+
22
+ gr.Markdown(
23
+ """<h1 style='text-align: center; color: black;'>🥇 Ranking LLMs without ground truth </h1>"""
24
+ )
25
+ gr.Markdown(
26
+ "This space demonstrates reference-free ranking of large language models describe in our ACL Findings paper [Ranking Large Language Models without Ground Truth](https://arxiv.org/abs/2402.14860). <br>"
27
+ "Inspired by real life where both an expert and a knowledgeable person can identify a novice the main idea is to consider triplets of models, where each one of them evaluates the other two, correctly identifying the worst model in the triplet with high probability. Iteratively performing such evaluations yields a estimated ranking that doesn't require ground truth/reference data which can be expensive to gather. The methods are a viable low-resource ranking mechanism for practical use.<br>"
28
+ "[Source code](https://huggingface.co/spaces/ibm/llm-rank-themselves/tree/main).<br>"
29
+ )
30
+ gr.Markdown('---')
31
+ gr.Markdown('<br>')
32
+
33
+
34
+ def selection_panel(self):
35
+ """user selections"""
36
+ gr.Markdown("""<h2 style='color: purple;'> Benchmark experiments </h2> """)
37
+ with gr.Column(variant='compact'):
38
+ self.data = gr.Dropdown(
39
+ choices=["CNN/DM", "XSUM", "MMLU"],
40
+ multiselect=False, value='CNN/DM',
41
+ label="Choose a dataset.",
42
+ info="The dataset describes a task",
43
+ interactive=True,
44
+ )
45
+ self.evaluation = gr.Dropdown(
46
+ choices=["Rouge", "Equality"],
47
+ multiselect=False, value='Rouge',
48
+ interactive=True,
49
+ label="Evaluation function",
50
+ info="How should the Judge model decide the winner? Demo limited to use 'Rouge' for generative tasks like summarization, and 'equality' for multiple choice or classification tasks. In practice you can use any function that compares judge responses to the contestant models.",
51
+ )
52
+ self.nmodels = gr.Dropdown(
53
+ choices=[None, 10, 20, 30],
54
+ label="Number of models",
55
+ info="Sample a subset of LLMs to rank.",
56
+ value=10,
57
+ interactive=True,
58
+ )
59
+ self.nrows = gr.Dropdown(
60
+ choices=[None, 10, 20, 30],
61
+ label="Number of instances",
62
+ info="Sample a subset of instances to evaluate (smaller is faster).",
63
+ value=10,
64
+ interactive=True,
65
+ )
66
+ self.method = gr.Dropdown(
67
+ choices=["Greedy", "Full"],
68
+ label="Algorithm variant to use",
69
+ info="Choose from one of two variants. 'Full' (FTR in the paper) runs all triplet combinations, recommended when evaluations are cheap or for smaller datasets, or 'greedy' (GTR) a faster variant suggested for more complex evaluations.",
70
+ value='Full',
71
+ interactive=True,
72
+ )
73
+ self.btn_execute = gr.Button("Run")
74
+
75
+
76
+ def output_panel(self):
77
+ """Plots/leaderboard/bump charts"""
78
+ with gr.Column(variant='default'):
79
+ gr.Markdown("""<h2 style='color: purple;'> Estimated ranking </h2> """)
80
+ self.leaderboard = gr.DataFrame()
81
+
82
+ with gr.Column(variant='default'):
83
+ gr.Markdown("""<h2 style='color: purple;'> Comparison to 'true' ranking </h2> """)
84
+ #self.bumpchart = gr.Plot(format='png')
85
+ self.bumpchart = gr.Image()
86
+ self.eval_metrics = gr.Markdown()
87
+
88
+ def synth_panel(self):
89
+ """ Synthetic data experiments """
90
+ gr.Markdown('<br>')
91
+ gr.Markdown('---')
92
+ gr.Markdown("""<h2 style='color: purple;'>Synthetic multiple choice </h2> """)
93
+
94
+ def byod_panel(self):
95
+ """ Synthetic data experiments """
96
+ gr.Markdown('<br>')
97
+ gr.Markdown('---')
98
+ gr.Markdown("""<h2 style='color: purple;'>BYOD </h2> """)
99
+
100
+
101
+ def layout(self):
102
+ """ Assemble the overall layout """
103
+
104
+ with gr.Blocks(theme=gr.themes.Default()) as demo:
105
+ self.header_block()
106
+
107
+ with gr.Row():
108
+
109
+ # Selection panel
110
+ with gr.Column():
111
+ self.selection_panel()
112
+
113
+ # Output panel/leaderboard
114
+ self.output_panel()
115
+
116
+ self.synth_panel()
117
+ self.byod_panel()
118
+
119
+ # Register event listeners
120
+ self.btn_execute.click(
121
+ fn=self.benchmark_executor, inputs=[self.data, self.evaluation, self.nmodels, self.nrows, self.method],
122
+ outputs=[self.leaderboard, self.bumpchart, self.eval_metrics]
123
+ )
124
+
125
+ return demo
126
+
127
+ def benchmark_executor(self, data, evaluation, nmodels, nrows, method) -> tuple[pd.DataFrame, plt.figure]:
128
+ """ Main execution flow for benchmarks """
129
+
130
+ #gr.Info(f"Loaded run config: {data}, {evaluation}, {nmodels}.")
131
+
132
+ match data:
133
+ case 'MMLU':
134
+ adf = pd.read_pickle(f"data/mmlu_subject_abstract_algebra.pkl")
135
+ MODELS = adf.model.unique()
136
+
137
+ case 'CNN/DM':
138
+ adf = pd.read_pickle(f"data/cnndm.pkl")
139
+ MODELS = adf.model.unique()
140
+
141
+ case 'XSUM':
142
+ raise NotImplementedError
143
+
144
+ case _:
145
+ raise ValueError(f"'{data}' not understood.")
146
+
147
+ # Sample fewer models if so needed
148
+ if nmodels is not None:
149
+ if nmodels < len(MODELS):
150
+
151
+ MODELS = np.random.choice(MODELS, nmodels, replace=False).tolist()
152
+ adf = adf[adf.model.isin(MODELS)]
153
+
154
+ match data:
155
+ case 'MMLU':
156
+ keys = ["id", "trial_id", "perturbation"] # MMLU has this extra parameter
157
+ case 'CNN/DM':
158
+ keys = ["id", "trial_id"]
159
+ case _:
160
+ pass
161
+
162
+ df = adf.pivot_table(
163
+ columns="model",
164
+ index=keys,
165
+ values="output",
166
+ aggfunc="first",
167
+ )
168
+
169
+ # Filter by number of rows
170
+ df.dropna(inplace=True)
171
+ if nrows is not None:
172
+ if nrows < df.shape[0]:
173
+ df = df.sample(nrows)
174
+
175
+ # Compute true ranking
176
+ adf = adf.set_index(keys).loc[df.index].reset_index()
177
+
178
+ if evaluation == "Rouge":
179
+
180
+ def __true_rouge(x, scorer):
181
+ return scorer.score(x["reference"], x["output"])["rouge2"].fmeasure
182
+
183
+ scorer = rouge_scorer.RougeScorer(["rouge2"], use_stemmer=True)
184
+ adf["rouge"] = Parallel(n_jobs=-1, batch_size=128)(
185
+ delayed(__true_rouge)(i, scorer) for _, i in adf.iterrows()
186
+ )
187
+
188
+ # Method 2 - look at "win rates" - for each question, see which model
189
+ # wins (i.e. has the best ROUGE score)
190
+ idx = adf.groupby(["id", "trial_id"])["rouge"].idxmax()
191
+ win_rates = adf.loc[idx].model.value_counts()
192
+ win_rate_rank = win_rates.index.tolist()
193
+
194
+ # include models with nowins at the bottom
195
+ no_wins = list(set(MODELS) - set(win_rate_rank))
196
+ true_ranking = win_rate_rank + no_wins
197
+ evaluator = rouge
198
+
199
+ elif evaluation == 'Equality':
200
+
201
+ # Compute the true ranking (multiple choice - so use equality between
202
+ # LLM response and reference-value)
203
+ adf["C"] = (adf.output == adf.reference).astype(int)
204
+ true_ranking = (
205
+ adf.groupby("model")["C"]
206
+ .apply(lambda x: sum(x) / len(x))
207
+ .sort_values(ascending=False)
208
+ .index.tolist()
209
+ )
210
+ evaluator = equality
211
+
212
+ else:
213
+ raise ValueError(f"'{evaluation}' not understood.")
214
+
215
+ match method:
216
+ case 'Full':
217
+ ranker = SelfRank(MODELS, evaluator, true_ranking)
218
+
219
+ case 'Greedy':
220
+ ranker = SelfRankGreedy(MODELS, evaluator, true_ranking)
221
+
222
+ case 'MCA':
223
+ raise NotImplementedError
224
+ case _:
225
+ raise ValueError(f"'{method}' not understood.")
226
+
227
+
228
+ # generate outputs
229
+ ranker.fit(df)
230
+ out_df = pd.DataFrame({'rank': range(1, len(true_ranking)+1), 'model': ranker.ranking})
231
+
232
+ out_metrics = {"rbo": ranker.measure(metric="rbo"),
233
+ "map-1": ranker.measure(metric="mapk", k=1),
234
+ "map-3": ranker.measure(metric="mapk", k=3),
235
+ "map-5": ranker.measure(metric="mapk", k=5),
236
+ "map-10": ranker.measure(metric="mapk", k=10),
237
+ "evaluations": evaluator.calls
238
+ }
239
+ eval_metrics = (f"Evaluation measures: <br>"
240
+ f"Rank-Biased Overlap: {out_metrics['rbo']:0.3f}<br>"
241
+ f"MAP-3 : {out_metrics['map-3']:0.3f}<br>"
242
+ f"MAP-5 : {out_metrics['map-5']:0.3f}<br>"
243
+ f"MAP-10 : {out_metrics['map-10']: 0.3f}.")
244
+
245
+ out_plot = ranker.plot()
246
+
247
+ return out_df, "output.png", eval_metrics
248
+
249
+
250
+ def run(self):
251
+ self.ui = self.layout()
252
+ self.ui.queue().launch(show_error=True)
253
+
254
+
255
+ #if __name__ == "__main__":
256
+ ui = UI()
257
+ #ui.run()
258
+ demo = ui.layout()
259
+ demo.launch()
css.css ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ html {
2
+ font-family: Inter;
3
+ font-size: 16px;
4
+ font-weight: 400;
5
+ line-height: 1.5;
6
+ -webkit-text-size-adjust: 100%;
7
+ background: #fff;
8
+ color: #323232;
9
+ -webkit-font-smoothing: antialiased;
10
+ -moz-osx-font-smoothing: grayscale;
11
+ text-rendering: optimizeLegibility;
12
+ }
13
+
14
+ :root {
15
+ --space: 1;
16
+ --vspace: calc(var(--space) * 1rem);
17
+ --vspace-0: calc(3 * var(--space) * 1rem);
18
+ --vspace-1: calc(2 * var(--space) * 1rem);
19
+ --vspace-2: calc(1.5 * var(--space) * 1rem);
20
+ --vspace-3: calc(0.5 * var(--space) * 1rem);
21
+ }
22
+
23
+ .app {
24
+ max-width: 748px !important;
25
+ }
26
+
27
+ .prose p {
28
+ margin: var(--vspace) 0;
29
+ line-height: var(--vspace * 2);
30
+ font-size: 1rem;
31
+ }
32
+
33
+ code {
34
+ font-family: "Inconsolata", sans-serif;
35
+ font-size: 16px;
36
+ }
37
+
38
+ h1,
39
+ h1 code {
40
+ font-weight: 400;
41
+ line-height: calc(2.5 / var(--space) * var(--vspace));
42
+ }
43
+
44
+ h1 code {
45
+ background: none;
46
+ border: none;
47
+ letter-spacing: 0.05em;
48
+ padding-bottom: 5px;
49
+ position: relative;
50
+ padding: 0;
51
+ }
52
+
53
+ h2 {
54
+ margin: var(--vspace-1) 0 var(--vspace-2) 0;
55
+ line-height: 1em;
56
+ }
57
+
58
+ h3,
59
+ h3 code {
60
+ margin: var(--vspace-1) 0 var(--vspace-2) 0;
61
+ line-height: 1em;
62
+ }
63
+
64
+ h4,
65
+ h5,
66
+ h6 {
67
+ margin: var(--vspace-3) 0 var(--vspace-3) 0;
68
+ line-height: var(--vspace);
69
+ }
70
+
71
+ .bigtitle,
72
+ h1,
73
+ h1 code {
74
+ font-size: calc(8px * 4.5);
75
+ word-break: break-word;
76
+ }
77
+
78
+ .title,
79
+ h2,
80
+ h2 code {
81
+ font-size: calc(8px * 3.375);
82
+ font-weight: lighter;
83
+ word-break: break-word;
84
+ border: none;
85
+ background: none;
86
+ }
87
+
88
+ .subheading1,
89
+ h3,
90
+ h3 code {
91
+ font-size: calc(8px * 1.8);
92
+ font-weight: 600;
93
+ border: none;
94
+ background: none;
95
+ letter-spacing: 0.1em;
96
+ text-transform: uppercase;
97
+ }
98
+
99
+ h2 code {
100
+ padding: 0;
101
+ position: relative;
102
+ letter-spacing: 0.05em;
103
+ }
104
+
105
+ blockquote {
106
+ font-size: calc(8px * 1.1667);
107
+ font-style: italic;
108
+ line-height: calc(1.1667 * var(--vspace));
109
+ margin: var(--vspace-2) var(--vspace-2);
110
+ }
111
+
112
+ .subheading2,
113
+ h4 {
114
+ font-size: calc(8px * 1.4292);
115
+ text-transform: uppercase;
116
+ font-weight: 600;
117
+ }
118
+
119
+ .subheading3,
120
+ h5 {
121
+ font-size: calc(8px * 1.2917);
122
+ line-height: calc(1.2917 * var(--vspace));
123
+
124
+ font-weight: lighter;
125
+ text-transform: uppercase;
126
+ letter-spacing: 0.15em;
127
+ }
128
+
129
+ h6 {
130
+ font-size: calc(8px * 1.1667);
131
+ font-size: 1.1667em;
132
+ font-weight: normal;
133
+ font-style: italic;
134
+ font-family: "le-monde-livre-classic-byol", serif !important;
135
+ letter-spacing: 0px !important;
136
+ }
137
+
138
+ #start .md > *:first-child {
139
+ margin-top: 0;
140
+ }
141
+
142
+ h2 + h3 {
143
+ margin-top: 0;
144
+ }
145
+
146
+ .md hr {
147
+ border: none;
148
+ border-top: 1px solid var(--block-border-color);
149
+ margin: var(--vspace-2) 0 var(--vspace-2) 0;
150
+ }
151
+ .prose ul {
152
+ margin: var(--vspace-2) 0 var(--vspace-1) 0;
153
+ }
154
+
155
+ .gap {
156
+ gap: 0;
157
+ }
data/cnndm.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa7a24100439d2a8a9f0a72b6064280eb5d63dadcb250123e62abaf795c1bc2b
3
+ size 19238934
data/mmlu_subject_abstract_algebra.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e446aa00757e4de201de6c6dc26d57763c0adf6f235594dffc661d2414345b65
3
+ size 9301843
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ certifi==2023.7.22
3
+ charset-normalizer==3.2.0
4
+ idna==3.4
5
+ numpy==1.25.2
6
+ pandas==2.1.0
7
+ requests==2.31.0
8
+ tzdata==2023.3
9
+ urllib3==2.0.4
10
+ matplotlib
11
+ seaborn
12
+ rouge-score
13
+ transformers
selfrank/__init__.py ADDED
File without changes
selfrank/algos/__init__.py ADDED
File without changes
selfrank/algos/baseline.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Baseline: based on most-common answer
3
+ """
4
+
5
+ import pandas as pd
6
+ import numpy as np
7
+ from tqdm import tqdm
8
+ from .metrics import mapk, rank_biased_overlap
9
+ from .plots import plot_ranks
10
+ import logging
11
+ from typing import List, Callable, Optional
12
+ from rouge_score import rouge_scorer as rs
13
+ from collections import Counter
14
+ import random
15
+
16
+ logger = logging.getLogger(__name__)
17
+ tol = 0.001
18
+
19
+
20
+ class MCARank:
21
+ """
22
+ Baseline method: based on most common answer
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ MODELS: List,
28
+ evaluator: Callable,
29
+ true_ranking: Optional[List] = None,
30
+ show_progress: Optional[bool] = False,
31
+ ):
32
+ self.MODELS = MODELS
33
+ self.N = len(MODELS)
34
+ self.evaluate = evaluator
35
+ self.true_ranking = true_ranking
36
+ self.show_progress = show_progress
37
+
38
+
39
+ def fit(self, df: pd.DataFrame, measure: Optional[str]='equality', p: float = 0):
40
+ """
41
+ df: Dataframe where each row is a benchmark instance,
42
+ and there is a column with the output for each Model
43
+
44
+ measure: decides how the most common answer is decided.
45
+ p - is the noise level to include (only used for noisy-equality)
46
+ """
47
+
48
+ assert set(self.MODELS) == set(df.columns), "Benchmark data models inconsistent with models to be ranked."
49
+
50
+ if measure == 'equality':
51
+
52
+ # Select the most common answer per question
53
+ mca = df.mode(axis=1).iloc[:, 0]
54
+
55
+ # Count all the times each model answered the most common one
56
+ wins = df.eq(mca, axis=0).astype(int)
57
+
58
+ self.ranking = wins.sum().sort_values(ascending=False).index.to_list()
59
+
60
+ elif measure == 'noisy_equality':
61
+
62
+ # Most common answer
63
+ mca = df.mode(axis=1).iloc[:, 0]
64
+
65
+ perturb = lambda x: not x if (random.random() <= p) else x
66
+
67
+ def __noisy_equality(x, mca):
68
+ wins = (x == mca).apply(perturb)
69
+ return wins
70
+
71
+ wins = df.apply(__noisy_equality, axis='rows', args=(mca, ))
72
+
73
+ self.ranking = wins.sum().sort_values(ascending=False).index.to_list()
74
+
75
+ elif measure == 'rouge':
76
+
77
+ MODELS = df.columns.to_list()
78
+ SIZE = 256
79
+
80
+ def __mca(x):
81
+ """ Most Commmon Answer, as the top k bigrams across all outputs """
82
+
83
+ cs = [rs._create_ngrams(x[m], n=2) for m in MODELS]
84
+ c = sum(cs, Counter())
85
+ return Counter(dict(c.most_common(SIZE)))
86
+
87
+ def __score_mca(x):
88
+ """ Rouge score computed relative to most-common-answer """
89
+
90
+ res = {}
91
+ for m in MODELS:
92
+ p_n = rs._create_ngrams(x[m], n=2)
93
+ res[m] = rs._score_ngrams(x.mca, p_n).fmeasure
94
+ return pd.Series(res)
95
+
96
+ df['mca'] = df.apply(__mca, axis=1)
97
+
98
+ # Winning model based on best ROUGE score for each question
99
+ win_rates = df.apply(__score_mca, axis=1).idxmax(axis=1).value_counts()
100
+ win_rate_rank = win_rates.index.tolist()
101
+
102
+ # include models with nowins at the bottom
103
+ no_wins = list(set(MODELS) - set(win_rate_rank))
104
+
105
+ self.ranking = win_rate_rank + no_wins
106
+
107
+
108
+ else:
109
+ raise ValueError(f"Measure {measure} not understood.")
110
+
111
+
112
+ logger.info(f"Estimated ranks (best to worst): {self.ranking}")
113
+ logger.info(f"True ranking: {self.true_ranking}")
114
+ logger.info(f"RBO measure: {self.measure()}")
115
+ return self.ranking # Best to worst
116
+
117
+
118
+ def measure(self, metric='rbo', k=5, p=0.95) -> float:
119
+ """
120
+ Report metric related to self-rank
121
+ """
122
+ if metric not in ['rbo', 'mapk']:
123
+ raise ValueError(f"Metric {metric} not supported (use 'rbo'/'mapk').")
124
+
125
+ if hasattr(self, 'ranking'):
126
+ if self.true_ranking is not None:
127
+ if metric == 'mapk':
128
+ if k > len(self.true_ranking):
129
+ logger.warning(f"MAPk metric is for k={len(self.true_ranking)}, and not k={k}.")
130
+ actual = [self.true_ranking[:k]]
131
+ pred = [self.ranking[:k]]
132
+ return mapk(actual, pred, k=k)
133
+ elif metric == 'rbo':
134
+ return rank_biased_overlap(self.true_ranking, self.ranking, p=p)
135
+ else:
136
+ raise ValueError(f"Metric {metric} not understood.")
137
+ else:
138
+ raise ValueError("True ranking not available for metric calculation.")
139
+ else:
140
+ raise ValueError("Ranking not estimated. Run 'fit' first.")
141
+
142
+
143
+ def plot(self, caselabel="output"):
144
+ if hasattr(self, 'ranking') & (self.true_ranking is not None):
145
+ plot_ranks(self.true_ranking, self.ranking, "actual", "estimated", caselabel)
selfrank/algos/greedy.py ADDED
@@ -0,0 +1,568 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Script for an iterative scheme.
3
+
4
+ Assumptions:
5
+ - complete pariwise comparisons available, i.e. evaluations are cheap
6
+ -
7
+ """
8
+
9
+ import pandas as pd
10
+ import numpy as np
11
+ from tqdm import tqdm
12
+ from selfrank.algos.metrics import mapk, rank_biased_overlap
13
+ from selfrank.algos.plots import plot_ranks
14
+ import logging
15
+ from typing import List, Callable, Optional
16
+ import random
17
+
18
+ logger = logging.getLogger(__name__)
19
+ tol = 0.001
20
+
21
+
22
+ class LLM_Model:
23
+ def __init__(self, model_name, all_model_data):
24
+ self.model_name = model_name
25
+
26
+ def name(self):
27
+ return self.model_name
28
+
29
+ def __eq__(self, other):
30
+ return self.name() == other.name()
31
+
32
+ def __lt__(self, other):
33
+ return self.name() < other.name()
34
+
35
+
36
+
37
+ class SelfRankGreedy:
38
+
39
+ def __init__(self, MODELS: List, evaluator: Callable, true_ranking: Optional[List]=None, show_progress: Optional[bool]=False):
40
+ self.MODELS = MODELS
41
+ self.N = len(MODELS)
42
+ self.evaluate = evaluator
43
+ self.true_ranking = true_ranking
44
+ self.show_progress = show_progress
45
+ self.df = None
46
+ self.DEBUG = False
47
+ self.model_eval = None
48
+ self.cnt=0
49
+
50
+ def getEvaluation(self, a, b , c, df, eval_arr, modelsList):
51
+ '''
52
+ model c in is evaluating a and b
53
+ It check in eval_arr is already evaluated; if not, evaluates and stores
54
+ '''
55
+ idx_a = modelsList.index(a)
56
+ idx_b = modelsList.index(b)
57
+ idx_c = modelsList.index(c)
58
+ val = eval_arr[idx_c, idx_a, idx_b] # stores c evaluating a to b
59
+ if val > -1:
60
+ return val
61
+ else:
62
+ val = self.evaluate(a, b, c, df)
63
+ eval_arr[idx_c, idx_a, idx_b] = val
64
+ eval_arr[idx_c, idx_b, idx_a] = 1 - val
65
+ return val
66
+
67
+ def __evaluateModelTriplet(self, df, triplet, eval_arr, modelsList):
68
+ model1 = triplet[0]
69
+ model2 = triplet[1]
70
+ model3 = triplet[2]
71
+ res = np.array([0, 0, 0])
72
+ m1_cmp_2_3 = self.getEvaluation(a=model2.name(), b=model3.name(), c=model1.name(), df=df, eval_arr=eval_arr, modelsList=modelsList) #model1.compareModels(model2, model3)
73
+ m2_cmp_1_3 = self.getEvaluation(a=model1.name(), b=model3.name(), c=model2.name(), df=df, eval_arr=eval_arr, modelsList=modelsList) #model2.compareModels(model1, model3)
74
+ m3_cmp_1_2 = self.getEvaluation(a=model1.name(), b=model2.name(), c=model3.name(), df=df, eval_arr=eval_arr, modelsList=modelsList) #model3.compareModels(model1, model2)
75
+ if m1_cmp_2_3 >= 0.5:
76
+ res[1]+=1
77
+ else:
78
+ res[2]+=1
79
+
80
+ if m2_cmp_1_3 >= 0.5:
81
+ res[0]+=1
82
+ else:
83
+ res[2]+=1
84
+
85
+ if m3_cmp_1_2 >= 0.5:
86
+ res[0]+=1
87
+ else:
88
+ res[1]+=1
89
+
90
+ #print(res)
91
+ #print(res.tolist())
92
+ zipped_pairs = zip(res.tolist(), triplet)
93
+ z = [(x,y, x.name()) for y, x in sorted(zipped_pairs, reverse=True)]
94
+ return z
95
+
96
+ def __printNames(self, ll):
97
+ print([i.name() for i in ll])
98
+
99
+ def __evaluateModels(self, df, evaluators, modelsToBeEvaluated, eval_arr, modelsList):
100
+ # rewrittten method to allow usage with updated code
101
+ # modelsToBeEvaluated can have 2 or 3 models only. evaluators will have only 1 model. Use evaluators to rank and return list of models in modelsToBeEvaluated
102
+ if len(evaluators) > 1:
103
+ raise Exception
104
+ if len(modelsToBeEvaluated) > 3 or len(modelsToBeEvaluated) < 2:
105
+ raise Exception
106
+ if len(modelsToBeEvaluated) == 2:
107
+ r = self.getEvaluation(a=modelsToBeEvaluated[0].name(), b=modelsToBeEvaluated[1].name(), c=evaluators[0].name(), df=df, eval_arr=eval_arr, modelsList=modelsList)
108
+ if r >= 0.5:
109
+ return [modelsToBeEvaluated[0],modelsToBeEvaluated[1]]
110
+ else:
111
+ return [modelsToBeEvaluated[1],modelsToBeEvaluated[0]]
112
+ if len(modelsToBeEvaluated) == 3:
113
+ r01 = self.getEvaluation(a=modelsToBeEvaluated[0].name(), b=modelsToBeEvaluated[1].name(), c=evaluators[0].name(), df=df, eval_arr=eval_arr, modelsList=modelsList)
114
+ r12 = self.getEvaluation(a=modelsToBeEvaluated[1].name(), b=modelsToBeEvaluated[2].name(), c=evaluators[0].name(), df=df, eval_arr=eval_arr, modelsList=modelsList)
115
+ r02 = self.getEvaluation(a=modelsToBeEvaluated[0].name(), b=modelsToBeEvaluated[2].name(), c=evaluators[0].name(), df=df, eval_arr=eval_arr, modelsList=modelsList)
116
+ res = np.array([0, 0, 0])
117
+ if r01 >= 0.5:
118
+ res[0]+=1
119
+ else:
120
+ res[1]+=1
121
+
122
+ if r12 >= 0.5:
123
+ res[1]+=1
124
+ else:
125
+ res[2]+=1
126
+
127
+ if r02 >= 0.5:
128
+ res[0]+=1
129
+ else:
130
+ res[2]+=1
131
+
132
+ zipped_pairs = zip(res.tolist(), modelsToBeEvaluated)
133
+ z = [x for y, x in sorted(zipped_pairs, reverse=True)]
134
+ return z
135
+
136
+
137
+ def __rankModels(self, df, eval_arr, modelsList, triplet, prev_model_ranking, unrankedModelList, rankedModelList, bottomModelList):
138
+
139
+ if len(triplet) < 3:
140
+ return [], list(triplet), []
141
+ self.cnt = self.cnt + 1
142
+ model_ranking = self.__evaluateModelTriplet(df, triplet, eval_arr, modelsList)
143
+ if self.DEBUG:
144
+ print("Cnt: ", self.cnt)
145
+ print("\n\n\nFIRST")
146
+ self.__printNames(triplet)
147
+ self.__printNames(unrankedModelList)
148
+ self.__printNames(rankedModelList)
149
+ self.__printNames(bottomModelList)
150
+ print(model_ranking)
151
+ print(prev_model_ranking)
152
+ print("END FIRST")
153
+
154
+ first_rank = model_ranking[0][1]
155
+ second_rank = model_ranking[1][1]
156
+ third_rank = model_ranking[2][1]
157
+ if first_rank == 2: # first model is better than the other two
158
+
159
+ if len(unrankedModelList) == 0 and len(bottomModelList) == 0: # CASE 1
160
+ # no more unranked models left to consider and none in bottomModels,
161
+ # so add the models in rank order to rankedModelList
162
+ if second_rank == 1 and third_rank == 0:
163
+ if self.DEBUG:
164
+ print('CASE 1a')
165
+ rankedModelList.extend([model_ranking[0][0], model_ranking[1][0], model_ranking[2][0]])
166
+ elif second_rank == 0 and third_rank == 0:
167
+ if self.DEBUG:
168
+ print('CASE 1b')
169
+ rankedModelList.append(model_ranking[0][0])
170
+ #use current best model to rank the bottom 2 and add to rankedList in order
171
+ z = self.__evaluateModels(df, [rankedModelList[0]],[model_ranking[1][0], model_ranking[2][0]], eval_arr, modelsList)
172
+ rankedModelList.extend(z)
173
+ else:
174
+ raise Exception("Error: Should not have occurred CASE 1")
175
+ if self.DEBUG:
176
+ self.__printNames(rankedModelList)
177
+ return [], rankedModelList, []
178
+
179
+ if len(unrankedModelList) == 0 and len(bottomModelList) == 1: # CASE 2
180
+ # no more unranked models left to consider and only 1 bottomModels in all,
181
+ if second_rank == 1 and third_rank == 0:
182
+ # so add the models in rank order to rankedModelList
183
+ if self.DEBUG:
184
+ print('CASE 2a')
185
+ rankedModelList.extend([model_ranking[0][0], model_ranking[1][0]])
186
+ #TODO Use top model in rankedModelList to rank the two models below and then add them according to ranking
187
+ z = self.__evaluateModels(df, [rankedModelList[0]],[model_ranking[2][0], bottomModelList[0]], eval_arr, modelsList)
188
+ rankedModelList.extend(z)
189
+ if self.DEBUG:
190
+ self.__printNames(rankedModelList)
191
+ return [], rankedModelList, []
192
+ elif second_rank == 0 and third_rank == 0:
193
+ if self.DEBUG:
194
+ print('CASE 2b')
195
+
196
+ rankedModelList.append(model_ranking[0][0])
197
+ modelsToCompare = [model_ranking[1][0], model_ranking[2][0], bottomModelList[0]]
198
+
199
+ if self.DEBUG:
200
+ self.__printNames(tuple(modelsToCompare))
201
+ self.__printNames(rankedModelList)
202
+ return self.__rankModels(df, eval_arr, modelsList, tuple(modelsToCompare), model_ranking, [], rankedModelList, [])
203
+ else:
204
+ raise Exception("Error: Should not have occurred CASE 2")
205
+
206
+
207
+ if len(unrankedModelList) == 0 and len(bottomModelList) > 1: # CASE 3
208
+ # no more unranked models left to consider but there are at least 2 models in bottomModelList
209
+ if second_rank == 1 and third_rank == 0:
210
+ if self.DEBUG:
211
+ print('CASE 3a')
212
+ rankedModelList.extend([model_ranking[0][0], model_ranking[1][0]]) # add top two models to ranked list
213
+ bottomModelList.append(model_ranking[2][0]) # add worst model to bottomModelList
214
+ elif second_rank == 0 and third_rank == 0:
215
+ if self.DEBUG:
216
+ print('CASE 3b')
217
+ rankedModelList.append(model_ranking[0][0]) # add top model to ranked list
218
+ bottomModelList.extend([model_ranking[1][0], model_ranking[2][0]]) # add bottom two model to bottomModelList
219
+ else:
220
+ raise Exception("Error: Should not have occurred CASE 3")
221
+
222
+ modelsToCompare = random.sample(bottomModelList, 3)
223
+ bottomModelList = [i for i in bottomModelList if i not in modelsToCompare]
224
+ if self.DEBUG:
225
+ self.__printNames(tuple(modelsToCompare))
226
+ self.__printNames(bottomModelList)
227
+ self.__printNames(rankedModelList)
228
+ print([])
229
+ return self.__rankModels(df, eval_arr, modelsList, tuple(modelsToCompare), model_ranking, bottomModelList, rankedModelList, [])
230
+
231
+ # CASE 4 len(unrankedModelList) > 0
232
+
233
+ #check the previous model ranking and model ranking. if either first or second ranked model previously is now the bottom ranked model,
234
+ # move all bottom to unranked and call with new triple
235
+ #if (prev_model_ranking is not None) and ((prev_model_ranking[0][0] == model_ranking[2][0]) or (prev_model_ranking[1][0] == model_ranking[2][0])):
236
+ # unrankedModelList.extend(bottomModelList)
237
+ # if self.DEBUG:
238
+ # print('Case 4a NEW ONE')
239
+ # self.__printNames(triplet)
240
+ # self.__printNames(unrankedModelList)
241
+ # self.__printNames(rankedModelList)
242
+ # self.__printNames([])
243
+ # return self.__rankModels(df, (triplet, None, unrankedModelList, rankedModelList, [])
244
+ if second_rank == 1 and third_rank == 0:
245
+ if self.DEBUG:
246
+ print('CASE 4a')
247
+ bottomModelList.append(model_ranking[2][0]) # add worst model to bottomModelList
248
+
249
+ newModel = random.sample(unrankedModelList, 1)
250
+ unrankedModelList.remove(newModel[0])
251
+ triplet = (model_ranking[0][0], model_ranking[1][0], newModel[0])
252
+
253
+ if self.DEBUG:
254
+ self.__printNames(triplet)
255
+ self.__printNames(unrankedModelList)
256
+ self.__printNames(rankedModelList)
257
+ self.__printNames(bottomModelList)
258
+ return self.__rankModels(df, eval_arr, modelsList, triplet, model_ranking, unrankedModelList, rankedModelList, bottomModelList)
259
+ elif second_rank == 0 and third_rank == 0:
260
+
261
+ # if unrankedModelList has 2 or more elements, put both 2nd and 3rd model into bottom; if unrankedModelList has only one,
262
+ # then randomly choose one of the two and put in bottom
263
+ if len(unrankedModelList) > 1:
264
+ if self.DEBUG:
265
+ print('CASE 4b')
266
+ bottomModelList.append(model_ranking[2][0])
267
+ bottomModelList.append(model_ranking[1][0])
268
+ newModels = random.sample(unrankedModelList, 2)
269
+ triplet = (model_ranking[0][0],) + tuple(newModels)
270
+ unrankedModelList.remove(newModels[0])
271
+ unrankedModelList.remove(newModels[1])
272
+
273
+ if self.DEBUG:
274
+ self.__printNames(triplet)
275
+ self.__printNames(unrankedModelList)
276
+ self.__printNames(rankedModelList)
277
+ self.__printNames(bottomModelList)
278
+ return self.__rankModels(df, eval_arr, modelsList, triplet, model_ranking, unrankedModelList, rankedModelList, bottomModelList)
279
+ else:
280
+ if self.DEBUG:
281
+ print('CASE 4c')
282
+ #200, UR==1
283
+ #add third model to bottom. replace in tuple with one from unranked. and rank
284
+ #newModel = random.sample(unrankedModelList, 1)
285
+ #unrankedModelList.remove(newModel[0])
286
+ #bottomModelList.append(model_ranking[2][0]) # add third model to bottomModelList
287
+ #triplet = (model_ranking[0][0], model_ranking[1][0], newModel[0])
288
+
289
+ #add both 0s to bottom. Create tuple with 2, the one from UR and 1 from B. Call self.__rankModels(df, (triple,B,R,[])
290
+ newModel = random.sample(unrankedModelList, 1)
291
+ unrankedModelList.remove(newModel[0])
292
+ bottomModelList.append(model_ranking[2][0]) # add third model to bottomModelList
293
+ bottomModelList.append(model_ranking[1][0]) # add second model to bottomModelList
294
+ newBottomModel = random.sample(bottomModelList, 1)
295
+ bottomModelList.remove(newBottomModel[0])
296
+ triplet = (model_ranking[0][0], newModel[0], newBottomModel[0])
297
+ if self.DEBUG:
298
+ self.__printNames(triplet)
299
+ self.__printNames(unrankedModelList)
300
+ self.__printNames(rankedModelList)
301
+ self.__printNames(bottomModelList)
302
+ return self.__rankModels(df, eval_arr, modelsList, triplet, model_ranking, bottomModelList, rankedModelList, [])
303
+ else:
304
+ raise Exception("Error: Should not have occurred CASE 4")
305
+
306
+ else:
307
+ # some problem with ranking all three models
308
+ if len(unrankedModelList) == 0 and len(bottomModelList) == 0: # CASE 1
309
+ #use top model from rankedlist to rank the three and append to ranked list in order
310
+ if self.DEBUG:
311
+ print('CASE ELSE_1')
312
+ z = self.__evaluateModels(df, [rankedModelList[0]], list(triplet), eval_arr, modelsList)
313
+ if self.DEBUG:
314
+ self.__printNames(z)
315
+ rankedModelList.extend(z)
316
+ if self.DEBUG:
317
+ self.__printNames(rankedModelList)
318
+ return [], rankedModelList, []
319
+
320
+ if len(unrankedModelList) == 0 and len(bottomModelList) == 1: # CASE 2
321
+ if self.DEBUG:
322
+ print('CASE ELSE_2')
323
+
324
+ #ALTERNATIVE
325
+ ##use top model from rankedlist to rank the three and append to ranked list in order; THEN, add the sole model from bottom list
326
+ if len(rankedModelList) > 0:
327
+ z = self.__evaluateModels(df, [rankedModelList[0]], list(triplet), eval_arr, modelsList)
328
+ else:
329
+ z = list(triplet)
330
+ if self.DEBUG:
331
+ self.__printNames(z)
332
+ rankedModelList.extend(z)
333
+ rankedModelList.append(bottomModelList[0])
334
+ if self.DEBUG:
335
+ self.__printNames(rankedModelList)
336
+ return [], rankedModelList, []
337
+
338
+
339
+ if len(unrankedModelList) == 0 and len(bottomModelList) > 1: # CASE 3
340
+ # ranks are 1xx or 000
341
+ if self.DEBUG:
342
+ print('CASE ELSE_3')
343
+
344
+ ##use top model from rankedlist to rank the three and add top 2 to ranked list in order;
345
+ if len(rankedModelList) > 0:
346
+ z = self.__evaluateModels(df, [rankedModelList[0]], list(triplet), eval_arr, modelsList)
347
+ else:
348
+ z = list(triplet)
349
+ if self.DEBUG:
350
+ self.__printNames(z)
351
+ rankedModelList.append(z[0])
352
+ rankedModelList.append(z[1])
353
+
354
+ bottomModelList.append(z[2])
355
+ #Sample 3 from bottom to create triple. call self.__rankModels(df, (tripler, B, R, [])
356
+ newModels = random.sample(bottomModelList, 3)
357
+ for mod in newModels:
358
+ bottomModelList.remove(mod)
359
+ if self.DEBUG:
360
+ self.__printNames(tuple(newModels))
361
+ self.__printNames(unrankedModelList)
362
+ self.__printNames(rankedModelList)
363
+ self.__printNames(bottomModelList)
364
+ return self.__rankModels(df, eval_arr, modelsList, tuple(newModels), model_ranking, bottomModelList, rankedModelList, [])
365
+
366
+
367
+ # CASE 4 len(unrankedModelList) > 0
368
+
369
+ # if the three models are 1,1,1 or 0,0,0 i.e. indistinguishable
370
+
371
+ #check the previous model ranking and model ranking. if either first or second ranked model previously is now the bottom ranked model,
372
+ # move all bottom to unranked and call with new triple
373
+ #if (prev_model_ranking is not None) and ((prev_model_ranking[0][0] == model_ranking[2][0]) or (prev_model_ranking[1][0] == model_ranking[2][0])):
374
+ # unrankedModelList.extend(bottomModelList)
375
+ # if self.DEBUG:
376
+ # print('Case ELSE_4 NEW ONE')
377
+ # self.__printNames(triplet)
378
+ # self.__printNames(unrankedModelList)
379
+ # self.__printNames(rankedModelList)
380
+ # self.__printNames([])
381
+ # return self.__rankModels(df, (triplet, None, unrankedModelList, rankedModelList, [])
382
+
383
+ # choose one of the tuple models and add to unrankedlIst. Remove random model from unrankedList and add to tuple. rank again
384
+ if first_rank == second_rank and first_rank == third_rank:
385
+ if self.DEBUG:
386
+ print('CASE ELSE_4a')
387
+ ##use top model from rankedlist to rank the three and add third one to Bottomlist ;
388
+ ##then create tuple with top 2 and one from unranked
389
+
390
+ if len(rankedModelList) > 0:
391
+ z = self.__evaluateModels(df, [rankedModelList[0]], list(triplet), eval_arr, modelsList)
392
+ else:
393
+ z = list(triplet)
394
+ if self.DEBUG:
395
+ print('z: ', z)
396
+ self.__printNames(z)
397
+
398
+ bottomModelList.append(z[2])
399
+ newModel = random.sample(unrankedModelList, 1)
400
+ unrankedModelList.remove(newModel[0])
401
+ triplet = (z[0], z[1], newModel[0])
402
+ if self.DEBUG:
403
+ print(1)
404
+ print('triplet:', triplet)
405
+ self.__printNames(triplet)
406
+ print(2)
407
+ self.__printNames(unrankedModelList)
408
+ print(3)
409
+ self.__printNames(rankedModelList)
410
+ print(4)
411
+ self.__printNames(bottomModelList)
412
+ print(5)
413
+
414
+ else: # there are one or two models with 0
415
+ # if only 1, add to bottom and replace with one from unranked
416
+ # if two are 0, then both replace with unranked if unranked has more than 1
417
+ # otherwise randomly add one of the 0s to bottom and replace with unranked.
418
+ if second_rank == 1: # then only third is 0
419
+ if self.DEBUG:
420
+ print('CASE ELSE_4b')
421
+ newModel = random.sample(unrankedModelList, 1)
422
+ unrankedModelList.remove(newModel[0])
423
+
424
+ bottomModelList.append(model_ranking[2][0])
425
+ triplet = (model_ranking[0][0], model_ranking[1][0], newModel[0])
426
+ else: # both second and third are zero
427
+ if len(unrankedModelList) > 1:
428
+ if self.DEBUG:
429
+ print('CASE ELSE_4c')
430
+ bottomModelList.append(model_ranking[2][0])
431
+ bottomModelList.append(model_ranking[1][0])
432
+ newModels = random.sample(unrankedModelList, 2)
433
+ triplet = (model_ranking[0][0],) + tuple(newModels)
434
+ unrankedModelList.remove(newModels[0])
435
+ unrankedModelList.remove(newModels[1])
436
+ else:
437
+ if self.DEBUG:
438
+ print('CASE ELSE_4d')
439
+ #add third model to bottom. replace in tuple with one from unranked. and rank
440
+ #newModel = random.sample(unrankedModelList, 1)
441
+ #unrankedModelList.remove(newModel[0])
442
+ #bottomModelList.append(model_ranking[2][0]) # add third model to bottomModelList
443
+ #triplet = (model_ranking[0][0], model_ranking[1][0], newModel[0])
444
+
445
+ # UR==1, 100
446
+ #Add both 0s to Bottom. Create tuple from the 1, one from UR, and one from Bottom
447
+ #Call self.__rankModels(df, (triple, B, R, [])
448
+ bottomModelList.append(model_ranking[2][0])
449
+ bottomModelList.append(model_ranking[1][0])
450
+ newModels = random.sample(unrankedModelList, 1)
451
+ unrankedModelList.remove(newModel[0])
452
+ newBottomModels = random.sample(bottomModelList, 1)
453
+ bottomModelList.remove(newBottomModels[0])
454
+ triplet = (model_ranking[0][0], newModels[0], newBottomModels[0])
455
+ if self.DEBUG:
456
+ self.__printNames(triplet)
457
+ self.__printNames(unrankedModelList)
458
+ self.__printNames(rankedModelList)
459
+ self.__printNames(bottomModelList)
460
+ return self.__rankModels(df, eval_arr, modelsList, triplet, model_ranking, bottomModelList, rankedModelList, [])
461
+
462
+
463
+ if self.DEBUG:
464
+ self.__printNames(triplet)
465
+ self.__printNames(unrankedModelList)
466
+ self.__printNames(rankedModelList)
467
+ self.__printNames(bottomModelList)
468
+ return self.__rankModels(df, eval_arr, modelsList, triplet, model_ranking, unrankedModelList, rankedModelList, bottomModelList)
469
+
470
+
471
+ def __printRanks(self, ll):
472
+ print([{i.name(): r} for r,i in enumerate(ll)])
473
+
474
+ def __estimate_rankings(self, df, numIter=1, modelSubset=None, numModels=None):
475
+ rankedLists = []
476
+ if modelSubset is not None:
477
+ model_list = modelSubset
478
+ elif numModels is not None:
479
+ model_list = self.MODELS.copy() #df.columns.tolist() #list(df['model'].unique())
480
+ model_list = random.sample(model_list, numModels)
481
+ else:
482
+ model_list = self.MODELS.copy() #df.columns.tolist() #list(df['model'].unique())
483
+
484
+ nModels = len(model_list)
485
+ self.model_eval = np.full((nModels, nModels, nModels), -1)
486
+
487
+ for it in tqdm(range(numIter)):
488
+ shuffled_list = model_list.copy()
489
+ random.shuffle(shuffled_list)
490
+
491
+ t = random.sample(shuffled_list, 3)
492
+ u = [i for i in shuffled_list if i not in t]
493
+
494
+ t = [LLM_Model(i, df) for i in t]
495
+ u = [LLM_Model(i, df) for i in u]
496
+
497
+ _,rankedList,_ = self.__rankModels(df, self.model_eval, model_list, tuple(t), None, u, [], [])
498
+ rankedLists.append(rankedList)
499
+
500
+ estimated_ranking_lists = []
501
+ ranks = []
502
+ for rl in rankedLists:
503
+ estimated_ranking = {i.name(): r+1 for r,i in enumerate(rl)}
504
+ rank = [estimated_ranking[name] for name in model_list] #sorted(model_list)]
505
+ estimated_ranking_lists.append(estimated_ranking)
506
+ ranks.append(rank)
507
+
508
+ average_estimated_scores = sorted(zip(np.mean(np.array(ranks), axis=0), model_list))
509
+ average_estimated_ranking = [mod for rnk, mod in average_estimated_scores]
510
+ #average_scores = [rnk for rnk, mod in zipped]
511
+
512
+ return model_list, estimated_ranking_lists, average_estimated_ranking, average_estimated_scores
513
+
514
+
515
+ def fit(self, df: pd.DataFrame):
516
+ """
517
+ df: Dataframe where each row is a benchmark instance,
518
+ and there is a column with the output for each Model
519
+ """
520
+ assert set(self.MODELS) == set(df.columns), "Benchmark data models inconsistent with models to be ranked."
521
+
522
+ #process the dataset
523
+ self.df = df #self.__process_dataset(df)
524
+ # Build a pairwise preference matrix
525
+ #if self.show_progress:
526
+ # pbar = tqdm(total=self.N**3, position=0, leave=False, desc="Evaluations")
527
+
528
+ #if self.show_progress: pbar.update(1)
529
+
530
+ # Estimate the ranks
531
+ _, _, average_estimated_ranking, _ = self.__estimate_rankings(self.df, numIter=1)
532
+ #logging.info(f"Iteration {iter}:{delta}")
533
+
534
+
535
+ self.ranking = average_estimated_ranking
536
+
537
+ logger.info(f"Estimated 'greedy' ranks (best to worst): {self.ranking}")
538
+
539
+ return self.ranking # Best to worst
540
+
541
+ def measure(self, metric='rbo', k=5, p=0.95) -> float:
542
+ """
543
+ Report metrics related to self-rank
544
+ """
545
+ if metric not in ['rbo', 'mapk']:
546
+ raise ValueError(f"Metric {metric} not supported (use 'rbo'/'mapk').")
547
+
548
+ if hasattr(self, 'ranking'):
549
+ if self.true_ranking is not None:
550
+ if metric == 'mapk':
551
+ if k > len(self.true_ranking):
552
+ logger.warning(f"MAPk metric is for k={len(self.true_ranking)}, and not k={k}.")
553
+ actual = [self.true_ranking[:k]]
554
+ pred = [self.ranking[:k]]
555
+ return mapk(actual, pred, k=k)
556
+ elif metric == 'rbo':
557
+ return rank_biased_overlap(self.true_ranking, self.ranking, p=p)
558
+ else:
559
+ raise ValueError(f"Metric {metric} not understood.")
560
+ else:
561
+ raise ValueError("True ranking not available for metric calculation.")
562
+ else:
563
+ raise ValueError("Ranking not estimated. Run 'fit' first.")
564
+
565
+
566
+ def plot(self, caselabel="output"):
567
+ if hasattr(self, 'ranking') & (self.true_ranking is not None):
568
+ return plot_ranks(self.true_ranking, self.ranking, "actual", "estimated", caselabel)
selfrank/algos/iterative.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Script for an iterative scheme.
3
+
4
+ Assumptions:
5
+ - complete pariwise comparisons available, i.e. evaluations are cheap
6
+ -
7
+ """
8
+
9
+ import pandas as pd
10
+ import numpy as np
11
+ from tqdm import tqdm
12
+ from .metrics import mapk, rank_biased_overlap
13
+ from .plots import plot_ranks
14
+ import logging
15
+ from typing import List, Callable, Optional
16
+
17
+ logger = logging.getLogger(__name__)
18
+ tol = 0.001
19
+
20
+ class SelfRank:
21
+
22
+ def __init__(self, MODELS: List, evaluator: Callable, true_ranking: Optional[List]=None, show_progress: Optional[bool]=False):
23
+ self.MODELS = MODELS
24
+ self.N = len(MODELS)
25
+ self.evaluate = evaluator
26
+ self.true_ranking = true_ranking
27
+ self.show_progress = show_progress
28
+
29
+
30
+ def fit(self, df: pd.DataFrame):
31
+ """
32
+ df: Dataframe where each row is a benchmark instance,
33
+ and there is a column with the output for each Model
34
+ """
35
+ assert set(self.MODELS) == set(df.columns), "Benchmark data models inconsistent with models to be ranked."
36
+
37
+ # Build a pairwise preference matrix
38
+ if self.show_progress:
39
+ pbar = tqdm(total=self.N**3, position=0, leave=False, desc="Evaluations")
40
+
41
+ y = np.empty((self.N, self.N, self.N))
42
+
43
+ for i, a in enumerate(self.MODELS):
44
+ for j, b in enumerate(self.MODELS):
45
+ for k, c in enumerate(self.MODELS): # Judge
46
+
47
+ # Some checks to limit evaluations
48
+ if a == b:
49
+ y[i, j, k] = 0.5
50
+ y[j, i, k] = 0.5
51
+
52
+ elif a == c:
53
+ y[i, j, k] = 1
54
+ y[j, i, k] = 0
55
+
56
+ elif b == c:
57
+ y[i, j, k] = 0
58
+ y[j, i, k] = 1
59
+
60
+ elif j > i:
61
+ y[i, j, k] = self.evaluate(a=a, b=b, c=c, df=df)
62
+ y[j, i, k] = 1 - y[i, j, k] # complement in the other direction
63
+
64
+ if self.show_progress: pbar.update(1)
65
+
66
+ # Estimate the ranks
67
+ r = np.ones((self.N, ))
68
+ iter = 0
69
+ while True:
70
+
71
+ # weighted mean over k
72
+ m = np.einsum('ijk,i->ij', y, r) / self.N
73
+
74
+ # Aggregate preferences using majority voting
75
+ y_p = np.zeros_like(m)
76
+
77
+ for i in np.arange(self.N):
78
+ for j in np.arange(self.N):
79
+ if j > i:
80
+ if m[i, j] >= m[j, i]:
81
+ y_p[i,j] = 1.
82
+ y_p[j,i] = 0.
83
+ else:
84
+ y_p[i,j] = 0.
85
+ y_p[j,i] = 1.
86
+
87
+ # update reputation score by wins
88
+ r_k = y_p.sum(axis=1)/max(y_p.sum(axis=1))
89
+
90
+ # termination if reputation score converges
91
+ delta = np.sum(np.abs(r - r_k))
92
+ logging.info(f"Iteration {iter}:{delta}")
93
+ logging.info(f"Reputation score: {r}")
94
+ if delta<= tol:
95
+ break
96
+ else:
97
+ iter += 1
98
+ r = r_k
99
+
100
+ # Get ranked list from the reputation score
101
+ idx = np.argsort(r_k)[::-1]
102
+ self.ranking = np.array(self.MODELS)[idx].tolist()
103
+
104
+ logger.info(f"Estimated ranks (best to worst): {self.ranking}")
105
+ if self.true_ranking is not None:
106
+ logger.info(f"True ranking: {self.true_ranking}")
107
+ logger.info(f"RBO measure: {self.measure()}")
108
+ return self.ranking # Best to worst
109
+
110
+ def measure(self, metric='rbo', k=5, p=0.95) -> float:
111
+ """
112
+ Report metric related to self-rank
113
+ """
114
+ if metric not in ['rbo', 'mapk']:
115
+ raise ValueError(f"Metric {metric} not supported (use 'rbo'/'mapk').")
116
+
117
+ if hasattr(self, 'ranking'):
118
+ if self.true_ranking is not None:
119
+ if metric == 'mapk':
120
+ if k > len(self.true_ranking):
121
+ logger.warning(f"MAPk metric is for k={len(self.true_ranking)}, and not k={k}.")
122
+ actual = [self.true_ranking[:k]]
123
+ pred = [self.ranking[:k]]
124
+ return mapk(actual, pred, k=k)
125
+ elif metric == 'rbo':
126
+ return rank_biased_overlap(self.true_ranking, self.ranking, p=p)
127
+ else:
128
+ raise ValueError(f"Metric {metric} not understood.")
129
+ else:
130
+ raise ValueError("True ranking not available for metric calculation.")
131
+ else:
132
+ raise ValueError("Ranking not estimated. Run 'fit' first.")
133
+
134
+
135
+ def plot(self, caselabel="output"):
136
+ if hasattr(self, 'ranking') & (self.true_ranking is not None):
137
+ return plot_ranks(self.true_ranking, self.ranking, "actual", "estimated", caselabel)
selfrank/algos/metrics.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+ def apk(actual, predicted, k=10):
4
+ """
5
+ Computes the average precision at k.
6
+ This function computes the average prescision at k between two lists of
7
+ items.
8
+ Parameters
9
+ ----------
10
+ actual : list
11
+ A list of elements that are to be predicted (order doesn't matter)
12
+ predicted : list
13
+ A list of predicted elements (order does matter)
14
+ k : int, optional
15
+ The maximum number of predicted elements
16
+ Returns
17
+ -------
18
+ score : double
19
+ The average precision at k over the input lists
20
+ """
21
+ if not actual:
22
+ return 0.0
23
+
24
+ if len(predicted)>k:
25
+ predicted = predicted[:k]
26
+
27
+ score = 0.0
28
+ num_hits = 0.0
29
+
30
+ for i,p in enumerate(predicted):
31
+ # first condition checks whether it is valid prediction
32
+ # second condition checks if prediction is not repeated
33
+ if p in actual and p not in predicted[:i]:
34
+ num_hits += 1.0
35
+ score += num_hits / (i+1.0)
36
+
37
+ return score / min(len(actual), k)
38
+
39
+ def mapk(actual: list[list], predicted: list[list], k:int=10) -> float:
40
+ """
41
+ Computes the mean average precision at k.
42
+ This function computes the mean average prescision at k between two lists
43
+ of lists of items.
44
+ Parameters
45
+ ----------
46
+ actual : list
47
+ A list of lists of elements that are to be predicted
48
+ (order doesn't matter in the lists)
49
+ predicted : list
50
+ A list of lists of predicted elements
51
+ (order matters in the lists)
52
+ k : int, optional
53
+ The maximum number of predicted elements
54
+ Returns
55
+ -------
56
+ score : double
57
+ The mean average precision at k over the input lists
58
+ """
59
+ return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)]).astype(float)
60
+
61
+
62
+
63
+ def rank_biased_overlap(l1,l2,p):
64
+ """
65
+ Returns RBO indefinite rank similarity metric, as described in:
66
+ Webber, W., Moffat, A., & Zobel, J. (2010).
67
+ A similarity measure for indefinite rankings.
68
+ ACM Transactions on Information Systems.
69
+ doi:10.1145/1852102.1852106.
70
+ """
71
+ sl,ll = sorted([(len(l1), l1),(len(l2),l2)])
72
+ s, S = sl
73
+ l, L = ll
74
+
75
+ # Calculate the overlaps at ranks 1 through l
76
+ # (the longer of the two lists)
77
+ ss = set([])
78
+ ls = set([])
79
+ overs = {}
80
+ for i in range(l):
81
+ ls.add(L[i])
82
+ if i<s:
83
+ ss.add(S[i])
84
+ X_d = len(ss.intersection(ls))
85
+ d = i+1
86
+ overs[d] = float(X_d)
87
+
88
+ # (1) \sum_{d=1}^l (X_d / d) * p^d
89
+ sum1 = 0
90
+ for i in range(l):
91
+ d=i+1
92
+ sum1+=overs[d]/d*pow(p,d)
93
+ X_s = overs[s]
94
+ X_l = overs[l]
95
+
96
+ # (2) \sum_{d=s+1}^l [(X_s (d - s)) / (sd)] * p^d
97
+ sum2 = 0
98
+ for i in range(s,l):
99
+ d=i+1
100
+ sum2+=(X_s*(d-s)/(s*d))*pow(p,d)
101
+
102
+ # (3) [(X_l - X_s) / l + X_s / s] * p^l
103
+ sum3 = ((X_l-X_s)/l+X_s/s)*pow(p,l)
104
+
105
+ # Equation 32.
106
+ rbo_ext = (1-p)/p*(sum1+sum2)+sum3
107
+ return rbo_ext
selfrank/algos/pairwise.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import pandas as pd
3
+ import numpy as np
4
+ from tqdm import tqdm
5
+ from dotenv import load_dotenv
6
+ from genai import Client
7
+ from typing import Callable, List, Optional
8
+ from genai.text.generation import TextGenerationParameters, TextGenerationReturnOptions
9
+ from genai import Credentials, Client
10
+ from langchain.prompts import PromptTemplate
11
+ from .metrics import mapk, rank_biased_overlap
12
+ from .plots import plot_ranks
13
+ import logging
14
+ import random
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ load_dotenv()
19
+
20
+ credentials = Credentials.from_env()
21
+ client = Client(credentials=credentials)
22
+
23
+ _INSTRUCTION = "Compare the two responses."
24
+ _RUBRIC = "Which is the better response?"
25
+ _PROMETHEUS_PROMPT = """###Task Description:
26
+ An instruction (might include an Input inside it), a response to evaluate, and a score rubric representing a evaluation criteria are given.
27
+ 1. Write a detailed feedback that assess the quality of two responses strictly based on the given score rubric, not evaluating in general.
28
+ 2. After writing a feedback, choose a better response between Response 1 and Response 2. You should refer to the score rubric.
29
+ 3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (1 or 2)"
30
+ 4. Please do not generate any other opening, closing, and explanations.
31
+ ###Instruction:
32
+ {instruction}
33
+ ###Response 1:
34
+ {response_1}
35
+ ###Response 2:
36
+ {response_2}
37
+ ###Score Rubric:
38
+ {rubric}
39
+ ###Feedback:
40
+ """
41
+
42
+ template = PromptTemplate.from_template(_PROMETHEUS_PROMPT)
43
+
44
+ class LLMJudge:
45
+ """
46
+ Competing method based on an LLM-Judge (Prometheus)
47
+ """
48
+
49
+ def __init__(
50
+ self,
51
+ MODELS: List,
52
+ true_ranking: Optional[List] = None,
53
+ show_progress: Optional[bool] = True,
54
+ ):
55
+ self.MODELS = MODELS
56
+ self.N = len(MODELS)
57
+ self.evaluate = prometheus
58
+ self.true_ranking = true_ranking
59
+ self.show_progress = show_progress
60
+
61
+
62
+ def fit(self, df: pd.DataFrame):
63
+ """
64
+ df: Dataframe where each row is a benchmark instance,
65
+ and there is a column with the output for each Model
66
+
67
+ """
68
+
69
+ assert set(self.MODELS) == set(df.columns), "Benchmark data models inconsistent with models to be ranked."
70
+ self.N = len(self.MODELS)
71
+
72
+ y = np.empty((self.N, self.N))
73
+
74
+ if self.show_progress:
75
+ pbar = tqdm(total=self.N**2, position=0, leave=False, desc="Evaluations")
76
+
77
+ for i, a in enumerate(self.MODELS):
78
+ for j, b in enumerate(self.MODELS):
79
+ if a == b:
80
+ y[i, j] = 0
81
+ else:
82
+ y[i, j] = self.evaluate(client, format_instruction, a=a, b=b, df=df)
83
+
84
+ if self.show_progress: pbar.update(1)
85
+
86
+ logger.debug(f"Win matrix:\n{y}")
87
+ # Just agregate based on win rates
88
+ df = pd.DataFrame({'wins': y.sum(axis=1)}, index=self.MODELS)
89
+ df = df.sort_values(by='wins', ascending=False)
90
+ self.ranking = df.index.to_list()
91
+
92
+ return self.ranking
93
+
94
+ def measure(self, metric='rbo', k=5, p=0.95) -> float:
95
+ """
96
+ Report metric related to self-rank
97
+ """
98
+ if metric not in ['rbo', 'mapk']:
99
+ raise ValueError(f"Metric {metric} not supported (use 'rbo'/'mapk').")
100
+
101
+ if hasattr(self, 'ranking'):
102
+ if self.true_ranking is not None:
103
+ if metric == 'mapk':
104
+ if k > len(self.true_ranking):
105
+ logger.warning(f"MAPk metric is for k={len(self.true_ranking)}, and not k={k}.")
106
+ actual = [self.true_ranking[:k]]
107
+ pred = [self.ranking[:k]]
108
+ return mapk(actual, pred, k=k)
109
+ elif metric == 'rbo':
110
+ return rank_biased_overlap(self.true_ranking, self.ranking, p=p)
111
+ else:
112
+ raise ValueError(f"Metric {metric} not understood.")
113
+ else:
114
+ raise ValueError("True ranking not available for metric calculation.")
115
+ else:
116
+ raise ValueError("Ranking not estimated. Run 'fit' first.")
117
+
118
+
119
+ def plot(self, caselabel="output"):
120
+ if hasattr(self, 'ranking') & (self.true_ranking is not None):
121
+ plot_ranks(self.true_ranking, self.ranking, "actual", "estimated", caselabel)
122
+
123
+
124
+ def format_instruction(x, a, b):
125
+ """instruction to score with Prometheus"""
126
+
127
+ # Build the instruction
128
+ response1 =f"{x[a]}"
129
+ response2 =f"{x[b]}"
130
+
131
+ instruction = _INSTRUCTION
132
+ rubric = _RUBRIC
133
+
134
+ instruction = template.format(
135
+ instruction=instruction, response_1=response1, response_2 = response2, rubric=rubric
136
+ )
137
+ return instruction
138
+
139
+
140
+ def prometheus(client: Client, formatter: Callable, a: str, b:str, df: pd.DataFrame) -> int:
141
+ """
142
+ Query the LLM-as-a-judge model Prometheus to compare responses from model "a" and model "b"
143
+
144
+ client: is the `genai` client (using BAM).
145
+ formatter: function that takes the model output and generates the Prometheus instruction
146
+ parameters: BAM specific parameters.
147
+ a: name of model `a` to be evaluated (column in `df` with responses)
148
+ b: named of model `b` to be evaluated
149
+ df: DataFrame with responses
150
+ """
151
+
152
+ parameters = TextGenerationParameters(
153
+ max_new_tokens=500, return_options=TextGenerationReturnOptions(), random_seed=42
154
+ )
155
+
156
+ # Get the correct prompts
157
+ inst = df.apply(formatter, axis=1, args = (a,b))
158
+ adf = df.copy(deep=True)
159
+
160
+ results = []
161
+ for response in client.text.generation.create(
162
+ model_id="kaist-ai/prometheus-8x7b-v2",
163
+ inputs=inst.values.tolist(),
164
+ execution_options={"ordered": True, 'concurrency_limit': 10},
165
+ parameters=parameters,
166
+ ):
167
+ results.append(response.results[0])
168
+
169
+ adf["generated_text"] = [r.generated_text for r in results]
170
+
171
+ def _helper(x):
172
+ try:
173
+ return int(x.split("[RESULT]")[1])
174
+ except (IndexError, ValueError) as e:
175
+ return random.choice([0, 1])
176
+
177
+ return
178
+ adf['A'] = adf["generated_text"].apply(_helper)
179
+
180
+ n = adf.shape[0]
181
+ a_wins = sum(adf['A'])
182
+ b_wins = n - a_wins
183
+
184
+ if a_wins >= b_wins:
185
+ return 1
186
+ else:
187
+ return 0
selfrank/algos/plots.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import matplotlib.pyplot as plt
3
+ import numpy as np
4
+
5
+ from typing import List
6
+
7
+ class bcolors:
8
+ PURPLE = '\033[95m'
9
+ BLUE = '\033[94m'
10
+ GREEN = '\033[92m'
11
+ WARNING = '\033[93m'
12
+ RED = '\033[91m'
13
+ ENDC = '\033[0m'
14
+ BOLD = '\033[1m'
15
+ UNDERLINE = '\033[4m'
16
+
17
+
18
+ def plot_ranks(r1: List, r2: List, r1_label: str, r2_label: str, output: str) -> plt.axes:
19
+ """
20
+ e.g.:
21
+ df = rank_data(true_ranking, ranking, "actual", "predicted", "output")
22
+ """
23
+
24
+ items = list(set(r1 + r2))
25
+ xs = []
26
+
27
+ for i in items:
28
+ for lbl, l in zip((r1_label, r2_label), (r1, r2)):
29
+ try:
30
+ x = l.index(i)
31
+ except ValueError:
32
+ x = np.nan
33
+
34
+ xs.append({"item": i, "version": lbl, "rank": x + 1})
35
+
36
+ df = pd.DataFrame(xs).pivot(index="item", columns="version", values="rank").T
37
+
38
+ fig = plt.figure(figsize=(5, 10))
39
+ bumpchart(
40
+ df,
41
+ show_rank_axis=False,
42
+ scatter=True,
43
+ ax=fig.gca(),
44
+ holes=False,
45
+ line_args={"linewidth": 5, "alpha": 0.5},
46
+ scatter_args={"s": 100, "alpha": 0.8},
47
+ )
48
+
49
+ plt.savefig(f"{output}.png", dpi=150, bbox_inches="tight")
50
+ return fig
51
+
52
+ def bumpchart(
53
+ df,
54
+ show_rank_axis=True,
55
+ rank_axis_distance=1.1,
56
+ ax=None,
57
+ scatter=False,
58
+ holes=False,
59
+ line_args={},
60
+ scatter_args={},
61
+ hole_args={},
62
+ ):
63
+ if ax is None:
64
+ left_yaxis = plt.gca()
65
+ else:
66
+ left_yaxis = ax
67
+
68
+ # Creating the right axis.
69
+ right_yaxis = left_yaxis.twinx()
70
+
71
+ axes = [left_yaxis, right_yaxis]
72
+
73
+ # Creating the far right axis if show_rank_axis is True
74
+ if show_rank_axis:
75
+ far_right_yaxis = left_yaxis.twinx()
76
+ axes.append(far_right_yaxis)
77
+
78
+ for col in df.columns:
79
+ y = df[col]
80
+ x = df.index.values
81
+ # Plotting blank points on the right axis/axes
82
+ # so that they line up with the left axis.
83
+ for axis in axes[1:]:
84
+ axis.plot(x, y, alpha=0)
85
+
86
+ left_yaxis.plot(x, y, **line_args, solid_capstyle="round")
87
+
88
+ # Adding scatter plots
89
+ if scatter:
90
+ left_yaxis.scatter(x, y, **scatter_args)
91
+
92
+ # Adding see-through holes
93
+ if holes:
94
+ bg_color = left_yaxis.get_facecolor()
95
+ left_yaxis.scatter(x, y, color=bg_color, **hole_args)
96
+
97
+ # Number of lines
98
+ lines = len(df.columns)
99
+
100
+ y_ticks = [*range(1, lines + 1)]
101
+
102
+ # Configuring the axes so that they line up well.
103
+ for axis in axes:
104
+ axis.invert_yaxis()
105
+ axis.set_yticks(y_ticks)
106
+ axis.set_ylim((lines + 0.5, 0.5))
107
+
108
+ # Sorting the labels to match the ranks.
109
+ left_labels = df.iloc[0].sort_values().index
110
+ right_labels = df.iloc[-1].sort_values().index
111
+
112
+ left_yaxis.set_yticklabels(left_labels)
113
+ right_yaxis.set_yticklabels(right_labels)
114
+
115
+ # Setting the position of the far right axis so that it doesn't overlap with the right axis
116
+ if show_rank_axis:
117
+ far_right_yaxis.spines["right"].set_position(("axes", rank_axis_distance))
118
+
119
+ return axes
selfrank/algos/triplet.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from rouge_score import rouge_scorer
4
+ from joblib import Parallel, delayed
5
+ #from transformers import AutoTokenizer, DebertaForSequenceClassification
6
+ #import torch
7
+ from tqdm import tqdm
8
+ import logging
9
+ from .plots import bcolors
10
+ import random
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ # Local only for now
15
+ #DEVICE = "mps" if torch.backends.mps.is_available() else "cpu"
16
+ DEVICE = 'cpu'
17
+
18
+ def call_counter(func):
19
+ def helper(*args, **kwargs):
20
+ helper.calls += 1
21
+ return func(*args, **kwargs)
22
+ helper.calls = 0
23
+ return helper
24
+
25
+ # @call_counter
26
+ # def entailment(tokenizer: AutoTokenizer, model: DebertaForSequenceClassification, a: str, b:str, c:str, df: pd.DataFrame) -> float:
27
+ # """
28
+ # uses model c to evaluate a vs. b
29
+
30
+ # Entailment based on natural language inference - binary outcomes version.
31
+ # """
32
+
33
+ # def __helper(x, h):
34
+
35
+ # premise = x[c]
36
+ # hypothesis = x[h]
37
+
38
+ # formatted_text = f"{premise}{tokenizer.sep_token}{hypothesis}"
39
+ # inputs = tokenizer(formatted_text, return_tensors="pt", padding=True, truncation=True).to(DEVICE)
40
+
41
+ # # Fetch class probabilities
42
+ # with torch.no_grad():
43
+ # predid = model(**inputs).logits.argmax(-1)
44
+ # out = model.config.id2label[predid.item()]
45
+
46
+ # if out == 'ENTAILMENT':
47
+ # return 1
48
+ # else:
49
+ # return 0
50
+
51
+ # a_ent = df.apply(__helper, args=(a,), axis=1)
52
+ # b_ent = df.apply(__helper, args=(b,), axis=1)
53
+
54
+ # if sum(a_ent) == sum(b_ent):
55
+ # logger.info(f"Judge: {c}, {bcolors.PURPLE}{bcolors.BOLD}Model {a}: {sum(a_ent)}, Model {b}: {sum(b_ent)} {bcolors.ENDC} (of {len(df)}).")
56
+ # return 0.5 # tied - in aggregate
57
+ # elif sum(a_ent) > sum(b_ent):
58
+ # logger.info(f"Judge: {c}, {bcolors.RED}{bcolors.BOLD}Model {a}: {sum(a_ent)}{bcolors.ENDC}, Model {b}: {sum(b_ent)} (of {len(df)}).")
59
+ # return 1 # a wins - in aggregate
60
+ # else:
61
+ # logger.info(f"Judge: {c}, Model {a}: {sum(a_ent)}, {bcolors.RED}{bcolors.BOLD}Model {b}: {sum(b_ent)}{bcolors.ENDC} (of {len(df)}).")
62
+ # return 0 # b wins
63
+
64
+ # @call_counter
65
+ # def entailment_p(tokenizer: AutoTokenizer, model: DebertaForSequenceClassification, a: str, b:str, c:str, df: pd.DataFrame) -> int:
66
+ # """
67
+ # uses model c to evaluate a vs. b
68
+
69
+ # Entailment based on natural language inference - PROBABILITY version.
70
+ # """
71
+
72
+
73
+ # def chunks(lst, batch_size):
74
+ # for i in range(0, len(lst), batch_size):
75
+ # yield lst[i:i + batch_size]
76
+
77
+ # def inference(ft):
78
+ # inputs = tokenizer(ft, return_tensors="pt", padding=True, truncation=True).to(DEVICE)
79
+
80
+ # idx = model.config.label2id['ENTAILMENT']
81
+ # # Fetch entailment probabilities
82
+ # with torch.no_grad():
83
+ # logits = model(**inputs).logits
84
+ # p = torch.nn.functional.softmax(logits, dim=1).to("cpu").numpy()[:, idx]
85
+
86
+ # return p.tolist()
87
+
88
+ # # prepare inputs
89
+ # premise = df[c]
90
+ # formatted_text = (premise + tokenizer.sep_token + df[a]).to_list() + \
91
+ # (premise + tokenizer.sep_token + df[b]).to_list()
92
+
93
+
94
+ # p = []
95
+ # for i in chunks(formatted_text, 4):
96
+ # p += inference(i)
97
+
98
+ # # Compare entailment probs between model 'a' and 'b'
99
+ # ent_a = p[:len(p)//2]
100
+ # ent_b = p[len(p)//2:]
101
+
102
+ # values = [1 if i >= j else 0 for i, j in zip(ent_a, ent_b)] # 1-> "a" wins
103
+
104
+ # # Win percentage
105
+ # if sum(values) >= (0.5 * len(values)):
106
+ # return 1 # a wins
107
+ # else:
108
+ # return 0 # b wins
109
+
110
+ @call_counter
111
+ def equality(a: str, b:str, c:str, df:pd.DataFrame) -> int:
112
+ """
113
+ use model c to evaluate a vs. b
114
+
115
+ simple heuristic as the answers are multiple choice, so use equality.
116
+ """
117
+
118
+ ties = df[a] == df[b]
119
+ a_wins = sum((df[a] == df[c]) & ~(ties))
120
+ b_wins = sum((df[b] == df[c]) & ~(ties))
121
+
122
+ if a_wins >= b_wins:
123
+ return 1
124
+ else:
125
+ return 0
126
+
127
+ @call_counter
128
+ def noisy_equality(a: str, b:str, c:str, df:pd.DataFrame, p: float) -> int:
129
+ """
130
+ use model c to evaluate a vs. b
131
+
132
+ noisy version of equality - where evaluations are flipped independently with
133
+ probability p (p=1 will always flip, p=0, will never)
134
+ """
135
+
136
+ perturb = lambda x: not x if (random.random() <= p) else x
137
+
138
+ ties = (df[a] == df[b])
139
+ a_w = (df[a] == df[c]).apply(perturb)
140
+ b_w = (df[b] == df[c]).apply(perturb)
141
+
142
+ a_wins = sum(a_w & ~(ties))
143
+ b_wins = sum(b_w & ~(ties))
144
+
145
+ if a_wins >= b_wins:
146
+ return 1
147
+ else:
148
+ return 0
149
+
150
+
151
+ @call_counter
152
+ def rouge(a: str, b: str, c:str, df: pd.DataFrame) -> float:
153
+ """
154
+ Summarization metric ROUGE2 - discrete version
155
+ """
156
+ scorer = rouge_scorer.RougeScorer(["rouge2"], use_stemmer=True)
157
+
158
+ def __helper(x) -> int:
159
+
160
+ score_a = scorer.score(x[c], x[a])['rouge2'].fmeasure
161
+ score_b = scorer.score(x[c], x[b])['rouge2'].fmeasure
162
+ #logger.info(f"{score_a}, {score_b}")
163
+
164
+ if score_a >= score_b:
165
+ return 1 # a wins this instance
166
+ else:
167
+ return 0 # b wins
168
+
169
+ outcomes = df.apply(__helper, axis=1)
170
+ a_wins = sum(outcomes)
171
+ b_wins = sum(outcomes==0)
172
+
173
+ if a_wins == b_wins:
174
+ logger.info(f"Judge: {c}, {bcolors.PURPLE}{bcolors.BOLD}Model {a}: {a_wins}, Model {b}: {b_wins} {bcolors.ENDC} (of {len(df)}).")
175
+ return 0.5 # tied overall
176
+ elif a_wins > b_wins:
177
+ logger.info(f"Judge: {c}, {bcolors.RED}{bcolors.BOLD}Model {a}: {a_wins}{bcolors.ENDC}, Model {b}: {b_wins} (of {len(df)}).")
178
+ return 1 # a wins overall
179
+ else:
180
+ logger.info(f"Judge: {c}, Model {a}: {a_wins}, {bcolors.RED}{bcolors.BOLD}Model {b}: {b_wins}{bcolors.ENDC} (of {len(df)}).")
181
+ return 0 # b wins
182
+
183
+
184
+ @call_counter
185
+ def rouge_avg(a: str, b: str, c:str, df: pd.DataFrame) -> float:
186
+ """
187
+ Summarization metric ROUGE2 - based on averages
188
+
189
+ Following HELM returns the fmeasure
190
+ https://github.com/stanford-crfm/helm/blob/9be35a339347a9f2ad5644d7b72aede57486e3d4/src/helm/benchmark/metrics/basic_metrics.py#L256
191
+ """
192
+ def __true_rouge(x, m, scorer):
193
+ try:
194
+ scores = scorer.score(x[c], x[m])
195
+ value = scores["rouge2"].fmeasure
196
+ return value
197
+ except AttributeError:
198
+ #print(x[c], x[m])
199
+ return 0.0
200
+
201
+ if a == b:
202
+ return 0.5 # its a tie!
203
+ if a == c:
204
+ return 1. # a wins (as judge is the same)
205
+ if b == c:
206
+ return 0. # b wins as its also the judge
207
+
208
+ scorer = rouge_scorer.RougeScorer(["rouge2"], use_stemmer=True)
209
+ values = {}
210
+ for m in [a, b]:
211
+ values[m] = Parallel(n_jobs=-1, batch_size=128)(
212
+ delayed(__true_rouge)(i, m, scorer) for _, i in df.iterrows()
213
+ )
214
+
215
+ # Compare average rouge score over entire benchmark
216
+ if np.mean(values[a]) >= np.mean(values[b]):
217
+ return 1. # a wins
218
+ else:
219
+ return 0. # b wins
selfrank/algos/utils.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+ def systematic_sampling(l: list, n: int) -> list:
4
+ """
5
+ l - (ordered) list to be sampled from
6
+ n - number of samples to fetch
7
+
8
+ returns a list of samples (far apart)
9
+ """
10
+
11
+ skip = len(l)/n
12
+ s = np.random.uniform(0, skip)
13
+ out = []
14
+ for _ in range(n):
15
+ out.append(l[np.floor(s).astype(int)])
16
+ s += skip
17
+
18
+ return out
19
+
20
+
21
+ def close_sampling(l:list, n: int) -> list:
22
+ """
23
+ returns a sampled list (close together)
24
+ """
25
+
26
+ w = np.floor(n/2 + 2).astype(int)
27
+ s = np.floor(np.random.uniform(w, len(l) - w)).astype(int)
28
+ subset = [l[i] for i in range(s-w, s+w)]
29
+ return np.random.choice(subset, n, replace=False).tolist()