Spaces:

ibm
/

llm-rank-themselves

Running

App Files Files Community

rahulnair23 commited on Jul 24, 2024

Commit

0de1d17

1 Parent(s): ab6548f

test commit

Browse files

Files changed (16) hide show

.gitignore +3 -0
app.py +256 -4
css.css +157 -0
data/cnndm.pkl +3 -0
data/mmlu_subject_abstract_algebra.pkl +3 -0
requirements.txt +13 -0
selfrank/__init__.py +0 -0
selfrank/algos/__init__.py +0 -0
selfrank/algos/baseline.py +145 -0
selfrank/algos/greedy.py +568 -0
selfrank/algos/iterative.py +137 -0
selfrank/algos/metrics.py +107 -0
selfrank/algos/pairwise.py +187 -0
selfrank/algos/plots.py +119 -0
selfrank/algos/triplet.py +219 -0
selfrank/algos/utils.py +29 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+__pycache__
+.vscode/
+.DS_Store

app.py CHANGED Viewed

@@ -1,7 +1,259 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "! Space for LLM-rank-themselves."
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

 import gradio as gr
+import pandas as pd
+import numpy as np
+from rouge_score import rouge_scorer
+from joblib import Parallel, delayed
+from selfrank.algos.greedy import SelfRankGreedy
+from selfrank.algos.iterative import SelfRank
+from selfrank.algos.baseline import MCARank
+from selfrank.algos.triplet import equality, rouge
+import matplotlib.pyplot as plt
+class UI:
+    def __init__(self):
+        """Load any static assets"""
+        pass
+    def header_block(self):
+        """Title/description"""
+        gr.Markdown(
+            """<h1 style='text-align: center; color: black;'>🥇 Ranking LLMs without ground truth </h1>"""
+        )
+        gr.Markdown(
+            "This space demonstrates reference-free ranking of large language models describe in our ACL Findings paper [Ranking Large Language Models without Ground Truth](https://arxiv.org/abs/2402.14860). <br>"
+            "Inspired by real life where both an expert and a knowledgeable person can identify a novice the main idea is to consider triplets of models, where each one of them evaluates the other two, correctly identifying the worst model in the triplet with high probability. Iteratively performing such evaluations yields a estimated ranking that doesn't require ground truth/reference data which can be expensive to gather. The methods are a viable low-resource ranking mechanism for practical use.<br>"
+            "[Source code](https://huggingface.co/spaces/ibm/llm-rank-themselves/tree/main).<br>"
+        )
+        gr.Markdown('---')
+        gr.Markdown('<br>')
+    def selection_panel(self):
+        """user selections"""
+        gr.Markdown("""<h2 style='color: purple;'> Benchmark experiments </h2> """)
+        with gr.Column(variant='compact'):
+            self.data = gr.Dropdown(
+                choices=["CNN/DM", "XSUM", "MMLU"],
+                multiselect=False, value='CNN/DM',
+                label="Choose a dataset.",
+                info="The dataset describes a task",
+                interactive=True,
+            )
+            self.evaluation = gr.Dropdown(
+                choices=["Rouge", "Equality"],
+                multiselect=False, value='Rouge',
+                interactive=True,
+                label="Evaluation function",
+                info="How should the Judge model decide the winner? Demo limited to use 'Rouge' for generative tasks like summarization, and 'equality' for multiple choice or classification tasks. In practice you can use any function that compares judge responses to the contestant models.",
+            )
+            self.nmodels = gr.Dropdown(
+                choices=[None, 10, 20, 30],
+                label="Number of models",
+                info="Sample a subset of LLMs to rank.",
+                value=10,
+                interactive=True,
+            )
+            self.nrows = gr.Dropdown(
+                choices=[None, 10, 20, 30],
+                label="Number of instances",
+                info="Sample a subset of instances to evaluate (smaller is faster).",
+                value=10,
+                interactive=True,
+            )
+            self.method = gr.Dropdown(
+                choices=["Greedy", "Full"],
+                label="Algorithm variant to use",
+                info="Choose from one of two variants. 'Full' (FTR in the paper) runs all triplet combinations, recommended when evaluations are cheap or for smaller datasets, or 'greedy' (GTR) a faster variant suggested for more complex evaluations.",
+                value='Full',
+                interactive=True,
+            )
+            self.btn_execute = gr.Button("Run")
+    def output_panel(self):
+        """Plots/leaderboard/bump charts"""
+        with gr.Column(variant='default'):
+            gr.Markdown("""<h2 style='color: purple;'> Estimated ranking </h2> """)
+            self.leaderboard = gr.DataFrame()
+        with gr.Column(variant='default'):
+            gr.Markdown("""<h2 style='color: purple;'> Comparison to 'true' ranking </h2> """)
+            #self.bumpchart = gr.Plot(format='png')
+            self.bumpchart = gr.Image()
+            self.eval_metrics = gr.Markdown()
+    def synth_panel(self):
+        """ Synthetic data experiments """
+        gr.Markdown('<br>')
+        gr.Markdown('---')
+        gr.Markdown("""<h2 style='color: purple;'>Synthetic multiple choice </h2> """)
+    def byod_panel(self):
+        """ Synthetic data experiments """
+        gr.Markdown('<br>')
+        gr.Markdown('---')
+        gr.Markdown("""<h2 style='color: purple;'>BYOD </h2> """)
+    def layout(self):
+        """ Assemble the overall layout """
+        with gr.Blocks(theme=gr.themes.Default()) as demo:
+            self.header_block()
+            with gr.Row():
+                # Selection panel
+                with gr.Column():
+                    self.selection_panel()
+                # Output panel/leaderboard
+                self.output_panel()
+            self.synth_panel()
+            self.byod_panel()
+            # Register event listeners
+            self.btn_execute.click(
+                fn=self.benchmark_executor, inputs=[self.data, self.evaluation, self.nmodels, self.nrows, self.method],
+                outputs=[self.leaderboard, self.bumpchart, self.eval_metrics]
+            )
+        return demo
+    def benchmark_executor(self, data, evaluation, nmodels, nrows, method) -> tuple[pd.DataFrame, plt.figure]:
+        """ Main execution flow for benchmarks """
+        #gr.Info(f"Loaded run config: {data}, {evaluation}, {nmodels}.")
+        match data:
+            case 'MMLU':
+                adf = pd.read_pickle(f"data/mmlu_subject_abstract_algebra.pkl")
+                MODELS = adf.model.unique()
+            case 'CNN/DM':
+                adf = pd.read_pickle(f"data/cnndm.pkl")
+                MODELS = adf.model.unique()
+            case 'XSUM':
+                raise NotImplementedError
+            case _:
+                raise ValueError(f"'{data}' not understood.")
+        # Sample fewer models if so needed
+        if nmodels is not None:
+            if nmodels < len(MODELS):
+                MODELS = np.random.choice(MODELS, nmodels, replace=False).tolist()
+                adf = adf[adf.model.isin(MODELS)]
+        match data:
+            case 'MMLU':
+                keys = ["id", "trial_id", "perturbation"] # MMLU has this extra parameter
+            case 'CNN/DM':
+                keys = ["id", "trial_id"]
+            case _:
+                pass
+        df = adf.pivot_table(
+                    columns="model",
+                    index=keys,
+                    values="output",
+                    aggfunc="first",
+                )
+        # Filter by number of rows
+        df.dropna(inplace=True)
+        if nrows is not None:
+            if nrows < df.shape[0]:
+                df = df.sample(nrows)
+        # Compute true ranking
+        adf = adf.set_index(keys).loc[df.index].reset_index()
+        if evaluation == "Rouge":
+            def __true_rouge(x, scorer):
+                return scorer.score(x["reference"], x["output"])["rouge2"].fmeasure
+            scorer = rouge_scorer.RougeScorer(["rouge2"], use_stemmer=True)
+            adf["rouge"] = Parallel(n_jobs=-1, batch_size=128)(
+                delayed(__true_rouge)(i, scorer) for _, i in adf.iterrows()
+            )
+            # Method 2 - look at "win rates" - for each question, see which model
+            # wins (i.e. has the best ROUGE score)
+            idx = adf.groupby(["id", "trial_id"])["rouge"].idxmax()
+            win_rates = adf.loc[idx].model.value_counts()
+            win_rate_rank = win_rates.index.tolist()
+            # include models with nowins at the bottom
+            no_wins = list(set(MODELS) - set(win_rate_rank))
+            true_ranking = win_rate_rank + no_wins
+            evaluator = rouge
+        elif evaluation == 'Equality':
+            # Compute the true ranking (multiple choice - so use equality between
+            # LLM response and reference-value)
+            adf["C"] = (adf.output == adf.reference).astype(int)
+            true_ranking = (
+                adf.groupby("model")["C"]
+                .apply(lambda x: sum(x) / len(x))
+                .sort_values(ascending=False)
+                .index.tolist()
+            )
+            evaluator = equality
+        else:
+            raise ValueError(f"'{evaluation}' not understood.")
+        match method:
+            case 'Full':
+                ranker = SelfRank(MODELS, evaluator, true_ranking)
+            case 'Greedy':
+                ranker = SelfRankGreedy(MODELS, evaluator, true_ranking)
+            case 'MCA':
+                raise NotImplementedError
+            case _:
+                raise ValueError(f"'{method}' not understood.")
+        # generate outputs
+        ranker.fit(df)
+        out_df = pd.DataFrame({'rank': range(1, len(true_ranking)+1), 'model': ranker.ranking})
+        out_metrics = {"rbo": ranker.measure(metric="rbo"),
+            "map-1": ranker.measure(metric="mapk", k=1),
+            "map-3": ranker.measure(metric="mapk", k=3),
+            "map-5": ranker.measure(metric="mapk", k=5),
+            "map-10": ranker.measure(metric="mapk", k=10),
+            "evaluations": evaluator.calls
+        }
+        eval_metrics = (f"Evaluation measures: <br>"
+                        f"Rank-Biased Overlap: {out_metrics['rbo']:0.3f}<br>"
+                        f"MAP-3              : {out_metrics['map-3']:0.3f}<br>"
+                        f"MAP-5              : {out_metrics['map-5']:0.3f}<br>"
+                        f"MAP-10             : {out_metrics['map-10']: 0.3f}.")
+        out_plot = ranker.plot()
+        return out_df, "output.png", eval_metrics
+    def run(self):
+        self.ui = self.layout()
+        self.ui.queue().launch(show_error=True)
+#if __name__ == "__main__":
+ui = UI()
+#ui.run()
+demo = ui.layout()
+demo.launch()

css.css ADDED Viewed

	@@ -0,0 +1,157 @@

+html {
+	font-family: Inter;
+	font-size: 16px;
+	font-weight: 400;
+	line-height: 1.5;
+	-webkit-text-size-adjust: 100%;
+	background: #fff;
+	color: #323232;
+	-webkit-font-smoothing: antialiased;
+	-moz-osx-font-smoothing: grayscale;
+	text-rendering: optimizeLegibility;
+}
+:root {
+	--space: 1;
+	--vspace: calc(var(--space) * 1rem);
+	--vspace-0: calc(3 * var(--space) * 1rem);
+	--vspace-1: calc(2 * var(--space) * 1rem);
+	--vspace-2: calc(1.5 * var(--space) * 1rem);
+	--vspace-3: calc(0.5 * var(--space) * 1rem);
+}
+.app {
+	max-width: 748px !important;
+}
+.prose p {
+	margin: var(--vspace) 0;
+	line-height: var(--vspace * 2);
+	font-size: 1rem;
+}
+code {
+	font-family: "Inconsolata", sans-serif;
+	font-size: 16px;
+}
+h1,
+h1 code {
+	font-weight: 400;
+	line-height: calc(2.5 / var(--space) * var(--vspace));
+}
+h1 code {
+	background: none;
+	border: none;
+	letter-spacing: 0.05em;
+	padding-bottom: 5px;
+	position: relative;
+	padding: 0;
+}
+h2 {
+	margin: var(--vspace-1) 0 var(--vspace-2) 0;
+	line-height: 1em;
+}
+h3,
+h3 code {
+	margin: var(--vspace-1) 0 var(--vspace-2) 0;
+	line-height: 1em;
+}
+h4,
+h5,
+h6 {
+	margin: var(--vspace-3) 0 var(--vspace-3) 0;
+	line-height: var(--vspace);
+}
+.bigtitle,
+h1,
+h1 code {
+	font-size: calc(8px * 4.5);
+	word-break: break-word;
+}
+.title,
+h2,
+h2 code {
+	font-size: calc(8px * 3.375);
+	font-weight: lighter;
+	word-break: break-word;
+	border: none;
+	background: none;
+}
+.subheading1,
+h3,
+h3 code {
+	font-size: calc(8px * 1.8);
+	font-weight: 600;
+	border: none;
+	background: none;
+	letter-spacing: 0.1em;
+	text-transform: uppercase;
+}
+h2 code {
+	padding: 0;
+	position: relative;
+	letter-spacing: 0.05em;
+}
+blockquote {
+	font-size: calc(8px * 1.1667);
+	font-style: italic;
+	line-height: calc(1.1667 * var(--vspace));
+	margin: var(--vspace-2) var(--vspace-2);
+}
+.subheading2,
+h4 {
+	font-size: calc(8px * 1.4292);
+	text-transform: uppercase;
+	font-weight: 600;
+}
+.subheading3,
+h5 {
+	font-size: calc(8px * 1.2917);
+	line-height: calc(1.2917 * var(--vspace));
+	font-weight: lighter;
+	text-transform: uppercase;
+	letter-spacing: 0.15em;
+}
+h6 {
+	font-size: calc(8px * 1.1667);
+	font-size: 1.1667em;
+	font-weight: normal;
+	font-style: italic;
+	font-family: "le-monde-livre-classic-byol", serif !important;
+	letter-spacing: 0px !important;
+}
+#start .md > *:first-child {
+	margin-top: 0;
+}
+h2 + h3 {
+	margin-top: 0;
+}
+.md hr {
+	border: none;
+	border-top: 1px solid var(--block-border-color);
+	margin: var(--vspace-2) 0 var(--vspace-2) 0;
+}
+.prose ul {
+	margin: var(--vspace-2) 0 var(--vspace-1) 0;
+}
+.gap {
+	gap: 0;
+}

data/cnndm.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fa7a24100439d2a8a9f0a72b6064280eb5d63dadcb250123e62abaf795c1bc2b
+size 19238934

data/mmlu_subject_abstract_algebra.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e446aa00757e4de201de6c6dc26d57763c0adf6f235594dffc661d2414345b65
+size 9301843

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+gradio
+certifi==2023.7.22
+charset-normalizer==3.2.0
+idna==3.4
+numpy==1.25.2
+pandas==2.1.0
+requests==2.31.0
+tzdata==2023.3
+urllib3==2.0.4
+matplotlib
+seaborn
+rouge-score
+transformers

selfrank/__init__.py ADDED Viewed

File without changes

selfrank/algos/__init__.py ADDED Viewed

File without changes

selfrank/algos/baseline.py ADDED Viewed

	@@ -0,0 +1,145 @@

+"""
+Baseline: based on most-common answer
+"""
+import pandas as pd
+import numpy as np
+from tqdm import tqdm
+from .metrics import mapk, rank_biased_overlap
+from .plots import plot_ranks
+import logging
+from typing import List, Callable, Optional
+from rouge_score import rouge_scorer as rs
+from collections import Counter
+import random
+logger = logging.getLogger(__name__)
+tol = 0.001
+class MCARank:
+    """
+    Baseline method: based on most common answer
+    """
+    def __init__(
+        self,
+        MODELS: List,
+        evaluator: Callable,
+        true_ranking: Optional[List] = None,
+        show_progress: Optional[bool] = False,
+    ):
+        self.MODELS = MODELS
+        self.N = len(MODELS)
+        self.evaluate = evaluator
+        self.true_ranking = true_ranking
+        self.show_progress = show_progress
+    def fit(self, df: pd.DataFrame, measure: Optional[str]='equality', p: float = 0):
+        """
+        df: Dataframe where each row is a benchmark instance,
+        and there is a column with the output for each Model
+        measure: decides how the most common answer is decided.
+        p - is the noise level to include (only used for noisy-equality)
+        """
+        assert set(self.MODELS) == set(df.columns), "Benchmark data models inconsistent with models to be ranked."
+        if measure == 'equality':
+            # Select the most common answer per question
+            mca = df.mode(axis=1).iloc[:, 0]
+            # Count all the times each model answered the most common one
+            wins = df.eq(mca, axis=0).astype(int)
+            self.ranking = wins.sum().sort_values(ascending=False).index.to_list()
+        elif measure == 'noisy_equality':
+            # Most common answer
+            mca = df.mode(axis=1).iloc[:, 0]
+            perturb = lambda x: not x if (random.random() <= p) else x
+            def __noisy_equality(x, mca):
+                wins = (x == mca).apply(perturb)
+                return wins
+            wins = df.apply(__noisy_equality, axis='rows', args=(mca, ))
+            self.ranking = wins.sum().sort_values(ascending=False).index.to_list()
+        elif measure == 'rouge':
+            MODELS = df.columns.to_list()
+            SIZE = 256
+            def __mca(x):
+                """ Most Commmon Answer, as the top k bigrams across all outputs """
+                cs = [rs._create_ngrams(x[m], n=2) for m in MODELS]
+                c = sum(cs, Counter())
+                return Counter(dict(c.most_common(SIZE)))
+            def __score_mca(x):
+                """ Rouge score computed relative to most-common-answer """
+                res = {}
+                for m in MODELS:
+                    p_n = rs._create_ngrams(x[m], n=2)
+                    res[m] = rs._score_ngrams(x.mca, p_n).fmeasure
+                return pd.Series(res)
+            df['mca'] = df.apply(__mca, axis=1)
+            # Winning model based on best ROUGE score for each question
+            win_rates = df.apply(__score_mca, axis=1).idxmax(axis=1).value_counts()
+            win_rate_rank = win_rates.index.tolist()
+            # include models with nowins at the bottom
+            no_wins = list(set(MODELS) - set(win_rate_rank))
+            self.ranking = win_rate_rank + no_wins
+        else:
+            raise ValueError(f"Measure {measure} not understood.")
+        logger.info(f"Estimated ranks (best to worst): {self.ranking}")
+        logger.info(f"True ranking: {self.true_ranking}")
+        logger.info(f"RBO measure: {self.measure()}")
+        return self.ranking # Best to worst
+    def measure(self, metric='rbo', k=5, p=0.95) -> float:
+        """
+        Report metric related to self-rank
+        """
+        if metric not in ['rbo', 'mapk']:
+            raise ValueError(f"Metric {metric} not supported (use 'rbo'/'mapk').")
+        if hasattr(self, 'ranking'):
+            if self.true_ranking is not None:
+                if metric == 'mapk':
+                    if k > len(self.true_ranking):
+                        logger.warning(f"MAPk metric is for k={len(self.true_ranking)}, and not k={k}.")
+                    actual = [self.true_ranking[:k]]
+                    pred = [self.ranking[:k]]
+                    return mapk(actual, pred, k=k)
+                elif metric == 'rbo':
+                    return rank_biased_overlap(self.true_ranking, self.ranking, p=p)
+                else:
+                    raise ValueError(f"Metric {metric} not understood.")
+            else:
+                raise ValueError("True ranking not available for metric calculation.")
+        else:
+            raise ValueError("Ranking not estimated. Run 'fit' first.")
+    def plot(self, caselabel="output"):
+        if hasattr(self, 'ranking') & (self.true_ranking is not None):
+            plot_ranks(self.true_ranking, self.ranking, "actual", "estimated", caselabel)

selfrank/algos/greedy.py ADDED Viewed

	@@ -0,0 +1,568 @@

+"""
+Script for an iterative scheme.
+Assumptions:
+- complete pariwise comparisons available, i.e. evaluations are cheap
+-
+"""
+import pandas as pd
+import numpy as np
+from tqdm import tqdm
+from selfrank.algos.metrics import mapk, rank_biased_overlap
+from selfrank.algos.plots import plot_ranks
+import logging
+from typing import List, Callable, Optional
+import random
+logger = logging.getLogger(__name__)
+tol = 0.001
+class LLM_Model:
+    def __init__(self, model_name, all_model_data):
+        self.model_name = model_name
+    def name(self):
+        return self.model_name
+    def __eq__(self, other):
+        return self.name() == other.name()
+    def __lt__(self, other):
+        return self.name() < other.name()
+class SelfRankGreedy:
+    def __init__(self, MODELS: List, evaluator: Callable, true_ranking: Optional[List]=None, show_progress: Optional[bool]=False):
+        self.MODELS = MODELS
+        self.N = len(MODELS)
+        self.evaluate = evaluator
+        self.true_ranking = true_ranking
+        self.show_progress = show_progress
+        self.df = None
+        self.DEBUG = False
+        self.model_eval = None
+        self.cnt=0
+    def getEvaluation(self, a, b , c, df, eval_arr, modelsList):
+        '''
+        model c in  is evaluating a and b
+        It check in eval_arr is already evaluated; if not, evaluates and stores
+        '''
+        idx_a = modelsList.index(a)
+        idx_b = modelsList.index(b)
+        idx_c = modelsList.index(c)
+        val = eval_arr[idx_c, idx_a, idx_b]  # stores c evaluating a to b
+        if val > -1:
+            return val
+        else:
+            val = self.evaluate(a, b, c, df)
+            eval_arr[idx_c, idx_a, idx_b] = val
+            eval_arr[idx_c, idx_b, idx_a] = 1 - val
+            return val
+    def __evaluateModelTriplet(self, df, triplet, eval_arr, modelsList):
+        model1 = triplet[0]
+        model2 = triplet[1]
+        model3 = triplet[2]
+        res = np.array([0, 0, 0])
+        m1_cmp_2_3 = self.getEvaluation(a=model2.name(), b=model3.name(), c=model1.name(), df=df, eval_arr=eval_arr, modelsList=modelsList)  #model1.compareModels(model2, model3)
+        m2_cmp_1_3 = self.getEvaluation(a=model1.name(), b=model3.name(), c=model2.name(), df=df, eval_arr=eval_arr, modelsList=modelsList)  #model2.compareModels(model1, model3)
+        m3_cmp_1_2 = self.getEvaluation(a=model1.name(), b=model2.name(), c=model3.name(), df=df, eval_arr=eval_arr, modelsList=modelsList)  #model3.compareModels(model1, model2)
+        if m1_cmp_2_3 >= 0.5:
+            res[1]+=1
+        else:
+            res[2]+=1
+        if m2_cmp_1_3 >= 0.5:
+            res[0]+=1
+        else:
+            res[2]+=1
+        if m3_cmp_1_2 >= 0.5:
+            res[0]+=1
+        else:
+            res[1]+=1
+        #print(res)
+        #print(res.tolist())
+        zipped_pairs = zip(res.tolist(), triplet)
+        z = [(x,y, x.name()) for y, x in sorted(zipped_pairs, reverse=True)]
+        return z
+    def __printNames(self, ll):
+        print([i.name() for i in ll])
+    def __evaluateModels(self, df, evaluators, modelsToBeEvaluated, eval_arr, modelsList):
+        # rewrittten method to allow usage with updated code
+        # modelsToBeEvaluated can have 2 or 3 models only. evaluators will have only 1 model. Use evaluators to rank and return list of models in modelsToBeEvaluated
+        if len(evaluators) > 1:
+            raise Exception
+        if len(modelsToBeEvaluated) > 3 or len(modelsToBeEvaluated) < 2:
+            raise Exception
+        if len(modelsToBeEvaluated) == 2:
+            r = self.getEvaluation(a=modelsToBeEvaluated[0].name(), b=modelsToBeEvaluated[1].name(), c=evaluators[0].name(), df=df, eval_arr=eval_arr, modelsList=modelsList)
+            if r >= 0.5:
+                return [modelsToBeEvaluated[0],modelsToBeEvaluated[1]]
+            else:
+                return [modelsToBeEvaluated[1],modelsToBeEvaluated[0]]
+        if len(modelsToBeEvaluated) == 3:
+            r01 = self.getEvaluation(a=modelsToBeEvaluated[0].name(), b=modelsToBeEvaluated[1].name(), c=evaluators[0].name(), df=df, eval_arr=eval_arr, modelsList=modelsList)
+            r12 = self.getEvaluation(a=modelsToBeEvaluated[1].name(), b=modelsToBeEvaluated[2].name(), c=evaluators[0].name(), df=df, eval_arr=eval_arr, modelsList=modelsList)
+            r02 = self.getEvaluation(a=modelsToBeEvaluated[0].name(), b=modelsToBeEvaluated[2].name(), c=evaluators[0].name(), df=df, eval_arr=eval_arr, modelsList=modelsList)
+            res = np.array([0, 0, 0])
+            if r01 >= 0.5:
+                res[0]+=1
+            else:
+                res[1]+=1
+            if r12 >= 0.5:
+                res[1]+=1
+            else:
+                res[2]+=1
+            if r02 >= 0.5:
+                res[0]+=1
+            else:
+                res[2]+=1
+            zipped_pairs = zip(res.tolist(), modelsToBeEvaluated)
+            z = [x for y, x in sorted(zipped_pairs, reverse=True)]
+            return z
+    def __rankModels(self, df, eval_arr, modelsList, triplet, prev_model_ranking, unrankedModelList, rankedModelList, bottomModelList):
+        if len(triplet) < 3:
+            return [], list(triplet), []
+        self.cnt = self.cnt + 1
+        model_ranking = self.__evaluateModelTriplet(df, triplet, eval_arr, modelsList)
+        if self.DEBUG:
+            print("Cnt: ", self.cnt)
+            print("\n\n\nFIRST")
+            self.__printNames(triplet)
+            self.__printNames(unrankedModelList)
+            self.__printNames(rankedModelList)
+            self.__printNames(bottomModelList)
+            print(model_ranking)
+            print(prev_model_ranking)
+            print("END FIRST")
+        first_rank = model_ranking[0][1]
+        second_rank = model_ranking[1][1]
+        third_rank = model_ranking[2][1]
+        if first_rank  == 2:  # first model is better than the other two
+            if len(unrankedModelList) == 0 and len(bottomModelList) == 0:                                 # CASE 1
+                # no more unranked models left to consider and none in bottomModels,
+                # so add the models in rank order to rankedModelList
+                if second_rank == 1 and third_rank == 0:
+                    if self.DEBUG:
+                        print('CASE 1a')
+                    rankedModelList.extend([model_ranking[0][0], model_ranking[1][0], model_ranking[2][0]])
+                elif second_rank == 0 and third_rank == 0:
+                    if self.DEBUG:
+                        print('CASE 1b')
+                    rankedModelList.append(model_ranking[0][0])
+                    #use current best model to rank the bottom 2 and add to rankedList in order
+                    z = self.__evaluateModels(df, [rankedModelList[0]],[model_ranking[1][0], model_ranking[2][0]], eval_arr, modelsList)
+                    rankedModelList.extend(z)
+                else:
+                    raise Exception("Error: Should not have occurred CASE 1")
+                if self.DEBUG:
+                    self.__printNames(rankedModelList)
+                return [], rankedModelList, []
+            if len(unrankedModelList) == 0 and len(bottomModelList) == 1:                                 # CASE 2
+                # no more unranked models left to consider and only 1 bottomModels in all,
+                if second_rank == 1 and third_rank == 0:
+                    # so add the models in rank order to rankedModelList
+                    if self.DEBUG:
+                        print('CASE 2a')
+                    rankedModelList.extend([model_ranking[0][0], model_ranking[1][0]])
+                    #TODO Use top model in rankedModelList to rank the two models below and then add them according to ranking
+                    z = self.__evaluateModels(df, [rankedModelList[0]],[model_ranking[2][0], bottomModelList[0]], eval_arr, modelsList)
+                    rankedModelList.extend(z)
+                    if self.DEBUG:
+                        self.__printNames(rankedModelList)
+                    return [], rankedModelList, []
+                elif second_rank == 0 and third_rank == 0:
+                    if self.DEBUG:
+                        print('CASE 2b')
+                    rankedModelList.append(model_ranking[0][0])
+                    modelsToCompare = [model_ranking[1][0], model_ranking[2][0], bottomModelList[0]]
+                    if self.DEBUG:
+                        self.__printNames(tuple(modelsToCompare))
+                        self.__printNames(rankedModelList)
+                    return self.__rankModels(df, eval_arr, modelsList, tuple(modelsToCompare), model_ranking, [], rankedModelList, [])
+                else:
+                    raise Exception("Error: Should not have occurred CASE 2")
+            if len(unrankedModelList) == 0 and len(bottomModelList) > 1:                                  # CASE 3
+                # no more unranked models left to consider but there are at least 2 models in bottomModelList
+                if second_rank == 1 and third_rank == 0:
+                    if self.DEBUG:
+                        print('CASE 3a')
+                    rankedModelList.extend([model_ranking[0][0], model_ranking[1][0]]) # add top two models to ranked list
+                    bottomModelList.append(model_ranking[2][0]) # add worst model to bottomModelList
+                elif second_rank == 0 and third_rank == 0:
+                    if self.DEBUG:
+                        print('CASE 3b')
+                    rankedModelList.append(model_ranking[0][0]) # add top model to ranked list
+                    bottomModelList.extend([model_ranking[1][0], model_ranking[2][0]]) # add bottom two model to bottomModelList
+                else:
+                    raise Exception("Error: Should not have occurred CASE 3")
+                modelsToCompare = random.sample(bottomModelList, 3)
+                bottomModelList = [i for i in bottomModelList if i not in modelsToCompare]
+                if self.DEBUG:
+                    self.__printNames(tuple(modelsToCompare))
+                    self.__printNames(bottomModelList)
+                    self.__printNames(rankedModelList)
+                    print([])
+                return self.__rankModels(df, eval_arr, modelsList, tuple(modelsToCompare), model_ranking, bottomModelList, rankedModelList, [])
+            # CASE 4 len(unrankedModelList) > 0
+            #check the previous model ranking and model ranking. if either first or second ranked model previously is now the bottom ranked model,
+            # move all bottom to unranked and call with new triple
+            #if (prev_model_ranking is not None) and ((prev_model_ranking[0][0] == model_ranking[2][0]) or (prev_model_ranking[1][0] == model_ranking[2][0])):
+            #    unrankedModelList.extend(bottomModelList)
+            #    if self.DEBUG:
+            #        print('Case 4a NEW ONE')
+            #        self.__printNames(triplet)
+            #        self.__printNames(unrankedModelList)
+            #        self.__printNames(rankedModelList)
+            #        self.__printNames([])
+            #    return self.__rankModels(df, (triplet, None, unrankedModelList, rankedModelList, [])
+            if second_rank == 1 and third_rank == 0:
+                if self.DEBUG:
+                    print('CASE 4a')
+                bottomModelList.append(model_ranking[2][0]) # add worst model to bottomModelList
+                newModel = random.sample(unrankedModelList, 1)
+                unrankedModelList.remove(newModel[0])
+                triplet = (model_ranking[0][0], model_ranking[1][0], newModel[0])
+                if self.DEBUG:
+                    self.__printNames(triplet)
+                    self.__printNames(unrankedModelList)
+                    self.__printNames(rankedModelList)
+                    self.__printNames(bottomModelList)
+                return self.__rankModels(df, eval_arr, modelsList, triplet, model_ranking, unrankedModelList, rankedModelList, bottomModelList)
+            elif second_rank == 0 and third_rank == 0:
+                # if unrankedModelList has 2 or more elements, put both 2nd and 3rd model into bottom; if unrankedModelList has only one,
+                # then randomly choose one of the two and put in bottom
+                if len(unrankedModelList) > 1:
+                    if self.DEBUG:
+                        print('CASE 4b')
+                    bottomModelList.append(model_ranking[2][0])
+                    bottomModelList.append(model_ranking[1][0])
+                    newModels = random.sample(unrankedModelList, 2)
+                    triplet = (model_ranking[0][0],) + tuple(newModels)
+                    unrankedModelList.remove(newModels[0])
+                    unrankedModelList.remove(newModels[1])
+                    if self.DEBUG:
+                        self.__printNames(triplet)
+                        self.__printNames(unrankedModelList)
+                        self.__printNames(rankedModelList)
+                        self.__printNames(bottomModelList)
+                    return self.__rankModels(df, eval_arr, modelsList, triplet, model_ranking, unrankedModelList, rankedModelList, bottomModelList)
+                else:
+                    if self.DEBUG:
+                        print('CASE 4c')
+                    #200, UR==1
+                    #add third model to bottom. replace in tuple with one from unranked. and rank
+                    #newModel = random.sample(unrankedModelList, 1)
+                    #unrankedModelList.remove(newModel[0])
+                    #bottomModelList.append(model_ranking[2][0]) # add third model to bottomModelList
+                    #triplet = (model_ranking[0][0], model_ranking[1][0], newModel[0])
+                    #add both 0s to bottom. Create tuple with 2, the one from UR and 1 from B. Call self.__rankModels(df, (triple,B,R,[])
+                    newModel = random.sample(unrankedModelList, 1)
+                    unrankedModelList.remove(newModel[0])
+                    bottomModelList.append(model_ranking[2][0]) # add third model to bottomModelList
+                    bottomModelList.append(model_ranking[1][0]) # add second model to bottomModelList
+                    newBottomModel = random.sample(bottomModelList, 1)
+                    bottomModelList.remove(newBottomModel[0])
+                    triplet = (model_ranking[0][0], newModel[0], newBottomModel[0])
+                    if self.DEBUG:
+                        self.__printNames(triplet)
+                        self.__printNames(unrankedModelList)
+                        self.__printNames(rankedModelList)
+                        self.__printNames(bottomModelList)
+                    return self.__rankModels(df, eval_arr, modelsList, triplet, model_ranking, bottomModelList, rankedModelList, [])
+            else:
+                raise Exception("Error: Should not have occurred CASE 4")
+        else:
+            # some problem with ranking all three models
+            if len(unrankedModelList) == 0 and len(bottomModelList) == 0:                                 # CASE 1
+                #use top model from rankedlist to rank the three and append to ranked list in order
+                if self.DEBUG:
+                    print('CASE ELSE_1')
+                z = self.__evaluateModels(df, [rankedModelList[0]], list(triplet), eval_arr, modelsList)
+                if self.DEBUG:
+                    self.__printNames(z)
+                rankedModelList.extend(z)
+                if self.DEBUG:
+                    self.__printNames(rankedModelList)
+                return [], rankedModelList, []
+            if len(unrankedModelList) == 0 and len(bottomModelList) == 1:                                 # CASE 2
+                if self.DEBUG:
+                    print('CASE ELSE_2')
+                #ALTERNATIVE
+                ##use top model from rankedlist to rank the three and append to ranked list in order; THEN, add the sole model from bottom list
+                if len(rankedModelList) > 0:
+                    z = self.__evaluateModels(df, [rankedModelList[0]], list(triplet), eval_arr, modelsList)
+                else:
+                    z = list(triplet)
+                if self.DEBUG:
+                    self.__printNames(z)
+                rankedModelList.extend(z)
+                rankedModelList.append(bottomModelList[0])
+                if self.DEBUG:
+                    self.__printNames(rankedModelList)
+                return [], rankedModelList, []
+            if len(unrankedModelList) == 0 and len(bottomModelList) > 1:                                  # CASE 3
+                # ranks are 1xx or 000
+                if self.DEBUG:
+                    print('CASE ELSE_3')
+                ##use top model from rankedlist to rank the three and add top 2 to ranked list in order;
+                if len(rankedModelList) > 0:
+                    z = self.__evaluateModels(df, [rankedModelList[0]], list(triplet), eval_arr, modelsList)
+                else:
+                    z = list(triplet)
+                if self.DEBUG:
+                    self.__printNames(z)
+                rankedModelList.append(z[0])
+                rankedModelList.append(z[1])
+                bottomModelList.append(z[2])
+                #Sample 3 from bottom to create triple. call self.__rankModels(df, (tripler, B, R, [])
+                newModels = random.sample(bottomModelList, 3)
+                for mod in newModels:
+                    bottomModelList.remove(mod)
+                if self.DEBUG:
+                    self.__printNames(tuple(newModels))
+                    self.__printNames(unrankedModelList)
+                    self.__printNames(rankedModelList)
+                    self.__printNames(bottomModelList)
+                return self.__rankModels(df, eval_arr, modelsList, tuple(newModels), model_ranking, bottomModelList, rankedModelList, [])
+            # CASE 4 len(unrankedModelList) > 0
+            # if the three models are 1,1,1 or 0,0,0 i.e. indistinguishable
+            #check the previous model ranking and model ranking. if either first or second ranked model previously is now the bottom ranked model,
+            # move all bottom to unranked and call with new triple
+            #if (prev_model_ranking is not None) and ((prev_model_ranking[0][0] == model_ranking[2][0]) or (prev_model_ranking[1][0] == model_ranking[2][0])):
+            #    unrankedModelList.extend(bottomModelList)
+            #    if self.DEBUG:
+            #        print('Case ELSE_4 NEW ONE')
+            #        self.__printNames(triplet)
+            #        self.__printNames(unrankedModelList)
+            #        self.__printNames(rankedModelList)
+            #        self.__printNames([])
+            #    return self.__rankModels(df, (triplet, None, unrankedModelList, rankedModelList, [])
+            # choose one of the tuple models and add to unrankedlIst. Remove random model from unrankedList and add to tuple. rank again
+            if first_rank == second_rank and first_rank == third_rank:
+                if self.DEBUG:
+                    print('CASE ELSE_4a')
+                ##use top model from rankedlist to rank the three and add third one to Bottomlist ;
+                ##then create tuple with top 2 and one from unranked
+                if len(rankedModelList) > 0:
+                    z = self.__evaluateModels(df, [rankedModelList[0]], list(triplet), eval_arr, modelsList)
+                else:
+                    z = list(triplet)
+                if self.DEBUG:
+                    print('z: ', z)
+                    self.__printNames(z)
+                bottomModelList.append(z[2])
+                newModel = random.sample(unrankedModelList, 1)
+                unrankedModelList.remove(newModel[0])
+                triplet = (z[0], z[1], newModel[0])
+                if self.DEBUG:
+                    print(1)
+                    print('triplet:', triplet)
+                    self.__printNames(triplet)
+                    print(2)
+                    self.__printNames(unrankedModelList)
+                    print(3)
+                    self.__printNames(rankedModelList)
+                    print(4)
+                    self.__printNames(bottomModelList)
+                    print(5)
+            else: # there are one or two models with 0
+                # if only 1, add to bottom and replace with one from unranked
+                # if two are 0, then both replace with unranked if unranked has more than 1
+                # otherwise randomly add one of the 0s to bottom and replace with unranked.
+                if second_rank == 1: # then only third is 0
+                    if self.DEBUG:
+                        print('CASE ELSE_4b')
+                    newModel = random.sample(unrankedModelList, 1)
+                    unrankedModelList.remove(newModel[0])
+                    bottomModelList.append(model_ranking[2][0])
+                    triplet = (model_ranking[0][0], model_ranking[1][0], newModel[0])
+                else: # both second and third are zero
+                    if len(unrankedModelList) > 1:
+                        if self.DEBUG:
+                            print('CASE ELSE_4c')
+                        bottomModelList.append(model_ranking[2][0])
+                        bottomModelList.append(model_ranking[1][0])
+                        newModels = random.sample(unrankedModelList, 2)
+                        triplet = (model_ranking[0][0],) + tuple(newModels)
+                        unrankedModelList.remove(newModels[0])
+                        unrankedModelList.remove(newModels[1])
+                    else:
+                        if self.DEBUG:
+                            print('CASE ELSE_4d')
+                        #add third model to bottom. replace in tuple with one from unranked. and rank
+                        #newModel = random.sample(unrankedModelList, 1)
+                        #unrankedModelList.remove(newModel[0])
+                        #bottomModelList.append(model_ranking[2][0]) # add third model to bottomModelList
+                        #triplet = (model_ranking[0][0], model_ranking[1][0], newModel[0])
+                        # UR==1,  100
+                        #Add both 0s to Bottom. Create tuple from the 1, one from UR, and one from Bottom
+                        #Call self.__rankModels(df, (triple, B, R, [])
+                        bottomModelList.append(model_ranking[2][0])
+                        bottomModelList.append(model_ranking[1][0])
+                        newModels = random.sample(unrankedModelList, 1)
+                        unrankedModelList.remove(newModel[0])
+                        newBottomModels = random.sample(bottomModelList, 1)
+                        bottomModelList.remove(newBottomModels[0])
+                        triplet = (model_ranking[0][0], newModels[0], newBottomModels[0])
+                        if self.DEBUG:
+                            self.__printNames(triplet)
+                            self.__printNames(unrankedModelList)
+                            self.__printNames(rankedModelList)
+                            self.__printNames(bottomModelList)
+                        return self.__rankModels(df, eval_arr, modelsList, triplet, model_ranking, bottomModelList, rankedModelList, [])
+            if self.DEBUG:
+                self.__printNames(triplet)
+                self.__printNames(unrankedModelList)
+                self.__printNames(rankedModelList)
+                self.__printNames(bottomModelList)
+            return self.__rankModels(df, eval_arr, modelsList, triplet, model_ranking, unrankedModelList, rankedModelList, bottomModelList)
+    def __printRanks(self, ll):
+        print([{i.name(): r} for r,i in enumerate(ll)])
+    def __estimate_rankings(self, df, numIter=1, modelSubset=None, numModels=None):
+        rankedLists = []
+        if modelSubset is not None:
+            model_list = modelSubset
+        elif numModels is not None:
+            model_list = self.MODELS.copy() #df.columns.tolist() #list(df['model'].unique())
+            model_list = random.sample(model_list, numModels)
+        else:
+            model_list = self.MODELS.copy() #df.columns.tolist() #list(df['model'].unique())
+        nModels = len(model_list)
+        self.model_eval = np.full((nModels, nModels, nModels), -1)
+        for it in tqdm(range(numIter)):
+            shuffled_list = model_list.copy()
+            random.shuffle(shuffled_list)
+            t = random.sample(shuffled_list, 3)
+            u = [i for i in shuffled_list if i not in t]
+            t = [LLM_Model(i, df) for i in t]
+            u = [LLM_Model(i, df) for i in u]
+            _,rankedList,_ = self.__rankModels(df, self.model_eval, model_list, tuple(t), None, u, [], [])
+            rankedLists.append(rankedList)
+        estimated_ranking_lists = []
+        ranks = []
+        for rl in rankedLists:
+            estimated_ranking = {i.name(): r+1 for r,i in enumerate(rl)}
+            rank = [estimated_ranking[name] for name in model_list] #sorted(model_list)]
+            estimated_ranking_lists.append(estimated_ranking)
+            ranks.append(rank)
+        average_estimated_scores = sorted(zip(np.mean(np.array(ranks), axis=0), model_list))
+        average_estimated_ranking = [mod for rnk, mod in average_estimated_scores]
+        #average_scores = [rnk for rnk, mod in zipped]
+        return model_list, estimated_ranking_lists, average_estimated_ranking, average_estimated_scores
+    def fit(self, df: pd.DataFrame):
+        """
+        df: Dataframe where each row is a benchmark instance,
+        and there is a column with the output for each Model
+        """
+        assert set(self.MODELS) == set(df.columns), "Benchmark data models inconsistent with models to be ranked."
+        #process the dataset
+        self.df = df #self.__process_dataset(df)
+        # Build a pairwise preference matrix
+        #if self.show_progress:
+        #    pbar = tqdm(total=self.N**3, position=0, leave=False, desc="Evaluations")
+        #if self.show_progress: pbar.update(1)
+        # Estimate the ranks
+        _, _, average_estimated_ranking, _ = self.__estimate_rankings(self.df, numIter=1)
+            #logging.info(f"Iteration {iter}:{delta}")
+        self.ranking = average_estimated_ranking
+        logger.info(f"Estimated 'greedy' ranks (best to worst): {self.ranking}")
+        return self.ranking # Best to worst
+    def measure(self, metric='rbo', k=5, p=0.95) -> float:
+        """
+        Report metrics related to self-rank
+        """
+        if metric not in ['rbo', 'mapk']:
+            raise ValueError(f"Metric {metric} not supported (use 'rbo'/'mapk').")
+        if hasattr(self, 'ranking'):
+            if self.true_ranking is not None:
+                if metric == 'mapk':
+                    if k > len(self.true_ranking):
+                        logger.warning(f"MAPk metric is for k={len(self.true_ranking)}, and not k={k}.")
+                    actual = [self.true_ranking[:k]]
+                    pred = [self.ranking[:k]]
+                    return mapk(actual, pred, k=k)
+                elif metric == 'rbo':
+                    return rank_biased_overlap(self.true_ranking, self.ranking, p=p)
+                else:
+                    raise ValueError(f"Metric {metric} not understood.")
+            else:
+                raise ValueError("True ranking not available for metric calculation.")
+        else:
+            raise ValueError("Ranking not estimated. Run 'fit' first.")
+    def plot(self, caselabel="output"):
+        if hasattr(self, 'ranking') & (self.true_ranking is not None):
+            return plot_ranks(self.true_ranking, self.ranking, "actual", "estimated", caselabel)

selfrank/algos/iterative.py ADDED Viewed

	@@ -0,0 +1,137 @@

+"""
+Script for an iterative scheme.
+Assumptions:
+- complete pariwise comparisons available, i.e. evaluations are cheap
+-
+"""
+import pandas as pd
+import numpy as np
+from tqdm import tqdm
+from .metrics import mapk, rank_biased_overlap
+from .plots import plot_ranks
+import logging
+from typing import List, Callable, Optional
+logger = logging.getLogger(__name__)
+tol = 0.001
+class SelfRank:
+    def __init__(self, MODELS: List, evaluator: Callable, true_ranking: Optional[List]=None, show_progress: Optional[bool]=False):
+        self.MODELS = MODELS
+        self.N = len(MODELS)
+        self.evaluate = evaluator
+        self.true_ranking = true_ranking
+        self.show_progress = show_progress
+    def fit(self, df: pd.DataFrame):
+        """
+        df: Dataframe where each row is a benchmark instance,
+        and there is a column with the output for each Model
+        """
+        assert set(self.MODELS) == set(df.columns), "Benchmark data models inconsistent with models to be ranked."
+        # Build a pairwise preference matrix
+        if self.show_progress:
+            pbar = tqdm(total=self.N**3, position=0, leave=False, desc="Evaluations")
+        y = np.empty((self.N, self.N, self.N))
+        for i, a in enumerate(self.MODELS):
+            for j, b in enumerate(self.MODELS):
+                for k, c in enumerate(self.MODELS):  # Judge
+                    # Some checks to limit evaluations
+                    if a == b:
+                        y[i, j, k] = 0.5
+                        y[j, i, k] = 0.5
+                    elif a == c:
+                        y[i, j, k] = 1
+                        y[j, i, k] = 0
+                    elif b == c:
+                        y[i, j, k] = 0
+                        y[j, i, k] = 1
+                    elif j > i:
+                        y[i, j, k] = self.evaluate(a=a, b=b, c=c, df=df)
+                        y[j, i, k] = 1 - y[i, j, k] # complement in the other direction
+                    if self.show_progress: pbar.update(1)
+        # Estimate the ranks
+        r = np.ones((self.N, ))
+        iter = 0
+        while True:
+            # weighted mean over k
+            m = np.einsum('ijk,i->ij', y, r) / self.N
+            # Aggregate preferences using majority voting
+            y_p = np.zeros_like(m)
+            for i in np.arange(self.N):
+                for j in np.arange(self.N):
+                    if j > i:
+                        if m[i, j] >= m[j, i]:
+                            y_p[i,j] = 1.
+                            y_p[j,i] = 0.
+                        else:
+                            y_p[i,j] = 0.
+                            y_p[j,i] = 1.
+            # update reputation score by wins
+            r_k = y_p.sum(axis=1)/max(y_p.sum(axis=1))
+            # termination if reputation score converges
+            delta = np.sum(np.abs(r - r_k))
+            logging.info(f"Iteration {iter}:{delta}")
+            logging.info(f"Reputation score: {r}")
+            if delta<= tol:
+                break
+            else:
+                iter += 1
+                r = r_k
+        # Get ranked list from the reputation score
+        idx = np.argsort(r_k)[::-1]
+        self.ranking = np.array(self.MODELS)[idx].tolist()
+        logger.info(f"Estimated ranks (best to worst): {self.ranking}")
+        if self.true_ranking is not None:
+            logger.info(f"True ranking: {self.true_ranking}")
+            logger.info(f"RBO measure: {self.measure()}")
+        return self.ranking # Best to worst
+    def measure(self, metric='rbo', k=5, p=0.95) -> float:
+        """
+        Report metric related to self-rank
+        """
+        if metric not in ['rbo', 'mapk']:
+            raise ValueError(f"Metric {metric} not supported (use 'rbo'/'mapk').")
+        if hasattr(self, 'ranking'):
+            if self.true_ranking is not None:
+                if metric == 'mapk':
+                    if k > len(self.true_ranking):
+                        logger.warning(f"MAPk metric is for k={len(self.true_ranking)}, and not k={k}.")
+                    actual = [self.true_ranking[:k]]
+                    pred = [self.ranking[:k]]
+                    return mapk(actual, pred, k=k)
+                elif metric == 'rbo':
+                    return rank_biased_overlap(self.true_ranking, self.ranking, p=p)
+                else:
+                    raise ValueError(f"Metric {metric} not understood.")
+            else:
+                raise ValueError("True ranking not available for metric calculation.")
+        else:
+            raise ValueError("Ranking not estimated. Run 'fit' first.")
+    def plot(self, caselabel="output"):
+        if hasattr(self, 'ranking') & (self.true_ranking is not None):
+            return plot_ranks(self.true_ranking, self.ranking, "actual", "estimated", caselabel)

selfrank/algos/metrics.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import numpy as np
+def apk(actual, predicted, k=10):
+    """
+    Computes the average precision at k.
+    This function computes the average prescision at k between two lists of
+    items.
+    Parameters
+    ----------
+    actual : list
+             A list of elements that are to be predicted (order doesn't matter)
+    predicted : list
+                A list of predicted elements (order does matter)
+    k : int, optional
+        The maximum number of predicted elements
+    Returns
+    -------
+    score : double
+            The average precision at k over the input lists
+    """
+    if not actual:
+        return 0.0
+    if len(predicted)>k:
+        predicted = predicted[:k]
+    score = 0.0
+    num_hits = 0.0
+    for i,p in enumerate(predicted):
+        # first condition checks whether it is valid prediction
+        # second condition checks if prediction is not repeated
+        if p in actual and p not in predicted[:i]:
+            num_hits += 1.0
+            score += num_hits / (i+1.0)
+    return score / min(len(actual), k)
+def mapk(actual: list[list], predicted: list[list], k:int=10) -> float:
+    """
+    Computes the mean average precision at k.
+    This function computes the mean average prescision at k between two lists
+    of lists of items.
+    Parameters
+    ----------
+    actual : list
+             A list of lists of elements that are to be predicted
+             (order doesn't matter in the lists)
+    predicted : list
+                A list of lists of predicted elements
+                (order matters in the lists)
+    k : int, optional
+        The maximum number of predicted elements
+    Returns
+    -------
+    score : double
+            The mean average precision at k over the input lists
+    """
+    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)]).astype(float)
+def rank_biased_overlap(l1,l2,p):
+    """
+    Returns RBO indefinite rank similarity metric, as described in:
+    Webber, W., Moffat, A., & Zobel, J. (2010).
+    A similarity measure for indefinite rankings.
+    ACM Transactions on Information Systems.
+    doi:10.1145/1852102.1852106.
+    """
+    sl,ll = sorted([(len(l1), l1),(len(l2),l2)])
+    s, S = sl
+    l, L = ll
+    # Calculate the overlaps at ranks 1 through l
+    # (the longer of the two lists)
+    ss = set([])
+    ls = set([])
+    overs = {}
+    for i in range(l):
+        ls.add(L[i])
+        if i<s:
+           ss.add(S[i])
+        X_d = len(ss.intersection(ls))
+        d = i+1
+        overs[d] = float(X_d)
+    # (1) \sum_{d=1}^l (X_d / d) * p^d
+    sum1 = 0
+    for i in range(l):
+        d=i+1
+        sum1+=overs[d]/d*pow(p,d)
+    X_s = overs[s]
+    X_l = overs[l]
+    # (2) \sum_{d=s+1}^l [(X_s (d - s)) / (sd)] * p^d
+    sum2 = 0
+    for i in range(s,l):
+        d=i+1
+        sum2+=(X_s*(d-s)/(s*d))*pow(p,d)
+    # (3) [(X_l - X_s) / l + X_s / s] * p^l
+    sum3 = ((X_l-X_s)/l+X_s/s)*pow(p,l)
+    # Equation 32.
+    rbo_ext = (1-p)/p*(sum1+sum2)+sum3
+    return rbo_ext

selfrank/algos/pairwise.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import pandas as pd
+import numpy as np
+from tqdm import tqdm
+from dotenv import load_dotenv
+from genai import Client
+from typing import Callable, List, Optional
+from genai.text.generation import TextGenerationParameters, TextGenerationReturnOptions
+from genai import Credentials, Client
+from langchain.prompts import PromptTemplate
+from .metrics import mapk, rank_biased_overlap
+from .plots import plot_ranks
+import logging
+import random
+logger = logging.getLogger(__name__)
+load_dotenv()
+credentials = Credentials.from_env()
+client = Client(credentials=credentials)
+_INSTRUCTION = "Compare the two responses."
+_RUBRIC = "Which is the better response?"
+_PROMETHEUS_PROMPT = """###Task Description:
+An instruction (might include an Input inside it), a response to evaluate, and a score rubric representing a evaluation criteria are given.
+1. Write a detailed feedback that assess the quality of two responses strictly based on the given score rubric, not evaluating in general.
+2. After writing a feedback, choose a better response between Response 1 and Response 2. You should refer to the score rubric.
+3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (1 or 2)"
+4. Please do not generate any other opening, closing, and explanations.
+###Instruction:
+{instruction}
+###Response 1:
+{response_1}
+###Response 2:
+{response_2}
+###Score Rubric:
+{rubric}
+###Feedback:
+"""
+template = PromptTemplate.from_template(_PROMETHEUS_PROMPT)
+class LLMJudge:
+    """
+    Competing method based on an LLM-Judge (Prometheus)
+    """
+    def __init__(
+        self,
+        MODELS: List,
+        true_ranking: Optional[List] = None,
+        show_progress: Optional[bool] = True,
+    ):
+        self.MODELS = MODELS
+        self.N = len(MODELS)
+        self.evaluate = prometheus
+        self.true_ranking = true_ranking
+        self.show_progress = show_progress
+    def fit(self, df: pd.DataFrame):
+        """
+        df: Dataframe where each row is a benchmark instance,
+        and there is a column with the output for each Model
+        """
+        assert set(self.MODELS) == set(df.columns), "Benchmark data models inconsistent with models to be ranked."
+        self.N = len(self.MODELS)
+        y = np.empty((self.N, self.N))
+        if self.show_progress:
+            pbar = tqdm(total=self.N**2, position=0, leave=False, desc="Evaluations")
+        for i, a in enumerate(self.MODELS):
+            for j, b in enumerate(self.MODELS):
+                if a == b:
+                    y[i, j] = 0
+                else:
+                    y[i, j] = self.evaluate(client, format_instruction, a=a, b=b, df=df)
+                if self.show_progress: pbar.update(1)
+        logger.debug(f"Win matrix:\n{y}")
+        # Just agregate based on win rates
+        df = pd.DataFrame({'wins': y.sum(axis=1)}, index=self.MODELS)
+        df = df.sort_values(by='wins', ascending=False)
+        self.ranking = df.index.to_list()
+        return self.ranking
+    def measure(self, metric='rbo', k=5, p=0.95) -> float:
+        """
+        Report metric related to self-rank
+        """
+        if metric not in ['rbo', 'mapk']:
+            raise ValueError(f"Metric {metric} not supported (use 'rbo'/'mapk').")
+        if hasattr(self, 'ranking'):
+            if self.true_ranking is not None:
+                if metric == 'mapk':
+                    if k > len(self.true_ranking):
+                        logger.warning(f"MAPk metric is for k={len(self.true_ranking)}, and not k={k}.")
+                    actual = [self.true_ranking[:k]]
+                    pred = [self.ranking[:k]]
+                    return mapk(actual, pred, k=k)
+                elif metric == 'rbo':
+                    return rank_biased_overlap(self.true_ranking, self.ranking, p=p)
+                else:
+                    raise ValueError(f"Metric {metric} not understood.")
+            else:
+                raise ValueError("True ranking not available for metric calculation.")
+        else:
+            raise ValueError("Ranking not estimated. Run 'fit' first.")
+    def plot(self, caselabel="output"):
+        if hasattr(self, 'ranking') & (self.true_ranking is not None):
+            plot_ranks(self.true_ranking, self.ranking, "actual", "estimated", caselabel)
+def format_instruction(x, a, b):
+    """instruction to score with Prometheus"""
+    # Build the instruction
+    response1 =f"{x[a]}"
+    response2 =f"{x[b]}"
+    instruction = _INSTRUCTION
+    rubric = _RUBRIC
+    instruction = template.format(
+        instruction=instruction, response_1=response1, response_2 = response2, rubric=rubric
+    )
+    return instruction
+def prometheus(client: Client, formatter: Callable, a: str, b:str, df: pd.DataFrame) -> int:
+    """
+    Query the LLM-as-a-judge model Prometheus to compare responses from model "a" and model "b"
+    client: is the `genai` client (using BAM).
+    formatter: function that takes the model output and generates the Prometheus instruction
+    parameters: BAM specific parameters.
+    a: name of model `a` to be evaluated (column in `df` with responses)
+    b: named of model `b` to be evaluated
+    df: DataFrame with responses
+    """
+    parameters = TextGenerationParameters(
+        max_new_tokens=500, return_options=TextGenerationReturnOptions(), random_seed=42
+    )
+    # Get the correct prompts
+    inst = df.apply(formatter, axis=1, args = (a,b))
+    adf = df.copy(deep=True)
+    results = []
+    for response in client.text.generation.create(
+                model_id="kaist-ai/prometheus-8x7b-v2",
+                inputs=inst.values.tolist(),
+                execution_options={"ordered": True, 'concurrency_limit': 10},
+                parameters=parameters,
+            ):
+        results.append(response.results[0])
+    adf["generated_text"] = [r.generated_text for r in results]
+    def _helper(x):
+        try:
+            return int(x.split("[RESULT]")[1])
+        except (IndexError, ValueError) as e:
+            return random.choice([0, 1])
+        return
+    adf['A'] = adf["generated_text"].apply(_helper)
+    n = adf.shape[0]
+    a_wins = sum(adf['A'])
+    b_wins = n - a_wins
+    if a_wins >= b_wins:
+        return 1
+    else:
+        return 0

selfrank/algos/plots.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+from typing import List
+class bcolors:
+    PURPLE = '\033[95m'
+    BLUE = '\033[94m'
+    GREEN = '\033[92m'
+    WARNING = '\033[93m'
+    RED = '\033[91m'
+    ENDC = '\033[0m'
+    BOLD = '\033[1m'
+    UNDERLINE = '\033[4m'
+def plot_ranks(r1: List, r2: List, r1_label: str, r2_label: str, output: str) -> plt.axes:
+    """
+    e.g.:
+    df = rank_data(true_ranking, ranking, "actual", "predicted", "output")
+    """
+    items = list(set(r1 + r2))
+    xs = []
+    for i in items:
+        for lbl, l in zip((r1_label, r2_label), (r1, r2)):
+            try:
+                x = l.index(i)
+            except ValueError:
+                x = np.nan
+            xs.append({"item": i, "version": lbl, "rank": x + 1})
+    df = pd.DataFrame(xs).pivot(index="item", columns="version", values="rank").T
+    fig = plt.figure(figsize=(5, 10))
+    bumpchart(
+        df,
+        show_rank_axis=False,
+        scatter=True,
+        ax=fig.gca(),
+        holes=False,
+        line_args={"linewidth": 5, "alpha": 0.5},
+        scatter_args={"s": 100, "alpha": 0.8},
+    )
+    plt.savefig(f"{output}.png", dpi=150, bbox_inches="tight")
+    return fig
+def bumpchart(
+    df,
+    show_rank_axis=True,
+    rank_axis_distance=1.1,
+    ax=None,
+    scatter=False,
+    holes=False,
+    line_args={},
+    scatter_args={},
+    hole_args={},
+):
+    if ax is None:
+        left_yaxis = plt.gca()
+    else:
+        left_yaxis = ax
+    # Creating the right axis.
+    right_yaxis = left_yaxis.twinx()
+    axes = [left_yaxis, right_yaxis]
+    # Creating the far right axis if show_rank_axis is True
+    if show_rank_axis:
+        far_right_yaxis = left_yaxis.twinx()
+        axes.append(far_right_yaxis)
+    for col in df.columns:
+        y = df[col]
+        x = df.index.values
+        # Plotting blank points on the right axis/axes
+        # so that they line up with the left axis.
+        for axis in axes[1:]:
+            axis.plot(x, y, alpha=0)
+        left_yaxis.plot(x, y, **line_args, solid_capstyle="round")
+        # Adding scatter plots
+        if scatter:
+            left_yaxis.scatter(x, y, **scatter_args)
+            # Adding see-through holes
+            if holes:
+                bg_color = left_yaxis.get_facecolor()
+                left_yaxis.scatter(x, y, color=bg_color, **hole_args)
+    # Number of lines
+    lines = len(df.columns)
+    y_ticks = [*range(1, lines + 1)]
+    # Configuring the axes so that they line up well.
+    for axis in axes:
+        axis.invert_yaxis()
+        axis.set_yticks(y_ticks)
+        axis.set_ylim((lines + 0.5, 0.5))
+    # Sorting the labels to match the ranks.
+    left_labels = df.iloc[0].sort_values().index
+    right_labels = df.iloc[-1].sort_values().index
+    left_yaxis.set_yticklabels(left_labels)
+    right_yaxis.set_yticklabels(right_labels)
+    # Setting the position of the far right axis so that it doesn't overlap with the right axis
+    if show_rank_axis:
+        far_right_yaxis.spines["right"].set_position(("axes", rank_axis_distance))
+    return axes

selfrank/algos/triplet.py ADDED Viewed

	@@ -0,0 +1,219 @@

+import pandas as pd
+import numpy as np
+from rouge_score import rouge_scorer
+from joblib import Parallel, delayed
+#from transformers import AutoTokenizer, DebertaForSequenceClassification
+#import torch
+from tqdm import tqdm
+import logging
+from .plots import bcolors
+import random
+logger = logging.getLogger(__name__)
+# Local only for now
+#DEVICE = "mps" if torch.backends.mps.is_available() else "cpu"
+DEVICE = 'cpu'
+def call_counter(func):
+    def helper(*args, **kwargs):
+        helper.calls += 1
+        return func(*args, **kwargs)
+    helper.calls = 0
+    return helper
+# @call_counter
+# def entailment(tokenizer: AutoTokenizer, model: DebertaForSequenceClassification, a: str, b:str, c:str, df: pd.DataFrame) -> float:
+#     """
+#     uses model c to evaluate a vs. b
+#     Entailment based on natural language inference - binary outcomes version.
+#     """
+#     def __helper(x, h):
+#         premise = x[c]
+#         hypothesis = x[h]
+#         formatted_text = f"{premise}{tokenizer.sep_token}{hypothesis}"
+#         inputs = tokenizer(formatted_text, return_tensors="pt", padding=True, truncation=True).to(DEVICE)
+#         # Fetch class probabilities
+#         with torch.no_grad():
+#             predid = model(**inputs).logits.argmax(-1)
+#             out = model.config.id2label[predid.item()]
+#         if out == 'ENTAILMENT':
+#             return 1
+#         else:
+#             return 0
+#     a_ent = df.apply(__helper, args=(a,), axis=1)
+#     b_ent = df.apply(__helper, args=(b,), axis=1)
+#     if sum(a_ent) == sum(b_ent):
+#         logger.info(f"Judge: {c}, {bcolors.PURPLE}{bcolors.BOLD}Model {a}: {sum(a_ent)}, Model {b}: {sum(b_ent)} {bcolors.ENDC} (of {len(df)}).")
+#         return 0.5 # tied - in aggregate
+#     elif sum(a_ent) > sum(b_ent):
+#         logger.info(f"Judge: {c}, {bcolors.RED}{bcolors.BOLD}Model {a}: {sum(a_ent)}{bcolors.ENDC}, Model {b}: {sum(b_ent)} (of {len(df)}).")
+#         return 1 # a wins - in aggregate
+#     else:
+#         logger.info(f"Judge: {c}, Model {a}: {sum(a_ent)}, {bcolors.RED}{bcolors.BOLD}Model {b}: {sum(b_ent)}{bcolors.ENDC} (of {len(df)}).")
+#         return 0 # b wins
+# @call_counter
+# def entailment_p(tokenizer: AutoTokenizer, model: DebertaForSequenceClassification, a: str, b:str, c:str, df: pd.DataFrame) -> int:
+#     """
+#     uses model c to evaluate a vs. b
+#     Entailment based on natural language inference - PROBABILITY version.
+#     """
+#     def chunks(lst, batch_size):
+#         for i in range(0, len(lst), batch_size):
+#             yield lst[i:i + batch_size]
+#     def inference(ft):
+#         inputs = tokenizer(ft, return_tensors="pt", padding=True, truncation=True).to(DEVICE)
+#         idx = model.config.label2id['ENTAILMENT']
+#         # Fetch entailment probabilities
+#         with torch.no_grad():
+#             logits = model(**inputs).logits
+#             p = torch.nn.functional.softmax(logits, dim=1).to("cpu").numpy()[:, idx]
+#         return p.tolist()
+#     # prepare inputs
+#     premise = df[c]
+#     formatted_text = (premise + tokenizer.sep_token + df[a]).to_list() + \
+#                      (premise + tokenizer.sep_token + df[b]).to_list()
+#     p = []
+#     for i in chunks(formatted_text, 4):
+#         p += inference(i)
+#     # Compare entailment probs between model 'a' and 'b'
+#     ent_a = p[:len(p)//2]
+#     ent_b = p[len(p)//2:]
+#     values = [1 if i >= j else 0 for i, j in zip(ent_a, ent_b)] # 1-> "a" wins
+#     # Win percentage
+#     if sum(values) >= (0.5 * len(values)):
+#         return 1 # a wins
+#     else:
+#         return 0 # b wins
+@call_counter
+def equality(a: str, b:str, c:str, df:pd.DataFrame) -> int:
+    """
+    use model c to evaluate a vs. b
+    simple heuristic as the answers are multiple choice, so use equality.
+    """
+    ties = df[a] == df[b]
+    a_wins = sum((df[a] == df[c]) & ~(ties))
+    b_wins = sum((df[b] == df[c]) & ~(ties))
+    if a_wins >= b_wins:
+        return 1
+    else:
+        return 0
+@call_counter
+def noisy_equality(a: str, b:str, c:str, df:pd.DataFrame, p: float) -> int:
+    """
+    use model c to evaluate a vs. b
+    noisy version of equality - where evaluations are flipped independently with
+    probability p (p=1 will always flip, p=0, will never)
+    """
+    perturb = lambda x: not x if (random.random() <= p) else x
+    ties = (df[a] == df[b])
+    a_w =  (df[a] == df[c]).apply(perturb)
+    b_w =  (df[b] == df[c]).apply(perturb)
+    a_wins = sum(a_w & ~(ties))
+    b_wins = sum(b_w & ~(ties))
+    if a_wins >= b_wins:
+        return 1
+    else:
+        return 0
+@call_counter
+def rouge(a: str, b: str, c:str, df: pd.DataFrame) -> float:
+    """
+    Summarization metric ROUGE2 - discrete version
+    """
+    scorer = rouge_scorer.RougeScorer(["rouge2"], use_stemmer=True)
+    def __helper(x) -> int:
+        score_a = scorer.score(x[c], x[a])['rouge2'].fmeasure
+        score_b = scorer.score(x[c], x[b])['rouge2'].fmeasure
+        #logger.info(f"{score_a}, {score_b}")
+        if score_a >= score_b:
+            return 1 # a wins this instance
+        else:
+            return 0 # b wins
+    outcomes = df.apply(__helper, axis=1)
+    a_wins = sum(outcomes)
+    b_wins = sum(outcomes==0)
+    if a_wins == b_wins:
+        logger.info(f"Judge: {c}, {bcolors.PURPLE}{bcolors.BOLD}Model {a}: {a_wins}, Model {b}: {b_wins} {bcolors.ENDC} (of {len(df)}).")
+        return 0.5 # tied overall
+    elif a_wins > b_wins:
+        logger.info(f"Judge: {c}, {bcolors.RED}{bcolors.BOLD}Model {a}: {a_wins}{bcolors.ENDC}, Model {b}: {b_wins} (of {len(df)}).")
+        return 1 # a wins overall
+    else:
+        logger.info(f"Judge: {c}, Model {a}: {a_wins}, {bcolors.RED}{bcolors.BOLD}Model {b}: {b_wins}{bcolors.ENDC} (of {len(df)}).")
+        return 0 # b wins
+@call_counter
+def rouge_avg(a: str, b: str, c:str, df: pd.DataFrame) -> float:
+    """
+    Summarization metric ROUGE2 - based on averages
+    Following HELM returns the fmeasure
+    https://github.com/stanford-crfm/helm/blob/9be35a339347a9f2ad5644d7b72aede57486e3d4/src/helm/benchmark/metrics/basic_metrics.py#L256
+    """
+    def __true_rouge(x, m, scorer):
+        try:
+            scores = scorer.score(x[c], x[m])
+            value = scores["rouge2"].fmeasure
+            return value
+        except AttributeError:
+            #print(x[c], x[m])
+            return 0.0
+    if a == b:
+        return 0.5 # its a tie!
+    if a == c:
+        return 1. # a wins (as judge is the same)
+    if b == c:
+        return 0. # b wins as its also the judge
+    scorer = rouge_scorer.RougeScorer(["rouge2"], use_stemmer=True)
+    values = {}
+    for m in [a, b]:
+        values[m] = Parallel(n_jobs=-1, batch_size=128)(
+            delayed(__true_rouge)(i, m, scorer) for _, i in df.iterrows()
+        )
+    # Compare average rouge score over entire benchmark
+    if np.mean(values[a]) >= np.mean(values[b]):
+        return 1. # a wins
+    else:
+        return 0. # b wins

selfrank/algos/utils.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import numpy as np
+def systematic_sampling(l: list, n: int) -> list:
+    """
+    l - (ordered) list to be sampled from
+    n - number of samples to fetch
+    returns a list of samples (far apart)
+    """
+    skip = len(l)/n
+    s = np.random.uniform(0, skip)
+    out = []
+    for _ in range(n):
+        out.append(l[np.floor(s).astype(int)])
+        s += skip
+    return out
+def close_sampling(l:list, n: int) -> list:
+    """
+    returns a sampled list (close together)
+    """
+    w = np.floor(n/2 + 2).astype(int)
+    s = np.floor(np.random.uniform(w, len(l) - w)).astype(int)
+    subset = [l[i] for i in range(s-w, s+w)]
+    return np.random.choice(subset, n, replace=False).tolist()