File size: 4,027 Bytes
976ee8c
 
 
f2d5c0f
 
976ee8c
 
 
 
f2d5c0f
 
61f9bac
f2d5c0f
61f9bac
f2d5c0f
976ee8c
7983727
 
cb7f1d0
976ee8c
61f9bac
f2d5c0f
976ee8c
f2d5c0f
 
976ee8c
f2d5c0f
976ee8c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f2d5c0f
 
976ee8c
 
 
 
 
 
 
 
 
 
 
 
 
6fe41a3
f2d5c0f
976ee8c
 
f2d5c0f
976ee8c
 
c5259e1
 
 
 
 
 
976ee8c
 
 
 
 
 
 
 
 
 
 
 
 
 
24142d6
 
976ee8c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
487e952
976ee8c
 
 
 
 
61f9bac
f2d5c0f
74d76d7
0ba00fa
b3bc767
6fe41a3
0ba00fa
 
74d76d7
b3bc767
 
 
0ba00fa
b3bc767
6fe41a3
74d76d7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
487e952
74d76d7
 
 
 
 
 
 
f2d5c0f
da72949
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import os
import random
import glob
import json

import numpy as np
from flask import Flask, render_template, request

app = Flask(__name__)


with open("problems.json") as f:
    problems = json.load(f)
    problem_choices = [q["question_title"] for q in problems]

random_idxs = list(range(len(problems)))
# random.seed(42)
# random.shuffle(random_idxs)
problems = [problems[idx] for idx in random_idxs]

with open("all_outputs.json") as f:
    all_outputs = json.load(f)
    all_models = list(all_outputs.keys())


num_questions_filtered = len(problems)

all_correctness_by_problem = {
    idx: {model: np.mean(all_outputs[model][idx]["pass1_list"]) for model in all_models}
    for idx in random_idxs
}


def calculate_color(performance):
    # Convert performance to a value between 0 and 1
    # Calculate the red and green components of the color
    if performance > 0.75:
        return f"rgba(0, 150, 0, 0.5)"
    elif performance > 0.5:
        return f"rgba(50, 150, 0, {performance})"
    elif performance > 0.25:
        return f"rgba(150, 50, 0, {1-performance})"
    else:
        return f"rgba(150, 0, 0,  0.5)"


all_evaluations_by_problem_colored = [
    (
        trueidx,
        {
            model: {
                "correctness": f"{all_correctness_by_problem[idx][model]*100:.1f}",
                "correctness_color": calculate_color(
                    all_correctness_by_problem[idx][model]
                ),
            }
            for model in all_models
        },
        problems[idx]["difficulty"],
        problems[idx]["question_id"],
    )
    for trueidx, idx in enumerate(random_idxs)
]

all_data_for_view_formatted = {
    model: [
        [
            {"code": a, "pass1": b, "metadata": c}
            for a, b, c in zip(
                row["code_list"], row["pass1_list"], row["metadata_list"]
            )
        ]
        # print(row)
        for idx in random_idxs
        for row in [resp[idx]]
    ]
    for model, resp in all_outputs.items()
}


@app.route("/")
def home():
    # Fetch your data here
    print(all_models)
    return render_template(
        "index.html", models=all_models, problems=all_evaluations_by_problem_colored
    )


@app.route("/problem/<int:problem_idx>")
def problem(problem_idx):
    # Fetch your data here

    data = {
        model: all_data_for_view_formatted[model][problem_idx] for model in all_models
    }
    evaluation = all_evaluations_by_problem_colored[problem_idx][1]
    question = problems[problem_idx]

    # print(data)

    return render_template(
        "problem.html",
        problem_idx=problem_idx,
        question_id=all_evaluations_by_problem_colored[problem_idx][3],
        evaluation=evaluation,
        models=all_models,
        question=question,
        data=data,
    )


mini_models = [
    # "DeepSeek-V2",
    "DeepSeek-V3",
    "DeepSeek-R1-Preview",
    # "DSCoder-33b-Ins",
    # "GPT-4-Turbo-2024-04-09",
    "GPT-4O-2024-05-13",
    "Claude-3.5-Sonnet-20240620",
    "Gemini-Flash-2.0-Thinking",
    # "Gemini-Exp-1206",
    # "Claude-3-Sonnet",
    "O1-2024-12-17 (N=1) (High)",
    "QwQ-32B-Preview (N=1)",
]


@app.route("/mini")
def mini():
    # Fetch your data here
    return render_template(
        "index_mini.html",
        models=mini_models,
        problems=all_evaluations_by_problem_colored,
    )


@app.route("/problem_mini/<int:problem_idx>")
def problem_mini(problem_idx):
    # Fetch your data here

    data = {
        model: all_data_for_view_formatted[model][problem_idx] for model in mini_models
    }
    evaluation = all_evaluations_by_problem_colored[problem_idx][1]
    question = problems[problem_idx]

    # print(data)

    return render_template(
        "problem_mini.html",
        problem_idx=problem_idx,
        question_id=all_evaluations_by_problem_colored[problem_idx][3],
        evaluation=evaluation,
        models=mini_models,
        question=question,
        data=data,
    )


if __name__ == "__main__":
    app.run(host="0.0.0.0", port=7860)