File size: 3,692 Bytes
3f62aec
 
 
f604782
 
3f62aec
 
 
 
f604782
 
4cbc1c1
f604782
4cbc1c1
f604782
3f62aec
 
9d8004e
3f62aec
4cbc1c1
f604782
3f62aec
f604782
 
3f62aec
f604782
3f62aec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f604782
 
3f62aec
 
 
 
 
 
 
 
 
 
 
 
 
f604782
3f62aec
 
f604782
3f62aec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee73652
 
3f62aec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4cbc1c1
f604782
0f609e5
ab90864
0f609e5
ab90864
 
 
 
0f609e5
 
ab90864
 
 
 
0f609e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f604782
122ade4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import os
import random
import glob
import json

import numpy as np
from flask import Flask, render_template, request

app = Flask(__name__)


with open("problems.json") as f:
    problems = json.load(f)
    problem_choices = [q["question_title"] for q in problems]

random_idxs = list(range(len(problems)))
random.shuffle(random_idxs)
problems = [problems[idx] for idx in random_idxs]

with open("all_outputs.json") as f:
    all_outputs = json.load(f)
    all_models = list(all_outputs.keys())


num_questions_filtered = len(problems)

all_correctness_by_problem = {
    idx: {model: np.mean(all_outputs[model][idx]["pass1_list"]) for model in all_models}
    for idx in random_idxs
}


def calculate_color(performance):
    # Convert performance to a value between 0 and 1
    # Calculate the red and green components of the color
    if performance > 0.75:
        return f"rgba(0, 150, 0, 0.5)"
    elif performance > 0.5:
        return f"rgba(50, 150, 0, {performance})"
    elif performance > 0.25:
        return f"rgba(150, 50, 0, {1-performance})"
    else:
        return f"rgba(150, 0, 0,  0.5)"


all_evaluations_by_problem_colored = [
    (
        trueidx,
        {
            model: {
                "correctness": f"{all_correctness_by_problem[idx][model]*100:.1f}",
                "correctness_color": calculate_color(
                    all_correctness_by_problem[idx][model]
                ),
            }
            for model in all_models
        },
        problems[idx]["difficulty"],
    )
    for trueidx, idx in enumerate(random_idxs)
]

all_data_for_view_formatted = {
    model: [
        [{"code": a, "pass1": b} for a, b in zip(row["code_list"], row["pass1_list"])]
        # print(row)
        for idx in random_idxs
        for row in [resp[idx]]
    ]
    for model, resp in all_outputs.items()
}


@app.route("/")
def home():
    # Fetch your data here
    print(all_models)
    return render_template(
        "index.html", models=all_models, problems=all_evaluations_by_problem_colored
    )


@app.route("/problem/<int:problem_idx>")
def problem(problem_idx):
    # Fetch your data here

    data = {
        model: all_data_for_view_formatted[model][problem_idx] for model in all_models
    }
    evaluation = all_evaluations_by_problem_colored[problem_idx][1]
    question = problems[problem_idx]

    # print(data)

    return render_template(
        "problem.html",
        problem_idx=problem_idx,
        evaluation=evaluation,
        models=all_models,
        question=question,
        data=data,
    )


mini_models = [
    # "DeepSeek-V2",
    "DeepSeekCoder-V2",
    # "DSCoder-33b-Ins",
    "LLama3.1-70b-Ins",
    "LLama3.1-405b-Ins-FP8",
    # "GPT-4-Turbo-2024-04-09",
    "GPT-4O-2024-05-13",
    "Claude-3-Opus",
    # "Claude-3-Sonnet",
    "Gemini-Pro-1.5-August",
    "O1-Mini (N=1)",
    "O1-Preview (N=1)",
]


@app.route("/mini")
def mini():
    # Fetch your data here
    return render_template(
        "index_mini.html",
        models=mini_models,
        problems=all_evaluations_by_problem_colored,
    )


@app.route("/problem_mini/<int:problem_idx>")
def problem_mini(problem_idx):
    # Fetch your data here

    data = {
        model: all_data_for_view_formatted[model][problem_idx] for model in mini_models
    }
    evaluation = all_evaluations_by_problem_colored[problem_idx][1]
    question = problems[problem_idx]

    # print(data)

    return render_template(
        "problem_mini.html",
        problem_idx=problem_idx,
        evaluation=evaluation,
        models=mini_models,
        question=question,
        data=data,
    )


if __name__ == "__main__":
    app.run(host="0.0.0.0", port=7860)