File size: 3,776 Bytes
5a7ab71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
"""Generate json file for webpage."""
import json
import os
import re

models = ["alpaca", "llama", "gpt35", "bard"]


def read_jsonl(path: str, key: str = None):
    data = []
    with open(os.path.expanduser(path)) as f:
        for line in f:
            if not line:
                continue
            data.append(json.loads(line))
    if key is not None:
        data.sort(key=lambda x: x[key])
        data = {item[key]: item for item in data}
    return data


def trim_hanging_lines(s: str, n: int) -> str:
    s = s.strip()
    for _ in range(n):
        s = s.split("\n", 1)[1].strip()
    return s


if __name__ == "__main__":
    questions = read_jsonl("table/question.jsonl", key="question_id")

    alpaca_answers = read_jsonl(
        "table/answer/answer_alpaca-13b.jsonl", key="question_id"
    )
    bard_answers = read_jsonl("table/answer/answer_bard.jsonl", key="question_id")
    gpt35_answers = read_jsonl("table/answer/answer_gpt35.jsonl", key="question_id")
    llama_answers = read_jsonl("table/answer/answer_llama-13b.jsonl", key="question_id")
    vicuna_answers = read_jsonl(
        "table/answer/answer_vicuna-13b.jsonl", key="question_id"
    )

    review_alpaca = read_jsonl(
        "table/review/review_alpaca-13b_vicuna-13b.jsonl", key="question_id"
    )
    review_bard = read_jsonl(
        "table/review/review_bard_vicuna-13b.jsonl", key="question_id"
    )
    review_gpt35 = read_jsonl(
        "table/review/review_gpt35_vicuna-13b.jsonl", key="question_id"
    )
    review_llama = read_jsonl(
        "table/review/review_llama-13b_vicuna-13b.jsonl", key="question_id"
    )

    records = []
    for qid in questions.keys():
        r = {
            "id": qid,
            "category": questions[qid]["category"],
            "question": questions[qid]["text"],
            "answers": {
                "alpaca": alpaca_answers[qid]["text"],
                "llama": llama_answers[qid]["text"],
                "bard": bard_answers[qid]["text"],
                "gpt35": gpt35_answers[qid]["text"],
                "vicuna": vicuna_answers[qid]["text"],
            },
            "evaluations": {
                "alpaca": review_alpaca[qid]["text"],
                "llama": review_llama[qid]["text"],
                "bard": review_bard[qid]["text"],
                "gpt35": review_gpt35[qid]["text"],
            },
            "scores": {
                "alpaca": review_alpaca[qid]["score"],
                "llama": review_llama[qid]["score"],
                "bard": review_bard[qid]["score"],
                "gpt35": review_gpt35[qid]["score"],
            },
        }

        # cleanup data
        cleaned_evals = {}
        for k, v in r["evaluations"].items():
            v = v.strip()
            lines = v.split("\n")
            # trim the first line if it's a pair of numbers
            if re.match(r"\d+[, ]+\d+", lines[0]):
                lines = lines[1:]
            v = "\n".join(lines)
            cleaned_evals[k] = v.replace("Assistant 1", "**Assistant 1**").replace(
                "Assistant 2", "**Assistant 2**"
            )

        r["evaluations"] = cleaned_evals
        records.append(r)

    # Reorder the records, this is optional
    for r in records:
        if r["id"] <= 20:
            r["id"] += 60
        else:
            r["id"] -= 20
    for r in records:
        if r["id"] <= 50:
            r["id"] += 10
        elif 50 < r["id"] <= 60:
            r["id"] -= 50
    for r in records:
        if r["id"] == 7:
            r["id"] = 1
        elif r["id"] < 7:
            r["id"] += 1

    records.sort(key=lambda x: x["id"])

    # Write to file
    with open("webpage/data.json", "w") as f:
        json.dump({"questions": records, "models": models}, f, indent=2)