File size: 3,776 Bytes
			
			6ef31de  | 
								1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120  | 
								"""Generate json file for webpage."""
import json
import os
import re
models = ["alpaca", "llama", "gpt35", "bard"]
def read_jsonl(path: str, key: str = None):
    data = []
    with open(os.path.expanduser(path)) as f:
        for line in f:
            if not line:
                continue
            data.append(json.loads(line))
    if key is not None:
        data.sort(key=lambda x: x[key])
        data = {item[key]: item for item in data}
    return data
def trim_hanging_lines(s: str, n: int) -> str:
    s = s.strip()
    for _ in range(n):
        s = s.split("\n", 1)[1].strip()
    return s
if __name__ == "__main__":
    questions = read_jsonl("table/question.jsonl", key="question_id")
    alpaca_answers = read_jsonl(
        "table/answer/answer_alpaca-13b.jsonl", key="question_id"
    )
    bard_answers = read_jsonl("table/answer/answer_bard.jsonl", key="question_id")
    gpt35_answers = read_jsonl("table/answer/answer_gpt35.jsonl", key="question_id")
    llama_answers = read_jsonl("table/answer/answer_llama-13b.jsonl", key="question_id")
    vicuna_answers = read_jsonl(
        "table/answer/answer_vicuna-13b.jsonl", key="question_id"
    )
    review_alpaca = read_jsonl(
        "table/review/review_alpaca-13b_vicuna-13b.jsonl", key="question_id"
    )
    review_bard = read_jsonl(
        "table/review/review_bard_vicuna-13b.jsonl", key="question_id"
    )
    review_gpt35 = read_jsonl(
        "table/review/review_gpt35_vicuna-13b.jsonl", key="question_id"
    )
    review_llama = read_jsonl(
        "table/review/review_llama-13b_vicuna-13b.jsonl", key="question_id"
    )
    records = []
    for qid in questions.keys():
        r = {
            "id": qid,
            "category": questions[qid]["category"],
            "question": questions[qid]["text"],
            "answers": {
                "alpaca": alpaca_answers[qid]["text"],
                "llama": llama_answers[qid]["text"],
                "bard": bard_answers[qid]["text"],
                "gpt35": gpt35_answers[qid]["text"],
                "vicuna": vicuna_answers[qid]["text"],
            },
            "evaluations": {
                "alpaca": review_alpaca[qid]["text"],
                "llama": review_llama[qid]["text"],
                "bard": review_bard[qid]["text"],
                "gpt35": review_gpt35[qid]["text"],
            },
            "scores": {
                "alpaca": review_alpaca[qid]["score"],
                "llama": review_llama[qid]["score"],
                "bard": review_bard[qid]["score"],
                "gpt35": review_gpt35[qid]["score"],
            },
        }
        # cleanup data
        cleaned_evals = {}
        for k, v in r["evaluations"].items():
            v = v.strip()
            lines = v.split("\n")
            # trim the first line if it's a pair of numbers
            if re.match(r"\d+[, ]+\d+", lines[0]):
                lines = lines[1:]
            v = "\n".join(lines)
            cleaned_evals[k] = v.replace("Assistant 1", "**Assistant 1**").replace(
                "Assistant 2", "**Assistant 2**"
            )
        r["evaluations"] = cleaned_evals
        records.append(r)
    # Reorder the records, this is optional
    for r in records:
        if r["id"] <= 20:
            r["id"] += 60
        else:
            r["id"] -= 20
    for r in records:
        if r["id"] <= 50:
            r["id"] += 10
        elif 50 < r["id"] <= 60:
            r["id"] -= 50
    for r in records:
        if r["id"] == 7:
            r["id"] = 1
        elif r["id"] < 7:
            r["id"] += 1
    records.sort(key=lambda x: x["id"])
    # Write to file
    with open("webpage/data.json", "w") as f:
        json.dump({"questions": records, "models": models}, f, indent=2)
 |