File size: 4,776 Bytes
2a26d3b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import json
import os
import time
from argparse import ArgumentParser
# from accelerate import Accelerator
# from accelerate import DistributedDataParallelKwargs
from pathlib import Path

import numpy as np
import torch
import torch.nn.functional as F
from human_eval.evaluation import evaluate_functional_correctness
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
from utils.dataset import HumanEvalDataset
from utils.utils import cleanup_code
from vllm import LLM, SamplingParams


class HumanEval:
    """
    HumanEval evaluation class.
    """

    def __init__(
        self,
        data_root,
        language="python",
        log_dir=None,
        issft=False,
        inference_increment=True,
        n_sample=1,
        k_sample=1,
    ):
        self.data_root = data_root
        self.k = k_sample
        self.n_sample = n_sample
        self.language = language
        self.log_dir = log_dir
        self.sft = issft
        self.inference_increment = inference_increment
        os.makedirs(self.log_dir, exist_ok=True)

    @torch.no_grad()
    def eval_model(self, args):
        """
        Evaluate the model on HumanEval.
        """
        assert (
            self.log_dir is not None
        ), "log_dir should not be None when evaluating humaneval"
        dataset = HumanEvalDataset(
            self.data_root,
            sample_num=self.n_sample,
            language=self.language,
            issft=self.sft,
        )
        model_name_or_path = args.model_path
        print("model", model_name_or_path)
        tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
        print(
            "load tokenizer {} from {} over.".format(
                tokenizer.__class__, model_name_or_path
            )
        )

        llm = LLM(
            model=model_name_or_path,
            tensor_parallel_size=1,
            max_model_len=4096,
            trust_remote_code=True,
            enforce_eager=True,
        )
        sampling_params = SamplingParams(
            temperature=0,
            max_tokens=1024,
            top_p=0.95,
            stop_token_ids=[tokenizer.eos_token_id],
        )
        messages_list = []
        for i in range(len(dataset)):
            data = dataset[i]
            prompt = data["prompt"].strip()
            messages_list.append(prompt)
        outputs = llm.generate(messages_list, sampling_params=sampling_params)
        assert len(dataset) == len(outputs), "dataset and outputs different lengths."
        log_file = os.path.join(self.log_dir, f"{self.language}.json")
        tmpfile = open(log_file, "w")
        for i, output in enumerate(tqdm(outputs)):
            data = dataset[i]
            output = output.outputs[0].text
            output = cleanup_code(
                output,
                self.language,
                "humaneval",
                self.sft,
                dataset.stopwords,
            )
            # sft mode does not need original prompt
            if not self.sft:
                suffixprediction = data["original_prompt"] + "\n" + output
            res = {
                "task_id": data["task_id"],
                "generation": suffixprediction,
                "prompt": data["original_prompt"],
            }
            tmpfile.write(json.dumps(res) + "\n")

        tmpfile.close()
        # calculate the final score of pass@k
        self._calculate_final_score(log_file)
        return

    def _calculate_final_score(self, logfilepath):
        """
        Calculate the final score.
        """
        res = evaluate_functional_correctness(
            input_file=logfilepath,
            problem_file=os.path.join(
                self.data_root, f"humaneval-{self.language}.jsonl"
            ),
            tmp_dir=self.log_dir,
            language=self.language,
        )
        print("score is", res["pass@%d" % self.k])
        os.remove(logfilepath)
        return


if __name__ == "__main__":
    parser = ArgumentParser()
    parser.add_argument("--logdir", type=str, default="")
    parser.add_argument(
        "--model_path",
        type=str,
        help="model name or path",
        default="/data0/pretrained-models/qwen2-7b",
    )

    parser.add_argument("--language", type=str, default="python")
    parser.add_argument(
        "--dataroot",
        type=str,
        default="HumanEval/data",
    )
    args = parser.parse_args()

    logdir = args.logdir
    language = args.language

    if logdir == "":
        logdir = "output/tmp/"

    evaluator = HumanEval(
        data_root=args.dataroot,
        log_dir=logdir,
        n_sample=1,
        language=language,
    )
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    evaluator.eval_model(args)