Spaces:
Running
Running
use evaluate metric for evaluation
Browse files- example_script.py +6 -5
example_script.py
CHANGED
@@ -7,7 +7,7 @@ import pprint
|
|
7 |
from tqdm import tqdm
|
8 |
from datasets import load_dataset
|
9 |
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, set_seed
|
10 |
-
from
|
11 |
|
12 |
def generate_prompt(sample):
|
13 |
starter_code = None if len(sample["starter_code"]) == 0 else sample["starter_code"]
|
@@ -91,11 +91,12 @@ def main(args):
|
|
91 |
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
|
92 |
model = AutoModelForCausalLM.from_pretrained(args.model_ckpt)
|
93 |
generations = make_generations(dataset, args, model, tokenizer)
|
94 |
-
|
95 |
-
|
96 |
-
|
|
|
97 |
with open(args.output_file, "w") as fp:
|
98 |
-
json.dump(
|
99 |
|
100 |
|
101 |
if __name__ == "__main__":
|
|
|
7 |
from tqdm import tqdm
|
8 |
from datasets import load_dataset
|
9 |
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, set_seed
|
10 |
+
from evaluate import load
|
11 |
|
12 |
def generate_prompt(sample):
|
13 |
starter_code = None if len(sample["starter_code"]) == 0 else sample["starter_code"]
|
|
|
91 |
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
|
92 |
model = AutoModelForCausalLM.from_pretrained(args.model_ckpt)
|
93 |
generations = make_generations(dataset, args, model, tokenizer)
|
94 |
+
|
95 |
+
metric = load("loubnabnl/apps_metric")
|
96 |
+
results = metric.compute(predictions=generations, level=args.difficulty, k_list=args.k_list, count_errors=args.count_errors, debug=args.debug)
|
97 |
+
print(results)
|
98 |
with open(args.output_file, "w") as fp:
|
99 |
+
json.dump(results, fp)
|
100 |
|
101 |
|
102 |
if __name__ == "__main__":
|