| import re | |
| import json | |
| import jsonlines | |
| from openai import OpenAI | |
| def batch_eval(query_file, result1_file, result2_file, output_file_path): | |
| client = OpenAI() | |
| with open(query_file, "r") as f: | |
| data = f.read() | |
| queries = re.findall(r"- Question \d+: (.+)", data) | |
| with open(result1_file, "r") as f: | |
| answers1 = json.load(f) | |
| answers1 = [i["result"] for i in answers1] | |
| with open(result2_file, "r") as f: | |
| answers2 = json.load(f) | |
| answers2 = [i["result"] for i in answers2] | |
| requests = [] | |
| for i, (query, answer1, answer2) in enumerate(zip(queries, answers1, answers2)): | |
| sys_prompt = """ | |
| ---Role--- | |
| You are an expert tasked with evaluating two answers to the same question based on three criteria: **Comprehensiveness**, **Diversity**, and **Empowerment**. | |
| """ | |
| prompt = f""" | |
| You will evaluate two answers to the same question based on three criteria: **Comprehensiveness**, **Diversity**, and **Empowerment**. | |
| - **Comprehensiveness**: How much detail does the answer provide to cover all aspects and details of the question? | |
| - **Diversity**: How varied and rich is the answer in providing different perspectives and insights on the question? | |
| - **Empowerment**: How well does the answer help the reader understand and make informed judgments about the topic? | |
| For each criterion, choose the better answer (either Answer 1 or Answer 2) and explain why. Then, select an overall winner based on these three categories. | |
| Here is the question: | |
| {query} | |
| Here are the two answers: | |
| **Answer 1:** | |
| {answer1} | |
| **Answer 2:** | |
| {answer2} | |
| Evaluate both answers using the three criteria listed above and provide detailed explanations for each criterion. | |
| Output your evaluation in the following JSON format: | |
| {{ | |
| "Comprehensiveness": {{ | |
| "Winner": "[Answer 1 or Answer 2]", | |
| "Explanation": "[Provide explanation here]" | |
| }}, | |
| "Empowerment": {{ | |
| "Winner": "[Answer 1 or Answer 2]", | |
| "Explanation": "[Provide explanation here]" | |
| }}, | |
| "Overall Winner": {{ | |
| "Winner": "[Answer 1 or Answer 2]", | |
| "Explanation": "[Summarize why this answer is the overall winner based on the three criteria]" | |
| }} | |
| }} | |
| """ | |
| request_data = { | |
| "custom_id": f"request-{i+1}", | |
| "method": "POST", | |
| "url": "/v1/chat/completions", | |
| "body": { | |
| "model": "gpt-4o-mini", | |
| "messages": [ | |
| {"role": "system", "content": sys_prompt}, | |
| {"role": "user", "content": prompt}, | |
| ], | |
| }, | |
| } | |
| requests.append(request_data) | |
| with jsonlines.open(output_file_path, mode="w") as writer: | |
| for request in requests: | |
| writer.write(request) | |
| print(f"Batch API requests written to {output_file_path}") | |
| batch_input_file = client.files.create( | |
| file=open(output_file_path, "rb"), purpose="batch" | |
| ) | |
| batch_input_file_id = batch_input_file.id | |
| batch = client.batches.create( | |
| input_file_id=batch_input_file_id, | |
| endpoint="/v1/chat/completions", | |
| completion_window="24h", | |
| metadata={"description": "nightly eval job"}, | |
| ) | |
| print(f"Batch {batch.id} has been created.") | |
| if __name__ == "__main__": | |
| batch_eval() | |