LastingBench / utils /llmjudge.py
kixx's picture
Upload 34 files
b1e25b1 verified
import logging
import os
def judge_answer_with_api(question, target, answer):
from openai import OpenAI
client = OpenAI(
base_url=os.environ.get("OPENAI_BASE_URL"),
api_key=os.environ.get("OPENAI_API_KEY")
)
prompt = (
"You will be given a question, a target answer (maybe a list of all possible answers), "
"and a generated answer. Please judge whether the generated answer is correct. "
"If it is correct, return 'True'. If it is incorrect, return 'False'.\n"
f"Question: {question}\n"
f"Target Answer: {target}\n"
f"Generated Answer: {answer}\n"
"Please return only 'True' or 'False', without any other text."
)
try:
response = client.chat.completions.create(
model="gpt-4o-mini-2024-07-18",
messages=[{"role": "user", "content": prompt}],
temperature=0,
max_tokens=1
)
except Exception as e:
logging.error("API call failed: %s", str(e))
return 0 # Return default value
try:
result = response.choices[0].message.content.strip()
except (AttributeError, IndexError) as e:
logging.error("Error parsing API response: %s", str(e))
return 0
if result == "True":
return 1
elif result == "False":
return 0
else:
logging.warning("Abnormal response format: %s", result)
return 0