|
|
import json |
|
|
import os |
|
|
import time |
|
|
from copy import deepcopy |
|
|
from pathlib import Path |
|
|
|
|
|
import numpy as np |
|
|
import openai |
|
|
import requests |
|
|
import yaml |
|
|
from loguru import logger as eval_logger |
|
|
from openai import OpenAI |
|
|
|
|
|
NUM_SECONDS_TO_SLEEP = 0.5 |
|
|
|
|
|
FERRET_W_METRICS = ["gpt_eval_ferret_refer_desc", "gpt_eval_ferret_refer_reason", "gpt_eval_ferret_ground_conv"] |
|
|
|
|
|
rule_dict = json.load(open(os.path.join(os.path.dirname(os.path.abspath(__file__)), "rule.json"), "r")) |
|
|
|
|
|
with open(Path(__file__).parent / "ferret.yaml", "r") as f: |
|
|
raw_data = f.readlines() |
|
|
safe_data = [] |
|
|
for i, line in enumerate(raw_data): |
|
|
|
|
|
if "!function" not in line: |
|
|
safe_data.append(line) |
|
|
|
|
|
config = yaml.safe_load("".join(safe_data)) |
|
|
|
|
|
GPT_EVAL_MODEL_NAME = os.getenv("MODEL_VERSION", "gpt-4o-2024-11-20") |
|
|
|
|
|
API_TYPE = os.getenv("API_TYPE", "openai") |
|
|
|
|
|
if API_TYPE == "openai": |
|
|
API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions") |
|
|
API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY") |
|
|
headers = { |
|
|
"Authorization": f"Bearer {API_KEY}", |
|
|
"Content-Type": "application/json", |
|
|
} |
|
|
elif API_TYPE == "azure": |
|
|
API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken") |
|
|
API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY") |
|
|
headers = { |
|
|
"api-key": API_KEY, |
|
|
"Content-Type": "application/json", |
|
|
} |
|
|
|
|
|
|
|
|
def get_eval(content: str, max_tokens: int, retries: int = 3): |
|
|
global headers |
|
|
|
|
|
messages = [ |
|
|
{ |
|
|
"role": "system", |
|
|
"content": "You are a helpful and precise assistant for checking the quality of the answer.", |
|
|
}, |
|
|
{"role": "user", "content": content}, |
|
|
] |
|
|
|
|
|
payload = { |
|
|
"model": GPT_EVAL_MODEL_NAME, |
|
|
"messages": messages, |
|
|
"temperature": 0.2, |
|
|
"max_tokens": max_tokens, |
|
|
} |
|
|
|
|
|
for attempt in range(retries): |
|
|
try: |
|
|
response = requests.post(API_URL, headers=headers, json=payload) |
|
|
response.raise_for_status() |
|
|
response_data = response.json() |
|
|
|
|
|
content = response_data["choices"][0]["message"]["content"].strip() |
|
|
if content != "": |
|
|
return content, response_data["model"] |
|
|
break |
|
|
|
|
|
except Exception as e: |
|
|
eval_logger.info(f"Attempt {attempt + 1} failed with error: {e}") |
|
|
if attempt < retries - 1: |
|
|
time.sleep(NUM_SECONDS_TO_SLEEP) |
|
|
else: |
|
|
eval_logger.error(f"All {retries} attempts failed. Last error message: {e}") |
|
|
return "", "" |
|
|
return "", "" |
|
|
|
|
|
|
|
|
def parse_score(review): |
|
|
try: |
|
|
score_pair = review.split("\n")[0].strip() |
|
|
score_pair = score_pair.replace(",", " ") |
|
|
sp = score_pair.split(" ") |
|
|
if len(sp) == 2: |
|
|
return [float(sp[0]), float(sp[1])] |
|
|
else: |
|
|
eval_logger.debug(f"Can not split: {review}. Returning [-1, -1]") |
|
|
return [-1, -1] |
|
|
except Exception as e: |
|
|
eval_logger.debug(f"Error: {e}. Returning [-1, -1]") |
|
|
return [-1, -1] |
|
|
|
|
|
|
|
|
def ferret_doc_to_visual(doc): |
|
|
return [doc["image"].convert("RGB")] |
|
|
|
|
|
|
|
|
def ferret_doc_to_text(doc, lmms_eval_specific_kwargs=None): |
|
|
if lmms_eval_specific_kwargs is None: |
|
|
lmms_eval_specific_kwargs = {} |
|
|
pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "") |
|
|
post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "") |
|
|
question = f"{pre_prompt}{doc['question']}{post_prompt}" |
|
|
return question |
|
|
|
|
|
|
|
|
def ferret_process_results(doc, result): |
|
|
""" |
|
|
Args: |
|
|
doc: a instance of the eval dataset |
|
|
results: [pred] |
|
|
Returns: |
|
|
a dictionary with key: metric name (in this case coco_bleu), value: metric value |
|
|
""" |
|
|
try: |
|
|
question = doc.get("question", "") |
|
|
ans1 = doc.get("gpt_answer", "") |
|
|
ans2 = result[0] if result else "" |
|
|
context = doc.get("context", []) |
|
|
context = "\n".join(context) if isinstance(context, list) else context |
|
|
category = doc.get("category", "") |
|
|
rule = rule_dict.get(category, {}) |
|
|
prompt = rule.get("prompt", "") |
|
|
role = rule.get("role", "user") |
|
|
content = f"[Context]\n{context}\n\n" f"[Question]\n{question}\n\n" f"[{role} 1]\n{ans1}\n\n[End of {role} 1]\n\n" f"[{role} 2]\n{ans2}\n\n[End of {role} 2]\n\n" f"[System]\n{prompt}\n\n" |
|
|
review, model_name = get_eval(content, 1024) |
|
|
scores = parse_score(review) |
|
|
except Exception as e: |
|
|
eval_logger.error(f"Error for Question ID: {doc.get('question_id', 'Unknown')}: {e}") |
|
|
review = "Failed to Get a Proper Review." |
|
|
model_name = "Failed Request" |
|
|
scores = [-1, -1] |
|
|
|
|
|
metric = f"gpt_eval_ferret_{doc.get('category', 'all')}" |
|
|
category_review_dict = { |
|
|
"question": question, |
|
|
"ans1": ans1, |
|
|
"ans2": ans2, |
|
|
"context": context, |
|
|
"category": category, |
|
|
"review": review, |
|
|
"scores": scores, |
|
|
"eval_model": model_name, |
|
|
} |
|
|
|
|
|
non_category_review_dict = deepcopy(category_review_dict) |
|
|
non_category_review_dict["scores"] = [-999, -999] |
|
|
|
|
|
data_dict = {} |
|
|
for m in FERRET_W_METRICS: |
|
|
if m == metric: |
|
|
data_dict[m] = category_review_dict |
|
|
else: |
|
|
data_dict[m] = non_category_review_dict |
|
|
data_dict["gpt_eval_ferret_all"] = category_review_dict |
|
|
|
|
|
|
|
|
return data_dict |
|
|
|
|
|
|
|
|
def ferret_refer_desc_aggregation(results): |
|
|
return ferret_aggregation(results, "refer_desc") |
|
|
|
|
|
|
|
|
def ferret_refer_reason_aggregation(results): |
|
|
return ferret_aggregation(results, "refer_reason") |
|
|
|
|
|
|
|
|
def ferret_ground_conv_aggregation(results): |
|
|
return ferret_aggregation(results, "ground_conv") |
|
|
|
|
|
|
|
|
def ferret_all_aggregation(results): |
|
|
return ferret_aggregation(results, "all") |
|
|
|
|
|
|
|
|
def ferret_aggregation(results, category): |
|
|
try: |
|
|
scores = [] |
|
|
for result in results: |
|
|
if -999 in result["scores"]: |
|
|
continue |
|
|
scores.append(result["scores"]) |
|
|
|
|
|
stats = np.asarray(scores).mean(0).tolist() |
|
|
stats = [round(x, 3) for x in stats] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return round(stats[1] / stats[0] * 100, 1) |
|
|
except Exception as e: |
|
|
eval_logger.info(f"Error in ferret_aggregation: {e}, and in category: {category}") |
|
|
return None |
|
|
|