Spaces:
Sleeping
Sleeping
File size: 10,458 Bytes
8b0c813 dbca0c8 8b0c813 1d9397e 57086c5 1d9397e 8b0c813 1d9397e 8b0c813 1d9397e dbca0c8 8b0c813 dbca0c8 8b0c813 57086c5 8b0c813 1d9397e 8b0c813 1d9397e 8b0c813 57086c5 1d9397e 8b0c813 1d9397e 8b0c813 1d9397e 8b0c813 dbca0c8 8b0c813 57086c5 1d9397e 8b0c813 1d9397e 8b0c813 1d9397e 8b0c813 dbca0c8 8b0c813 57086c5 1d9397e 8b0c813 1d9397e 57086c5 8b0c813 1d9397e 8b0c813 dbca0c8 8b0c813 dbca0c8 57086c5 1d9397e dbca0c8 1d9397e dbca0c8 1d9397e dbca0c8 1d9397e dbca0c8 57086c5 dbca0c8 8b0c813 1d9397e 8b0c813 f3dc719 dbca0c8 8b0c813 57086c5 1d9397e dbca0c8 1d9397e 57086c5 dbca0c8 1d9397e dbca0c8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 |
import pandas as pd
from fuzzywuzzy import fuzz
from collections import Counter
from nltk.stem import PorterStemmer
from ast import literal_eval
from typing import Union, List
import streamlit as st
from my_model.config import evaluation_config as config
class KBVQAEvaluator:
"""
A class to evaluate Knowledge-Based Visual Question Answering (KB-VQA) models.
This class provides methods for syntactic and semantic evaluation of the KB-VQA model,
using both exact match and VQA scores. The evaluation results can be saved to an
Excel file for further analysis.
Attributes:
data_path (str): Path to the evaluation data.
use_fuzzy (bool): Flag to determine if fuzzy matching should be used.
stemmer (PorterStemmer): Instance of PorterStemmer for stemming answers.
scores_df (pd.DataFrame): DataFrame containing scores.
df (pd.DataFrame): Main DataFrame containing evaluation data.
vqa_scores (Dict[str, float]): Dictionary to store VQA scores for different model configurations.
exact_match_scores (Dict[str, float]): Dictionary to store exact match scores for different model configurations.
fuzzy_threshold (int): Threshold for fuzzy matching score.
openai_api_key (str): API key for OpenAI GPT-4.
model_names (List[str]): List of model names to be evaluated.
model_configurations (List[str]): List of model configurations to be evaluated.
gpt4_seed (int): Seed for GPT-4 evaluation.
gpt4_max_tokens (int): Maximum tokens for GPT-4 responses.
gpt4_temperature (float): Temperature setting for GPT-4 responses.
"""
def __init__(self): -> None
"""
Initialize the KBVQAEvaluator with the dataset and configuration settings.
Reads data from the specified paths in the configuration and initializes
various attributes required for evaluation.
"""
self.data_path = config.EVALUATION_DATA_PATH
self.use_fuzzy = config.USE_FUZZY
self.stemmer = PorterStemmer()
self.scores_df = pd.read_excel(self.data_path, sheet_name="Scores")
self.df = pd.read_excel(self.data_path, sheet_name="Main Data")
self.vqa_scores = {}
self.exact_match_scores = {}
self.fuzzy_threshold = config.FUZZY_SCORE
self.openai_api_key = config.OPENAI_API_KEY
self.model_names = config.MODEL_NAMES
self.model_configurations = config.MODEL_CONFIGURATIONS # ['caption+detic', 'caption+yolov5', 'only_caption', 'only_detic', 'only_yolov5']
self.gpt4_seed = config.GPT4_SEED
self.gpt4_max_tokens = config.GPT4_MAX_TOKENS
self.gpt4_temperature = config.GPT4_TEMPERATURE
def stem_answers(self, answers: Union[str, List[str]]) -> Union[str, List[str]]:
"""
Apply Porter Stemmer to either a single string or a list of strings.
Args:
answers (Union[str, List[str]]): A single answer string or a list of answer strings.
Returns:
Union[str, List[str]]: Stemmed version of the input string or list of strings.
"""
if isinstance(answers, list):
return [" ".join(self.stemmer.stem(word.strip()) for word in answer.split()) for answer in answers]
else:
words = answers.split()
return " ".join(self.stemmer.stem(word.strip()) for word in words)
def calculate_vqa_score(self, ground_truths: List[str], model_answer: str) -> float:
"""
Calculate VQA score based on the number of matching answers, with optional fuzzy matching.
Args:
ground_truths (List[str]): List of ground truth answers.
model_answer (str): Model's answer to be evaluated.
Returns:
float: VQA score based on the number of matches.
"""
if self.use_fuzzy:
fuzzy_matches = sum(fuzz.partial_ratio(model_answer, gt) >= self.fuzzy_threshold for gt in ground_truths)
return min(fuzzy_matches / 3, 1)
else:
count = Counter(ground_truths)
return min(count.get(model_answer, 0) / 3, 1)
def calculate_exact_match_score(self, ground_truths: List[str], model_answer: str) -> int:
"""
Calculate Exact Match score, with optional fuzzy matching.
Args:
ground_truths (List[str]): List of ground truth answers.
model_answer (str): Model's answer to be evaluated.
Returns:
int: Exact match score (1 if there is a match, 0 otherwise).
"""
if self.use_fuzzy:
return int(any(fuzz.partial_ratio(model_answer, gt) >= self.fuzzy_threshold for gt in ground_truths))
else:
return int(model_answer in ground_truths)
def syntactic_evaluation(self) -> None:
"""
Process the DataFrame: stem answers, calculate scores, and store results.
Returns:
None.
"""
self.df['raw_answers_stemmed'] = self.df['raw_answers'].apply(literal_eval).apply(self.stem_answers)
for name in self.model_names:
for config in self.model_configurations:
full_config = f'{name}_{config}'
self.df[f'{full_config}_stemmed'] = self.df[full_config].apply(self.stem_answers)
self.df[f'vqa_score_{full_config}'] = self.df.apply(lambda x: self.calculate_vqa_score(x['raw_answers_stemmed'], x[f'{full_config}_stemmed']), axis=1)
self.df[f'exact_match_score_{full_config}'] = self.df.apply(lambda x: self.calculate_exact_match_score(x['raw_answers_stemmed'], x[f'{full_config}_stemmed']), axis=1)
self.vqa_scores[full_config] = round(self.df[f'vqa_score_{full_config}'].mean()*100, 2)
self.exact_match_scores[full_config] = round(self.df[f'exact_match_score_{full_config}'].mean()*100, 2)
def create_GPT4_messages_template(self, question: str, ground_truths: List[str], model_answer: str) -> List[dict]:
"""
Create a message list for the GPT-4 API call based on the question, ground truths, and model answer.
Args:
question (str): The question being evaluated.
ground_truths (List[str]): List of ground truth answers.
model_answer (str): Model's answer to be evaluated.
Returns:
List[dict]: Messages formatted for GPT-4 API call.
"""
system_message = {
"role": "system",
"content": """You are an AI trained to evaluate the equivalence of AI-generated answers to a set of ground truth answers for a given question. Upon reviewing a model's answer, determine if it matches the ground truths. Use the following rating system: 1 if you find that the model answer matches more than 25% of the ground truth answers, 2 if you find that the model answer matches only less than 25% of the ground truth answers, and 3 if the model answer is incorrect. Respond in the format below for easy parsing:
Rating: {1/2/3}
"""
}
user_message = {
"role": "user",
"content": f"Question : {question}\nGround Truth: {ground_truths}\nModel's Response: {model_answer}"
}
return [system_message, user_message]
def semantic_evaluation(self) -> None:
"""
Perform semantic evaluation using GPT-4 for each model configuration.
Returns:
None.
"""
openai.api_key = self.openai_api_key
model_configurations_for_semantic_evaluation = self.model_configurations[:2] # considering only main model configs ['caption+detic', 'caption+yolov5'] without ablation, due to the cost involved.
for name in self.model_names:
for config in model_configurations_for_semantic_evaluation:
# Iterate over rows and send requests
for index, row in self.df.iterrows():
messages = self.create_GPT4_messages_template(row['question'], row['raw_answers'][1:-1], row[name+'_'+config])
response = openai.ChatCompletion.create(model="gpt-4", messages=messages, max_tokens=self.gpt4_max_tokens, temperature=self.gpt4_temperature, seed=self.gpt4_seed)
evaluation = response["choices"][0]["message"]["content"]
rating = int(evaluation.split('\n')[0].split(":")[1].strip())
self.df.at[index, f'gpt4_rating_{config}'] = rating
def save_results(self, save_filename: str) -> None:
"""
Save the evaluation results to an Excel file.
Args:
save_filename (str): The filename to save the results.
"""
# Create a DataFrame for the scores
scores_data = {
'Model Configuration': list(self.vqa_scores.keys()),
'VQA Score': list(self.vqa_scores.values()),
'Exact Match Score': list(self.exact_match_scores.values())
}
scores_df = pd.DataFrame(scores_data)
# Saving the scores DataFrame to an Excel file
with pd.ExcelWriter(save_filename+'.xlsx', engine='openpyxl', mode='w') as writer:
self.df.to_excel(writer, sheet_name='Main Data', index=False)
scores_df.to_excel(writer, sheet_name='Scores', index=False)
def run_evaluation(save: bool = False, save_filename: str = "results") -> None:
"""
Run the full evaluation process using KBVQAEvaluator and save the results to an Excel file.
Args:
save (bool): Whether to save the results to an Excel file. Defaults to False.
save_filename (str): The filename to save the results if save is True. Defaults to "results".
Returns:
None.
"""
# Instantiate the evaluator
evaluator = KBVQAEvaluator()
# Run syntactic evaluation
evaluator.syntactic_evaluation()
# Optionally, run semantic evaluation if required (can be cost-intensive)
evaluator.semantic_evaluation()
if save:
# Save results
evaluator.save_results(save_filename)
# Call run_evaluation() to execute the evaluation process
if __name__ == "__main__":
#run_evaluation(save=True, save_filename="results")
pass |