m7mdal7aj commited on
Commit
57086c5
1 Parent(s): cb8ebbd

Update my_model/results/evaluation.py

Browse files
Files changed (1) hide show
  1. my_model/results/evaluation.py +16 -0
my_model/results/evaluation.py CHANGED
@@ -31,6 +31,7 @@ class KBVQAEvaluator:
31
  gpt4_max_tokens (int): Maximum tokens for GPT-4 responses.
32
  gpt4_temperature (float): Temperature setting for GPT-4 responses.
33
  """
 
34
 
35
  def __init__(self): -> None
36
  """
@@ -55,6 +56,7 @@ class KBVQAEvaluator:
55
  self.gpt4_max_tokens = config.GPT4_MAX_TOKENS
56
  self.gpt4_temperature = config.GPT4_TEMPERATURE
57
 
 
58
  def stem_answers(self, answers: Union[str, List[str]]) -> Union[str, List[str]]:
59
  """
60
  Apply Porter Stemmer to either a single string or a list of strings.
@@ -72,6 +74,7 @@ class KBVQAEvaluator:
72
  words = answers.split()
73
  return " ".join(self.stemmer.stem(word.strip()) for word in words)
74
 
 
75
  def calculate_vqa_score(self, ground_truths: List[str], model_answer: str) -> float:
76
  """
77
  Calculate VQA score based on the number of matching answers, with optional fuzzy matching.
@@ -91,6 +94,7 @@ class KBVQAEvaluator:
91
  count = Counter(ground_truths)
92
  return min(count.get(model_answer, 0) / 3, 1)
93
 
 
94
  def calculate_exact_match_score(self, ground_truths: List[str], model_answer: str) -> int:
95
  """
96
  Calculate Exact Match score, with optional fuzzy matching.
@@ -108,10 +112,13 @@ class KBVQAEvaluator:
108
  else:
109
  return int(model_answer in ground_truths)
110
 
 
111
  def syntactic_evaluation(self) -> None:
112
  """
113
  Process the DataFrame: stem answers, calculate scores, and store results.
114
 
 
 
115
  """
116
 
117
  self.df['raw_answers_stemmed'] = self.df['raw_answers'].apply(literal_eval).apply(self.stem_answers)
@@ -127,6 +134,7 @@ class KBVQAEvaluator:
127
  self.vqa_scores[full_config] = round(self.df[f'vqa_score_{full_config}'].mean()*100, 2)
128
  self.exact_match_scores[full_config] = round(self.df[f'exact_match_score_{full_config}'].mean()*100, 2)
129
 
 
130
  def create_GPT4_messages_template(self, question: str, ground_truths: List[str], model_answer: str) -> List[dict]:
131
  """
132
  Create a message list for the GPT-4 API call based on the question, ground truths, and model answer.
@@ -158,6 +166,9 @@ class KBVQAEvaluator:
158
  def semantic_evaluation(self) -> None:
159
  """
160
  Perform semantic evaluation using GPT-4 for each model configuration.
 
 
 
161
  """
162
  openai.api_key = self.openai_api_key
163
  model_configurations_for_semantic_evaluation = self.model_configurations[:2] # considering only main model configs ['caption+detic', 'caption+yolov5'] without ablation, due to the cost involved.
@@ -192,6 +203,8 @@ class KBVQAEvaluator:
192
  self.df.to_excel(writer, sheet_name='Main Data', index=False)
193
  scores_df.to_excel(writer, sheet_name='Scores', index=False)
194
 
 
 
195
  def run_evaluation(save: bool = False, save_filename: str = "results") -> None:
196
  """
197
  Run the full evaluation process using KBVQAEvaluator and save the results to an Excel file.
@@ -199,6 +212,9 @@ def run_evaluation(save: bool = False, save_filename: str = "results") -> None:
199
  Args:
200
  save (bool): Whether to save the results to an Excel file. Defaults to False.
201
  save_filename (str): The filename to save the results if save is True. Defaults to "results".
 
 
 
202
  """
203
 
204
  # Instantiate the evaluator
 
31
  gpt4_max_tokens (int): Maximum tokens for GPT-4 responses.
32
  gpt4_temperature (float): Temperature setting for GPT-4 responses.
33
  """
34
+
35
 
36
  def __init__(self): -> None
37
  """
 
56
  self.gpt4_max_tokens = config.GPT4_MAX_TOKENS
57
  self.gpt4_temperature = config.GPT4_TEMPERATURE
58
 
59
+
60
  def stem_answers(self, answers: Union[str, List[str]]) -> Union[str, List[str]]:
61
  """
62
  Apply Porter Stemmer to either a single string or a list of strings.
 
74
  words = answers.split()
75
  return " ".join(self.stemmer.stem(word.strip()) for word in words)
76
 
77
+
78
  def calculate_vqa_score(self, ground_truths: List[str], model_answer: str) -> float:
79
  """
80
  Calculate VQA score based on the number of matching answers, with optional fuzzy matching.
 
94
  count = Counter(ground_truths)
95
  return min(count.get(model_answer, 0) / 3, 1)
96
 
97
+
98
  def calculate_exact_match_score(self, ground_truths: List[str], model_answer: str) -> int:
99
  """
100
  Calculate Exact Match score, with optional fuzzy matching.
 
112
  else:
113
  return int(model_answer in ground_truths)
114
 
115
+
116
  def syntactic_evaluation(self) -> None:
117
  """
118
  Process the DataFrame: stem answers, calculate scores, and store results.
119
 
120
+ Returns:
121
+ None.
122
  """
123
 
124
  self.df['raw_answers_stemmed'] = self.df['raw_answers'].apply(literal_eval).apply(self.stem_answers)
 
134
  self.vqa_scores[full_config] = round(self.df[f'vqa_score_{full_config}'].mean()*100, 2)
135
  self.exact_match_scores[full_config] = round(self.df[f'exact_match_score_{full_config}'].mean()*100, 2)
136
 
137
+
138
  def create_GPT4_messages_template(self, question: str, ground_truths: List[str], model_answer: str) -> List[dict]:
139
  """
140
  Create a message list for the GPT-4 API call based on the question, ground truths, and model answer.
 
166
  def semantic_evaluation(self) -> None:
167
  """
168
  Perform semantic evaluation using GPT-4 for each model configuration.
169
+
170
+ Returns:
171
+ None.
172
  """
173
  openai.api_key = self.openai_api_key
174
  model_configurations_for_semantic_evaluation = self.model_configurations[:2] # considering only main model configs ['caption+detic', 'caption+yolov5'] without ablation, due to the cost involved.
 
203
  self.df.to_excel(writer, sheet_name='Main Data', index=False)
204
  scores_df.to_excel(writer, sheet_name='Scores', index=False)
205
 
206
+
207
+
208
  def run_evaluation(save: bool = False, save_filename: str = "results") -> None:
209
  """
210
  Run the full evaluation process using KBVQAEvaluator and save the results to an Excel file.
 
212
  Args:
213
  save (bool): Whether to save the results to an Excel file. Defaults to False.
214
  save_filename (str): The filename to save the results if save is True. Defaults to "results".
215
+
216
+ Returns:
217
+ None.
218
  """
219
 
220
  # Instantiate the evaluator