Spaces:

hasaniqbal777
/

OpenFactCheck-Prerelease

Sleeping

OpenFactCheck-Prerelease / src /openfactcheck /evaluator /llm /evaluate_freshqa.py

Hasan Iqbal

Added LLM Evaluation on Datasets

eca534f unverified 7 months ago

12.9 kB

	import os
	import re
	import pytz
	import datetime
	import pandas as pd
	import seaborn as sns
	from openai import OpenAI
	import matplotlib.pyplot as plt

	class FreshQAEvaluator():
	def __init__(self):
	self.openai_client = OpenAI()
	self.openai_model_name = "gpt-4o-mini"
	self.current_date = datetime.datetime.now(pytz.timezone("America/Los_Angeles")).strftime("%B %d, %Y")
	self.prefix = (
	"Please evaluate the response to a question under strict evaluation, where"
	" no hallucinations, outdated information, or ill-formed answers are"
	" allowed. Please credit the response only if it provides a confident and"
	" definitive answer, or the correct answer can be obviously inferred from"
	" the response. The primary or final answer when standing alone must be"
	" accurate. Any additional information that is provided must not contradict"
	" the primary answer or reshape one's perception of it. For false-premise"
	" questions, the response must point out the presence of a false premise to"
	" receive credit. For answers that involve names of entities (e.g.,"
	" people), complete names or commonly recognized names are expected."
	" Regarding numerical answers, approximate numbers are generally not"
	" accepted unless explicitly included in the ground-truth answers. A"
	" response that contains any hallucination, no matter how minor, will not"
	" receive credit. Furthermore, when the response indicates that the"
	" information might be outdated, we accept it only if it is evident that"
	" the knowledge has not changed (e.g., through common sense or well-known"
	" facts)."
	)

	self.demo_examples = [
	{
	"question": "How old is the world's oldest verified living person?",
	"correct_answers": [
	"116 years old",
	"116",
	],
	"response": (
	f"As of today {self.current_date}, the most up-to-date and relevant"
	" information regarding this query is as follows. The oldest"
	" verified living person is Maria Branyas Morera, who was born"
	" on March 4, 1907, making her 116 years old."
	),
	"comment": (
	"This is a valid question. The primary answer in the response"
	" (116 years old) is accurate, and all the provided information"
	" is accurate and up-to-date. Thus, the response is credited."
	),
	"evaluation": "correct",
	},
	{
	"question": "When did the UK adopt the Euro?",
	"correct_answers": ["The United Kingdom has never adopted the Euro."],
	"response": (
	"The UK has never adopted the Euro as its official currency. The"
	" country has retained the British pound sterling (GBP) as its"
	" currency."
	),
	"comment": (
	"The question contains a false premise. The response debunks the"
	" question's false premise (i.e., it points out that the UK has"
	" never adopted the Euro), and all the provided information is"
	" accurate and up-to-date. Thus, the response is credited."
	),
	"evaluation": "correct",
	},
	{
	"question": (
	"How much longer will Brittney Griner spend in the Russian prison?"
	),
	"correct_answers": [
	"She was released in December 2022 as part of a prisoner swap."
	],
	"response": (
	"I'm sorry, but I have no information to suggest that Brittney"
	" Griner is currently in a Russian prison. As far as I know, she"
	" is a professional basketball player who currently plays for the"
	" Phoenix Mercury in the Women's National Basketball Association"
	" (WNBA). If you have any more information or context, please let"
	" me know and I'll do my best to help."
	),
	"comment": (
	"The question contains a false premise. The response fails to"
	" debunk the question's false premise by not explicitly stating"
	" that Brittney Griner was already released. Additionally, the"
	" response lacks a confident and definitive answer. Thus, the"
	" response is not credited."
	),
	"evaluation": "incorrect",
	},
	{
	"question": (
	"Which language has the third largest population by number of"
	" native speakers?"
	),
	"correct_answers": ["English"],
	"response": "1. Mandarin 2. Spanish 3. English",
	"comment": (
	"This is a valid question. While the correct answer (English) can"
	" be inferred from the response, the response lacks an explicit"
	" statement, which is not desirable."
	" Thus, the response is not credited."
	),
	"evaluation": "incorrect",
	},
	{
	"question": (
	"Has Virginia Woolf's novel about the Ramsay family entered the"
	" public domain in the United States?"
	),
	"correct_answers": ["Yes"],
	"response": (
	"1927's To the Lighthouse is in the public domain in the United"
	" States. The final answer is To the Lighthouse."
	),
	"comment": (
	"This is a valid question. While the primary information in the"
	" response (1927's To the Lighthouse is in the public domain in"
	" the United States) is accurate, the final answer when standing"
	' alone is inaccurate (it should be "Yes" instead of "To the'
	' Lighthouse"). Thus, the response is not credited.'
	),
	"evaluation": "incorrect",
	},
	]

	self.demo_questions = [ex["question"] for ex in self.demo_examples]
	self.demo_evaluations = []

	self.demo_evaluation_template = (
	"\ncorrect answer(s): {correct_answers}"
	"\nresponse: {response}"
	"\ncomment: {comment}"
	"\nevaluation: {evaluation}"
	)

	self.evaluation_template = (
	"\ncorrect answer(s): {correct_answers}" "\nresponse: {response}" "\ncomment: "
	)

	self.df = pd.DataFrame()

	def call_openai_api(self, prompt, temperature, max_tokens, chat_completions=True):
	"""
	Call the OpenAI API to generate responses.
	"""

	# Generate prompts for demo examples
	for ex in self.demo_examples:
	demo_evaluation = self.demo_evaluation_template.format(
	question=ex["question"],
	correct_answers=" \| ".join(ex["correct_answers"]),
	response=ex["response"],
	comment=ex["comment"],
	evaluation=ex["evaluation"],
	)
	self.demo_evaluations.append(demo_evaluation)

	# Call the OpenAI API to generate responses
	# If chat completions are enabled, use the chat completions endpoint
	if chat_completions:
	response = self.openai_client.chat.completions.create(
	model=self.openai_model_name,
	temperature=temperature,
	max_tokens=max_tokens,
	messages=[
	{
	"role": "system",
	"content": (
	"You are a helpful assistant. Respond as concisely as"
	f" possible. Knowledge cutoff: {self.current_date}."
	),
	},
	{"role": "user", "content": "What's today's date?"},
	{
	"role": "assistant",
	"content": f"Today is {self.current_date} in Pacific Standard Time.",
	},
	{"role": "user", "content": prompt},
	],
	)
	return response.choices[0].message.content

	# If chat completions are disabled, use the completions endpoint
	else:
	response = self.openai_client.completions.create(
	model=self.openai_model_name,
	temperature=temperature,
	max_tokens=max_tokens,
	prompt=prompt,
	)
	return response.choices[0].text


	def call_fresheval(self, prefix, question, response, correct_answers, evaluation):
	"""
	Call the FreshEval API to evaluate responses.
	"""

	# Set the parameters for the OpenAI API
	temperature = 0.0
	max_tokens = 256
	chat_completions = True

	# Generate prompts for demo examples
	demo_prompts = []
	for q, e in zip(self.demo_questions, self.demo_evaluations):
	demo_prompts.append(f"\n\n\nquestion: {q}{e}")

	# Generate the fresh evaluation prompt
	fresheval_demo = "".join(demo_prompts).strip()
	fresheval_question = f"\n\n\nquestion: {question}{evaluation}"

	# Call the OpenAI API to generate responses
	fresh_eval = prefix + "\n\n\n" + fresheval_demo + fresheval_question
	answer = self.call_openai_api(fresh_eval, temperature, max_tokens, chat_completions)

	return answer

	def extract_ratings(self, response):
	"""
	Extract the rating from the evaluation response.
	"""

	# If the eval answer contains either of these three words, considered as 0
	# including incorrect, not correct, not credited
	pattern = re.compile(
	r"\b(?:incorrect\|not\s+correct\|not\s+credited)\b", re.IGNORECASE
	)
	if pattern.search(response):
	return 0
	else:
	return 1

	def freshqa_piechart(self, result, fig_path: str = "", save: bool = False):
	"""
	Plot a pie chart of the true and false answers on FreshQA.

	Parameters
	----------
	result : dict
	The evaluation result.
	fig_path : str
	The path to save the figure.
	save : bool, optional
	Whether to save the figure, by default True.
	"""

	# Given numbers
	sizes = [result["accuracy"], 1 - result["accuracy"]]
	labels = ["True Answer", "False Answer"]
	colors = [(0, 1, 0, 0.5), (1, 0, 0, 0.5)] # Red and green with 50% transparency

	# Create a new figure
	fig, ax = plt.subplots()

	# Plot pie chart
	plt.pie(sizes, labels=labels, colors=colors, autopct="%1.1f%%", startangle=140)
	plt.title("Performance on FreshQA Dataset")

	# Equal aspect ratio ensures that pie is drawn as a circle
	plt.axis("equal")

	if save:
	# Save the figure
	plt.tight_layout()
	plt.savefig(os.path.join(fig_path, "freshqa_piechart.pdf"), format="pdf")
	plt.savefig(os.path.join(fig_path, "freshqa_piechart.png"), format="png")

	# Return the figure
	return fig


	def evaluate_freshqa(self, llm_responses):
	"""
	Evaluate the responses generated by the LLM on FreshQA questions.
	"""

	llm_responses = pd.DataFrame(llm_responses)
	raw_evals = []
	preds = []
	for idx, row in llm_responses.iterrows():
	evaluation = self.evaluation_template.format(
	correct_answers=row["reference_answer"],
	response=row["response"],
	)

	fresheval = self.call_fresheval(
	self.prefix,
	row["question"],
	row["response"],
	row["reference_answer"],
	evaluation,
	)

	evaluation_rating = self.extract_ratings(fresheval)
	raw_evals.append(evaluation)
	preds.append(evaluation_rating)

	# Compute the accuracy (percentage of correct evaluations)
	accuracy = sum(preds) / len(preds)
	result = { "accuracy": accuracy }
	return result, raw_evals, preds