Final_Assignment_AWorld

Sleeping

App Files Files Community

Final_Assignment_AWorld / examples /gaia /run.py

Duibonduil

Upload 9 files

3a235a9 verified 4 months ago

raw

history blame

8.53 kB

	import argparse
	import json
	import logging
	import os
	import re
	import traceback
	from typing import Any, Dict, List

	from dotenv import load_dotenv

	from aworld.config.conf import AgentConfig, TaskConfig
	from aworld.agents.llm_agent import Agent
	from aworld.core.task import Task
	from aworld.runner import Runners
	from examples.gaia.prompt import system_prompt
	from examples.gaia.utils import (
	add_file_path,
	load_dataset_meta,
	question_scorer,
	report_results,
	)

	# Create log directory if it doesn't exist
	if not os.path.exists(os.getenv("LOG_FILE_PATH")):
	os.makedirs(os.getenv("LOG_FILE_PATH"))

	parser = argparse.ArgumentParser()
	parser.add_argument(
	"--start",
	type=int,
	default=0,
	help="Start index of the dataset",
	)
	parser.add_argument(
	"--end",
	type=int,
	default=20,
	help="End index of the dataset",
	)
	parser.add_argument(
	"--q",
	type=str,
	help="Question Index, e.g., 0-0-0-0-0. Highest priority: override other arguments if provided.",
	)
	parser.add_argument(
	"--skip",
	action="store_true",
	help="Skip the question if it has been processed before.",
	)
	parser.add_argument(
	"--split",
	type=str,
	default="validation",
	help="Split of the dataset, e.g., validation, test",
	)
	parser.add_argument(
	"--blacklist_file_path",
	type=str,
	nargs="?",
	help="Blacklist file path, e.g., blacklist.txt",
	)
	args = parser.parse_args()


	def setup_logging():
	logging_logger = logging.getLogger()
	logging_logger.setLevel(logging.INFO)

	log_file_name = (
	f"/super_agent_{args.q}.log"
	if args.q
	else f"/super_agent_{args.start}_{args.end}.log"
	)
	file_handler = logging.FileHandler(
	os.getenv(
	"LOG_FILE_PATH",
	"run_super_agent.log",
	)
	+ log_file_name,
	mode="a",
	encoding="utf-8",
	)
	file_handler.setLevel(logging.INFO)

	formatter = logging.Formatter(
	"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
	)
	file_handler.setFormatter(formatter)

	logging_logger.addHandler(file_handler)


	if __name__ == "__main__":
	load_dotenv()
	setup_logging()

	gaia_dataset_path = os.getenv("GAIA_DATASET_PATH", "./gaia_dataset")
	full_dataset = load_dataset_meta(gaia_dataset_path, split=args.split)
	logging.info(f"Total questions: {len(full_dataset)}")

	agent_config = AgentConfig(
	llm_provider="openai",
	llm_model_name=os.getenv("LLM_MODEL_NAME", "gpt-4o"),
	llm_api_key=os.getenv("LLM_API_KEY", "your_openai_api_key"),
	llm_base_url=os.getenv("LLM_BASE_URL", "your_openai_base_url"),
	)
	super_agent = Agent(
	conf=agent_config,
	name="gaia_super_agent",
	system_prompt=system_prompt,
	mcp_servers=[
	"e2b-server",
	# "filesystem",
	"terminal-controller",
	"excel",
	"calculator",
	"ms-playwright",
	"audio_server",
	"image_server",
	"video_server",
	"search_server",
	"download_server",
	"document_server",
	# "browser_server",
	"youtube_server",
	"reasoning_server",
	],
	)

	# load results from the checkpoint file
	if os.path.exists(os.getenv("LOG_FILE_PATH") + "/results.json"):
	with open(
	os.getenv("LOG_FILE_PATH") + "/results.json", "r", encoding="utf-8"
	) as results_f:
	results: List[Dict[str, Any]] = json.load(results_f)
	else:
	results: List[Dict[str, Any]] = []

	# load blacklist `task_id`
	if args.blacklist_file_path and os.path.exists(args.blacklist_file_path):
	with open(args.blacklist_file_path, "r", encoding="utf-8") as f:
	blacklist = set(f.read().splitlines())
	else:
	blacklist = set() # Empty set if file doesn't exist

	try:
	# slice dataset by args.start and args.end, overrided by args.q (single `task_id`)
	dataset_slice = (
	[
	dataset_record
	for idx, dataset_record in enumerate(full_dataset)
	if dataset_record["task_id"] in args.q
	]
	if args.q is not None
	else full_dataset[args.start : args.end]
	)

	# main loop to execute questions
	for i, dataset_i in enumerate(dataset_slice):
	# specify `task_id`
	if args.q and args.q != dataset_i["task_id"]:
	continue
	# only valid for args.q==None
	if not args.q:
	# blacklist
	if dataset_i["task_id"] in blacklist:
	continue

	# pass
	if any(
	# Question Done and Correct
	(result["task_id"] == dataset_i["task_id"] and result["is_correct"])
	for result in results
	) or any(
	# Question Done and Incorrect, but Level is 3
	(
	result["task_id"] == dataset_i["task_id"]
	and not result["is_correct"]
	and dataset_i["Level"] == 3
	)
	for result in results
	):
	continue

	# skip
	if args.skip and any(
	# Question Done and Correct
	(result["task_id"] == dataset_i["task_id"])
	for result in results
	):
	continue

	# run
	try:
	logging.info(f"Start to process: {dataset_i['task_id']}")
	logging.info(f"Detail: {dataset_i}")
	logging.info(f"Question: {dataset_i['Question']}")
	logging.info(f"Level: {dataset_i['Level']}")
	logging.info(f"Tools: {dataset_i['Annotator Metadata']['Tools']}")

	question = add_file_path(
	dataset_i, file_path=gaia_dataset_path, split=args.split
	)["Question"]

	task = Task(input=question, agent=super_agent, conf=TaskConfig())
	result = Runners.sync_run_task(task=task)

	match = re.search(r"<answer>(.*?)</answer>", result[task.id].get('answer'))
	if match:
	answer = match.group(1)
	logging.info(f"Agent answer: {answer}")
	logging.info(f"Correct answer: {dataset_i['Final answer']}")

	if question_scorer(answer, dataset_i["Final answer"]):
	logging.info(f"Question {i} Correct!")
	else:
	logging.info("Incorrect!")

	# Create the new result record
	new_result = {
	"task_id": dataset_i["task_id"],
	"level": dataset_i["Level"],
	"question": question,
	"answer": dataset_i["Final answer"],
	"response": answer,
	"is_correct": question_scorer(answer, dataset_i["Final answer"]),
	}

	# Check if this task_id already exists in results
	existing_index = next(
	(
	i
	for i, result in enumerate(results)
	if result["task_id"] == dataset_i["task_id"]
	),
	None,
	)

	if existing_index is not None:
	# Update existing record
	results[existing_index] = new_result
	logging.info(
	f"Updated existing record for task_id: {dataset_i['task_id']}"
	)
	else:
	# Append new record
	results.append(new_result)
	logging.info(
	f"Added new record for task_id: {dataset_i['task_id']}"
	)

	except Exception as e:
	logging.error(f"Error processing {i}: {traceback.format_exc()}")
	continue
	except KeyboardInterrupt:
	pass
	finally:
	# report
	report_results(results)
	with open(
	os.getenv("LOG_FILE_PATH") + "/results.json", "w", encoding="utf-8"
	) as f:
	json.dump(results, f, indent=4, ensure_ascii=False)