Final_Assignment_Template

Sleeping

App Files Files Community

Final_Assignment_Template / agents /video_agent.py

altozachmo

Attempt with wikipedia parsing tools

ca6fbc3 4 months ago

raw

history blame contribute delete

2.62 kB

	from io import BytesIO
	from time import sleep
	import os
	import sys

	# Add the parent directory to the Python path so modules can be found
	sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

	import helium
	from dotenv import load_dotenv
	from PIL import Image
	from selenium import webdriver

	from smolagents import CodeAgent
	from smolagents.agents import ActionStep
	from agents.agent import MyAgent
	from prompts.helium import HELIUM_PROMPT

	load_dotenv()

	# Configure Chrome options
	chrome_options = webdriver.ChromeOptions()
	chrome_options.add_argument("--force-device-scale-factor=1")
	chrome_options.add_argument("--window-size=1000,1350")
	chrome_options.add_argument("--disable-pdf-viewer")
	chrome_options.add_argument("--window-position=0,0")

	# Initialize the browser
	driver = helium.start_chrome(headless=False, options=chrome_options)


	def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None:
	sleep(1.0) # Let JavaScript animations happen before taking the screenshot
	driver = helium.get_driver()
	current_step = memory_step.step_number
	if driver is not None:
	for (
	previous_memory_step
	) in agent.memory.steps: # Remove previous screenshots for lean processing
	if (
	isinstance(previous_memory_step, ActionStep)
	and previous_memory_step.step_number <= current_step - 2
	):
	previous_memory_step.observations_images = None
	png_bytes = driver.get_screenshot_as_png()
	image = Image.open(BytesIO(png_bytes))
	print(f"Captured a browser screenshot: {image.size} pixels")
	memory_step.observations_images = [
	image.copy()
	] # Create a copy to ensure it persists

	# Update observations with current URL
	url_info = f"Current url: {driver.current_url}"
	memory_step.observations = (
	url_info
	if memory_step.observations is None
	else memory_step.observations + "\n" + url_info
	)


	video_agent = MyAgent(
	api_key=os.getenv("GEMINI_API_KEY"),
	temperature=0.0,
	add_base_tools=False,
	additional_authorized_imports=["helium"],
	step_callbacks=[save_screenshot],
	max_steps=20,
	verbosity_level=2,
	)

	video_agent.agent.python_executor("from helium import *", video_agent.agent.state)


	search_request = """
	Please navigate to https://en.wikipedia.org/wiki/Chicago and give me a sentence containing the word "1992" that mentions a construction accident.
	"""

	agent_output = video_agent(search_request + HELIUM_PROMPT)
	print("Final output:")
	print(agent_output)