altozachmo's picture
Attempt with wikipedia parsing tools
ca6fbc3
from io import BytesIO
from time import sleep
import os
import sys
# Add the parent directory to the Python path so modules can be found
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import helium
from dotenv import load_dotenv
from PIL import Image
from selenium import webdriver
from smolagents import CodeAgent
from smolagents.agents import ActionStep
from agents.agent import MyAgent
from prompts.helium import HELIUM_PROMPT
load_dotenv()
# Configure Chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--force-device-scale-factor=1")
chrome_options.add_argument("--window-size=1000,1350")
chrome_options.add_argument("--disable-pdf-viewer")
chrome_options.add_argument("--window-position=0,0")
# Initialize the browser
driver = helium.start_chrome(headless=False, options=chrome_options)
def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None:
sleep(1.0) # Let JavaScript animations happen before taking the screenshot
driver = helium.get_driver()
current_step = memory_step.step_number
if driver is not None:
for (
previous_memory_step
) in agent.memory.steps: # Remove previous screenshots for lean processing
if (
isinstance(previous_memory_step, ActionStep)
and previous_memory_step.step_number <= current_step - 2
):
previous_memory_step.observations_images = None
png_bytes = driver.get_screenshot_as_png()
image = Image.open(BytesIO(png_bytes))
print(f"Captured a browser screenshot: {image.size} pixels")
memory_step.observations_images = [
image.copy()
] # Create a copy to ensure it persists
# Update observations with current URL
url_info = f"Current url: {driver.current_url}"
memory_step.observations = (
url_info
if memory_step.observations is None
else memory_step.observations + "\n" + url_info
)
video_agent = MyAgent(
api_key=os.getenv("GEMINI_API_KEY"),
temperature=0.0,
add_base_tools=False,
additional_authorized_imports=["helium"],
step_callbacks=[save_screenshot],
max_steps=20,
verbosity_level=2,
)
video_agent.agent.python_executor("from helium import *", video_agent.agent.state)
search_request = """
Please navigate to https://en.wikipedia.org/wiki/Chicago and give me a sentence containing the word "1992" that mentions a construction accident.
"""
agent_output = video_agent(search_request + HELIUM_PROMPT)
print("Final output:")
print(agent_output)