|
from io import BytesIO |
|
from time import sleep |
|
import os |
|
import sys |
|
|
|
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
|
|
import helium |
|
from dotenv import load_dotenv |
|
from PIL import Image |
|
from selenium import webdriver |
|
|
|
from smolagents import CodeAgent |
|
from smolagents.agents import ActionStep |
|
from agents.agent import MyAgent |
|
from prompts.helium import HELIUM_PROMPT |
|
|
|
load_dotenv() |
|
|
|
|
|
chrome_options = webdriver.ChromeOptions() |
|
chrome_options.add_argument("--force-device-scale-factor=1") |
|
chrome_options.add_argument("--window-size=1000,1350") |
|
chrome_options.add_argument("--disable-pdf-viewer") |
|
chrome_options.add_argument("--window-position=0,0") |
|
|
|
|
|
driver = helium.start_chrome(headless=False, options=chrome_options) |
|
|
|
|
|
def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None: |
|
sleep(1.0) |
|
driver = helium.get_driver() |
|
current_step = memory_step.step_number |
|
if driver is not None: |
|
for ( |
|
previous_memory_step |
|
) in agent.memory.steps: |
|
if ( |
|
isinstance(previous_memory_step, ActionStep) |
|
and previous_memory_step.step_number <= current_step - 2 |
|
): |
|
previous_memory_step.observations_images = None |
|
png_bytes = driver.get_screenshot_as_png() |
|
image = Image.open(BytesIO(png_bytes)) |
|
print(f"Captured a browser screenshot: {image.size} pixels") |
|
memory_step.observations_images = [ |
|
image.copy() |
|
] |
|
|
|
|
|
url_info = f"Current url: {driver.current_url}" |
|
memory_step.observations = ( |
|
url_info |
|
if memory_step.observations is None |
|
else memory_step.observations + "\n" + url_info |
|
) |
|
|
|
|
|
video_agent = MyAgent( |
|
api_key=os.getenv("GEMINI_API_KEY"), |
|
temperature=0.0, |
|
add_base_tools=False, |
|
additional_authorized_imports=["helium"], |
|
step_callbacks=[save_screenshot], |
|
max_steps=20, |
|
verbosity_level=2, |
|
) |
|
|
|
video_agent.agent.python_executor("from helium import *", video_agent.agent.state) |
|
|
|
|
|
search_request = """ |
|
Please navigate to https://en.wikipedia.org/wiki/Chicago and give me a sentence containing the word "1992" that mentions a construction accident. |
|
""" |
|
|
|
agent_output = video_agent(search_request + HELIUM_PROMPT) |
|
print("Final output:") |
|
print(agent_output) |
|
|