| | import asyncio |
| | import json |
| | import os |
| | import base64 |
| | import nest_asyncio |
| | from io import BytesIO |
| | import pandas as pd |
| | from playwright.async_api import async_playwright |
| | from openai import OpenAI |
| | from PIL import Image |
| | from tabulate import tabulate |
| | from IPython.display import display, HTML, Markdown |
| | from pydantic import BaseModel |
| | import streamlit as st |
| | from helper import get_openai_api_key, visualizeCourses |
| |
|
| | |
| | nest_asyncio.apply() |
| |
|
| | |
| | client = OpenAI(api_key="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") |
| |
|
| | class WebScraperAgent: |
| | def __init__(self): |
| | self.playwright = None |
| | self.browser = None |
| | self.page = None |
| |
|
| | async def init_browser(self): |
| | self.playwright = await async_playwright().start() |
| | self.browser = await self.playwright.chromium.launch(headless=True) |
| | self.page = await self.browser.new_page() |
| |
|
| | async def scrape_content(self, url): |
| | if not self.page or self.page.is_closed(): |
| | await self.init_browser() |
| | await self.page.goto(url, wait_until="load") |
| | await self.page.wait_for_timeout(2000) |
| | return await self.page.content() |
| |
|
| | async def take_screenshot(self, path="screenshot.png"): |
| | await self.page.screenshot(path=path, full_page=True) |
| | return path |
| |
|
| | async def screenshot_buffer(self): |
| | screenshot_bytes = await self.page.screenshot(type="png", full_page=False) |
| | return screenshot_bytes |
| |
|
| | async def close(self): |
| | await self.browser.close() |
| | await self.playwright.stop() |
| |
|
| |
|
| | |
| | class DeeplearningCourse(BaseModel): |
| | title: str |
| | description: str |
| | presenter: list[str] |
| | imageUrl: str |
| | courseURL: str |
| |
|
| | class DeeplearningCourseList(BaseModel): |
| | courses: list[DeeplearningCourse] |
| |
|
| | |
| | async def process_with_llm(html, instructions): |
| | response = await client.chat.completions.create( |
| | model="gpt-4o", |
| | messages=[ |
| | { |
| | "role": "system", |
| | "content": f""" |
| | You are an expert web scraping agent. Your task is to: |
| | Extract relevant information from this HTML to JSON |
| | following these instructions: |
| | {instructions} |
| | |
| | Extract the title, description, presenter, |
| | the image URL and course URL for each course from deeplearning.ai |
| | |
| | Return ONLY valid JSON. |
| | """ |
| | }, |
| | { |
| | "role": "user", |
| | "content": html[:150000] |
| | } |
| | ], |
| | temperature=0.1 |
| | ) |
| |
|
| | content = response.choices[0].message.content |
| | try: |
| | json_obj = json.loads(content) |
| | return DeeplearningCourseList(**json_obj) |
| | except Exception as e: |
| | raise ValueError(f"Parsing failed: {e}\nRaw output:\n{content}") |
| |
|
| | |
| | async def webscraper(target_url, instructions): |
| | scraper = WebScraperAgent() |
| | try: |
| | st.info("Extracting HTML Content...") |
| | html_content = await scraper.scrape_content(target_url) |
| |
|
| | st.info("Taking Screenshot...") |
| | screenshot = await scraper.screenshot_buffer() |
| |
|
| | st.info("Processing with LLM...") |
| | result = await process_with_llm(html_content, instructions) |
| | return result, screenshot |
| | except Exception as e: |
| | st.error(f"Error: {str(e)}") |
| | return None, None |
| | finally: |
| | await scraper.close() |
| |
|
| |
|
| | |
| | def main(): |
| | st.title("AI Web Browser Agent (Hugging Face + Streamlit)") |
| | target_url = "https://www.deeplearning.ai/courses" |
| | instructions = "Get all the courses." |
| |
|
| | if st.button("Start Scraping"): |
| | result, screenshot = asyncio.run(webscraper(target_url, instructions)) |
| |
|
| | if result: |
| | st.success("Successfully extracted course data!") |
| | visualizeCourses(result=result, screenshot=screenshot, target_url=target_url, instructions=instructions, base_url="https://deeplearning.ai") |
| | else: |
| | st.error("Failed to extract course data.") |
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|