Spaces:

Afeefa123
/

web-agent

Sleeping

App Files Files Community

web-agent / app.py

Afeefa123

Update app.py

74a2967 verified 11 months ago

raw

history blame contribute delete

4.38 kB

	import asyncio
	import json
	import os
	import base64
	import nest_asyncio
	from io import BytesIO
	import pandas as pd
	from playwright.async_api import async_playwright
	from openai import OpenAI
	from PIL import Image
	from tabulate import tabulate
	from IPython.display import display, HTML, Markdown
	from pydantic import BaseModel
	import streamlit as st
	from helper import get_openai_api_key, visualizeCourses

	# Apply nested asyncio support for Jupyter / Streamlit environments
	nest_asyncio.apply()

	# Init OpenAI client securely
	client = OpenAI(api_key="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")

	class WebScraperAgent:
	def __init__(self):
	self.playwright = None
	self.browser = None
	self.page = None

	async def init_browser(self):
	self.playwright = await async_playwright().start()
	self.browser = await self.playwright.chromium.launch(headless=True)
	self.page = await self.browser.new_page()

	async def scrape_content(self, url):
	if not self.page or self.page.is_closed():
	await self.init_browser()
	await self.page.goto(url, wait_until="load")
	await self.page.wait_for_timeout(2000) # Wait for dynamic content
	return await self.page.content()

	async def take_screenshot(self, path="screenshot.png"):
	await self.page.screenshot(path=path, full_page=True)
	return path

	async def screenshot_buffer(self):
	screenshot_bytes = await self.page.screenshot(type="png", full_page=False)
	return screenshot_bytes

	async def close(self):
	await self.browser.close()
	await self.playwright.stop()


	# Pydantic models for structured output
	class DeeplearningCourse(BaseModel):
	title: str
	description: str
	presenter: list[str]
	imageUrl: str
	courseURL: str

	class DeeplearningCourseList(BaseModel):
	courses: list[DeeplearningCourse]

	# LLM interaction
	async def process_with_llm(html, instructions):
	response = await client.chat.completions.create(
	model="gpt-4o",
	messages=[
	{
	"role": "system",
	"content": f"""
	You are an expert web scraping agent. Your task is to:
	Extract relevant information from this HTML to JSON
	following these instructions:
	{instructions}

	Extract the title, description, presenter,
	the image URL and course URL for each course from deeplearning.ai

	Return ONLY valid JSON.
	"""
	},
	{
	"role": "user",
	"content": html[:150000]
	}
	],
	temperature=0.1
	)

	content = response.choices[0].message.content
	try:
	json_obj = json.loads(content)
	return DeeplearningCourseList(**json_obj)
	except Exception as e:
	raise ValueError(f"Parsing failed: {e}\nRaw output:\n{content}")

	# Scraper workflow
	async def webscraper(target_url, instructions):
	scraper = WebScraperAgent()
	try:
	st.info("Extracting HTML Content...")
	html_content = await scraper.scrape_content(target_url)

	st.info("Taking Screenshot...")
	screenshot = await scraper.screenshot_buffer()

	st.info("Processing with LLM...")
	result = await process_with_llm(html_content, instructions)
	return result, screenshot
	except Exception as e:
	st.error(f"Error: {str(e)}")
	return None, None
	finally:
	await scraper.close()


	# Streamlit entrypoint
	def main():
	st.title("AI Web Browser Agent (Hugging Face + Streamlit)")
	target_url = "https://www.deeplearning.ai/courses"
	instructions = "Get all the courses."

	if st.button("Start Scraping"):
	result, screenshot = asyncio.run(webscraper(target_url, instructions))

	if result:
	st.success("Successfully extracted course data!")
	visualizeCourses(result=result, screenshot=screenshot, target_url=target_url, instructions=instructions, base_url="https://deeplearning.ai")
	else:
	st.error("Failed to extract course data.")

	if __name__ == "__main__":
	main()