Spaces:

tdurzynski
/

web-scraper-summarizer

Sleeping

App Files Files Community

web-scraper-summarizer / app.py

tdurzynski

Update app.py

5892725 verified 6 months ago

raw

history blame contribute delete

3.82 kB

	import requests
	from bs4 import BeautifulSoup
	import gradio as gr
	import os
	from openai import OpenAI
	from selenium import webdriver
	import undetected_chromedriver as uc
	from selenium.webdriver.chrome.options import Options

	# Initialize OpenAI client securely
	client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

	def fetch_with_requests(url):
	"""
	Fetches webpage content using requests with proper headers.
	Returns extracted text if successful, or raises an error for fallback.
	"""
	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
	"(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
	"Accept-Language": "en-US,en;q=0.9",
	"Referer": "https://www.google.com/",
	"DNT": "1",
	"Connection": "keep-alive"
	}

	response = requests.get(url, headers=headers, timeout=10)
	if response.status_code == 403:
	raise Exception("403 Forbidden - Switching to Selenium")

	soup = BeautifulSoup(response.text, "html.parser")
	paragraphs = soup.find_all("p")
	text_content = "\n".join([p.get_text() for p in paragraphs if p.get_text().strip()])

	return text_content if text_content else "No readable content found."

	def fetch_with_selenium(url):
	"""
	Uses Selenium with an undetected Chrome driver to scrape JavaScript-heavy pages.
	"""
	chrome_options = Options()
	chrome_options.add_argument("--headless") # Run in headless mode
	chrome_options.add_argument("--disable-gpu")
	chrome_options.add_argument("--no-sandbox")
	chrome_options.add_argument("--disable-dev-shm-usage")

	driver = uc.Chrome(options=chrome_options)

	driver.get(url)
	html = driver.page_source
	driver.quit()

	soup = BeautifulSoup(html, "html.parser")
	paragraphs = soup.find_all("p")
	text_content = "\n".join([p.get_text() for p in paragraphs if p.get_text().strip()])

	return text_content if text_content else "No readable content found (even with Selenium)."

	def scrape_and_summarize(url):
	"""
	Scrapes the given website URL and summarizes its content using GPT-4o-mini.
	Tries `requests` first, falls back to Selenium if needed.
	"""
	try:
	# Attempt with requests first
	text_content = fetch_with_requests(url)
	except Exception as e:
	# If blocked, fallback to Selenium
	try:
	text_content = fetch_with_selenium(url)
	except Exception as selenium_error:
	return f"Failed both requests and Selenium: {selenium_error}"

	# Limit content to 4000 characters for better summarization
	text_content = text_content[:4000]

	# Call OpenAI GPT-4o-mini for summarization
	response = client.chat.completions.create(
	model="gpt-4o-mini",
	messages=[
	{"role": "system", "content": "You are a helpful assistant that summarizes webpage content."},
	{"role": "user", "content": f"Summarize the following webpage content:\n\n{text_content}"}
	],
	response_format={"type": "text"},
	temperature=1,
	max_completion_tokens=2048,
	top_p=1,
	frequency_penalty=0,
	presence_penalty=0
	)

	summary = response.choices[0].message.content # Extract response content
	return summary

	# Gradio UI
	with gr.Blocks() as demo:
	gr.Markdown("# Web Page Summarizer")
	gr.Markdown("Enter a website URL to get a summary of its content.")

	url_input = gr.Textbox(label="Website URL", placeholder="https://example.com")
	output = gr.Textbox(label="Summary", interactive=False)
	submit_button = gr.Button("Summarize")

	submit_button.click(scrape_and_summarize, inputs=[url_input], outputs=[output])

	# Launch Gradio App
	if __name__ == "__main__":
	demo.launch()