tdurzynski's picture
Update app.py
5892725 verified
import requests
from bs4 import BeautifulSoup
import gradio as gr
import os
from openai import OpenAI
from selenium import webdriver
import undetected_chromedriver as uc
from selenium.webdriver.chrome.options import Options
# Initialize OpenAI client securely
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
def fetch_with_requests(url):
"""
Fetches webpage content using requests with proper headers.
Returns extracted text if successful, or raises an error for fallback.
"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
"Accept-Language": "en-US,en;q=0.9",
"Referer": "https://www.google.com/",
"DNT": "1",
"Connection": "keep-alive"
}
response = requests.get(url, headers=headers, timeout=10)
if response.status_code == 403:
raise Exception("403 Forbidden - Switching to Selenium")
soup = BeautifulSoup(response.text, "html.parser")
paragraphs = soup.find_all("p")
text_content = "\n".join([p.get_text() for p in paragraphs if p.get_text().strip()])
return text_content if text_content else "No readable content found."
def fetch_with_selenium(url):
"""
Uses Selenium with an undetected Chrome driver to scrape JavaScript-heavy pages.
"""
chrome_options = Options()
chrome_options.add_argument("--headless") # Run in headless mode
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
driver = uc.Chrome(options=chrome_options)
driver.get(url)
html = driver.page_source
driver.quit()
soup = BeautifulSoup(html, "html.parser")
paragraphs = soup.find_all("p")
text_content = "\n".join([p.get_text() for p in paragraphs if p.get_text().strip()])
return text_content if text_content else "No readable content found (even with Selenium)."
def scrape_and_summarize(url):
"""
Scrapes the given website URL and summarizes its content using GPT-4o-mini.
Tries `requests` first, falls back to Selenium if needed.
"""
try:
# Attempt with requests first
text_content = fetch_with_requests(url)
except Exception as e:
# If blocked, fallback to Selenium
try:
text_content = fetch_with_selenium(url)
except Exception as selenium_error:
return f"Failed both requests and Selenium: {selenium_error}"
# Limit content to 4000 characters for better summarization
text_content = text_content[:4000]
# Call OpenAI GPT-4o-mini for summarization
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "You are a helpful assistant that summarizes webpage content."},
{"role": "user", "content": f"Summarize the following webpage content:\n\n{text_content}"}
],
response_format={"type": "text"},
temperature=1,
max_completion_tokens=2048,
top_p=1,
frequency_penalty=0,
presence_penalty=0
)
summary = response.choices[0].message.content # Extract response content
return summary
# Gradio UI
with gr.Blocks() as demo:
gr.Markdown("# Web Page Summarizer")
gr.Markdown("Enter a website URL to get a summary of its content.")
url_input = gr.Textbox(label="Website URL", placeholder="https://example.com")
output = gr.Textbox(label="Summary", interactive=False)
submit_button = gr.Button("Summarize")
submit_button.click(scrape_and_summarize, inputs=[url_input], outputs=[output])
# Launch Gradio App
if __name__ == "__main__":
demo.launch()