Spaces:
Sleeping
Sleeping
import requests | |
from bs4 import BeautifulSoup | |
import gradio as gr | |
import os | |
from openai import OpenAI | |
from selenium import webdriver | |
import undetected_chromedriver as uc | |
from selenium.webdriver.chrome.options import Options | |
# Initialize OpenAI client securely | |
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) | |
def fetch_with_requests(url): | |
""" | |
Fetches webpage content using requests with proper headers. | |
Returns extracted text if successful, or raises an error for fallback. | |
""" | |
headers = { | |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " | |
"(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", | |
"Accept-Language": "en-US,en;q=0.9", | |
"Referer": "https://www.google.com/", | |
"DNT": "1", | |
"Connection": "keep-alive" | |
} | |
response = requests.get(url, headers=headers, timeout=10) | |
if response.status_code == 403: | |
raise Exception("403 Forbidden - Switching to Selenium") | |
soup = BeautifulSoup(response.text, "html.parser") | |
paragraphs = soup.find_all("p") | |
text_content = "\n".join([p.get_text() for p in paragraphs if p.get_text().strip()]) | |
return text_content if text_content else "No readable content found." | |
def fetch_with_selenium(url): | |
""" | |
Uses Selenium with an undetected Chrome driver to scrape JavaScript-heavy pages. | |
""" | |
chrome_options = Options() | |
chrome_options.add_argument("--headless") # Run in headless mode | |
chrome_options.add_argument("--disable-gpu") | |
chrome_options.add_argument("--no-sandbox") | |
chrome_options.add_argument("--disable-dev-shm-usage") | |
driver = uc.Chrome(options=chrome_options) | |
driver.get(url) | |
html = driver.page_source | |
driver.quit() | |
soup = BeautifulSoup(html, "html.parser") | |
paragraphs = soup.find_all("p") | |
text_content = "\n".join([p.get_text() for p in paragraphs if p.get_text().strip()]) | |
return text_content if text_content else "No readable content found (even with Selenium)." | |
def scrape_and_summarize(url): | |
""" | |
Scrapes the given website URL and summarizes its content using GPT-4o-mini. | |
Tries `requests` first, falls back to Selenium if needed. | |
""" | |
try: | |
# Attempt with requests first | |
text_content = fetch_with_requests(url) | |
except Exception as e: | |
# If blocked, fallback to Selenium | |
try: | |
text_content = fetch_with_selenium(url) | |
except Exception as selenium_error: | |
return f"Failed both requests and Selenium: {selenium_error}" | |
# Limit content to 4000 characters for better summarization | |
text_content = text_content[:4000] | |
# Call OpenAI GPT-4o-mini for summarization | |
response = client.chat.completions.create( | |
model="gpt-4o-mini", | |
messages=[ | |
{"role": "system", "content": "You are a helpful assistant that summarizes webpage content."}, | |
{"role": "user", "content": f"Summarize the following webpage content:\n\n{text_content}"} | |
], | |
response_format={"type": "text"}, | |
temperature=1, | |
max_completion_tokens=2048, | |
top_p=1, | |
frequency_penalty=0, | |
presence_penalty=0 | |
) | |
summary = response.choices[0].message.content # Extract response content | |
return summary | |
# Gradio UI | |
with gr.Blocks() as demo: | |
gr.Markdown("# Web Page Summarizer") | |
gr.Markdown("Enter a website URL to get a summary of its content.") | |
url_input = gr.Textbox(label="Website URL", placeholder="https://example.com") | |
output = gr.Textbox(label="Summary", interactive=False) | |
submit_button = gr.Button("Summarize") | |
submit_button.click(scrape_and_summarize, inputs=[url_input], outputs=[output]) | |
# Launch Gradio App | |
if __name__ == "__main__": | |
demo.launch() | |