Scraper / app.py
z4hid's picture
Update app.py
f987b64 verified
import gradio as gr
import requests
import os
import tempfile
from typing import Tuple
JINA_API_KEY = os.getenv('JINA_API_KEY')
def web_scraper(url: str) -> str:
"""
Scrape the content of a given URL using the Jina API.
Args:
url (str): The URL to scrape.
Returns:
str: The scraped content in markdown format.
"""
headers = {
'Authorization': f'Bearer {JINA_API_KEY}',
'X-Locale': 'en-US',
'X-Return-Format': 'text',
'X-With-Generated-Alt': 'true',
'X-With-Links-Summary': 'true'
}
scrape_pattern = f'https://r.jina.ai/{url}'
response = requests.get(scrape_pattern, headers=headers)
return response.text
def scrape_and_display(url: str) -> Tuple[str, tempfile._TemporaryFileWrapper]:
"""
Scrape the content of a given URL and prepare it for display and download.
Args:
url (str): The URL to scrape.
Returns:
Tuple[str, tempfile._TemporaryFileWrapper]: A tuple containing the scraped content and a temporary file for download.
"""
scraped_content = web_scraper(url)
# Create a temporary file
temp_file = tempfile.NamedTemporaryFile(mode="w+", suffix=".md", delete=False)
temp_file.write(scraped_content)
temp_file.flush()
return scraped_content, temp_file.name
def create_gradio_interface() -> gr.Interface:
"""
Create and configure the Gradio interface for the web scraper.
Returns:
gr.Interface: The configured Gradio interface.
"""
return gr.Interface(
fn=scrape_and_display,
inputs=gr.Textbox(label="Enter URL to scrape"),
outputs=[
gr.Markdown(label="Scraped Content"),
gr.File(label="Download Markdown")
],
title="Web Scraper",
description="Enter a URL to scrape and view the content in markdown format. You can also download the markdown file.",
examples=[["https://www.robots.ox.ac.uk/~vgg/data/flowers/102/categories.html"]],
allow_flagging="never"
)
if __name__ == "__main__":
iface = create_gradio_interface()
iface.launch()