File size: 2,150 Bytes
f8db296 f987b64 f8db296 f987b64 f8db296 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
import gradio as gr
import requests
import os
import tempfile
from typing import Tuple
JINA_API_KEY = os.getenv('JINA_API_KEY')
def web_scraper(url: str) -> str:
"""
Scrape the content of a given URL using the Jina API.
Args:
url (str): The URL to scrape.
Returns:
str: The scraped content in markdown format.
"""
headers = {
'Authorization': f'Bearer {JINA_API_KEY}',
'X-Locale': 'en-US',
'X-Return-Format': 'text',
'X-With-Generated-Alt': 'true',
'X-With-Links-Summary': 'true'
}
scrape_pattern = f'https://r.jina.ai/{url}'
response = requests.get(scrape_pattern, headers=headers)
return response.text
def scrape_and_display(url: str) -> Tuple[str, tempfile._TemporaryFileWrapper]:
"""
Scrape the content of a given URL and prepare it for display and download.
Args:
url (str): The URL to scrape.
Returns:
Tuple[str, tempfile._TemporaryFileWrapper]: A tuple containing the scraped content and a temporary file for download.
"""
scraped_content = web_scraper(url)
# Create a temporary file
temp_file = tempfile.NamedTemporaryFile(mode="w+", suffix=".md", delete=False)
temp_file.write(scraped_content)
temp_file.flush()
return scraped_content, temp_file.name
def create_gradio_interface() -> gr.Interface:
"""
Create and configure the Gradio interface for the web scraper.
Returns:
gr.Interface: The configured Gradio interface.
"""
return gr.Interface(
fn=scrape_and_display,
inputs=gr.Textbox(label="Enter URL to scrape"),
outputs=[
gr.Markdown(label="Scraped Content"),
gr.File(label="Download Markdown")
],
title="Web Scraper",
description="Enter a URL to scrape and view the content in markdown format. You can also download the markdown file.",
examples=[["https://www.robots.ox.ac.uk/~vgg/data/flowers/102/categories.html"]],
allow_flagging="never"
)
if __name__ == "__main__":
iface = create_gradio_interface()
iface.launch() |