File size: 2,150 Bytes
f8db296
 
 
 
f987b64
f8db296
 
f987b64
f8db296
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import gradio as gr
import requests
import os
import tempfile

from typing import Tuple



JINA_API_KEY = os.getenv('JINA_API_KEY')

def web_scraper(url: str) -> str:
    """
    Scrape the content of a given URL using the Jina API.

    Args:
        url (str): The URL to scrape.

    Returns:
        str: The scraped content in markdown format.
    """
    headers = {
        'Authorization': f'Bearer {JINA_API_KEY}',
        'X-Locale': 'en-US',
        'X-Return-Format': 'text',    
        'X-With-Generated-Alt': 'true',
        'X-With-Links-Summary': 'true'
    }
    scrape_pattern = f'https://r.jina.ai/{url}'
    
    response = requests.get(scrape_pattern, headers=headers)
    return response.text

def scrape_and_display(url: str) -> Tuple[str, tempfile._TemporaryFileWrapper]:
    """
    Scrape the content of a given URL and prepare it for display and download.

    Args:
        url (str): The URL to scrape.

    Returns:
        Tuple[str, tempfile._TemporaryFileWrapper]: A tuple containing the scraped content and a temporary file for download.
    """
    scraped_content = web_scraper(url)
    
    # Create a temporary file
    temp_file = tempfile.NamedTemporaryFile(mode="w+", suffix=".md", delete=False)
    temp_file.write(scraped_content)
    temp_file.flush()
    
    return scraped_content, temp_file.name

def create_gradio_interface() -> gr.Interface:
    """
    Create and configure the Gradio interface for the web scraper.

    Returns:
        gr.Interface: The configured Gradio interface.
    """
    return gr.Interface(
        fn=scrape_and_display,
        inputs=gr.Textbox(label="Enter URL to scrape"),
        outputs=[
            gr.Markdown(label="Scraped Content"),
            gr.File(label="Download Markdown")
        ],
        title="Web Scraper",
        description="Enter a URL to scrape and view the content in markdown format. You can also download the markdown file.",
        examples=[["https://www.robots.ox.ac.uk/~vgg/data/flowers/102/categories.html"]],
        allow_flagging="never"
    )

if __name__ == "__main__":
    iface = create_gradio_interface()
    iface.launch()