File size: 4,412 Bytes
842d191
 
 
 
 
 
 
 
 
cea4b04
b4cf797
842d191
 
 
 
 
 
 
 
ac6e8fe
 
 
842d191
 
 
ac6e8fe
842d191
 
 
ac6e8fe
842d191
ac6e8fe
842d191
 
 
 
08a92ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ac6e8fe
842d191
 
 
ac6e8fe
 
 
842d191
ac6e8fe
842d191
 
ac6e8fe
 
 
 
 
 
 
 
 
 
08a92ff
ac6e8fe
342a5a7
ac6e8fe
 
842d191
ac6e8fe
842d191
ac6e8fe
 
842d191
ac6e8fe
842d191
 
ac6e8fe
842d191
ac6e8fe
842d191
ac6e8fe
 
 
842d191
ac6e8fe
 
842d191
ac6e8fe
 
 
842d191
 
ac6e8fe
 
842d191
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ac6e8fe
 
 
842d191
 
ac6e8fe
842d191
 
 
 
ac6e8fe
 
842d191
 
ac6e8fe
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
''' 
# Web Scrapping 
[@dwancin on HuggingFace](https://huggingface.co/spaces/dwancin/web-scraping)
'''

import os,re, requests, uuid, zipfile, hashlib, shutil
import gradio as gr
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from transformers import pipeline
import torch

# Function to validate URLs
def validator(url):
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)

def finder(url, soup, media_type):
    files = []
    # Find text
    if media_type == "text":
        text_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'strong']
        for tag in text_tags:
            for element in soup.find_all(tag):
                files.append(element.get_text())
    # Find links
    else:
        for link in soup.find_all('a'):
            file = link.get('href')
            if file and media_type in file:
                file_url = file
                if not validator(file_url): # Assuming 'validator' is a function defined elsewhere
                    file_url = urljoin(url, file_url)
                files.append(file_url)
    return files

def summarize_long_text(text, chunk_size=1024):
    # Initialize the summarization pipeline
    summarizer = pipeline('summarization')

    # Tokenize the text into words
    words = text.split()

    # Split the words into chunks of the specified size
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

    # Summarize each chunk
    summarized_chunks = [summarizer(chunk, max_length=1024, min_length=50, do_sample=False)[0]['summary_text'] for chunk in chunks]

    # Combine the summarized chunks into the final summary
    final_summary = ' '.join(summarized_chunks)

    return final_summary

def scrapper(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
    except (requests.exceptions.RequestException, ValueError) as e:
        raise Exception(f"Unable to access URL: {url}. Error: {str(e)}")
        return None

    soup = BeautifulSoup(response.content, 'html.parser')

    # Add text files to the text folder
    text_content = finder(url, soup, 'text')
    os.makedirs('text', exist_ok=True)
    full_text = ''
    if text_content:
        with open('text/content.txt', 'w') as text_file:
            for line in text_content:
                text_file.write(line + '\n')
                full_text += line + ' '

    # Initialize the summarization pipeline
    summary = summarize_long_text(full_text)

    return summary

def checker(url):
    if not url:
        raise Exception("URL cannot be empty.")
    if not url.startswith("https://"):
        raise Exception("The URL must begin with https://")

    try:
        summary_text = scrapper(url)
    except requests.exceptions.HTTPError as e:
        if e.response.status_code == 403:
            raise Exception("HTTP Error: Forbidden. Access to the URL is forbidden.")
        else:
            raise Exception(f"HTTP Error: {e.response.status_code}")
    except TypeError as e:
        raise Exception(f"TypeError: {str(e)}")
    except (requests.exceptions.RequestException, ValueError) as e:
        raise Exception(f"Unable to access URL: {url}. Error: {str(e)}")

    if not summary_text:
        raise Exception("Found no text.")

    print(f"Returning summarized text from {url} ...")

    return summary_text

with gr.Blocks(theme="dwancin/theme") as app:
    title = gr.Markdown('''# Web Scraping 🕵️''')
    description = gr.Markdown('''Get the summarized text from your desired webpages with just a few clicks.''')
    with gr.Row():
        with gr.Column(scale=0, min_width=480, variant="panel", elem_id="sd-panel"):
            url_name = gr.Textbox(
                placeholder="Enter URL here",
                show_label=True,
                label="Website",
            )

            submit_button = gr.Button(
                "Submit",
                variant="primary",
                interactive=True,
            )

        with gr.Column(scale=2):
            summary_output = gr.Textbox(
                label="Summary",
                elem_id="summary-text",
                size="lg",
                show_label=False,
                readonly=True,
            )
    
    submit_button.click(
        checker, 
        inputs=[url_name], 
        outputs=[summary_output],
    )

app.launch()