File size: 4,193 Bytes
842d191
 
 
 
e38196e
842d191
 
 
e38196e
b4cf797
842d191
 
 
 
 
 
 
 
ac6e8fe
 
aa12d7c
842d191
 
 
 
 
e38196e
08a92ff
e38196e
08a92ff
e38196e
 
08a92ff
e38196e
 
08a92ff
e38196e
 
08a92ff
e38196e
 
 
 
 
 
08a92ff
e38196e
08a92ff
ac6e8fe
842d191
 
 
ac6e8fe
 
 
 
842d191
ac6e8fe
 
e38196e
ac6e8fe
e38196e
 
 
 
 
08a92ff
ac6e8fe
342a5a7
ac6e8fe
 
842d191
ac6e8fe
842d191
ac6e8fe
 
842d191
ac6e8fe
842d191
 
ac6e8fe
842d191
ac6e8fe
842d191
ac6e8fe
 
 
842d191
ac6e8fe
 
842d191
ac6e8fe
 
 
842d191
 
ac6e8fe
 
842d191
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ac6e8fe
 
 
842d191
 
ac6e8fe
842d191
 
 
 
ac6e8fe
 
842d191
 
ac6e8fe
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
''' 
# Web Scrapping 
[@dwancin on HuggingFace](https://huggingface.co/spaces/dwancin/web-scraping)
'''
import os, re, requests, uuid, zipfile, hashlib, shutil
import gradio as gr
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from transformers import pipeline, AutoTokenizer
import torch

# Function to validate URLs
def validator(url):
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)

def finder(url, soup, media_type):
    files = []
    # Find text
    if media_type == "text":
        text_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
        for tag in text_tags:
            for element in soup.find_all(tag):
                files.append(element.get_text())
    return files

def summarize_long_text(text, model_name="facebook/bart-large-cnn", max_chunk_tokens=500):
    # Initialize the summarization pipeline
    summarizer = pipeline('summarization', model=model_name)

    # Initialize the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Tokenize the text
    tokens = tokenizer.encode(text)

    # Split the tokens into chunks of the specified size
    chunks = [tokens[i:i + max_chunk_tokens] for i in range(0, len(tokens), max_chunk_tokens)]

    # Summarize each chunk and combine the results
    final_summary = ''
    for chunk in chunks:
        chunk_text = tokenizer.decode(chunk)
        summary = summarizer(chunk_text, max_length=200, min_length=50, do_sample=False)[0]['summary_text']
        final_summary += ' ' + summary

    return final_summary.strip()

def scrapper(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
    except (requests.exceptions.RequestException, ValueError) as e:
        raise Exception(f"Unable to access URL: {url}. Error: {str(e)}")
        return None
    soup = BeautifulSoup(response.content, 'html.parser')
    # Add text files to the text folder
    text_content = finder(url, soup, 'text')
    os.makedirs('text', exist_ok=True)
    full_text = ' '.join(text_content) # Join the text content into a single string

    # Save the full text to a file
    with open('text/content.txt', 'w') as text_file:
        text_file.write(full_text)

    # Summarize the text
    summary = summarize_long_text(full_text)

    return summary

def checker(url):
    if not url:
        raise Exception("URL cannot be empty.")
    if not url.startswith("https://"):
        raise Exception("The URL must begin with https://")

    try:
        summary_text = scrapper(url)
    except requests.exceptions.HTTPError as e:
        if e.response.status_code == 403:
            raise Exception("HTTP Error: Forbidden. Access to the URL is forbidden.")
        else:
            raise Exception(f"HTTP Error: {e.response.status_code}")
    except TypeError as e:
        raise Exception(f"TypeError: {str(e)}")
    except (requests.exceptions.RequestException, ValueError) as e:
        raise Exception(f"Unable to access URL: {url}. Error: {str(e)}")

    if not summary_text:
        raise Exception("Found no text.")

    print(f"Returning summarized text from {url} ...")

    return summary_text

with gr.Blocks(theme="dwancin/theme") as app:
    title = gr.Markdown('''# Web Scraping 🕵️''')
    description = gr.Markdown('''Get the summarized text from your desired webpages with just a few clicks.''')
    with gr.Row():
        with gr.Column(scale=0, min_width=480, variant="panel", elem_id="sd-panel"):
            url_name = gr.Textbox(
                placeholder="Enter URL here",
                show_label=True,
                label="Website",
            )

            submit_button = gr.Button(
                "Submit",
                variant="primary",
                interactive=True,
            )

        with gr.Column(scale=2):
            summary_output = gr.Textbox(
                label="Summary",
                elem_id="summary-text",
                size="lg",
                show_label=False,
                readonly=True,
            )
    
    submit_button.click(
        checker, 
        inputs=[url_name], 
        outputs=[summary_output],
    )

app.launch()