Spaces:

claytonsamples
/

newsletter

Sleeping

App Files Files Community

claytonsamples commited on Aug 7, 2023

Commit

ac6e8fe

1 Parent(s): 286803f

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -125

app.py CHANGED Viewed

@@ -13,149 +13,83 @@ def validator(url):
     parsed = urlparse(url)
     return bool(parsed.netloc) and bool(parsed.scheme)
-# Function to find files on webpage
 def finder(url, soup, media_type):
     files = []
-    # find image files
-    if media_type == "image":
-        tags = ['jpg', 'jpeg', 'png', 'svg', 'gif', 'webp', 'tiff', 'psd', 'eps', 'ai', 'indd', 'raw']
-        for tag in soup.find_all('img'):
-            file = tag.get('src')
-            if any(tag in file for tag in tags):
-                file_url = file
-                if not validator(file_url):
-                    file_url = urljoin(url, file_url)
-                files.append(file_url)
-    # find text
-    elif media_type == "text":
-        text_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'strong', 'pdf', 'txt', 'doc', 'rtf', 'docx']
         for tag in text_tags:
             for element in soup.find_all(tag):
                 files.append(element.get_text())
-    # find links
     else:
         for link in soup.find_all('a'):
             file = link.get('href')
-            if media_type in file:
                 file_url = file
-                if not validator(file_url):
                     file_url = urljoin(url, file_url)
                 files.append(file_url)
     return files
-# Function to download the files
-def downloader(urls, folder_name):
-    os.makedirs(folder_name, exist_ok=True)
-    for i, url in enumerate(urls):
-        response = requests.get(url, stream=True)
-        file_extension = url.split(".")[-1].split("&")[0]
-        url_hash = hashlib.md5(url.encode()).hexdigest()
-        unique_id = str(uuid.uuid4())[:8]
-        file_name = f'{url_hash}-{unique_id}.{file_extension}'
-        file_name = file_name[:255]
-        file_name = re.sub(r'[\\/:"*?<>|]+', '_', file_name)
-        with open(f'{folder_name}/{file_name}', 'wb') as out_file:
-            out_file.write(response.content)
-        print(f"Downloaded file: {file_name}")
-# Function to create zip file
-def zipper(folder_name):
-    if os.listdir(folder_name):
-        with zipfile.ZipFile(f'{folder_name}.zip', 'w') as zipf:
-            for file in os.listdir(folder_name):
-                zipf.write(f'{folder_name}/{file}')
-        return f'{folder_name}.zip'
-    else:
-        return ""
-# Function to access website
-def scrapper(url, images=False, text=False):
     try:
         response = requests.get(url, timeout=10)
         response.raise_for_status()
-    except (requests.exceptions.RequestException, ValueError):
-        raise gr.Error(f"Unable to access URL: {url}")
-        return None, None
-    soup = BeautifulSoup(response.content, 'html.parser')
-    # Clear all the previews folder data
-    if images:
-        shutil.rmtree('images', ignore_errors=True)
-    if text:
-        shutil.rmtree('text', ignore_errors=True)
-    # Add images to the image folder
-    if images:
-        image_urls = finder(url, soup, 'image')
-        os.makedirs('images', exist_ok=True)
-        if image_urls:
-            downloader(image_urls, 'images')
-        else:
-            raise gr.Error("Found no images.")
     # Add text files to the text folder
-    if text:
-        text_content = finder(url, soup, 'text')
-        os.makedirs('text', exist_ok=True)
-        if text_content:
-            with open('text/content.txt', 'w') as text_file:
-                for line in text_content:
-                    text_file.write(line + '\n')
-    # Output folder(s) as zip files
-    images_zip_file, text_zip_file = None, None
-    if images and os.path.exists('images') and os.listdir('images'):
-        images_zip_file = zipper('images')
-    if text and os.path.exists('text') and os.listdir('text'):
-        text_zip_file = zipper('text')
-    return images_zip_file, text_zip_file
-# Function to find requests errors
-def checker(url, media_types):
     if not url:
-        raise gr.Error("URL cannot be empty.")
     if not url.startswith("https://"):
-        raise gr.Error("The URL must begin with https://")
-    if not media_types:
-        raise gr.Error("At least one media type must be selected.")
     try:
-        image_file, text_file = scrapper(url, "Images" in media_types, "Text" in media_types)
     except requests.exceptions.HTTPError as e:
         if e.response.status_code == 403:
-            raise gr.Error("HTTP Error: Forbidden. Access to the URL is forbidden.")
         else:
-            raise gr.Error(f"HTTP Error: {e.response.status_code}")
     except TypeError as e:
-        raise gr.Error(f"TypeError: {str(e)}")
-    except (requests.exceptions.RequestException, ValueError):
-        raise gr.Error(f"Unable to access URL: {url}")
-    files = []
-    if "Text" in media_types and not text_file:
-        raise gr.Error("Found no text.")
-    if "Images" in media_types and not image_file:
-        raise gr.Error("Found no images.")
-    if image_file:
-        files.append(image_file)
-    if text_file:
-        files.append(text_file)
-    print(f"Returning downloaded files from {url} in {files} ...")
-    return files
-# Gradio Interface
 with gr.Blocks(theme="dwancin/theme") as app:
-    title = gr.Markdown('''# Web Scrapping 🕵️''')
-    description = gr.Markdown('''Get all media files from your desired webpages with just a few clicks.''')
     with gr.Row():
         with gr.Column(scale=0, min_width=480, variant="panel", elem_id="sd-panel"):
             url_name = gr.Textbox(
@@ -164,12 +98,6 @@ with gr.Blocks(theme="dwancin/theme") as app:
                 label="Website",
             )
-            media_types = gr.CheckboxGroup(
-                ["Images", "Text"],
-                value="Images",
-                label="Media types",
-            )
             submit_button = gr.Button(
                 "Submit",
                 variant="primary",
@@ -177,17 +105,18 @@ with gr.Blocks(theme="dwancin/theme") as app:
             )
         with gr.Column(scale=2):
-            output_files = gr.Files(
-                label="Output",
-                elem_id="file-list",
                 size="lg",
                 show_label=False,
             )
     submit_button.click(
         checker,
-        inputs=[url_name, media_types],
-        outputs=[output_files],
     )
-app.launch()

     parsed = urlparse(url)
     return bool(parsed.netloc) and bool(parsed.scheme)
 def finder(url, soup, media_type):
     files = []
+    # Find text
+    if media_type == "text":
+        text_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'strong']
         for tag in text_tags:
             for element in soup.find_all(tag):
                 files.append(element.get_text())
+    # Find links
     else:
         for link in soup.find_all('a'):
             file = link.get('href')
+            if file and media_type in file:
                 file_url = file
+                if not validator(file_url): # Assuming 'validator' is a function defined elsewhere
                     file_url = urljoin(url, file_url)
                 files.append(file_url)
     return files
+def scrapper(url):
     try:
         response = requests.get(url, timeout=10)
         response.raise_for_status()
+    except (requests.exceptions.RequestException, ValueError) as e:
+        raise Exception(f"Unable to access URL: {url}. Error: {str(e)}")
+        return None
+    soup = BeautifulSoup(response.content, 'html.parser')
     # Add text files to the text folder
+    text_content = finder(url, soup, 'text')
+    os.makedirs('text', exist_ok=True)
+    full_text = ''
+    if text_content:
+        with open('text/content.txt', 'w') as text_file:
+            for line in text_content:
+                text_file.write(line + '\n')
+                full_text += line + ' '
+    # Initialize the summarization pipeline
+    summarizer = pipeline('summarization')
+    # Summarize the content
+    summary = summarizer(full_text, max_length=200, min_length=50, do_sample=False)
+    # Extract the summary text
+    summary_text = summary[0]['summary_text']
+    return summary_text
+def checker(url):
     if not url:
+        raise Exception("URL cannot be empty.")
     if not url.startswith("https://"):
+        raise Exception("The URL must begin with https://")
     try:
+        summary_text = scrapper(url)
     except requests.exceptions.HTTPError as e:
         if e.response.status_code == 403:
+            raise Exception("HTTP Error: Forbidden. Access to the URL is forbidden.")
         else:
+            raise Exception(f"HTTP Error: {e.response.status_code}")
     except TypeError as e:
+        raise Exception(f"TypeError: {str(e)}")
+    except (requests.exceptions.RequestException, ValueError) as e:
+        raise Exception(f"Unable to access URL: {url}. Error: {str(e)}")
+    if not summary_text:
+        raise Exception("Found no text.")
+    print(f"Returning summarized text from {url} ...")
+    return summary_text
 with gr.Blocks(theme="dwancin/theme") as app:
+    title = gr.Markdown('''# Web Scraping 🕵️''')
+    description = gr.Markdown('''Get the summarized text from your desired webpages with just a few clicks.''')
     with gr.Row():
         with gr.Column(scale=0, min_width=480, variant="panel", elem_id="sd-panel"):
             url_name = gr.Textbox(
                 label="Website",
             )
             submit_button = gr.Button(
                 "Submit",
                 variant="primary",
             )
         with gr.Column(scale=2):
+            summary_output = gr.Textbox(
+                label="Summary",
+                elem_id="summary-text",
                 size="lg",
                 show_label=False,
+                readonly=True,
             )
     submit_button.click(
         checker,
+        inputs=[url_name],
+        outputs=[summary_output],
     )
+app.launch()