Spaces:

JMah
/

Alt_Text_Generator

Runtime error

App Files Files Community

john.mah commited on Nov 11, 2022

Commit

ca5f8bd

•

1 Parent(s): 5e76e2d

Add App file

Browse files

Files changed (1) hide show

App.py +59 -0

App.py ADDED Viewed

	@@ -0,0 +1,59 @@

+### WEBSCRAPER FUNCTION
+from bs4 import BeautifulSoup
+import requests
+import shutil
+import csv
+from urllib.parse import urljoin, urlparse
+def webscraper(image, URL):
+    alt_text_list = []
+    image_count = 0
+    if "http" in URL:
+        try:
+            html_page = requests.get(URL)
+        except:
+            print("Exception! Likely a bad URL!")
+        # Parse the DOM structure
+        soup = BeautifulSoup(html_page.content, 'html.parser')
+        # Grab all the images on the page where src and alt are available
+        images = soup.findAll('img', {"src":True, "alt":True})
+        # Take the img tags and download the images via the src property
+        for index, image in enumerate(images):
+            src_url = image.attrs['src']
+            # As a safeguard make sure all the img tags are jpg or png
+            if "jpg" in src_url or "png" in src_url:
+                if "https" not in src_url and src_url[0:1] != "//":
+                    src_url = urljoin(URL, src_url)
+                elif "https" not in src_url:
+                    src_url = "https:" + src_url
+                # Request to download (should be 200 status)
+                try:
+                    image_request = requests.get(src_url, stream=True)
+                except:
+                    print("Exception! Likely due to timeout!")
+                    continue
+                if image_request.status_code == 200:
+                    # Increment image download count
+                    image_count += 1
+                    save_name = "images/" + str(image_count) + ".jpg"
+                    # Adding alt text to list
+                    alt_text = image.attrs['alt']
+                    alt_text_list.append({'img_number':str(image_count), 'alt_text': alt_text})
+                    with open(save_name, 'wb') as f:
+                        image_request.raw.decode_content = True
+                        shutil.copyfileobj(image_request.raw, f)
+    return alt_text_list
+import gradio as gr
+demo = gr.Interface(fn=webscraper,
+                    inputs=["image", "text"],
+                    outputs=["gallery", "text"])
+demo.launch()