### WEBSCRAPER FUNCTION
from bs4 import BeautifulSoup
import requests
import shutil
import csv
from urllib.parse import urljoin, urlparse

def webscraper(image, URL):
    alt_text_list = []
    image_count = 0
    if "http" in URL:
        try:
            html_page = requests.get(URL)
        except:
            print("Exception! Likely a bad URL!")

        # Parse the DOM structure
        soup = BeautifulSoup(html_page.content, 'html.parser')

        # Grab all the images on the page where src and alt are available
        images = soup.findAll('img', {"src":True, "alt":True})

        # Take the img tags and download the images via the src property
        for index, image in enumerate(images):
            src_url = image.attrs['src']

            # As a safeguard make sure all the img tags are jpg or png
            if "jpg" in src_url or "png" in src_url:
                if "https" not in src_url and src_url[0:1] != "//":
                    src_url = urljoin(URL, src_url)
                elif "https" not in src_url:
                    src_url = "https:" + src_url

                # Request to download (should be 200 status)
                try:
                    image_request = requests.get(src_url, stream=True)
                except:
                    print("Exception! Likely due to timeout!")
                    continue

                if image_request.status_code == 200:
                    # Increment image download count
                    image_count += 1
                    save_name = "images/" + str(image_count) + ".jpg"
                    # Adding alt text to list
                    alt_text = image.attrs['alt']
                    alt_text_list.append({'img_number':str(image_count), 'alt_text': alt_text})
                    with open(save_name, 'wb') as f:
                        image_request.raw.decode_content = True
                        shutil.copyfileobj(image_request.raw, f)
    return alt_text_list

import gradio as gr

demo = gr.Interface(fn=webscraper,
                    inputs=["image", "text"],
                    outputs=["gallery", "text"])

demo.launch()