### WEBSCRAPER FUNCTION from bs4 import BeautifulSoup import requests import shutil import csv from urllib.parse import urljoin, urlparse def webscraper(image, URL): alt_text_list = [] image_count = 0 if "http" in URL: try: html_page = requests.get(URL) except: print("Exception! Likely a bad URL!") # Parse the DOM structure soup = BeautifulSoup(html_page.content, 'html.parser') # Grab all the images on the page where src and alt are available images = soup.findAll('img', {"src":True, "alt":True}) # Take the img tags and download the images via the src property for index, image in enumerate(images): src_url = image.attrs['src'] # As a safeguard make sure all the img tags are jpg or png if "jpg" in src_url or "png" in src_url: if "https" not in src_url and src_url[0:1] != "//": src_url = urljoin(URL, src_url) elif "https" not in src_url: src_url = "https:" + src_url # Request to download (should be 200 status) try: image_request = requests.get(src_url, stream=True) except: print("Exception! Likely due to timeout!") continue if image_request.status_code == 200: # Increment image download count image_count += 1 save_name = "images/" + str(image_count) + ".jpg" # Adding alt text to list alt_text = image.attrs['alt'] alt_text_list.append({'img_number':str(image_count), 'alt_text': alt_text}) with open(save_name, 'wb') as f: image_request.raw.decode_content = True shutil.copyfileobj(image_request.raw, f) return alt_text_list import gradio as gr demo = gr.Interface(fn=webscraper, inputs=["image", "text"], outputs=["gallery", "text"]) demo.launch()