john.mah commited on
Commit
ca5f8bd
1 Parent(s): 5e76e2d

Add App file

Browse files
Files changed (1) hide show
  1. App.py +59 -0
App.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### WEBSCRAPER FUNCTION
2
+ from bs4 import BeautifulSoup
3
+ import requests
4
+ import shutil
5
+ import csv
6
+ from urllib.parse import urljoin, urlparse
7
+
8
+ def webscraper(image, URL):
9
+ alt_text_list = []
10
+ image_count = 0
11
+ if "http" in URL:
12
+ try:
13
+ html_page = requests.get(URL)
14
+ except:
15
+ print("Exception! Likely a bad URL!")
16
+
17
+ # Parse the DOM structure
18
+ soup = BeautifulSoup(html_page.content, 'html.parser')
19
+
20
+ # Grab all the images on the page where src and alt are available
21
+ images = soup.findAll('img', {"src":True, "alt":True})
22
+
23
+ # Take the img tags and download the images via the src property
24
+ for index, image in enumerate(images):
25
+ src_url = image.attrs['src']
26
+
27
+ # As a safeguard make sure all the img tags are jpg or png
28
+ if "jpg" in src_url or "png" in src_url:
29
+ if "https" not in src_url and src_url[0:1] != "//":
30
+ src_url = urljoin(URL, src_url)
31
+ elif "https" not in src_url:
32
+ src_url = "https:" + src_url
33
+
34
+ # Request to download (should be 200 status)
35
+ try:
36
+ image_request = requests.get(src_url, stream=True)
37
+ except:
38
+ print("Exception! Likely due to timeout!")
39
+ continue
40
+
41
+ if image_request.status_code == 200:
42
+ # Increment image download count
43
+ image_count += 1
44
+ save_name = "images/" + str(image_count) + ".jpg"
45
+ # Adding alt text to list
46
+ alt_text = image.attrs['alt']
47
+ alt_text_list.append({'img_number':str(image_count), 'alt_text': alt_text})
48
+ with open(save_name, 'wb') as f:
49
+ image_request.raw.decode_content = True
50
+ shutil.copyfileobj(image_request.raw, f)
51
+ return alt_text_list
52
+
53
+ import gradio as gr
54
+
55
+ demo = gr.Interface(fn=webscraper,
56
+ inputs=["image", "text"],
57
+ outputs=["gallery", "text"])
58
+
59
+ demo.launch()