claytonsamples commited on
Commit
ac6e8fe
·
1 Parent(s): 286803f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -125
app.py CHANGED
@@ -13,149 +13,83 @@ def validator(url):
13
  parsed = urlparse(url)
14
  return bool(parsed.netloc) and bool(parsed.scheme)
15
 
16
-
17
- # Function to find files on webpage
18
  def finder(url, soup, media_type):
19
  files = []
20
-
21
- # find image files
22
- if media_type == "image":
23
- tags = ['jpg', 'jpeg', 'png', 'svg', 'gif', 'webp', 'tiff', 'psd', 'eps', 'ai', 'indd', 'raw']
24
- for tag in soup.find_all('img'):
25
- file = tag.get('src')
26
- if any(tag in file for tag in tags):
27
- file_url = file
28
- if not validator(file_url):
29
- file_url = urljoin(url, file_url)
30
- files.append(file_url)
31
-
32
- # find text
33
- elif media_type == "text":
34
- text_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'strong', 'pdf', 'txt', 'doc', 'rtf', 'docx']
35
  for tag in text_tags:
36
  for element in soup.find_all(tag):
37
  files.append(element.get_text())
38
-
39
- # find links
40
  else:
41
  for link in soup.find_all('a'):
42
  file = link.get('href')
43
- if media_type in file:
44
  file_url = file
45
- if not validator(file_url):
46
  file_url = urljoin(url, file_url)
47
  files.append(file_url)
48
-
49
  return files
50
 
51
-
52
- # Function to download the files
53
- def downloader(urls, folder_name):
54
- os.makedirs(folder_name, exist_ok=True)
55
- for i, url in enumerate(urls):
56
- response = requests.get(url, stream=True)
57
- file_extension = url.split(".")[-1].split("&")[0]
58
- url_hash = hashlib.md5(url.encode()).hexdigest()
59
- unique_id = str(uuid.uuid4())[:8]
60
- file_name = f'{url_hash}-{unique_id}.{file_extension}'
61
- file_name = file_name[:255]
62
- file_name = re.sub(r'[\\/:"*?<>|]+', '_', file_name)
63
- with open(f'{folder_name}/{file_name}', 'wb') as out_file:
64
- out_file.write(response.content)
65
- print(f"Downloaded file: {file_name}")
66
-
67
-
68
- # Function to create zip file
69
- def zipper(folder_name):
70
- if os.listdir(folder_name):
71
- with zipfile.ZipFile(f'{folder_name}.zip', 'w') as zipf:
72
- for file in os.listdir(folder_name):
73
- zipf.write(f'{folder_name}/{file}')
74
- return f'{folder_name}.zip'
75
- else:
76
- return ""
77
-
78
-
79
- # Function to access website
80
- def scrapper(url, images=False, text=False):
81
  try:
82
  response = requests.get(url, timeout=10)
83
  response.raise_for_status()
84
- except (requests.exceptions.RequestException, ValueError):
85
- raise gr.Error(f"Unable to access URL: {url}")
86
- return None, None
87
- soup = BeautifulSoup(response.content, 'html.parser')
88
 
89
- # Clear all the previews folder data
90
- if images:
91
- shutil.rmtree('images', ignore_errors=True)
92
- if text:
93
- shutil.rmtree('text', ignore_errors=True)
94
-
95
- # Add images to the image folder
96
- if images:
97
- image_urls = finder(url, soup, 'image')
98
- os.makedirs('images', exist_ok=True)
99
- if image_urls:
100
- downloader(image_urls, 'images')
101
- else:
102
- raise gr.Error("Found no images.")
103
 
104
  # Add text files to the text folder
105
- if text:
106
- text_content = finder(url, soup, 'text')
107
- os.makedirs('text', exist_ok=True)
108
- if text_content:
109
- with open('text/content.txt', 'w') as text_file:
110
- for line in text_content:
111
- text_file.write(line + '\n')
112
-
113
- # Output folder(s) as zip files
114
- images_zip_file, text_zip_file = None, None
115
- if images and os.path.exists('images') and os.listdir('images'):
116
- images_zip_file = zipper('images')
117
- if text and os.path.exists('text') and os.listdir('text'):
118
- text_zip_file = zipper('text')
119
- return images_zip_file, text_zip_file
120
-
121
-
122
- # Function to find requests errors
123
- def checker(url, media_types):
 
124
  if not url:
125
- raise gr.Error("URL cannot be empty.")
126
  if not url.startswith("https://"):
127
- raise gr.Error("The URL must begin with https://")
128
- if not media_types:
129
- raise gr.Error("At least one media type must be selected.")
130
  try:
131
- image_file, text_file = scrapper(url, "Images" in media_types, "Text" in media_types)
132
  except requests.exceptions.HTTPError as e:
133
  if e.response.status_code == 403:
134
- raise gr.Error("HTTP Error: Forbidden. Access to the URL is forbidden.")
135
  else:
136
- raise gr.Error(f"HTTP Error: {e.response.status_code}")
137
  except TypeError as e:
138
- raise gr.Error(f"TypeError: {str(e)}")
139
- except (requests.exceptions.RequestException, ValueError):
140
- raise gr.Error(f"Unable to access URL: {url}")
141
- files = []
142
- if "Text" in media_types and not text_file:
143
- raise gr.Error("Found no text.")
144
- if "Images" in media_types and not image_file:
145
- raise gr.Error("Found no images.")
146
- if image_file:
147
- files.append(image_file)
148
- if text_file:
149
- files.append(text_file)
150
 
151
- print(f"Returning downloaded files from {url} in {files} ...")
 
152
 
153
- return files
 
 
154
 
155
- # Gradio Interface
156
  with gr.Blocks(theme="dwancin/theme") as app:
157
- title = gr.Markdown('''# Web Scrapping 🕵️''')
158
- description = gr.Markdown('''Get all media files from your desired webpages with just a few clicks.''')
159
  with gr.Row():
160
  with gr.Column(scale=0, min_width=480, variant="panel", elem_id="sd-panel"):
161
  url_name = gr.Textbox(
@@ -164,12 +98,6 @@ with gr.Blocks(theme="dwancin/theme") as app:
164
  label="Website",
165
  )
166
 
167
- media_types = gr.CheckboxGroup(
168
- ["Images", "Text"],
169
- value="Images",
170
- label="Media types",
171
- )
172
-
173
  submit_button = gr.Button(
174
  "Submit",
175
  variant="primary",
@@ -177,17 +105,18 @@ with gr.Blocks(theme="dwancin/theme") as app:
177
  )
178
 
179
  with gr.Column(scale=2):
180
- output_files = gr.Files(
181
- label="Output",
182
- elem_id="file-list",
183
  size="lg",
184
  show_label=False,
 
185
  )
186
 
187
  submit_button.click(
188
  checker,
189
- inputs=[url_name, media_types],
190
- outputs=[output_files],
191
  )
192
 
193
- app.launch()
 
13
  parsed = urlparse(url)
14
  return bool(parsed.netloc) and bool(parsed.scheme)
15
 
 
 
16
  def finder(url, soup, media_type):
17
  files = []
18
+ # Find text
19
+ if media_type == "text":
20
+ text_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'strong']
 
 
 
 
 
 
 
 
 
 
 
 
21
  for tag in text_tags:
22
  for element in soup.find_all(tag):
23
  files.append(element.get_text())
24
+ # Find links
 
25
  else:
26
  for link in soup.find_all('a'):
27
  file = link.get('href')
28
+ if file and media_type in file:
29
  file_url = file
30
+ if not validator(file_url): # Assuming 'validator' is a function defined elsewhere
31
  file_url = urljoin(url, file_url)
32
  files.append(file_url)
 
33
  return files
34
 
35
+ def scrapper(url):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  try:
37
  response = requests.get(url, timeout=10)
38
  response.raise_for_status()
39
+ except (requests.exceptions.RequestException, ValueError) as e:
40
+ raise Exception(f"Unable to access URL: {url}. Error: {str(e)}")
41
+ return None
 
42
 
43
+ soup = BeautifulSoup(response.content, 'html.parser')
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
  # Add text files to the text folder
46
+ text_content = finder(url, soup, 'text')
47
+ os.makedirs('text', exist_ok=True)
48
+ full_text = ''
49
+ if text_content:
50
+ with open('text/content.txt', 'w') as text_file:
51
+ for line in text_content:
52
+ text_file.write(line + '\n')
53
+ full_text += line + ' '
54
+
55
+ # Initialize the summarization pipeline
56
+ summarizer = pipeline('summarization')
57
+
58
+ # Summarize the content
59
+ summary = summarizer(full_text, max_length=200, min_length=50, do_sample=False)
60
+
61
+ # Extract the summary text
62
+ summary_text = summary[0]['summary_text']
63
+ return summary_text
64
+
65
+ def checker(url):
66
  if not url:
67
+ raise Exception("URL cannot be empty.")
68
  if not url.startswith("https://"):
69
+ raise Exception("The URL must begin with https://")
70
+
 
71
  try:
72
+ summary_text = scrapper(url)
73
  except requests.exceptions.HTTPError as e:
74
  if e.response.status_code == 403:
75
+ raise Exception("HTTP Error: Forbidden. Access to the URL is forbidden.")
76
  else:
77
+ raise Exception(f"HTTP Error: {e.response.status_code}")
78
  except TypeError as e:
79
+ raise Exception(f"TypeError: {str(e)}")
80
+ except (requests.exceptions.RequestException, ValueError) as e:
81
+ raise Exception(f"Unable to access URL: {url}. Error: {str(e)}")
 
 
 
 
 
 
 
 
 
82
 
83
+ if not summary_text:
84
+ raise Exception("Found no text.")
85
 
86
+ print(f"Returning summarized text from {url} ...")
87
+
88
+ return summary_text
89
 
 
90
  with gr.Blocks(theme="dwancin/theme") as app:
91
+ title = gr.Markdown('''# Web Scraping 🕵️''')
92
+ description = gr.Markdown('''Get the summarized text from your desired webpages with just a few clicks.''')
93
  with gr.Row():
94
  with gr.Column(scale=0, min_width=480, variant="panel", elem_id="sd-panel"):
95
  url_name = gr.Textbox(
 
98
  label="Website",
99
  )
100
 
 
 
 
 
 
 
101
  submit_button = gr.Button(
102
  "Submit",
103
  variant="primary",
 
105
  )
106
 
107
  with gr.Column(scale=2):
108
+ summary_output = gr.Textbox(
109
+ label="Summary",
110
+ elem_id="summary-text",
111
  size="lg",
112
  show_label=False,
113
+ readonly=True,
114
  )
115
 
116
  submit_button.click(
117
  checker,
118
+ inputs=[url_name],
119
+ outputs=[summary_output],
120
  )
121
 
122
+ app.launch()