awacke1 commited on
Commit
e154fc1
β€’
1 Parent(s): 8839acc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -83
app.py CHANGED
@@ -30,11 +30,6 @@ if not os.path.exists("history.json"):
30
  with open("history.json", "w") as f:
31
  json.dump({}, f)
32
 
33
- import os
34
- import base64
35
- import zipfile
36
- import streamlit as st
37
-
38
  def zip_subdirs(start_dir):
39
  for subdir, dirs, files in os.walk(start_dir):
40
  if subdir != start_dir: # Skip the root directory
@@ -55,7 +50,6 @@ def get_zip_download_link(zip_file):
55
  link_name = os.path.basename(zip_file)
56
  href = f'<a href="data:file/zip;base64,{b64}" download="{link_name}">Download: {link_name}</a>'
57
  return href
58
-
59
 
60
  @st.cache_resource
61
  def create_zip_of_files(files):
@@ -73,8 +67,6 @@ def get_zip_download_link(zip_file):
73
  href = f'<a href="data:application/zip;base64,{b64}" download="{zip_file}">Download All</a>'
74
  return href
75
 
76
-
77
-
78
  def download_file(url, local_filename):
79
  if url.startswith('http://') or url.startswith('https://'):
80
  try:
@@ -91,15 +83,12 @@ def download_html_and_files(url, subdir):
91
  html_content = requests.get(url).text
92
  soup = BeautifulSoup(html_content, 'html.parser')
93
  base_url = urllib.parse.urlunparse(urllib.parse.urlparse(url)._replace(path='', params='', query='', fragment=''))
94
-
95
  for link in soup.find_all('a'):
96
  file_url = urllib.parse.urljoin(base_url, link.get('href'))
97
  local_filename = os.path.join(subdir, urllib.parse.urlparse(file_url).path.split('/')[-1])
98
-
99
  if not local_filename.endswith('/') and local_filename != subdir:
100
  link['href'] = local_filename
101
  download_file(file_url, local_filename)
102
-
103
  with open(os.path.join(subdir, "index.html"), "w") as file:
104
  file.write(str(soup))
105
 
@@ -110,34 +99,27 @@ def list_files(directory_path='.'):
110
  def file_editor(file_path):
111
  st.write(f"Editing File: {os.path.basename(file_path)}")
112
  file_content = ""
113
-
114
  with open(file_path, "r") as f:
115
  file_content = f.read()
116
-
117
  file_content = st.text_area("Edit the file content:", value=file_content, height=250)
118
-
119
  if st.button("πŸ’Ύ Save"):
120
  with open(file_path, "w") as f:
121
  f.write(file_content)
122
  st.success(f"File '{os.path.basename(file_path)}' saved!")
123
 
124
-
125
  def show_file_operations(file_path, sequence_number):
126
- #st.write(f"File: {os.path.basename(file_path)}")
127
  unique_key = hashlib.md5(file_path.encode()).hexdigest()
128
  file_content = ""
129
-
130
  col01, col02, col1, col2, col3 = st.columns(5)
131
  with col01:
132
  st.write(os.path.basename(file_path))
133
  with col1:
134
  edit_key = f"edit_{unique_key}_{sequence_number}"
135
- #if st.button(f"✏️ Edit", key=edit_key):
136
- #with open(file_path, "r") as f:
137
- # file_content = f.read()
138
- #text_area_key = f"text_area_{unique_key}_{sequence_number}"
139
- #file_content = st.text_area("Edit the file content:", value=file_content, height=250, key=text_area_key)
140
-
141
  with col2:
142
  save_key = f"save_{unique_key}_{sequence_number}"
143
  if st.button(f"πŸ’Ύ Save", key=save_key):
@@ -145,21 +127,17 @@ def show_file_operations(file_path, sequence_number):
145
  with open(file_path, "w") as f:
146
  f.write(file_content)
147
  st.success(f"File saved!")
148
-
149
  with col3:
150
  delete_key = f"delete_{unique_key}_{sequence_number}"
151
  if st.button(f"πŸ—‘οΈ Delete", key=delete_key):
152
  os.remove(file_path)
153
  st.markdown(f"File deleted!")
154
 
155
-
156
  file_sequence_numbers = {}
157
 
158
-
159
  def show_file_content(file_path):
160
  _, file_extension = os.path.splitext(file_path)
161
  try:
162
-
163
  if file_extension in ['.png', '.jpg', '.jpeg']:
164
  image_url = file_path.replace('File:','').replace('/','')
165
  st.write('Image URL:' + image_url)
@@ -176,15 +154,9 @@ def show_file_content(file_path):
176
  elif file_extension in ['.html', '.txt']:
177
  with open(file_path, "r") as file:
178
  st.markdown(file.read(), unsafe_allow_html=True)
179
- #elif file_extension in ['.pdf']:
180
- #pdf_file = open(file_path, "rb")
181
- #base64_pdf = base64.b64encode(pdf_file.read()).decode('utf-8')
182
- #pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="700" height="1000" type="application/pdf"></iframe>'
183
- #st.markdown(pdf_display, unsafe_allow_html=True)
184
  except Exception as e:
185
  st.error(f"Error reading file {file_path}: {e}")
186
 
187
-
188
  def show_download_links(subdir):
189
  global file_sequence_numbers
190
  for file in list_files(subdir):
@@ -194,7 +166,6 @@ def show_download_links(subdir):
194
  else:
195
  file_sequence_numbers[file_path] += 1
196
  sequence_number = file_sequence_numbers[file_path]
197
-
198
  if os.path.isfile(file_path):
199
  st.markdown(file_path) # Display file path
200
  show_file_content(file_path) # Display file content based on type
@@ -210,9 +181,7 @@ def show_download_links_backup(subdir):
210
  else:
211
  file_sequence_numbers[file_path] += 1
212
  sequence_number = file_sequence_numbers[file_path]
213
-
214
  if os.path.isfile(file_path):
215
- #st.markdown(get_download_link(file_path), unsafe_allow_html=True)
216
  st.markdown(file_path, unsafe_allow_html=True) # faster than encapsulating file into base64 download link
217
  show_file_operations(file_path, sequence_number)
218
  else:
@@ -224,26 +193,14 @@ def get_download_link(file):
224
  b64 = base64.b64encode(bytes).decode()
225
  href = f'<a href="data:file/octet-stream;base64,{b64}" download=\'{os.path.basename(file)}\'>Download: {os.path.basename(file)}</a>'
226
  return href
227
-
228
-
229
-
230
  def main():
231
  st.sidebar.title('🌐 Web Datasets Bulk Downloader')
232
-
233
-
234
-
235
- # Check for query parameters for file editing
236
- #query_params = st.query_params()
237
  query_params = st.experimental_get_query_params()
238
-
239
  file_to_edit = query_params.get('file_to_edit', [None])[0]
240
-
241
  if file_to_edit and os.path.exists(file_to_edit):
242
  file_editor(file_to_edit)
243
  else:
244
- # Selecting URL input method
245
-
246
- # Selecting URL input method
247
  url_input_method = st.sidebar.radio("Choose URL Input Method", ["Enter URL", "Select from List"], index=1)
248
  url = ""
249
  if url_input_method == "Enter URL":
@@ -252,15 +209,6 @@ def main():
252
  selected_site = st.sidebar.selectbox("Select a Website", list(URLS.keys()), index=0)
253
  url = URLS[selected_site]
254
 
255
-
256
- #url_input_method = st.sidebar.radio("Choose URL Input Method", ["Enter URL", "Select from List"])
257
- #url = ""
258
- #if url_input_method == "Enter URL":
259
- # url = st.sidebar.text_input('Please enter a Web URL to bulk download text and files')
260
- #else:
261
- # selected_site = st.sidebar.selectbox("Select a Website", list(URLS.keys()))
262
- # url = URLS[selected_site]
263
-
264
  # Reading or creating history.json
265
  if not os.path.exists("history.json"):
266
  with open("history.json", "w") as f:
@@ -282,7 +230,6 @@ def main():
282
  with open("history.json", "w") as f:
283
  json.dump(history, f)
284
 
285
-
286
  if st.sidebar.button('πŸ“₯ Get All the Content', help="Download content from the selected URL"):
287
  download_html_and_files(url, history[url])
288
  show_download_links(history[url])
@@ -291,23 +238,11 @@ def main():
291
  for subdir in history.values():
292
  show_download_links(subdir)
293
 
294
-
295
- # Button for downloading content
296
- #if st.sidebar.button('πŸ“₯ Get All the Content'):
297
- # download_html_and_files(url, history[url])
298
- # show_download_links(history[url])
299
-
300
- # Button for showing download links
301
- #if st.sidebar.button('πŸ“‚ Show Download Links'):
302
- # for subdir in history.values():
303
- # show_download_links(subdir)
304
 
305
  if st.sidebar.button("πŸ—‘ Delete All", help="Delete all downloaded content"):
306
- #if st.sidebar.button("πŸ—‘ Delete All"):
307
  # Clear history file
308
  with open("history.json", "w") as f:
309
  json.dump({}, f)
310
-
311
  # Delete all files in subdirectories
312
  for subdir in glob.glob('*'):
313
  if os.path.isdir(subdir) and subdir not in EXCLUDED_FILES:
@@ -316,29 +251,18 @@ def main():
316
  os.remove(file_path)
317
  st.write(f"Deleted: {file_path}")
318
  os.rmdir(subdir) # Remove the empty directory
319
-
320
  st.experimental_rerun()
321
-
322
  if st.sidebar.button("⬇️ Download All", help="Download all files in a zip"):
323
  start_directory = '.' # Current directory
324
  for zip_file in zip_subdirs(start_directory):
325
  st.sidebar.markdown(zip_file, unsafe_allow_html=True)
326
  st.sidebar.markdown(get_zip_download_link(zip_file), unsafe_allow_html=True)
327
- #if st.sidebar.button("⬇️ Download All"):
328
- # start_directory = '.' # Current directory
329
- # for zip_file in zip_subdirs(start_directory):
330
- # st.sidebar.markdown(get_zip_download_link(zip_file), unsafe_allow_html=True)
331
-
332
- # Expander for showing URL history and download links
333
  with st.expander("URL History and Downloaded Files"):
334
  try:
335
  for url, subdir in history.items():
336
  st.markdown(f"#### {url}")
337
- # show_download_links(subdir)
338
  except:
339
  print('url history is empty')
340
- # Update each time to show files we have
341
- #for subdir in history.values():
342
- # show_download_links(subdir)
343
  if __name__ == "__main__":
344
  main()
 
30
  with open("history.json", "w") as f:
31
  json.dump({}, f)
32
 
 
 
 
 
 
33
  def zip_subdirs(start_dir):
34
  for subdir, dirs, files in os.walk(start_dir):
35
  if subdir != start_dir: # Skip the root directory
 
50
  link_name = os.path.basename(zip_file)
51
  href = f'<a href="data:file/zip;base64,{b64}" download="{link_name}">Download: {link_name}</a>'
52
  return href
 
53
 
54
  @st.cache_resource
55
  def create_zip_of_files(files):
 
67
  href = f'<a href="data:application/zip;base64,{b64}" download="{zip_file}">Download All</a>'
68
  return href
69
 
 
 
70
  def download_file(url, local_filename):
71
  if url.startswith('http://') or url.startswith('https://'):
72
  try:
 
83
  html_content = requests.get(url).text
84
  soup = BeautifulSoup(html_content, 'html.parser')
85
  base_url = urllib.parse.urlunparse(urllib.parse.urlparse(url)._replace(path='', params='', query='', fragment=''))
 
86
  for link in soup.find_all('a'):
87
  file_url = urllib.parse.urljoin(base_url, link.get('href'))
88
  local_filename = os.path.join(subdir, urllib.parse.urlparse(file_url).path.split('/')[-1])
 
89
  if not local_filename.endswith('/') and local_filename != subdir:
90
  link['href'] = local_filename
91
  download_file(file_url, local_filename)
 
92
  with open(os.path.join(subdir, "index.html"), "w") as file:
93
  file.write(str(soup))
94
 
 
99
  def file_editor(file_path):
100
  st.write(f"Editing File: {os.path.basename(file_path)}")
101
  file_content = ""
 
102
  with open(file_path, "r") as f:
103
  file_content = f.read()
 
104
  file_content = st.text_area("Edit the file content:", value=file_content, height=250)
 
105
  if st.button("πŸ’Ύ Save"):
106
  with open(file_path, "w") as f:
107
  f.write(file_content)
108
  st.success(f"File '{os.path.basename(file_path)}' saved!")
109
 
 
110
  def show_file_operations(file_path, sequence_number):
 
111
  unique_key = hashlib.md5(file_path.encode()).hexdigest()
112
  file_content = ""
 
113
  col01, col02, col1, col2, col3 = st.columns(5)
114
  with col01:
115
  st.write(os.path.basename(file_path))
116
  with col1:
117
  edit_key = f"edit_{unique_key}_{sequence_number}"
118
+ if st.button(f"✏️ Edit", key=edit_key):
119
+ with open(file_path, "r") as f:
120
+ file_content = f.read()
121
+ text_area_key = f"text_area_{unique_key}_{sequence_number}"
122
+ file_content = st.text_area("Edit the file content:", value=file_content, height=250, key=text_area_key)
 
123
  with col2:
124
  save_key = f"save_{unique_key}_{sequence_number}"
125
  if st.button(f"πŸ’Ύ Save", key=save_key):
 
127
  with open(file_path, "w") as f:
128
  f.write(file_content)
129
  st.success(f"File saved!")
 
130
  with col3:
131
  delete_key = f"delete_{unique_key}_{sequence_number}"
132
  if st.button(f"πŸ—‘οΈ Delete", key=delete_key):
133
  os.remove(file_path)
134
  st.markdown(f"File deleted!")
135
 
 
136
  file_sequence_numbers = {}
137
 
 
138
  def show_file_content(file_path):
139
  _, file_extension = os.path.splitext(file_path)
140
  try:
 
141
  if file_extension in ['.png', '.jpg', '.jpeg']:
142
  image_url = file_path.replace('File:','').replace('/','')
143
  st.write('Image URL:' + image_url)
 
154
  elif file_extension in ['.html', '.txt']:
155
  with open(file_path, "r") as file:
156
  st.markdown(file.read(), unsafe_allow_html=True)
 
 
 
 
 
157
  except Exception as e:
158
  st.error(f"Error reading file {file_path}: {e}")
159
 
 
160
  def show_download_links(subdir):
161
  global file_sequence_numbers
162
  for file in list_files(subdir):
 
166
  else:
167
  file_sequence_numbers[file_path] += 1
168
  sequence_number = file_sequence_numbers[file_path]
 
169
  if os.path.isfile(file_path):
170
  st.markdown(file_path) # Display file path
171
  show_file_content(file_path) # Display file content based on type
 
181
  else:
182
  file_sequence_numbers[file_path] += 1
183
  sequence_number = file_sequence_numbers[file_path]
 
184
  if os.path.isfile(file_path):
 
185
  st.markdown(file_path, unsafe_allow_html=True) # faster than encapsulating file into base64 download link
186
  show_file_operations(file_path, sequence_number)
187
  else:
 
193
  b64 = base64.b64encode(bytes).decode()
194
  href = f'<a href="data:file/octet-stream;base64,{b64}" download=\'{os.path.basename(file)}\'>Download: {os.path.basename(file)}</a>'
195
  return href
196
+
 
 
197
  def main():
198
  st.sidebar.title('🌐 Web Datasets Bulk Downloader')
 
 
 
 
 
199
  query_params = st.experimental_get_query_params()
 
200
  file_to_edit = query_params.get('file_to_edit', [None])[0]
 
201
  if file_to_edit and os.path.exists(file_to_edit):
202
  file_editor(file_to_edit)
203
  else:
 
 
 
204
  url_input_method = st.sidebar.radio("Choose URL Input Method", ["Enter URL", "Select from List"], index=1)
205
  url = ""
206
  if url_input_method == "Enter URL":
 
209
  selected_site = st.sidebar.selectbox("Select a Website", list(URLS.keys()), index=0)
210
  url = URLS[selected_site]
211
 
 
 
 
 
 
 
 
 
 
212
  # Reading or creating history.json
213
  if not os.path.exists("history.json"):
214
  with open("history.json", "w") as f:
 
230
  with open("history.json", "w") as f:
231
  json.dump(history, f)
232
 
 
233
  if st.sidebar.button('πŸ“₯ Get All the Content', help="Download content from the selected URL"):
234
  download_html_and_files(url, history[url])
235
  show_download_links(history[url])
 
238
  for subdir in history.values():
239
  show_download_links(subdir)
240
 
 
 
 
 
 
 
 
 
 
 
241
 
242
  if st.sidebar.button("πŸ—‘ Delete All", help="Delete all downloaded content"):
 
243
  # Clear history file
244
  with open("history.json", "w") as f:
245
  json.dump({}, f)
 
246
  # Delete all files in subdirectories
247
  for subdir in glob.glob('*'):
248
  if os.path.isdir(subdir) and subdir not in EXCLUDED_FILES:
 
251
  os.remove(file_path)
252
  st.write(f"Deleted: {file_path}")
253
  os.rmdir(subdir) # Remove the empty directory
 
254
  st.experimental_rerun()
 
255
  if st.sidebar.button("⬇️ Download All", help="Download all files in a zip"):
256
  start_directory = '.' # Current directory
257
  for zip_file in zip_subdirs(start_directory):
258
  st.sidebar.markdown(zip_file, unsafe_allow_html=True)
259
  st.sidebar.markdown(get_zip_download_link(zip_file), unsafe_allow_html=True)
 
 
 
 
 
 
260
  with st.expander("URL History and Downloaded Files"):
261
  try:
262
  for url, subdir in history.items():
263
  st.markdown(f"#### {url}")
 
264
  except:
265
  print('url history is empty')
266
+
 
 
267
  if __name__ == "__main__":
268
  main()