|
import streamlit as st |
|
import requests |
|
import os |
|
import urllib |
|
import base64 |
|
import hashlib |
|
import json |
|
import uuid |
|
import glob |
|
import zipfile |
|
from bs4 import BeautifulSoup |
|
from PIL import Image |
|
|
|
EXCLUDED_FILES = ['app.py', 'requirements.txt', 'pre-requirements.txt', 'packages.txt', 'README.md','.gitattributes', "backup.py","Dockerfile"] |
|
FILE_EMOJIS = { |
|
".jpg": "πΌοΈ", ".jpeg": "πΌοΈ", ".png": "πΌοΈ", |
|
".html": "π", ".htm": "π", ".txt": "π", |
|
".pdf": "π", ".doc": "π", ".docx": "π", |
|
".xls": "π", ".xlsx": "π", ".ppt": "π", ".pptx": "π", |
|
} |
|
URLS = { |
|
"Lumiere": "https://lumiere-video.github.io/", |
|
"ChatDev": "https://github.com/OpenBMB/ChatDev", |
|
"Autogen": "https://microsoft.github.io/autogen/", |
|
"Mixtral": "https://github.com/open-compass/MixtralKit", |
|
"World Health Organization": "https://www.who.int/", |
|
"CMS - Centers for Medicare & Medicaid Services": "https://www.cms.gov/", |
|
"Mayo Clinic": "https://www.mayoclinic.org/", |
|
"MedlinePlus": "https://medlineplus.gov/", |
|
"Healthline": "https://www.healthline.com/", |
|
"CDC - Centers for Disease Control and Prevention": "https://www.cdc.gov/", |
|
"UHCProvider - UHC Provider": "https://www.uhcprovider.com/", |
|
"Johns Hopkins Medicine": "https://www.hopkinsmedicine.org/" |
|
} |
|
|
|
if not os.path.exists("history.json"): |
|
with open("history.json", "w") as f: |
|
json.dump({}, f) |
|
|
|
def zip_subdirs(start_dir): |
|
for subdir, dirs, files in os.walk(start_dir): |
|
if subdir != start_dir: |
|
zip_filename = os.path.join(start_dir, subdir.split(os.sep)[-1] + '.zip') |
|
allFileSummary = "" |
|
with zipfile.ZipFile(zip_filename, 'w') as zipf: |
|
for file in files: |
|
file_path = os.path.join(subdir, file) |
|
zipf.write(file_path, os.path.relpath(file_path, start_dir)) |
|
allFileSummary=allFileSummary+(f"Added: {file_path}") |
|
st.write(allFileSummary) |
|
yield zip_filename |
|
|
|
def get_zip_download_link(zip_file): |
|
with open(zip_file, 'rb') as f: |
|
bytes = f.read() |
|
b64 = base64.b64encode(bytes).decode() |
|
link_name = os.path.basename(zip_file) |
|
href = f'<a href="data:file/zip;base64,{b64}" download="{link_name}">Download: {link_name}</a>' |
|
return href |
|
|
|
@st.cache_resource |
|
def create_zip_of_files(files): |
|
zip_name = "all_files.zip" |
|
with zipfile.ZipFile(zip_name, 'w') as zipf: |
|
for file in files: |
|
zipf.write(file) |
|
return zip_name |
|
|
|
@st.cache_resource |
|
def get_zip_download_link(zip_file): |
|
with open(zip_file, 'rb') as f: |
|
data = f.read() |
|
b64 = base64.b64encode(data).decode() |
|
href = f'<a href="data:application/zip;base64,{b64}" download="{zip_file}">Download All</a>' |
|
return href |
|
|
|
def download_file(url, local_filename): |
|
if url.startswith('http://') or url.startswith('https://'): |
|
try: |
|
with requests.get(url, stream=True) as r: |
|
r.raise_for_status() |
|
with open(local_filename, 'wb') as f: |
|
for chunk in r.iter_content(chunk_size=8192): |
|
f.write(chunk) |
|
return local_filename |
|
except requests.exceptions.HTTPError as err: |
|
print(f"HTTP error occurred: {err}") |
|
|
|
def download_html_and_files(url, subdir): |
|
html_content = requests.get(url).text |
|
soup = BeautifulSoup(html_content, 'html.parser') |
|
base_url = urllib.parse.urlunparse(urllib.parse.urlparse(url)._replace(path='', params='', query='', fragment='')) |
|
file_urls = {} |
|
for link in soup.find_all('a'): |
|
file_url = urllib.parse.urljoin(base_url, link.get('href')) |
|
local_filename = os.path.join(subdir, urllib.parse.urlparse(file_url).path.split('/')[-1]) |
|
if not local_filename.endswith('/') and local_filename != subdir: |
|
link['href'] = local_filename |
|
if download_file(file_url, local_filename): |
|
file_urls[local_filename] = file_url |
|
with open(os.path.join(subdir, "index.html"), "w") as file: |
|
file.write(str(soup)) |
|
return file_urls |
|
|
|
def list_files(directory_path='.'): |
|
files = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))] |
|
return [f for f in files if f not in EXCLUDED_FILES] |
|
|
|
def file_editor(file_path): |
|
st.write(f"Editing File: {os.path.basename(file_path)}") |
|
file_content = "" |
|
with open(file_path, "r") as f: |
|
file_content = f.read() |
|
file_content = st.text_area("Edit the file content:", value=file_content, height=250) |
|
if st.button("πΎ Save"): |
|
with open(file_path, "w") as f: |
|
f.write(file_content) |
|
st.success(f"File '{os.path.basename(file_path)}' saved!") |
|
|
|
def show_file_operations(file_path, sequence_number): |
|
unique_key = hashlib.md5(file_path.encode()).hexdigest() |
|
file_content = "" |
|
col01, col02, col1, col2, col3 = st.columns(5) |
|
with col01: |
|
st.write(os.path.basename(file_path)) |
|
with col1: |
|
edit_key = f"edit_{unique_key}_{sequence_number}" |
|
if st.button(f"βοΈ Edit", key=edit_key): |
|
with open(file_path, "r") as f: |
|
file_content = f.read() |
|
text_area_key = f"text_area_{unique_key}_{sequence_number}" |
|
file_content = st.text_area("Edit the file content:", value=file_content, height=250, key=text_area_key) |
|
with col2: |
|
save_key = f"save_{unique_key}_{sequence_number}" |
|
if st.button(f"πΎ Save", key=save_key): |
|
if file_content: |
|
with open(file_path, "w") as f: |
|
f.write(file_content) |
|
st.success(f"File saved!") |
|
with col3: |
|
delete_key = f"delete_{unique_key}_{sequence_number}" |
|
if st.button(f"ποΈ Delete", key=delete_key): |
|
os.remove(file_path) |
|
st.markdown(f"File deleted!") |
|
|
|
def show_file_content(file_path, original_url=''): |
|
_, file_extension = os.path.splitext(file_path) |
|
emoji = FILE_EMOJIS.get(file_extension, "π") |
|
try: |
|
|
|
if file_extension in ['.png', '.jpg', '.jpeg']: |
|
image_url = file_path.replace('File:','').replace('/','') |
|
markdown_link = f"{emoji} [![Image]({image_url})]({original_url})" |
|
st.markdown(markdown_link, unsafe_allow_html=True) |
|
elif file_extension in ['.html', '.htm', '.txt']: |
|
with open(file_path, "r") as file: |
|
st.markdown(f"{emoji} [{os.path.basename(file_path)}]({original_url})", unsafe_allow_html=True) |
|
st.markdown(file.read(), unsafe_allow_html=True) |
|
else: |
|
st.markdown(f"{emoji} [{os.path.basename(file_path)}]({original_url})", unsafe_allow_html=True) |
|
except Exception as e: |
|
st.error(f"Error reading file {file_path}: {e}") |
|
|
|
file_sequence_numbers = {} |
|
def show_download_links(subdir): |
|
global file_sequence_numbers |
|
for file in list_files(subdir): |
|
file_path = os.path.join(subdir, file) |
|
if file_path not in file_sequence_numbers: |
|
file_sequence_numbers[file_path] = 1 |
|
else: |
|
file_sequence_numbers[file_path] += 1 |
|
sequence_number = file_sequence_numbers[file_path] |
|
if os.path.isfile(file_path): |
|
st.markdown(file_path) |
|
show_file_content(file_path) |
|
else: |
|
st.write(f"File not found: {file}") |
|
|
|
def show_download_links_backup(subdir): |
|
global file_sequence_numbers |
|
for file in list_files(subdir): |
|
file_path = os.path.join(subdir, file) |
|
if file_path not in file_sequence_numbers: |
|
file_sequence_numbers[file_path] = 1 |
|
else: |
|
file_sequence_numbers[file_path] += 1 |
|
sequence_number = file_sequence_numbers[file_path] |
|
if os.path.isfile(file_path): |
|
st.markdown(file_path, unsafe_allow_html=True) |
|
show_file_operations(file_path, sequence_number) |
|
else: |
|
st.write(f"File not found: {file}") |
|
|
|
def get_download_link(file): |
|
with open(file, "rb") as f: |
|
bytes = f.read() |
|
b64 = base64.b64encode(bytes).decode() |
|
href = f'<a href="data:file/octet-stream;base64,{b64}" download=\'{os.path.basename(file)}\'>Download: {os.path.basename(file)}</a>' |
|
return href |
|
|
|
def main(): |
|
st.sidebar.title('π Web Datasets Bulk Downloader') |
|
query_params = st.query_params() |
|
file_to_edit = query_params.get('file_to_edit', [None])[0] |
|
if file_to_edit and os.path.exists(file_to_edit): |
|
file_editor(file_to_edit) |
|
else: |
|
url_input_method = st.sidebar.radio("Choose URL Input Method", ["Enter URL", "Select from List"], index=1) |
|
url = "" |
|
if url_input_method == "Enter URL": |
|
url = st.sidebar.text_input('Please enter a Web URL to bulk download text and files') |
|
else: |
|
selected_site = st.sidebar.selectbox("Select a Website", list(URLS.keys()), index=0) |
|
url = URLS[selected_site] |
|
if not os.path.exists("history.json"): |
|
with open("history.json", "w") as f: |
|
json.dump({}, f) |
|
with open("history.json", "r") as f: |
|
try: |
|
history = json.load(f) |
|
except: |
|
print('error') |
|
if url: |
|
subdir = hashlib.md5(url.encode()).hexdigest() |
|
if not os.path.exists(subdir): |
|
os.makedirs(subdir) |
|
if url not in history: |
|
history[url] = subdir |
|
with open("history.json", "w") as f: |
|
json.dump(history, f) |
|
|
|
if st.sidebar.button('π₯ Get All the Content', help="Download content from the selected URL"): |
|
file_urls = download_html_and_files(url, history[url]) |
|
for file in list_files(history[url]): |
|
file_path = os.path.join(history[url], file) |
|
original_url = file_urls.get(file_path, "#") |
|
show_file_content(file_path, original_url) |
|
|
|
if st.sidebar.button('π Show Download Links', help="Show all available download links"): |
|
for subdir in history.values(): |
|
show_download_links(subdir) |
|
|
|
if st.sidebar.button("π Delete All", help="Delete all downloaded content"): |
|
|
|
with open("history.json", "w") as f: |
|
json.dump({}, f) |
|
|
|
for subdir in glob.glob('*'): |
|
if os.path.isdir(subdir) and subdir not in EXCLUDED_FILES: |
|
for file in os.listdir(subdir): |
|
file_path = os.path.join(subdir, file) |
|
os.remove(file_path) |
|
st.write(f"Deleted: {file_path}") |
|
os.rmdir(subdir) |
|
st.experimental_rerun() |
|
|
|
if st.sidebar.button("β¬οΈ Download All", help="Download all files in a zip"): |
|
start_directory = '.' |
|
for zip_file in zip_subdirs(start_directory): |
|
st.sidebar.markdown(zip_file, unsafe_allow_html=True) |
|
st.sidebar.markdown(get_zip_download_link(zip_file), unsafe_allow_html=True) |
|
|
|
with st.expander("URL History and Downloaded Files"): |
|
try: |
|
for url, subdir in history.items(): |
|
st.markdown(f"#### {url}") |
|
except: |
|
print('url history is empty') |
|
|
|
if __name__ == "__main__": |
|
main() |