code-completion / scraping /files /naive_download.py
jblitzar's picture
Upload folder using huggingface_hub
a8639ac verified
import requests
import os
from urllib.parse import urlparse
from tqdm import tqdm
from tenacity import retry, stop_after_attempt, wait_exponential
@retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=4, max=10))
def download_file(url, session, download_folder):
try:
# Download the file from the URL
response = session.get(url)
response.raise_for_status() # Check for errors
# Parse the URL to extract the file name
parsed_url = urlparse(url).path.strip("/").split("/")
file_name = f"{parsed_url[0]}_{parsed_url[1]}_{parsed_url[3]}.py"
# Ensure the file name is unique within the download folder
original_file_name = file_name
counter = 1
while os.path.exists(os.path.join(download_folder, file_name)):
# If the file name exists, append a counter to make it unique
file_name = f"{os.path.splitext(original_file_name)[0]}_{counter}{os.path.splitext(original_file_name)[1]}"
counter += 1
# Write the file to the download folder
with open(os.path.join(download_folder, file_name), "wb") as file:
file.write(response.content)
return f"Downloaded {file_name}"
except requests.exceptions.HTTPError as e:
if e.response.status_code == 404:
return f"404 Not Found: {url}"
else:
raise
def download_files_linearly(urls, download_folder, start_line=0):
# Ensure the download folder exists
os.makedirs(download_folder, exist_ok=True)
with requests.Session() as session:
for i, url in enumerate(
tqdm(urls[start_line:], initial=start_line, total=len(urls))
):
result = download_file(url, session, download_folder)
print(result)
def read_urls_from_file(file_name):
# Read URLs from a text file and return as a list
with open(file_name, "r") as file:
# Strip newline characters and return as a list
return [line.strip() for line in file.readlines()]
# Path to the text file containing URLs
urls_file = "python_files.txt"
# Folder to save the downloaded files
download_folder = "downloaded_files"
# Read the URLs from the file
urls = read_urls_from_file(urls_file)
# Line to start downloading from
start_line = 123791
# Download files linearly and save them to the folder
download_files_linearly(urls, download_folder, start_line)