import requests | |
from concurrent.futures import ThreadPoolExecutor | |
from tqdm import tqdm | |
import os | |
import random | |
# Path to the file containing URLs | |
file_path = "python_files.txt" | |
# Function to get the size of a file from a URL using a HEAD request | |
def get_file_size(url): | |
try: | |
response = requests.head(url, allow_redirects=True, timeout=10) | |
# Extract the content length header | |
size = int(response.headers.get("Content-Length", 0)) | |
return size | |
except (requests.RequestException, ValueError): | |
# Return 0 if any error occurs (e.g., timeout, invalid URL, or missing header) | |
return 0 | |
# Main function to calculate total size of all URLs in file | |
def calculate_total_size(file_path): | |
# Read URLs from file | |
if not os.path.exists(file_path): | |
print("File not found!") | |
return 0 | |
with open(file_path, "r") as file: | |
urls = [line.strip() for line in file if line.strip()] | |
random.shuffle(urls) | |
urls = urls[:5000] | |
# Use threading to perform requests concurrently with a progress bar | |
total_size = 0 | |
with ThreadPoolExecutor() as executor: | |
# Wrap the map in tqdm for a progress bar | |
file_sizes = list( | |
tqdm( | |
executor.map(get_file_size, urls), | |
total=len(urls), | |
desc=f"Processing URLs.", | |
) | |
) | |
# Calculate the total size | |
total_size = sum(file_sizes) | |
return total_size | |
# Calculate and print the total size | |
total_size = calculate_total_size(file_path) | |
print(f"Total size of all files: {total_size / (1024 * 1024):.2f} MB") | |