Spaces:

MicroHealth
/

Bulk-PDF-download

Sleeping

App Files Files Community

Bulk-PDF-download / app.py

bluenevus

Update app.py

373964e verified 2 months ago

raw

history blame contribute delete

8.94 kB

	import requests
	from bs4 import BeautifulSoup
	import os
	from urllib.parse import urljoin, urlparse
	import time
	import zipfile
	import tempfile
	import shutil
	import gradio as gr

	def extract_detail_page_links(url, headers):
	"""
	Extract all detail page links from the main listing page.

	Args:
	url: Main page URL
	headers: Request headers

	Returns:
	list of detail page URLs
	"""
	response = requests.get(url, headers=headers, timeout=30)
	response.raise_for_status()
	soup = BeautifulSoup(response.content, 'html.parser')

	detail_links = []
	for link in soup.find_all('a', href=True):
	href = link['href']
	# Look for detail page patterns (adjust pattern as needed)
	if 'Details.aspx' in href or 'PUB_ID=' in href:
	full_url = urljoin(url, href)
	if full_url not in detail_links:
	detail_links.append(full_url)

	return detail_links

	def extract_pdf_links_from_page(url, headers):
	"""
	Extract PDF links from a single page.

	Args:
	url: Page URL to scrape
	headers: Request headers

	Returns:
	list of PDF URLs
	"""
	try:
	response = requests.get(url, headers=headers, timeout=30)
	response.raise_for_status()
	soup = BeautifulSoup(response.content, 'html.parser')

	pdf_links = []
	for link in soup.find_all('a', href=True):
	href = link['href']
	if '.pdf' in href.lower():
	full_url = urljoin(url, href)
	if full_url not in pdf_links:
	pdf_links.append(full_url)

	return pdf_links
	except Exception as e:
	print(f"Error extracting PDFs from {url}: {str(e)}")
	return []

	def download_pdfs_from_page(url, progress=gr.Progress()):
	"""
	Download all PDFs from a webpage by navigating through detail pages.

	Args:
	url: The main webpage URL to scrape
	progress: Gradio progress tracker

	Returns:
	tuple of (zip_file_path, summary_message)
	"""

	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
	}

	try:
	# Step 1: Extract detail page links from main page
	progress(0, desc="Fetching main page and extracting detail links...")
	detail_page_links = extract_detail_page_links(url, headers)

	if len(detail_page_links) == 0:
	return None, "❌ No detail page links found on the main page."

	progress(0.1, desc=f"Found {len(detail_page_links)} detail pages to process")

	# Step 2: Visit each detail page and collect PDF links
	all_pdf_links = []
	for idx, detail_url in enumerate(detail_page_links, 1):
	progress(0.1 + (0.3 * idx / len(detail_page_links)),
	desc=f"[{idx}/{len(detail_page_links)}] Scanning detail page...")

	pdf_links = extract_pdf_links_from_page(detail_url, headers)
	all_pdf_links.extend(pdf_links)

	# Be polite - small delay between page requests
	time.sleep(0.5)

	# Remove duplicates
	all_pdf_links = list(set(all_pdf_links))

	if len(all_pdf_links) == 0:
	return None, f"❌ No PDF links found across {len(detail_page_links)} detail pages."

	progress(0.4, desc=f"Found {len(all_pdf_links)} unique PDFs to download")

	# Step 3: Create temporary directory for downloads
	temp_dir = tempfile.mkdtemp()

	# Step 4: Download each PDF
	successful = 0
	failed = 0
	failed_urls = []

	for idx, pdf_url in enumerate(all_pdf_links, 1):
	try:
	parsed_url = urlparse(pdf_url)
	path_without_query = parsed_url.path
	filename = os.path.basename(path_without_query)

	# Handle empty filenames
	if not filename or filename == '':
	filename = f"document_{idx}.pdf"

	filepath = os.path.join(temp_dir, filename)

	# Skip if file already exists
	if os.path.exists(filepath):
	progress(0.4 + (0.5 * idx / len(all_pdf_links)),
	desc=f"[{idx}/{len(all_pdf_links)}] Skipping: {filename}")
	successful += 1
	continue

	progress(0.4 + (0.5 * idx / len(all_pdf_links)),
	desc=f"[{idx}/{len(all_pdf_links)}] Downloading: {filename}")

	# Download PDF
	pdf_response = requests.get(pdf_url, headers=headers, timeout=60)
	pdf_response.raise_for_status()

	# Verify it's actually a PDF
	if pdf_response.headers.get('content-type', '').lower() not in ['application/pdf', 'application/octet-stream']:
	failed += 1
	failed_urls.append(f"{filename}: Not a valid PDF file")
	continue

	# Save PDF
	with open(filepath, 'wb') as f:
	f.write(pdf_response.content)

	successful += 1
	time.sleep(1) # Be polite

	except Exception as e:
	failed += 1
	failed_urls.append(f"{filename}: {str(e)}")
	continue

	# Step 5: Generate summary
	summary = f"""
	✅ Download Complete!

	📊 Summary:
	- Detail pages scanned: {len(detail_page_links)}
	- Total PDFs found: {len(all_pdf_links)}
	- Successfully downloaded: {successful}
	- Failed: {failed}
	"""

	if failed > 0:
	summary += f"\n\n⚠️ Failed Downloads:\n"
	for fail in failed_urls[:10]:
	summary += f"- {fail}\n"
	if len(failed_urls) > 10:
	summary += f"- ... and {len(failed_urls) - 10} more\n"

	# Step 6: Create zip file
	progress(0.9, desc="Creating zip file...")

	zip_path = os.path.join(tempfile.gettempdir(), f"pdfs_{int(time.time())}.zip")
	with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
	for root, dirs, files in os.walk(temp_dir):
	for file in files:
	file_path = os.path.join(root, file)
	zipf.write(file_path, arcname=file)

	# Clean up
	shutil.rmtree(temp_dir)

	progress(1.0, desc="Complete!")
	return zip_path, summary

	except requests.exceptions.RequestException as e:
	return None, f"❌ Error fetching webpage: {str(e)}"
	except Exception as e:
	return None, f"❌ Unexpected error: {str(e)}"

	# Create Gradio interface
	def create_interface():
	with gr.Blocks(title="PDF Downloader", theme=gr.themes.Soft()) as demo:
	gr.Markdown(
	"""
	# 📥 Two-Level PDF Downloader
	Download all PDFs from webpages with intermediate detail pages!

	Instructions:
	1. Enter the URL of the main listing page
	2. Click "Download PDFs"
	3. The tool will navigate through all detail pages
	4. Download your ZIP file with all PDFs
	"""
	)

	with gr.Row():
	with gr.Column():
	url_input = gr.Textbox(
	label="Main Page URL",
	placeholder="https://armypubs.army.mil/ProductMaps/PubForm/AR.aspx",
	lines=1
	)

	download_btn = gr.Button("📥 Download PDFs", variant="primary", size="lg")

	with gr.Row():
	with gr.Column():
	output_file = gr.File(label="Download ZIP")
	summary_output = gr.Markdown(label="Summary")

	download_btn.click(
	fn=download_pdfs_from_page,
	inputs=[url_input],
	outputs=[output_file, summary_output]
	)

	gr.Markdown(
	"""
	---
	### 💡 Features:
	- Two-level navigation: Scans main page → visits detail pages → downloads PDFs
	- Duplicate removal: Ensures each PDF is downloaded only once
	- Polite scraping: Includes delays between requests
	- Error handling: Continues even if some downloads fail
	- Progress tracking: Real-time updates on scanning and downloading
	"""
	)

	return demo

	if __name__ == "__main__":
	demo = create_interface()
	demo.launch(share=True)