irs-manuals / download_data.py
amanda103's picture
Upload 5 files
0043c9e
raw
history blame
1.5 kB
import sys
import urllib
import requests
from bs4 import BeautifulSoup
import re
import zipfile
def get_zip_urls(base="https://www.irs.gov/downloads/irm", start_page=1, max_page=74):
urls = []
for page_num in range(start_page, max_page + 1):
url = f"{base}?page={page_num}"
response = requests.get(url)
html_content = response.text
soup = BeautifulSoup(html_content, "html.parser")
for link in soup.find_all("a", href=re.compile(r"\.zip$")):
urls.append(link.get("href"))
return urls
def download_and_unzip(urls, unzip_dir):
for zip_url in urls[:10]:
filename = zip_url.split("/")[-1]
urllib.request.urlretrieve(zip_url, filename)
with zipfile.ZipFile(filename, "r") as zip_ref:
for file_info in zip_ref.infolist():
# check if the file has a PDF extension
if file_info.filename.lower().endswith(".pdf"):
# extract the file to the PDF directory
zip_ref.extract(file_info, unzip_dir)
if __name__ == "__main__":
base_url = sys.argv[1]
page_start = int(sys.argv[2])
page_max = int(sys.argv[3])
pdf_dir = sys.argv[4]
print(f"Grabbing zip urls from {base_url}")
zip_urls = get_zip_urls(base_url, page_start, page_max)
print(
f"Found {len(zip_urls)} zip urls, downloading and unzipping pdfs into {pdf_dir}"
)
download_and_unzip(zip_urls, pdf_dir)
print(f"Finished unzipping")