quick-spin / patent_downloader.py
DrishtiSharma's picture
Create patent_downloader.py
2375a67 verified
raw
history blame
4.38 kB
from typing import List, Union, Optional
import os
import requests
import tempfile
from bs4 import BeautifulSoup
class PatentDownloader:
"""
A class to automate downloading patent PDFs from Google Patents.
"""
base_url = "https://patents.google.com/patent"
def __init__(self, verbose: bool = False):
"""
Initialize the downloader.
Parameters
----------
verbose : bool
If True, print detailed debug information.
"""
self.verbose = verbose
def download(self, patents: Union[str, List[str]], output_path: Optional[str] = None) -> List[str]:
"""
Download single or multiple patent PDFs.
Parameters
----------
patents : str or List[str]
Single patent number or a list of patent numbers.
output_path : Optional[str]
Directory to save the PDFs. Defaults to a temporary directory.
Returns
-------
List[str]
List of paths to the downloaded PDFs.
"""
if isinstance(patents, str):
patents = [patents]
# Use a temporary directory if no output path is provided
output_dir = output_path or tempfile.gettempdir()
os.makedirs(output_dir, exist_ok=True)
downloaded_files = []
for i, patent in enumerate(patents):
try:
if self.verbose:
print(f"🔍 Downloading {i+1}/{len(patents)}: {patent}")
file_path = self._download_single_pdf(patent, output_dir)
downloaded_files.append(file_path)
print(f"✅ Successfully downloaded: {file_path}")
except Exception as e:
print(f"❌ Failed to download {patent}: {e}")
return downloaded_files
def _download_single_pdf(self, patent_number: str, output_dir: str) -> str:
"""
Download a single patent PDF.
Parameters
----------
patent_number : str
The patent number (e.g., "US8676427B1").
output_dir : str
Directory to save the PDF.
Returns
-------
str
Path to the downloaded PDF file.
"""
# Construct the Google Patents URL
patent_url = f"{self.base_url}/{patent_number}/en"
if self.verbose:
print(f"Fetching patent page: {patent_url}")
# Fetch the HTML content of the patent page
response = requests.get(patent_url)
if response.status_code != 200:
raise Exception(f"Failed to fetch patent page for {patent_number}. HTTP Status: {response.status_code}")
# Parse the HTML content and extract the PDF link
soup = BeautifulSoup(response.content, "html.parser")
pdf_url = self._extract_pdf_link(soup)
if not pdf_url:
raise Exception(f"No PDF link found for patent {patent_number}.")
if self.verbose:
print(f"Found PDF link: {pdf_url}")
# Download the PDF file
pdf_response = requests.get(pdf_url)
if pdf_response.status_code != 200:
raise Exception(f"Failed to download PDF for {patent_number}. HTTP Status: {pdf_response.status_code}")
# Save the PDF to the specified output directory
file_path = os.path.join(output_dir, f"{patent_number}.pdf")
with open(file_path, "wb") as pdf_file:
pdf_file.write(pdf_response.content)
return file_path
@staticmethod
def _extract_pdf_link(soup: BeautifulSoup) -> Optional[str]:
"""
Extract the PDF link from the page's metadata.
Parameters
----------
soup : BeautifulSoup
Parsed HTML content of the patent page.
Returns
-------
Optional[str]
The direct PDF link if found.
"""
# Look for the 'citation_pdf_url' meta tag
pdf_meta = soup.find("meta", {"name": "citation_pdf_url"})
if pdf_meta and pdf_meta.get("content"):
return pdf_meta["content"]
# Fallback: search for any <a> tag containing '.pdf' in its href
pdf_links = [a['href'] for a in soup.find_all("a", href=True) if a['href'].endswith(".pdf")]
if pdf_links:
return pdf_links[0] # Return the first matching PDF link
return None