Spaces:
Running
Running
from typing import List, Union, Optional | |
import os | |
import requests | |
import tempfile | |
from bs4 import BeautifulSoup | |
class PatentDownloader: | |
""" | |
A class to automate downloading patent PDFs from Google Patents. | |
""" | |
base_url = "https://patents.google.com/patent" | |
def __init__(self, verbose: bool = False): | |
""" | |
Initialize the downloader. | |
Parameters | |
---------- | |
verbose : bool | |
If True, print detailed debug information. | |
""" | |
self.verbose = verbose | |
def download(self, patents: Union[str, List[str]], output_path: Optional[str] = None) -> List[str]: | |
""" | |
Download single or multiple patent PDFs. | |
Parameters | |
---------- | |
patents : str or List[str] | |
Single patent number or a list of patent numbers. | |
output_path : Optional[str] | |
Directory to save the PDFs. Defaults to a temporary directory. | |
Returns | |
------- | |
List[str] | |
List of paths to the downloaded PDFs. | |
""" | |
if isinstance(patents, str): | |
patents = [patents] | |
# Use a temporary directory if no output path is provided | |
output_dir = output_path or tempfile.gettempdir() | |
os.makedirs(output_dir, exist_ok=True) | |
downloaded_files = [] | |
for i, patent in enumerate(patents): | |
try: | |
if self.verbose: | |
print(f"🔍 Downloading {i+1}/{len(patents)}: {patent}") | |
file_path = self._download_single_pdf(patent, output_dir) | |
downloaded_files.append(file_path) | |
print(f"✅ Successfully downloaded: {file_path}") | |
except Exception as e: | |
print(f"❌ Failed to download {patent}: {e}") | |
return downloaded_files | |
def _download_single_pdf(self, patent_number: str, output_dir: str) -> str: | |
""" | |
Download a single patent PDF. | |
Parameters | |
---------- | |
patent_number : str | |
The patent number (e.g., "US8676427B1"). | |
output_dir : str | |
Directory to save the PDF. | |
Returns | |
------- | |
str | |
Path to the downloaded PDF file. | |
""" | |
# Construct the Google Patents URL | |
patent_url = f"{self.base_url}/{patent_number}/en" | |
if self.verbose: | |
print(f"Fetching patent page: {patent_url}") | |
# Fetch the HTML content of the patent page | |
response = requests.get(patent_url) | |
if response.status_code != 200: | |
raise Exception(f"Failed to fetch patent page for {patent_number}. HTTP Status: {response.status_code}") | |
# Parse the HTML content and extract the PDF link | |
soup = BeautifulSoup(response.content, "html.parser") | |
pdf_url = self._extract_pdf_link(soup) | |
if not pdf_url: | |
raise Exception(f"No PDF link found for patent {patent_number}.") | |
if self.verbose: | |
print(f"Found PDF link: {pdf_url}") | |
# Download the PDF file | |
pdf_response = requests.get(pdf_url) | |
if pdf_response.status_code != 200: | |
raise Exception(f"Failed to download PDF for {patent_number}. HTTP Status: {pdf_response.status_code}") | |
# Save the PDF to the specified output directory | |
file_path = os.path.join(output_dir, f"{patent_number}.pdf") | |
with open(file_path, "wb") as pdf_file: | |
pdf_file.write(pdf_response.content) | |
return file_path | |
def _extract_pdf_link(soup: BeautifulSoup) -> Optional[str]: | |
""" | |
Extract the PDF link from the page's metadata. | |
Parameters | |
---------- | |
soup : BeautifulSoup | |
Parsed HTML content of the patent page. | |
Returns | |
------- | |
Optional[str] | |
The direct PDF link if found. | |
""" | |
# Look for the 'citation_pdf_url' meta tag | |
pdf_meta = soup.find("meta", {"name": "citation_pdf_url"}) | |
if pdf_meta and pdf_meta.get("content"): | |
return pdf_meta["content"] | |
# Fallback: search for any <a> tag containing '.pdf' in its href | |
pdf_links = [a['href'] for a in soup.find_all("a", href=True) if a['href'].endswith(".pdf")] | |
if pdf_links: | |
return pdf_links[0] # Return the first matching PDF link | |
return None | |