python_project_explainer / github_repo_downloader.py
lafifi-24's picture
i
933c2fa
import io
import os
from pathlib import Path
from typing import Callable, Dict, Optional
from urllib.parse import urlparse
from structlog import get_logger
import requests
import zipfile
logger = get_logger(__name__)
class GitHubRepoDownloader:
def __init__(self, repo_url: str, branch: str = "main", cache_dir: str = ".cache"):
"""
Initialize downloader with a GitHub repo URL.
Args:
repo_url: Full GitHub repo URL (e.g., https://github.com/owner/repo)
branch: Branch name to download (default: main)
cache_dir: Directory to cache downloaded files
"""
self.owner, self.repo = self._parse_repo_url(repo_url)
self.branch = branch
self.cache_dir = Path(cache_dir)
self.cache_dir.mkdir(exist_ok=True)
self._validate_branch()
def _parse_repo_url(self, repo_url: str) -> tuple[str, str]:
"""Extract owner and repo name from GitHub URL"""
repo_url = repo_url.rstrip('/').replace('.git', '')
# Validate it's a GitHub URL
parsed = urlparse(repo_url)
if 'github.com' not in repo_url:
message = f"Not a GitHub URL: {repo_url}"
logger.error(message)
raise ValueError(message)
parts = repo_url.split('/')
if len(parts) < 2:
message = f"Invalid GitHub URL format: {repo_url}"
logger.error(message)
raise ValueError(message)
repo = parts[-1]
owner = parts[-2]
return owner, repo
def _validate_branch(self) -> None:
"""Validate that the branch exists in the repository"""
url = f"https://api.github.com/repos/{self.owner}/{self.repo}/branches/{self.branch}"
logger.info(f"Validating branch: {self.branch}")
response = requests.get(url)
if response.status_code == 404:
message = f"Branch '{self.branch}' not found in {self.owner}/{self.repo}"
logger.error(message)
raise ValueError(message)
response.raise_for_status()
def _get_cache_path(self) -> Path:
"""Get the cache file path for this repo"""
return self.cache_dir / f"{self.owner}_{self.repo}_{self.branch}.zip"
def _download_zip(self) -> Path:
"""Download repo ZIP to cache"""
cache_path = self._get_cache_path()
# Return cached file if exists
if cache_path.exists():
logger.info(f"Using cached file: {cache_path}")
return cache_path
# Download ZIP
url = f"https://github.com/{self.owner}/{self.repo}/archive/refs/heads/{self.branch}.zip"
logger.info(f"Downloading {self.owner}/{self.repo} (branch: {self.branch})...")
response = requests.get(url)
response.raise_for_status()
with open(cache_path, 'wb') as f:
f.write(response.content)
logger.info(f"Saved to cache: {cache_path}")
return cache_path
def read_files(self, file_filter: Optional[Callable[[str], bool]] = None) -> Dict[str, str]:
"""
Read files from the repo without extracting.
Args:
file_filter: Optional function to filter files (e.g., lambda path: path.endswith('.py'))
Returns:
Dictionary mapping file paths to their contents
"""
cache_path = self._download_zip()
files_content = {}
with zipfile.ZipFile(cache_path) as zip_file:
for filename in zip_file.namelist():
if filename.endswith('/'):
continue
# Remove root folder (format: repo-branch/path/to/file)
clean_path = '/'.join(filename.split('/')[1:])
if not clean_path:
continue
# Apply filter
if file_filter and not file_filter(clean_path):
continue
logger.info(f"Reading: {clean_path}")
try:
with zip_file.open(filename) as file:
content = file.read().decode('utf-8', errors='ignore')
files_content[clean_path] = content
except Exception as e:
logger.exception(f"⚠️ Error reading {clean_path}: {e}")
return files_content