""" github.py Functions for retrieving information from GitHub profiles and repositories. """ # import re import json import logging import base64 from typing import List, Dict from pathlib import Path from datetime import datetime import requests # pylint: disable=broad-exception-caught def get_github_repositories(username: str) -> list: """ Retrieve public repositories from a GitHub profile URL. Args: username (str): GitHub username (e.g., username) Returns: dict: List containing dictionaries of repository information Example: [ { "name": "repo-name", "description": "Repository description", "language": "Python", "stars": 10, "forks": 2, "updated_at": "2024-01-01T00:00:00Z", "html_url": "https://github.com/user/repo", "topics": ["python", "api"], "readme": "# Project Title\n\nProject description..." } ] """ logger = logging.getLogger(f'{__name__}.get_github_repositories') try: logger.info("Fetching repositories for GitHub user: %s", username) # Get repositories repositories = _get_user_repositories(username) if repositories: repositories = _process_repository_data(repositories) # Save results to JSON file try: github_repos_dir = Path(__file__).parent.parent / "data" / "github_repos" github_repos_dir.mkdir(parents=True, exist_ok=True) # Create timestamped filename timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") output_file = github_repos_dir / f"github_repos_{timestamp}.json" with open(output_file, 'w', encoding='utf-8') as f: json.dump(repositories, f, indent=2, ensure_ascii=False) except Exception as save_error: logger.warning("Failed to save GitHub repositories to file: %s", str(save_error)) except Exception as e: logger.error("Error retrieving GitHub repositories: %s", str(e)) return None return repositories def _get_user_repositories(username: str) -> Dict: """ Get user's public repositories from GitHub API. Args: username (str): GitHub username Returns: dict: API response with repositories """ logger = logging.getLogger(f'{__name__}._get_user_repositories') try: # Get repositories with pagination all_repos = [] page = 1 per_page = 100 # Maximum allowed by GitHub API while True: url = f"https://api.github.com/users/{username}/repos" params = { "type": "public", "sort": "updated", "direction": "desc", "per_page": per_page, "page": page } headers = { "Accept": "application/vnd.github.v3+json", "User-Agent": "Resumate-App/1.0" } response = requests.get(url, headers=headers, params=params, timeout=10) if response.status_code != 200: logger.error("GitHub API error: %s", response.status_code) return None repos = response.json() if not repos: # No more repositories break all_repos.extend(repos) # If we got less than per_page, we've reached the end if len(repos) < per_page: break page += 1 # Safety limit to prevent infinite loops if page > 10: # Max 1000 repos break return all_repos except requests.RequestException as e: logger.error("Network error fetching repositories: %s", str(e)) # If we have some repos, return them if len(all_repos) > 0: logger.info("Returning partial repository data due to error") return all_repos else: logger.error("No repositories found and network error occurred") return None def _process_repository_data(repos: List[Dict]) -> List[Dict]: """ Process and clean repository data for easier consumption. Args: repos (List[Dict]): Raw repository data from GitHub API Returns: List[Dict]: Processed repository data """ logger = logging.getLogger(f'{__name__}._process_repository_data') processed = [] for repo in repos: # Skip forks unless they have significant modifications if repo.get("fork", False) and repo.get("stargazers_count", 0) == 0: continue try: processed_repo = { "name": repo.get("name", ""), "description": repo.get("description", ""), "language": repo.get("language", ""), "stars": repo.get("stargazers_count", 0), "forks": repo.get("forks_count", 0), "updated_at": repo.get("updated_at", ""), "created_at": repo.get("created_at", ""), "html_url": repo.get("html_url", ""), "topics": repo.get("topics", []), "size": repo.get("size", 0) } # Get README content for the repository repo_url = repo.get("html_url", "") if repo_url: readme_content = get_repository_readme(repo_url) processed_repo["readme"] = readme_content else: processed_repo["readme"] = "" processed.append(processed_repo) except Exception as e: logger.error("Error processing repository data: %s", str(e)) continue return processed def get_repository_readme(repo_url: str) -> str: """ Get the fulltext content of a repository's README file. Args: repo_url (str): GitHub repository URL (e.g., "https://github.com/owner/repo") Returns: str: README file content as text, or empty string if not found/error Example: >>> readme_content = get_repository_readme("https://github.com/owner/repo") >>> print(readme_content[:100]) # My Project This is a sample project that does... """ logger = logging.getLogger(f'{__name__}.get_repository_readme') try: # Extract owner and repo name from URL if not repo_url.startswith("https://github.com/"): logger.error("Invalid GitHub URL format: %s", repo_url) return "" # Remove trailing slash and split repo_url = repo_url.rstrip("/") parts = repo_url.replace("https://github.com/", "").split("/") if len(parts) != 2: logger.error("Invalid GitHub URL format, expected owner/repo: %s", repo_url) return "" owner, repo = parts logger.info("Fetching README for repository: %s/%s", owner, repo) # GitHub API endpoint for README api_url = f"https://api.github.com/repos/{owner}/{repo}/readme" headers = { "Accept": "application/vnd.github.v3+json", "User-Agent": "Resumate-App/1.0" } response = requests.get(api_url, headers=headers, timeout=10) if response.status_code == 404: logger.info("No README file found for repository: %s/%s", owner, repo) return "" if response.status_code != 200: logger.error("GitHub API error fetching README: %s", response.status_code) return "" readme_data = response.json() # README content is base64 encoded if "content" not in readme_data: logger.error("README API response missing content field") return "" # Decode base64 content encoded_content = readme_data["content"] # Remove any whitespace/newlines from base64 string encoded_content = encoded_content.replace("\n", "").replace(" ", "") try: decoded_content = base64.b64decode(encoded_content).decode('utf-8') logger.info( "Successfully retrieved README content (%d characters)", len(decoded_content) ) return decoded_content except Exception as decode_error: logger.error("Error decoding README content: %s", str(decode_error)) return "" except requests.RequestException as e: logger.error("Network error fetching README: %s", str(e)) return "" except Exception as e: logger.error("Error retrieving README: %s", str(e)) return ""