""" Project Importer - Standalone module for importing projects from various sources This module provides functionality to import projects from: - HuggingFace Spaces - HuggingFace Models - GitHub Repositories No Gradio dependency required - pure Python implementation. """ import os import re import requests from typing import Dict, List, Optional, Tuple from urllib.parse import urlparse from huggingface_hub import HfApi, list_repo_files class ProjectImporter: """Main class for importing projects from various sources""" def __init__(self, hf_token: Optional[str] = None): """ Initialize the ProjectImporter. Args: hf_token: Optional HuggingFace token for authenticated requests """ self.hf_token = hf_token or os.environ.get("HF_TOKEN") self.api = HfApi(token=self.hf_token) def import_from_url(self, url: str) -> Dict[str, any]: """ Import a project from any supported URL. Args: url: URL to import from (HF Space, HF Model, or GitHub) Returns: Dictionary containing: - status: Success/error message - code: Extracted code content - language: Detected language/framework - url: Original URL - metadata: Additional metadata """ if not url or not url.strip(): return { "status": "error", "message": "Please provide a valid URL", "code": "", "language": "unknown", "url": url, "metadata": {} } # Parse URL to determine source type kind, meta = self._parse_url(url) if kind == "hf_space": return self.import_space(meta["username"], meta["project"]) elif kind == "hf_model": return self.import_model(meta["repo_id"]) elif kind == "github": return self.import_github_repo(meta["owner"], meta["repo"]) else: return { "status": "error", "message": "Unsupported URL format. Supported: HF Spaces, HF Models, GitHub repos", "code": "", "language": "unknown", "url": url, "metadata": {} } def import_space(self, username: str, project_name: str) -> Dict[str, any]: """ Import a HuggingFace Space. Args: username: HuggingFace username project_name: Space name Returns: Dictionary with imported project data """ try: space_id = f"{username}/{project_name}" space_info = self.api.space_info(space_id) # Detect if this is a transformers.js space if space_info.sdk == "static" and self._is_transformers_js_space(username, project_name): code, files = self._fetch_transformers_js_files(username, project_name) return { "status": "success", "message": f"Successfully imported transformers.js space: {space_id}", "code": code, "language": "transformers.js", "url": f"https://huggingface.co/spaces/{space_id}", "metadata": { "sdk": "static", "type": "transformers.js", "files": files } } # Handle multi-file spaces files = self._fetch_all_space_files(username, project_name, space_info.sdk) if files: code = self._format_multi_file_content(files, username, project_name, space_info.sdk) language = self._detect_language_from_sdk(space_info.sdk) return { "status": "success", "message": f"Successfully imported space: {space_id}", "code": code, "language": language, "url": f"https://huggingface.co/spaces/{space_id}", "metadata": { "sdk": space_info.sdk, "files": list(files.keys()) } } else: # Fallback to single file main_file, content = self._fetch_main_file(username, project_name, space_info.sdk) if content: return { "status": "success", "message": f"Successfully imported space: {space_id}", "code": content, "language": self._detect_language_from_sdk(space_info.sdk), "url": f"https://huggingface.co/spaces/{space_id}", "metadata": { "sdk": space_info.sdk, "main_file": main_file } } else: return { "status": "error", "message": f"Could not find main file in space {space_id}", "code": "", "language": "unknown", "url": f"https://huggingface.co/spaces/{space_id}", "metadata": {"sdk": space_info.sdk} } except Exception as e: return { "status": "error", "message": f"Failed to import space: {str(e)}", "code": "", "language": "unknown", "url": f"https://huggingface.co/spaces/{username}/{project_name}", "metadata": {} } def import_model(self, model_id: str, prefer_local: bool = False) -> Dict[str, any]: """ Import a HuggingFace Model. Args: model_id: HuggingFace model ID (e.g., "meta-llama/Llama-2-7b") prefer_local: If True, prefer local inference code over serverless Returns: Dictionary with imported model data """ try: # Get model info model_info = self.api.model_info(model_id) pipeline_tag = getattr(model_info, "pipeline_tag", None) # Try to get inference provider code inference_code = self._generate_inference_code(model_id, pipeline_tag) # Try to get transformers/diffusers code from README readme_code = None try: readme = self._fetch_hf_model_readme(model_id) if readme: _, readme_code = self._extract_code_from_markdown(readme) except: pass # Determine which code to return if inference_code and readme_code: code = readme_code if prefer_local else inference_code code_type = "local" if prefer_local else "inference" return { "status": "success", "message": f"Successfully imported model: {model_id} ({code_type} code)", "code": code, "language": "python", "url": f"https://huggingface.co/{model_id}", "metadata": { "pipeline_tag": pipeline_tag, "code_type": code_type, "has_alternatives": True, "inference_code": inference_code, "local_code": readme_code } } elif inference_code: return { "status": "success", "message": f"Successfully imported model: {model_id} (inference code)", "code": inference_code, "language": "python", "url": f"https://huggingface.co/{model_id}", "metadata": { "pipeline_tag": pipeline_tag, "code_type": "inference" } } elif readme_code: return { "status": "success", "message": f"Successfully imported model: {model_id} (local code)", "code": readme_code, "language": "python", "url": f"https://huggingface.co/{model_id}", "metadata": { "pipeline_tag": pipeline_tag, "code_type": "local" } } else: return { "status": "error", "message": f"No code found for model: {model_id}", "code": "", "language": "python", "url": f"https://huggingface.co/{model_id}", "metadata": {"pipeline_tag": pipeline_tag} } except Exception as e: return { "status": "error", "message": f"Failed to import model: {str(e)}", "code": "", "language": "python", "url": f"https://huggingface.co/{model_id}", "metadata": {} } def import_github_repo(self, owner: str, repo: str) -> Dict[str, any]: """ Import a GitHub repository. Args: owner: GitHub username/organization repo: Repository name Returns: Dictionary with imported repository data """ try: readme = self._fetch_github_readme(owner, repo) if not readme: return { "status": "error", "message": f"Could not fetch README from {owner}/{repo}", "code": "", "language": "python", "url": f"https://github.com/{owner}/{repo}", "metadata": {} } lang, code = self._extract_code_from_markdown(readme) if code: return { "status": "success", "message": f"Successfully imported code from {owner}/{repo}", "code": code, "language": lang or "python", "url": f"https://github.com/{owner}/{repo}", "metadata": { "source": "github", "detected_language": lang } } else: return { "status": "error", "message": f"No relevant code found in README of {owner}/{repo}", "code": "", "language": "python", "url": f"https://github.com/{owner}/{repo}", "metadata": {} } except Exception as e: return { "status": "error", "message": f"Failed to import repository: {str(e)}", "code": "", "language": "python", "url": f"https://github.com/{owner}/{repo}", "metadata": {} } # ==================== Private Helper Methods ==================== def _parse_url(self, url: str) -> Tuple[str, Optional[Dict]]: """Parse URL and detect source type""" try: parsed = urlparse(url.strip()) netloc = (parsed.netloc or "").lower() path = (parsed.path or "").strip("/") # HuggingFace Spaces if ("huggingface.co" in netloc or "hf.co" in netloc) and path.startswith("spaces/"): parts = path.split("/") if len(parts) >= 3: return "hf_space", {"username": parts[1], "project": parts[2]} # HuggingFace Model if ("huggingface.co" in netloc or "hf.co" in netloc) and not path.startswith(("spaces/", "datasets/")): parts = path.split("/") if len(parts) >= 2: return "hf_model", {"repo_id": f"{parts[0]}/{parts[1]}"} # GitHub Repository if "github.com" in netloc: parts = path.split("/") if len(parts) >= 2: return "github", {"owner": parts[0], "repo": parts[1]} except Exception: pass return "unknown", None def _is_transformers_js_space(self, username: str, project_name: str) -> bool: """Check if space is a transformers.js app""" try: files = list_repo_files( repo_id=f"{username}/{project_name}", repo_type="space", token=self.hf_token ) has_html = any('index.html' in f for f in files) has_js = any('index.js' in f for f in files) has_css = any('style.css' in f for f in files) return has_html and has_js and has_css except: return False def _fetch_transformers_js_files(self, username: str, project_name: str) -> Tuple[str, Dict]: """Fetch transformers.js files and combine them""" files = {} file_names = ['index.html', 'index.js', 'style.css'] for file_name in file_names: try: content_path = self.api.hf_hub_download( repo_id=f"{username}/{project_name}", filename=file_name, repo_type="space" ) with open(content_path, 'r', encoding='utf-8') as f: files[file_name] = f.read() except: files[file_name] = "" # Combine files combined = f"""=== index.html === {files.get('index.html', '')} === index.js === {files.get('index.js', '')} === style.css === {files.get('style.css', '')}""" return combined, files def _fetch_all_space_files(self, username: str, project_name: str, sdk: str) -> Optional[Dict[str, str]]: """Fetch all relevant files from a space""" try: space_id = f"{username}/{project_name}" files = list_repo_files(repo_id=space_id, repo_type="space", token=self.hf_token) # Define file extensions to include include_extensions = { '.py', '.js', '.html', '.css', '.json', '.txt', '.yml', '.yaml', '.toml', '.cfg', '.ini', '.sh', '.md' } # Filter files relevant_files = [ f for f in files if any(f.endswith(ext) for ext in include_extensions) and not f.startswith('.') and not f.startswith('__pycache__') ] # Limit number of files if len(relevant_files) > 50: relevant_files = relevant_files[:50] # Fetch file contents file_contents = {} for file in relevant_files: try: file_path = self.api.hf_hub_download( repo_id=space_id, filename=file, repo_type="space" ) with open(file_path, 'r', encoding='utf-8') as f: file_contents[file] = f.read() except: continue return file_contents if file_contents else None except: return None def _format_multi_file_content(self, files: Dict[str, str], username: str, project_name: str, sdk: str) -> str: """Format multi-file content""" header = f"""IMPORTED PROJECT FROM HUGGING FACE SPACE ============================================== Space: {username}/{project_name} SDK: {sdk} Files: {len(files)} """ file_sections = [] for filename, content in files.items(): file_sections.append(f"=== {filename} ===\n{content}") return header + "\n\n".join(file_sections) def _fetch_main_file(self, username: str, project_name: str, sdk: str) -> Tuple[Optional[str], Optional[str]]: """Fetch main file from space""" file_patterns = self._get_file_patterns_for_sdk(sdk) for file_pattern in file_patterns: try: content_path = self.api.hf_hub_download( repo_id=f"{username}/{project_name}", filename=file_pattern, repo_type="space" ) with open(content_path, 'r', encoding='utf-8') as f: return file_pattern, f.read() except: continue return None, None def _get_file_patterns_for_sdk(self, sdk: str) -> List[str]: """Get file patterns to try based on SDK""" patterns = { "static": ["index.html"], "gradio": ["app.py", "main.py", "gradio_app.py"], "streamlit": [ "streamlit_app.py", "src/streamlit_app.py", "app.py", "src/app.py", "main.py", "src/main.py", "Home.py", "src/Home.py" ] } return patterns.get(sdk, ["app.py", "main.py", "index.html"]) def _detect_language_from_sdk(self, sdk: str) -> str: """Detect language/framework from SDK""" sdk_map = { "gradio": "gradio", "streamlit": "streamlit", "static": "html", "docker": "docker" } return sdk_map.get(sdk, "python") def _generate_inference_code(self, model_id: str, pipeline_tag: Optional[str]) -> Optional[str]: """Generate inference provider code based on pipeline tag""" if not pipeline_tag: return None templates = { "text-generation": f'''import os from huggingface_hub import InferenceClient client = InferenceClient(api_key=os.environ["HF_TOKEN"]) completion = client.chat.completions.create( model="{model_id}", messages=[ {{"role": "user", "content": "What is the capital of France?"}} ], ) print(completion.choices[0].message)''', "text-to-image": f'''import os from huggingface_hub import InferenceClient client = InferenceClient(api_key=os.environ["HF_TOKEN"]) # output is a PIL.Image object image = client.text_to_image( "Astronaut riding a horse", model="{model_id}", ) # Save the image image.save("output.png")''', "automatic-speech-recognition": f'''import os from huggingface_hub import InferenceClient client = InferenceClient(api_key=os.environ["HF_TOKEN"]) with open("audio.mp3", "rb") as f: audio_data = f.read() result = client.automatic_speech_recognition( audio_data, model="{model_id}", ) print(result)''', "text-to-speech": f'''import os from huggingface_hub import InferenceClient client = InferenceClient(api_key=os.environ["HF_TOKEN"]) audio = client.text_to_speech( "Hello world", model="{model_id}", ) # Save the audio with open("output.mp3", "wb") as f: f.write(audio)''', } return templates.get(pipeline_tag) def _fetch_hf_model_readme(self, repo_id: str) -> Optional[str]: """Fetch README from HuggingFace model""" try: local_path = self.api.hf_hub_download( repo_id=repo_id, filename="README.md", repo_type="model" ) with open(local_path, "r", encoding="utf-8") as f: return f.read() except: return None def _fetch_github_readme(self, owner: str, repo: str) -> Optional[str]: """Fetch README from GitHub repository""" urls = [ f"https://raw.githubusercontent.com/{owner}/{repo}/HEAD/README.md", f"https://raw.githubusercontent.com/{owner}/{repo}/main/README.md", f"https://raw.githubusercontent.com/{owner}/{repo}/master/README.md", ] for url in urls: try: resp = requests.get(url, timeout=10) if resp.status_code == 200 and resp.text: return resp.text except: continue return None def _extract_code_from_markdown(self, markdown: str) -> Tuple[Optional[str], Optional[str]]: """Extract relevant code from markdown""" if not markdown: return None, None # Find all code blocks code_blocks = [] for match in re.finditer(r"```([\w+-]+)?\s*\n([\s\S]*?)```", markdown, re.IGNORECASE): lang = (match.group(1) or "").lower() code = match.group(2) or "" code_blocks.append((lang, code.strip())) # Score blocks based on relevance def score_block(code: str) -> int: score = 0 keywords = [ "from transformers", "import transformers", "pipeline(", "AutoModel", "AutoTokenizer", "text-generation", "from diffusers", "import diffusers", "DiffusionPipeline", "StableDiffusion", "from gradio", "import gradio" ] for kw in keywords: if kw in code: score += 1 score += min(len(code) // 200, 5) return score # Filter and sort relevant = [ cb for cb in code_blocks if any(kw in cb[1] for kw in ["transformers", "diffusers", "pipeline(", "gradio", "import"]) ] if relevant: sorted_blocks = sorted(relevant, key=lambda x: score_block(x[1]), reverse=True) return sorted_blocks[0][0] or "python", sorted_blocks[0][1] return None, None # ==================== CLI Interface ==================== def main(): """CLI interface for project importer""" import argparse parser = argparse.ArgumentParser( description="Import projects from HuggingFace Spaces, Models, or GitHub repos" ) parser.add_argument("url", help="URL to import from") parser.add_argument("-o", "--output", help="Output file to save code", default=None) parser.add_argument("--prefer-local", action="store_true", help="Prefer local inference code over serverless (for models)") parser.add_argument("--token", help="HuggingFace token", default=None) args = parser.parse_args() # Initialize importer importer = ProjectImporter(hf_token=args.token) # Import project print(f"Importing from: {args.url}") print("-" * 60) result = importer.import_from_url(args.url) # Print results print(f"Status: {result['status']}") print(f"Message: {result['message']}") print(f"Language: {result['language']}") print(f"URL: {result['url']}") if result.get('metadata'): print(f"Metadata: {result['metadata']}") print("-" * 60) if result['code']: if args.output: with open(args.output, 'w', encoding='utf-8') as f: f.write(result['code']) print(f"Code saved to: {args.output}") else: print("Code:") print("=" * 60) print(result['code']) print("=" * 60) else: print("No code to display") if __name__ == "__main__": main()