Spaces:

smolagents
/

ml-intern

Running on CPU Upgrade

App Files Files Community

akseljoonas HF Staff commited on Jan 4

Commit

ccbe2d2

1 Parent(s): b70fed7

poc github tools

Browse files

Files changed (9) hide show

agent/core/agent_loop.py +8 -4
agent/core/tools.py +33 -1
agent/main.py +12 -6
agent/tools/__init__.py +32 -0
agent/tools/github_find_examples.py +524 -0
agent/tools/github_list_repos.py +324 -0
agent/tools/github_read_file.py +392 -0
agent/tools/github_search_code.py +381 -0
agent/tools/jobs_tool.py +19 -5

agent/core/agent_loop.py CHANGED Viewed

@@ -25,9 +25,15 @@ def _validate_tool_args(tool_args: dict) -> tuple[bool, str | None]:
     args = tool_args.get("args", {})
     # Sometimes LLM passes args as string instead of dict
     if isinstance(args, str):
-        return False, f"Tool call error: 'args' must be a JSON object, not a string. You passed: {repr(args)}"
     if not isinstance(args, dict) and args is not None:
-        return False, f"Tool call error: 'args' must be a JSON object. You passed type: {type(args).__name__}"
     return True, None
@@ -38,8 +44,6 @@ def _needs_approval(tool_name: str, tool_args: dict) -> bool:
     if not args_valid:
         return False
-    args = tool_args.get("args", {})
     if tool_name == "hf_jobs":
         # Check if it's a run or uv operation
         operation = tool_args.get("operation", "")

     args = tool_args.get("args", {})
     # Sometimes LLM passes args as string instead of dict
     if isinstance(args, str):
+        return (
+            False,
+            f"Tool call error: 'args' must be a JSON object, not a string. You passed: {repr(args)}",
+        )
     if not isinstance(args, dict) and args is not None:
+        return (
+            False,
+            f"Tool call error: 'args' must be a JSON object. You passed type: {type(args).__name__}",
+        )
     return True, None
     if not args_valid:
         return False
     if tool_name == "hf_jobs":
         # Check if it's a run or uv operation
         operation = tool_args.get("operation", "")

agent/core/tools.py CHANGED Viewed

@@ -19,6 +19,13 @@ from agent.tools.docs_tools import (
     explore_hf_docs_handler,
     hf_docs_fetch_handler,
 )
 from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, hf_jobs_handler
 from agent.tools.plan_tool import PLAN_TOOL_SPEC, plan_tool_handler
 from agent.tools.private_hf_repo_tools import (
@@ -224,7 +231,7 @@ class ToolRouter:
 def create_builtin_tools() -> list[ToolSpec]:
     """Create built-in tool specifications"""
     print(
-        f"Creating built-in tools: {EXPLORE_HF_DOCS_TOOL_SPEC['name']}, {HF_DOCS_FETCH_TOOL_SPEC['name']}, {PLAN_TOOL_SPEC['name']}, {HF_JOBS_TOOL_SPEC['name']}, {PRIVATE_HF_REPO_TOOL_SPEC['name']}, {UTILS_TOOL_SPEC['name']}"
     )
     # in order of importance
     return [
@@ -266,4 +273,29 @@ def create_builtin_tools() -> list[ToolSpec]:
             parameters=UTILS_TOOL_SPEC["parameters"],
             handler=utils_handler,
         ),
     ]

     explore_hf_docs_handler,
     hf_docs_fetch_handler,
 )
+from agent.tools.github_find_examples import (
+    FIND_EXAMPLES_TOOL_SPEC,
+    find_examples_handler,
+)
+from agent.tools.github_list_repos import LIST_REPOS_TOOL_SPEC, list_repos_handler
+from agent.tools.github_read_file import READ_FILE_TOOL_SPEC, read_file_handler
+from agent.tools.github_search_code import SEARCH_CODE_TOOL_SPEC, search_code_handler
 from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, hf_jobs_handler
 from agent.tools.plan_tool import PLAN_TOOL_SPEC, plan_tool_handler
 from agent.tools.private_hf_repo_tools import (
 def create_builtin_tools() -> list[ToolSpec]:
     """Create built-in tool specifications"""
     print(
+        f"Creating built-in tools: {EXPLORE_HF_DOCS_TOOL_SPEC['name']}, {HF_DOCS_FETCH_TOOL_SPEC['name']}, {PLAN_TOOL_SPEC['name']}, {HF_JOBS_TOOL_SPEC['name']}, {PRIVATE_HF_REPO_TOOL_SPEC['name']}, {UTILS_TOOL_SPEC['name']}, {FIND_EXAMPLES_TOOL_SPEC['name']}, {READ_FILE_TOOL_SPEC['name']}, {LIST_REPOS_TOOL_SPEC['name']}, {SEARCH_CODE_TOOL_SPEC['name']}"
     )
     # in order of importance
     return [
             parameters=UTILS_TOOL_SPEC["parameters"],
             handler=utils_handler,
         ),
+        # GitHub tools - 4 separate tools
+        ToolSpec(
+            name=FIND_EXAMPLES_TOOL_SPEC["name"],
+            description=FIND_EXAMPLES_TOOL_SPEC["description"],
+            parameters=FIND_EXAMPLES_TOOL_SPEC["parameters"],
+            handler=find_examples_handler,
+        ),
+        ToolSpec(
+            name=READ_FILE_TOOL_SPEC["name"],
+            description=READ_FILE_TOOL_SPEC["description"],
+            parameters=READ_FILE_TOOL_SPEC["parameters"],
+            handler=read_file_handler,
+        ),
+        ToolSpec(
+            name=LIST_REPOS_TOOL_SPEC["name"],
+            description=LIST_REPOS_TOOL_SPEC["description"],
+            parameters=LIST_REPOS_TOOL_SPEC["parameters"],
+            handler=list_repos_handler,
+        ),
+        ToolSpec(
+            name=SEARCH_CODE_TOOL_SPEC["name"],
+            description=SEARCH_CODE_TOOL_SPEC["description"],
+            parameters=SEARCH_CODE_TOOL_SPEC["parameters"],
+            handler=search_code_handler,
+        ),
     ]

agent/main.py CHANGED Viewed

@@ -222,11 +222,15 @@ async def event_listener(
                             # Build repo URL
                             type_path = "" if repo_type == "model" else f"{repo_type}s"
-                            repo_url = f"https://huggingface.co/{type_path}/{repo_id}".replace("//", "/")
                             print(f"Repository: {repo_id}")
                             print(f"Type: {repo_type}")
-                            print(f"Private: Yes")
                             print(f"URL: {repo_url}")
                             # Show file preview for upload_file operation
@@ -237,9 +241,9 @@ async def event_listener(
                                 if isinstance(file_content, str):
                                     # Calculate metrics
-                                    all_lines = file_content.split('\n')
                                     line_count = len(all_lines)
-                                    size_bytes = len(file_content.encode('utf-8'))
                                     size_kb = size_bytes / 1024
                                     size_mb = size_kb / 1024
@@ -251,8 +255,10 @@ async def event_listener(
                                     # Show preview
                                     preview_lines = all_lines[:5]
-                                    preview = '\n'.join(preview_lines)
-                                    print(f"Content preview (first 5 lines):\n{preview}")
                                     if len(all_lines) > 5:
                                         print("...")

                             # Build repo URL
                             type_path = "" if repo_type == "model" else f"{repo_type}s"
+                            repo_url = (
+                                f"https://huggingface.co/{type_path}/{repo_id}".replace(
+                                    "//", "/"
+                                )
+                            )
                             print(f"Repository: {repo_id}")
                             print(f"Type: {repo_type}")
+                            print("Private: Yes")
                             print(f"URL: {repo_url}")
                             # Show file preview for upload_file operation
                                 if isinstance(file_content, str):
                                     # Calculate metrics
+                                    all_lines = file_content.split("\n")
                                     line_count = len(all_lines)
+                                    size_bytes = len(file_content.encode("utf-8"))
                                     size_kb = size_bytes / 1024
                                     size_mb = size_kb / 1024
                                     # Show preview
                                     preview_lines = all_lines[:5]
+                                    preview = "\n".join(preview_lines)
+                                    print(
+                                        f"Content preview (first 5 lines):\n{preview}"
+                                    )
                                     if len(all_lines) > 5:
                                         print("...")

agent/tools/__init__.py CHANGED Viewed

@@ -2,6 +2,26 @@
 Hugging Face tools for the agent
 """
 from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, HfJobsTool, hf_jobs_handler
 from agent.tools.types import ToolResult
@@ -10,4 +30,16 @@ __all__ = [
     "HF_JOBS_TOOL_SPEC",
     "hf_jobs_handler",
     "HfJobsTool",
 ]

 Hugging Face tools for the agent
 """
+from agent.tools.github_find_examples import (
+    FIND_EXAMPLES_TOOL_SPEC,
+    FindExamplesTool,
+    find_examples_handler,
+)
+from agent.tools.github_list_repos import (
+    LIST_REPOS_TOOL_SPEC,
+    ListReposTool,
+    list_repos_handler,
+)
+from agent.tools.github_read_file import (
+    READ_FILE_TOOL_SPEC,
+    ReadFileTool,
+    read_file_handler,
+)
+from agent.tools.github_search_code import (
+    SEARCH_CODE_TOOL_SPEC,
+    SearchCodeTool,
+    search_code_handler,
+)
 from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, HfJobsTool, hf_jobs_handler
 from agent.tools.types import ToolResult
     "HF_JOBS_TOOL_SPEC",
     "hf_jobs_handler",
     "HfJobsTool",
+    "FIND_EXAMPLES_TOOL_SPEC",
+    "find_examples_handler",
+    "FindExamplesTool",
+    "READ_FILE_TOOL_SPEC",
+    "read_file_handler",
+    "ReadFileTool",
+    "LIST_REPOS_TOOL_SPEC",
+    "list_repos_handler",
+    "ListReposTool",
+    "SEARCH_CODE_TOOL_SPEC",
+    "search_code_handler",
+    "SearchCodeTool",
 ]

agent/tools/github_find_examples.py ADDED Viewed

	@@ -0,0 +1,524 @@

+"""
+GitHub Find Examples Tool
+Finds examples, guides, and tutorials for a library using deterministic queries and heuristics.
+"""
+import asyncio
+import math
+import os
+from dataclasses import asdict, dataclass
+from datetime import datetime, timedelta
+from typing import Any, Dict, List, Optional
+try:
+    import requests
+except ImportError:
+    raise ImportError(
+        "requests library is required. Install with: pip install requests"
+    )
+from agent.tools.types import ToolResult
+@dataclass
+class Example:
+    """An example file with metadata and relevance score."""
+    repo: str
+    path: str
+    ref: str
+    url: str
+    score: float
+    reason: str
+    repo_stars: int
+    repo_updated: str
+    file_size: int
+    def to_dict(self):
+        return asdict(self)
+class GitHubAPIError(Exception):
+    """Raised when GitHub API returns an error."""
+    pass
+# Path-based scoring weights
+PATH_SCORES = {
+    "README.md": 100,
+    "readme.md": 100,
+    "docs/": 80,
+    "doc/": 80,
+    "examples/": 90,
+    "example/": 90,
+    "notebooks/": 70,
+    "notebook/": 70,
+    "tutorials/": 85,
+    "tutorial/": 85,
+    "guides/": 85,
+    "guide/": 85,
+    "tests/": 40,
+    "test/": 40,
+    "demos/": 75,
+    "demo/": 75,
+    "samples/": 75,
+    "sample/": 75,
+}
+# Content-based scoring keywords
+CONTENT_KEYWORDS = {
+    'if __name__ == "__main__"': 50,
+    "if __name__ == '__main__'": 50,
+    "quickstart": 60,
+    "quick start": 60,
+    "getting started": 60,
+    "tutorial": 50,
+    "example usage": 55,
+    "usage example": 55,
+    "how to use": 45,
+    "basic example": 50,
+    "simple example": 50,
+}
+# File extension preferences
+PREFERRED_EXTENSIONS = {
+    ".py": 10,
+    ".ipynb": 15,
+    ".md": 20,
+    ".rst": 10,
+    ".js": 10,
+    ".ts": 10,
+    ".go": 10,
+    ".java": 10,
+    ".cpp": 10,
+    ".c": 10,
+}
+def _get_github_token() -> str:
+    """Get GitHub token from environment."""
+    token = os.environ.get("GITHUB_TOKEN")
+    if not token:
+        raise GitHubAPIError(
+            "GITHUB_TOKEN environment variable is required. "
+            "Set it with: export GITHUB_TOKEN=your_token_here"
+        )
+    return token
+def _execute_search(query: str, token: str, limit: int = 20) -> List[Dict[str, Any]]:
+    """Execute a GitHub code search query."""
+    headers = {
+        "Accept": "application/vnd.github.text-match+json",
+        "X-GitHub-Api-Version": "2022-11-28",
+        "Authorization": f"Bearer {token}",
+    }
+    results = []
+    page = 1
+    per_page = min(100, limit)
+    try:
+        while len(results) < limit:
+            params = {"q": query, "per_page": per_page, "page": page}
+            url = "https://api.github.com/search/code"
+            response = requests.get(url, headers=headers, params=params, timeout=30)
+            if response.status_code != 200:
+                break
+            data = response.json()
+            items = data.get("items", [])
+            if not items:
+                break
+            for item in items:
+                results.append(
+                    {
+                        "repo": item.get("repository", {}).get("full_name", ""),
+                        "path": item.get("path", ""),
+                        "sha": item.get("sha", ""),
+                        "url": item.get("html_url", ""),
+                        "size": item.get("size", 0),
+                        "text_matches": item.get("text_matches", []),
+                    }
+                )
+            if len(results) >= limit or len(items) < per_page:
+                break
+            page += 1
+    except Exception:
+        pass
+    return results[:limit]
+def _fetch_repo_metadata(repos: List[str], token: str) -> Dict[str, Dict[str, Any]]:
+    """Fetch metadata for repositories."""
+    headers = {
+        "Accept": "application/vnd.github+json",
+        "X-GitHub-Api-Version": "2022-11-28",
+        "Authorization": f"Bearer {token}",
+    }
+    metadata = {}
+    for repo in repos:
+        try:
+            url = f"https://api.github.com/repos/{repo}"
+            response = requests.get(url, headers=headers, timeout=10)
+            if response.status_code == 200:
+                data = response.json()
+                metadata[repo] = {
+                    "stars": data.get("stargazers_count", 0),
+                    "updated_at": data.get("updated_at", ""),
+                    "description": data.get("description", ""),
+                }
+        except:
+            continue
+    return metadata
+def _score_and_rank(
+    results: List[Dict[str, Any]], library: str, token: str
+) -> List[Example]:
+    """Score results based on heuristics and rank them."""
+    repos = list(set(r["repo"] for r in results))
+    repo_metadata = _fetch_repo_metadata(repos, token)
+    scored_examples = []
+    for result in results:
+        repo = result["repo"]
+        path = result["path"]
+        score = 0.0
+        reasons = []
+        # Path-based scoring
+        path_lower = path.lower()
+        for pattern, points in PATH_SCORES.items():
+            if pattern.lower() in path_lower:
+                score += points
+                reasons.append(f"in {pattern}")
+                break
+        # File extension scoring
+        for ext, points in PREFERRED_EXTENSIONS.items():
+            if path_lower.endswith(ext):
+                score += points
+                break
+        # Content-based scoring
+        text_content = ""
+        for match in result.get("text_matches", []):
+            text_content += match.get("fragment", "").lower() + " "
+        for keyword, points in CONTENT_KEYWORDS.items():
+            if keyword.lower() in text_content:
+                score += points
+                reasons.append(f"contains '{keyword}'")
+        # Repo-based scoring
+        metadata = repo_metadata.get(repo, {})
+        stars = metadata.get("stars", 0)
+        updated = metadata.get("updated_at", "")
+        if stars > 0:
+            star_score = math.log10(stars + 1) * 10
+            score += star_score
+        # Recency bonus
+        if updated:
+            try:
+                updated_date = datetime.fromisoformat(updated.replace("Z", "+00:00"))
+                if datetime.now(updated_date.tzinfo) - updated_date < timedelta(
+                    days=180
+                ):
+                    score += 20
+                    reasons.append("recently updated")
+            except:
+                pass
+        # Filename quality
+        filename = path.split("/")[-1].lower()
+        if any(
+            word in filename
+            for word in ["example", "tutorial", "guide", "quickstart", "demo"]
+        ):
+            score += 30
+            reasons.append("descriptive filename")
+        # Size penalty
+        if result["size"] > 100000:
+            score *= 0.5
+            reasons.append("large file")
+        example = Example(
+            repo=repo,
+            path=path,
+            ref=result["sha"],
+            url=result["url"],
+            score=score,
+            reason=", ".join(reasons) if reasons else "matches library",
+            repo_stars=stars,
+            repo_updated=updated,
+            file_size=result["size"],
+        )
+        scored_examples.append(example)
+    scored_examples.sort(key=lambda x: x.score, reverse=True)
+    return scored_examples
+def _search_by_path(
+    library: str, org: str, repo_scope: Optional[str], token: str
+) -> List[Dict[str, Any]]:
+    """Search for library in example/tutorial/docs directories."""
+    results = []
+    path_patterns = [
+        "examples/",
+        "example/",
+        "docs/",
+        "tutorials/",
+        "notebooks/",
+        "guides/",
+    ]
+    for path in path_patterns:
+        query_parts = [f"org:{org}", f"{library}", f"path:{path}"]
+        if repo_scope:
+            query_parts[0] = f"repo:{org}/{repo_scope}"
+        query = " ".join(query_parts)
+        results.extend(_execute_search(query, token, limit=20))
+    return results
+def _search_by_content(
+    library: str, org: str, repo_scope: Optional[str], token: str
+) -> List[Dict[str, Any]]:
+    """Search for library with specific content patterns."""
+    results = []
+    content_patterns = [
+        f"{library} if __name__",
+        f"{library} quickstart",
+        f"{library} tutorial",
+        f"{library} usage example",
+    ]
+    for pattern in content_patterns:
+        query_parts = [f"org:{org}", pattern]
+        if repo_scope:
+            query_parts[0] = f"repo:{org}/{repo_scope}"
+        query = " ".join(query_parts)
+        results.extend(_execute_search(query, token, limit=15))
+    return results
+def _search_readmes(
+    library: str, org: str, repo_scope: Optional[str], token: str
+) -> List[Dict[str, Any]]:
+    """Search for library mentions in README files."""
+    query_parts = [f"org:{org}", f"{library}", "filename:README"]
+    if repo_scope:
+        query_parts[0] = f"repo:{org}/{repo_scope}"
+    query = " ".join(query_parts)
+    return _execute_search(query, token, limit=20)
+def find_examples(
+    library: str,
+    org: str = "huggingface",
+    repo_scope: Optional[str] = None,
+    max_results: int = 10,
+) -> List[Example]:
+    """
+    Find examples, guides, and tutorials for a library using deterministic queries.
+    Uses a playbook of smart searches and heuristics to find canonical examples:
+    - Prefers README.md, docs/**, examples/**, notebooks/**, tests/**
+    - Prefers files with if __name__ == "__main__", "quickstart", "tutorial"
+    - Prefers repos with higher stars and more recent updates
+    Args:
+        library: Library name to search for (e.g., "transformers", "torch")
+        org: GitHub organization to search in (default: "huggingface")
+        repo_scope: Optional specific repository (e.g., "transformers")
+        max_results: Maximum number of results to return (default: 10)
+    Returns:
+        List of Example objects, ranked by relevance score
+    """
+    token = _get_github_token()
+    all_results = []
+    all_results.extend(_search_by_path(library, org, repo_scope, token))
+    all_results.extend(_search_by_content(library, org, repo_scope, token))
+    all_results.extend(_search_readmes(library, org, repo_scope, token))
+    # Deduplicate
+    seen = set()
+    unique_results = []
+    for result in all_results:
+        key = (result["repo"], result["path"])
+        if key not in seen:
+            seen.add(key)
+            unique_results.append(result)
+    scored_examples = _score_and_rank(unique_results, library, token)
+    return scored_examples[:max_results]
+async def _async_call(func, *args, **kwargs):
+    """Wrap synchronous calls for async context."""
+    return await asyncio.to_thread(func, *args, **kwargs)
+def _format_examples_table(examples: List[Example]) -> str:
+    """Format examples as a markdown table."""
+    if not examples:
+        return "No examples found."
+    lines = [
+        "| Rank | File | Score | Stars | Reason |",
+        "|------|------|-------|-------|--------|",
+    ]
+    for i, ex in enumerate(examples, 1):
+        file_path = f"{ex.repo}/{ex.path}"
+        if len(file_path) > 60:
+            file_path = file_path[:57] + "..."
+        reason = ex.reason if len(ex.reason) < 40 else ex.reason[:37] + "..."
+        lines.append(
+            f"| {i} | {file_path} | {ex.score:.1f} | {ex.repo_stars:,} | {reason} |"
+        )
+    return "\n".join(lines)
+class FindExamplesTool:
+    """Tool for finding examples and tutorials for libraries."""
+    async def execute(self, params: Dict[str, Any]) -> ToolResult:
+        """Execute find_examples operation."""
+        library = params.get("library")
+        if not library:
+            return {
+                "formatted": "Error: 'library' parameter is required",
+                "totalResults": 0,
+                "resultsShared": 0,
+                "isError": True,
+            }
+        org = params.get("org", "huggingface")
+        repo_scope = params.get("repo_scope")
+        max_results = params.get("max_results", 10)
+        try:
+            examples = await _async_call(
+                find_examples,
+                library=library,
+                org=org,
+                repo_scope=repo_scope,
+                max_results=max_results,
+            )
+            if not examples:
+                return {
+                    "formatted": f"No examples found for '{library}' in {org}",
+                    "totalResults": 0,
+                    "resultsShared": 0,
+                }
+            table = _format_examples_table(examples)
+            response = f"**Found {len(examples)} examples for '{library}' in {org}:**\n\n{table}"
+            # Add URLs and suggest using read_file
+            response += "\n\n**Top examples (use read_file to view):**\n"
+            for i, ex in enumerate(examples[:3], 1):
+                response += f"{i}. [{ex.repo}/{ex.path}]({ex.url})\n"
+                response += f"   Use: read_file(repo='{ex.repo}', path='{ex.path}')\n"
+            return {
+                "formatted": response,
+                "totalResults": len(examples),
+                "resultsShared": len(examples),
+            }
+        except GitHubAPIError as e:
+            return {
+                "formatted": f"GitHub API Error: {str(e)}",
+                "totalResults": 0,
+                "resultsShared": 0,
+                "isError": True,
+            }
+        except Exception as e:
+            return {
+                "formatted": f"Error: {str(e)}",
+                "totalResults": 0,
+                "resultsShared": 0,
+                "isError": True,
+            }
+# Tool specification
+FIND_EXAMPLES_TOOL_SPEC = {
+    "name": "find_examples",
+    "description": (
+        "Find examples, guides, and tutorials for a library using deterministic queries and heuristics.\n\n"
+        "Uses best practices retrieval without semantic search:\n"
+        "- Prefers README.md, docs/**, examples/**, notebooks/**, tests/**\n"
+        "- Prefers files with if __name__ == '__main__', 'quickstart', 'tutorial', 'usage'\n"
+        "- Prefers repos with higher stars and more recent updates\n\n"
+        "Returns a ranked list of canonical example files.\n\n"
+        "Examples:\n"
+        "- Find transformers examples: {'library': 'transformers', 'org': 'huggingface', 'max_results': 5}\n"
+        "- Find torch examples in specific repo: {'library': 'torch', 'org': 'pytorch', 'repo_scope': 'examples'}\n\n"
+        "Use read_file tool to view the content of returned files.\n\n"
+    ),
+    "parameters": {
+        "type": "object",
+        "properties": {
+            "library": {
+                "type": "string",
+                "description": "Library name to search for (e.g., 'transformers', 'torch', 'react')",
+            },
+            "org": {
+                "type": "string",
+                "description": "GitHub organization to search in (default: 'huggingface')",
+            },
+            "repo_scope": {
+                "type": "string",
+                "description": "Optional specific repository to search within",
+            },
+            "max_results": {
+                "type": "integer",
+                "description": "Maximum number of results to return (default: 10)",
+            },
+        },
+        "required": ["library"],
+    },
+}
+async def find_examples_handler(arguments: Dict[str, Any]) -> tuple[str, bool]:
+    """Handler for agent tool router."""
+    try:
+        tool = FindExamplesTool()
+        result = await tool.execute(arguments)
+        return result["formatted"], not result.get("isError", False)
+    except Exception as e:
+        return f"Error executing find_examples: {str(e)}", False

agent/tools/github_list_repos.py ADDED Viewed

	@@ -0,0 +1,324 @@

+"""
+GitHub List Repos Tool
+Lists repositories for a user or organization with sorting options.
+"""
+import asyncio
+import os
+from dataclasses import asdict, dataclass
+from typing import Any, Dict, List, Literal, Optional
+try:
+    import requests
+except ImportError:
+    raise ImportError(
+        "requests library is required. Install with: pip install requests"
+    )
+from agent.tools.types import ToolResult
+@dataclass
+class Repository:
+    """Repository information."""
+    id: int
+    name: str
+    full_name: str
+    description: Optional[str]
+    html_url: str
+    language: Optional[str]
+    stars: int
+    forks: int
+    open_issues: int
+    private: bool
+    fork: bool
+    archived: bool
+    default_branch: str
+    created_at: Optional[str] = None
+    updated_at: Optional[str] = None
+    topics: Optional[List[str]] = None
+    def to_dict(self):
+        return asdict(self)
+class GitHubAPIError(Exception):
+    """Raised when GitHub API returns an error."""
+    pass
+def _get_github_token() -> str:
+    """Get GitHub token from environment."""
+    token = os.environ.get("GITHUB_TOKEN")
+    if not token:
+        raise GitHubAPIError(
+            "GITHUB_TOKEN environment variable is required. "
+            "Set it with: export GITHUB_TOKEN=your_token_here"
+        )
+    return token
+def _fetch_repositories(
+    query: str, sort: str, order: str, limit: Optional[int], token: str
+) -> List[Repository]:
+    """Fetch repositories from GitHub Search API."""
+    headers = {
+        "Accept": "application/vnd.github+json",
+        "X-GitHub-Api-Version": "2022-11-28",
+        "Authorization": f"Bearer {token}",
+    }
+    all_repos = []
+    page = 1
+    per_page = min(100, limit) if limit else 100
+    while True:
+        params = {
+            "q": query,
+            "sort": sort,
+            "order": order,
+            "page": page,
+            "per_page": per_page,
+        }
+        url = "https://api.github.com/search/repositories"
+        try:
+            response = requests.get(url, headers=headers, params=params, timeout=30)
+            if response.status_code != 200:
+                break
+            data = response.json()
+            items = data.get("items", [])
+            if not items:
+                break
+            for item in items:
+                repo = Repository(
+                    id=item.get("id"),
+                    name=item.get("name"),
+                    full_name=item.get("full_name"),
+                    description=item.get("description"),
+                    html_url=item.get("html_url"),
+                    language=item.get("language"),
+                    stars=item.get("stargazers_count", 0),
+                    forks=item.get("forks_count", 0),
+                    open_issues=item.get("open_issues_count", 0),
+                    private=item.get("private", False),
+                    fork=item.get("fork", False),
+                    archived=item.get("archived", False),
+                    default_branch=item.get("default_branch", "main"),
+                    created_at=item.get("created_at"),
+                    updated_at=item.get("updated_at"),
+                    topics=item.get("topics", []),
+                )
+                all_repos.append(repo)
+            if limit and len(all_repos) >= limit:
+                all_repos = all_repos[:limit]
+                break
+            total_count = data.get("total_count", 0)
+            if len(all_repos) >= total_count:
+                break
+            if page * per_page >= 1000:
+                break
+            page += 1
+        except requests.exceptions.RequestException:
+            break
+    return all_repos
+def list_repos(
+    owner: str,
+    owner_type: Literal["user", "org"] = "org",
+    sort: Literal["stars", "forks", "updated", "created"] = "stars",
+    order: Literal["asc", "desc"] = "desc",
+    limit: Optional[int] = None,
+) -> List[Repository]:
+    """
+    List repositories for a user or organization using GitHub Search API.
+    Backed by https://api.github.com/search/repositories?q=org:huggingface&sort=stars&order=desc
+    or can use GraphQL + client-side sort.
+    Args:
+        owner: GitHub username or organization name
+        owner_type: Whether the owner is a "user" or "org" (default: "org")
+        sort: Sort field - "stars", "forks", "updated", or "created" (default: "stars")
+        order: Sort order - "asc" or "desc" (default: "desc")
+        limit: Maximum number of repositories to return (default: no limit)
+    Returns:
+        List of Repository objects
+    """
+    token = _get_github_token()
+    if owner_type == "org":
+        query = f"org:{owner}"
+    else:
+        query = f"user:{owner}"
+    repos = _fetch_repositories(
+        query=query, sort=sort, order=order, limit=limit, token=token
+    )
+    return repos
+async def _async_call(func, *args, **kwargs):
+    """Wrap synchronous calls for async context."""
+    return await asyncio.to_thread(func, *args, **kwargs)
+def _format_repos_table(repos: List[Repository]) -> str:
+    """Format repositories as a markdown table."""
+    if not repos:
+        return "No repositories found."
+    lines = [
+        "| Repo | Stars | Forks | Language | Description |",
+        "|------|-------|-------|----------|-------------|",
+    ]
+    for repo in repos:
+        desc = repo.description or "N/A"
+        if len(desc) > 50:
+            desc = desc[:47] + "..."
+        lang = repo.language or "N/A"
+        lines.append(
+            f"| {repo.full_name} | {repo.stars:,} | {repo.forks:,} | {lang} | {desc} |"
+        )
+    return "\n".join(lines)
+class ListReposTool:
+    """Tool for listing GitHub repositories."""
+    async def execute(self, params: Dict[str, Any]) -> ToolResult:
+        """Execute list_repos operation."""
+        owner = params.get("owner")
+        if not owner:
+            return {
+                "formatted": "Error: 'owner' parameter is required",
+                "totalResults": 0,
+                "resultsShared": 0,
+                "isError": True,
+            }
+        owner_type = params.get("owner_type", "org")
+        sort = params.get("sort", "stars")
+        order = params.get("order", "desc")
+        limit = params.get("limit")
+        try:
+            repos = await _async_call(
+                list_repos,
+                owner=owner,
+                owner_type=owner_type,
+                sort=sort,
+                order=order,
+                limit=limit,
+            )
+            if not repos:
+                return {
+                    "formatted": f"No repositories found for {owner}",
+                    "totalResults": 0,
+                    "resultsShared": 0,
+                }
+            table = _format_repos_table(repos)
+            response = f"**Found {len(repos)} repositories for {owner} (sorted by {sort}, {order}):**\n\n{table}"
+            # Add links to top repos
+            response += "\n\n**Top repositories:**\n"
+            for i, repo in enumerate(repos[:5], 1):
+                response += (
+                    f"{i}. [{repo.full_name}]({repo.html_url}) - ⭐ {repo.stars:,}\n"
+                )
+            return {
+                "formatted": response,
+                "totalResults": len(repos),
+                "resultsShared": len(repos),
+            }
+        except GitHubAPIError as e:
+            return {
+                "formatted": f"GitHub API Error: {str(e)}",
+                "totalResults": 0,
+                "resultsShared": 0,
+                "isError": True,
+            }
+        except Exception as e:
+            return {
+                "formatted": f"Error: {str(e)}",
+                "totalResults": 0,
+                "resultsShared": 0,
+                "isError": True,
+            }
+# Tool specification
+LIST_REPOS_TOOL_SPEC = {
+    "name": "list_repos",
+    "description": (
+        "List repositories for a user or organization with sorting options.\n\n"
+        "Backed by GitHub Search API: https://api.github.com/search/repositories?q=org:huggingface&sort=stars&order=desc\n\n"
+        "Examples:\n"
+        "- Top 10 starred repos: {'owner': 'huggingface', 'sort': 'stars', 'limit': 10}\n"
+        "- Recently updated: {'owner': 'microsoft', 'sort': 'updated', 'order': 'desc', 'limit': 5}\n"
+        "- User repos: {'owner': 'torvalds', 'owner_type': 'user', 'sort': 'stars'}\n"
+        "- All repos: {'owner': 'pytorch', 'sort': 'forks'}\n\n"
+    ),
+    "parameters": {
+        "type": "object",
+        "properties": {
+            "owner": {
+                "type": "string",
+                "description": "GitHub username or organization name (e.g., 'huggingface', 'torvalds')",
+            },
+            "owner_type": {
+                "type": "string",
+                "enum": ["user", "org"],
+                "description": "Whether the owner is a 'user' or 'org' (default: 'org')",
+            },
+            "sort": {
+                "type": "string",
+                "enum": ["stars", "forks", "updated", "created"],
+                "description": "Sort field: 'stars', 'forks', 'updated', or 'created' (default: 'stars')",
+            },
+            "order": {
+                "type": "string",
+                "enum": ["asc", "desc"],
+                "description": "Sort order: 'asc' or 'desc' (default: 'desc')",
+            },
+            "limit": {
+                "type": "integer",
+                "description": "Maximum number of repositories to return (default: no limit, returns all)",
+            },
+        },
+        "required": ["owner"],
+    },
+}
+async def list_repos_handler(arguments: Dict[str, Any]) -> tuple[str, bool]:
+    """Handler for agent tool router."""
+    try:
+        tool = ListReposTool()
+        result = await tool.execute(arguments)
+        return result["formatted"], not result.get("isError", False)
+    except Exception as e:
+        return f"Error executing list_repos: {str(e)}", False

agent/tools/github_read_file.py ADDED Viewed

	@@ -0,0 +1,392 @@

+"""
+GitHub Read File Tool
+Reads file contents from a GitHub repository with line range support.
+"""
+import asyncio
+import base64
+import os
+from dataclasses import asdict, dataclass
+from typing import Any, Dict, Optional, Tuple
+try:
+    import requests
+except ImportError:
+    raise ImportError(
+        "requests library is required. Install with: pip install requests"
+    )
+from agent.tools.types import ToolResult
+@dataclass
+class FileContents:
+    """File contents with metadata."""
+    content: str
+    sha: str
+    path: str
+    size: int
+    last_modified: Optional[str]
+    last_commit_sha: Optional[str]
+    line_start: int
+    line_end: int
+    total_lines: int
+    truncated: bool
+    message: Optional[str] = None
+    def to_dict(self):
+        return asdict(self)
+class GitHubAPIError(Exception):
+    """Raised when GitHub API returns an error."""
+    pass
+def _get_github_token() -> str:
+    """Get GitHub token from environment."""
+    token = os.environ.get("GITHUB_TOKEN")
+    if not token:
+        raise GitHubAPIError(
+            "GITHUB_TOKEN environment variable is required. "
+            "Set it with: export GITHUB_TOKEN=your_token_here"
+        )
+    return token
+def _fetch_raw_content(owner: str, repo: str, path: str, ref: str, token: str) -> str:
+    """Fetch raw file content for large files."""
+    headers = {
+        "Accept": "application/vnd.github.raw",
+        "X-GitHub-Api-Version": "2022-11-28",
+        "Authorization": f"Bearer {token}",
+    }
+    url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}"
+    params = {"ref": ref}
+    response = requests.get(url, headers=headers, params=params, timeout=30)
+    if response.status_code != 200:
+        raise GitHubAPIError(
+            f"Failed to fetch raw content: HTTP {response.status_code}"
+        )
+    return response.text
+def _get_last_commit_info(
+    owner: str, repo: str, path: str, ref: Optional[str], token: str
+) -> Tuple[Optional[str], Optional[str]]:
+    """Get last commit information for a specific file."""
+    headers = {
+        "Accept": "application/vnd.github+json",
+        "X-GitHub-Api-Version": "2022-11-28",
+        "Authorization": f"Bearer {token}",
+    }
+    url = f"https://api.github.com/repos/{owner}/{repo}/commits"
+    params = {"path": path, "per_page": 1}
+    if ref and ref != "HEAD":
+        params["sha"] = ref
+    try:
+        response = requests.get(url, headers=headers, params=params, timeout=30)
+        if response.status_code == 200:
+            commits = response.json()
+            if commits:
+                commit = commits[0]
+                commit_sha = commit.get("sha")
+                commit_date = commit.get("commit", {}).get("committer", {}).get("date")
+                return commit_date, commit_sha
+    except:
+        pass
+    return None, None
+def _fetch_file_contents(
+    owner: str,
+    repo: str,
+    path: str,
+    ref: str,
+    line_start: Optional[int],
+    line_end: Optional[int],
+    token: str,
+) -> FileContents:
+    """Fetch file contents from GitHub API."""
+    headers = {
+        "Accept": "application/vnd.github+json",
+        "X-GitHub-Api-Version": "2022-11-28",
+        "Authorization": f"Bearer {token}",
+    }
+    url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}"
+    params = {}
+    if ref and ref != "HEAD":
+        params["ref"] = ref
+    try:
+        response = requests.get(url, headers=headers, params=params, timeout=30)
+        if response.status_code == 404:
+            raise GitHubAPIError(
+                f"File not found: {path} in {owner}/{repo} (ref: {ref})"
+            )
+        if response.status_code != 200:
+            error_msg = f"GitHub API error (status {response.status_code})"
+            try:
+                error_data = response.json()
+                if "message" in error_data:
+                    error_msg += f": {error_data['message']}"
+            except:
+                pass
+            raise GitHubAPIError(error_msg)
+        data = response.json()
+        if data.get("type") != "file":
+            raise GitHubAPIError(
+                f"Path {path} is not a file (type: {data.get('type')})"
+            )
+        file_sha = data.get("sha")
+        file_size = data.get("size", 0)
+        # Decode content
+        content_b64 = data.get("content", "")
+        if content_b64:
+            content_b64 = content_b64.replace("\n", "").replace(" ", "")
+            content = base64.b64decode(content_b64).decode("utf-8", errors="replace")
+        else:
+            content = _fetch_raw_content(owner, repo, path, ref or "HEAD", token)
+    except requests.exceptions.RequestException as e:
+        raise GitHubAPIError(f"Failed to connect to GitHub API: {e}")
+    # Get last commit info
+    last_modified, last_commit_sha = _get_last_commit_info(
+        owner, repo, path, ref, token
+    )
+    # Process line ranges
+    lines = content.split("\n")
+    total_lines = len(lines)
+    truncated = False
+    message = None
+    if line_start is None and line_end is None:
+        if total_lines > 300:
+            line_start = 1
+            line_end = 300
+            truncated = True
+            message = (
+                f"File has {total_lines} lines. Returned only the first 300 lines. "
+                f"To view more, use the line_start and line_end parameters."
+            )
+        else:
+            line_start = 1
+            line_end = total_lines
+    else:
+        if line_start is None:
+            line_start = 1
+        if line_end is None:
+            line_end = total_lines
+        if line_start < 1:
+            line_start = 1
+        if line_end > total_lines:
+            line_end = total_lines
+        if line_start > line_end:
+            raise ValueError(
+                f"line_start ({line_start}) cannot be greater than line_end ({line_end})"
+            )
+    selected_lines = lines[line_start - 1 : line_end]
+    selected_content = "\n".join(selected_lines)
+    return FileContents(
+        content=selected_content,
+        sha=file_sha,
+        path=path,
+        size=file_size,
+        last_modified=last_modified,
+        last_commit_sha=last_commit_sha,
+        line_start=line_start,
+        line_end=line_end,
+        total_lines=total_lines,
+        truncated=truncated,
+        message=message,
+    )
+def read_file(
+    repo: str,
+    path: str,
+    ref: str = "HEAD",
+    line_start: Optional[int] = None,
+    line_end: Optional[int] = None,
+) -> FileContents:
+    """
+    Read file contents from a GitHub repository.
+    Returns raw file text plus metadata (commit SHA, last modified).
+    If file is more than 300 lines and no line range is specified,
+    returns only the first 300 lines with a message.
+    Args:
+        repo: Repository in format "owner/repo" (e.g., "huggingface/transformers")
+        path: Path to file in repository (e.g., "README.md")
+        ref: Git reference - branch name, tag, or commit SHA (default: "HEAD")
+        line_start: Starting line number (1-indexed, inclusive)
+        line_end: Ending line number (1-indexed, inclusive)
+    Returns:
+        FileContents object with content and metadata
+    """
+    if "/" not in repo:
+        raise ValueError("repo must be in format 'owner/repo'")
+    owner, repo_name = repo.split("/", 1)
+    token = _get_github_token()
+    return _fetch_file_contents(
+        owner=owner,
+        repo=repo_name,
+        path=path,
+        ref=ref,
+        line_start=line_start,
+        line_end=line_end,
+        token=token,
+    )
+async def _async_call(func, *args, **kwargs):
+    """Wrap synchronous calls for async context."""
+    return await asyncio.to_thread(func, *args, **kwargs)
+class ReadFileTool:
+    """Tool for reading files from GitHub repositories."""
+    async def execute(self, params: Dict[str, Any]) -> ToolResult:
+        """Execute read_file operation."""
+        repo = params.get("repo")
+        path = params.get("path")
+        if not repo or not path:
+            return {
+                "formatted": "Error: 'repo' and 'path' parameters are required",
+                "totalResults": 0,
+                "resultsShared": 0,
+                "isError": True,
+            }
+        ref = params.get("ref", "HEAD")
+        line_start = params.get("line_start")
+        line_end = params.get("line_end")
+        try:
+            file_contents = await _async_call(
+                read_file,
+                repo=repo,
+                path=path,
+                ref=ref,
+                line_start=line_start,
+                line_end=line_end,
+            )
+            response = f"**File: {file_contents.path}**\n"
+            response += f"**Repo: {repo}**\n"
+            response += f"**Lines:** {file_contents.line_start}-{file_contents.line_end} of {file_contents.total_lines}\n"
+            response += f"**SHA:** {file_contents.sha}\n"
+            if file_contents.last_modified:
+                response += f"**Last modified:** {file_contents.last_modified}\n"
+            if file_contents.message:
+                response += f"\n⚠️ {file_contents.message}\n"
+            response += f"\n```\n{file_contents.content}\n```"
+            return {
+                "formatted": response,
+                "totalResults": 1,
+                "resultsShared": 1,
+            }
+        except GitHubAPIError as e:
+            return {
+                "formatted": f"GitHub API Error: {str(e)}",
+                "totalResults": 0,
+                "resultsShared": 0,
+                "isError": True,
+            }
+        except Exception as e:
+            return {
+                "formatted": f"Error: {str(e)}",
+                "totalResults": 0,
+                "resultsShared": 0,
+                "isError": True,
+            }
+# Tool specification
+READ_FILE_TOOL_SPEC = {
+    "name": "read_file",
+    "description": (
+        "Read file contents from a GitHub repository.\n\n"
+        "Returns raw file text plus metadata (commit SHA, last modified).\n"
+        "If file is more than 300 lines, returns only the first 300 lines and includes line_start and line_end indexes.\n"
+        "Use line_start and line_end parameters to view specific line ranges.\n\n"
+        "Examples:\n"
+        "- Read README: {'repo': 'huggingface/transformers', 'path': 'README.md'}\n"
+        "- Read specific lines: {'repo': 'huggingface/transformers', 'path': 'src/transformers/__init__.py', 'line_start': 1, 'line_end': 50}\n"
+        "- Read from branch: {'repo': 'torvalds/linux', 'path': 'MAINTAINERS', 'ref': 'master', 'line_start': 1, 'line_end': 20}\n\n"
+    ),
+    "parameters": {
+        "type": "object",
+        "properties": {
+            "repo": {
+                "type": "string",
+                "description": "Repository in format 'owner/repo' (e.g., 'huggingface/transformers')",
+            },
+            "path": {
+                "type": "string",
+                "description": "Path to file in repository (e.g., 'README.md', 'src/main.py')",
+            },
+            "ref": {
+                "type": "string",
+                "description": "Git reference: branch name, tag, or commit SHA (default: 'HEAD')",
+            },
+            "line_start": {
+                "type": "integer",
+                "description": "Starting line number (1-indexed, inclusive). Use to read specific range.",
+            },
+            "line_end": {
+                "type": "integer",
+                "description": "Ending line number (1-indexed, inclusive). Use to read specific range.",
+            },
+        },
+        "required": ["repo", "path"],
+    },
+}
+async def read_file_handler(arguments: Dict[str, Any]) -> tuple[str, bool]:
+    """Handler for agent tool router."""
+    try:
+        tool = ReadFileTool()
+        result = await tool.execute(arguments)
+        return result["formatted"], not result.get("isError", False)
+    except Exception as e:
+        return f"Error executing read_file: {str(e)}", False

agent/tools/github_search_code.py ADDED Viewed

	@@ -0,0 +1,381 @@

+"""
+GitHub Search Code Tool
+Searches code across GitHub with glob filtering and line-level results.
+"""
+import asyncio
+import fnmatch
+import os
+import re
+from dataclasses import asdict, dataclass
+from typing import Any, Dict, List, Optional, Tuple
+try:
+    import requests
+except ImportError:
+    raise ImportError(
+        "requests library is required. Install with: pip install requests"
+    )
+from agent.tools.types import ToolResult
+@dataclass
+class CodeMatch:
+    """A code match with location information."""
+    repo: str
+    path: str
+    ref: str
+    line_start: int
+    line_end: int
+    snippet: str
+    def to_dict(self):
+        return asdict(self)
+class GitHubAPIError(Exception):
+    """Raised when GitHub API returns an error."""
+    pass
+def _get_github_token() -> str:
+    """Get GitHub token from environment."""
+    token = os.environ.get("GITHUB_TOKEN")
+    if not token:
+        raise GitHubAPIError(
+            "GITHUB_TOKEN environment variable is required. "
+            "Set it with: export GITHUB_TOKEN=your_token_here"
+        )
+    return token
+def _build_github_query(
+    query: str, repo_glob: Optional[str], path_glob: Optional[str], regex: bool
+) -> str:
+    """Build GitHub search query string from parameters."""
+    parts = []
+    if regex:
+        parts.append(f"/{query}/")
+    else:
+        if " " in query:
+            parts.append(f'"{query}"')
+        else:
+            parts.append(query)
+    if repo_glob:
+        if "/" in repo_glob:
+            parts.append(f"repo:{repo_glob}")
+        else:
+            parts.append(f"user:{repo_glob}")
+    if path_glob:
+        if "*" not in path_glob and "?" not in path_glob:
+            parts.append(f"path:{path_glob}")
+        elif path_glob.startswith("*."):
+            ext = path_glob[2:]
+            parts.append(f"extension:{ext}")
+        elif "/" not in path_glob and "*" in path_glob:
+            parts.append(f"filename:{path_glob}")
+        else:
+            if "." in path_glob:
+                ext_match = re.search(r"\*\.(\w+)", path_glob)
+                if ext_match:
+                    parts.append(f"extension:{ext_match.group(1)}")
+    return " ".join(parts)
+def _fetch_code_search_results(
+    query: str, token: str, max_results: int
+) -> List[Dict[str, Any]]:
+    """Fetch code search results from GitHub API."""
+    headers = {
+        "Accept": "application/vnd.github.text-match+json",
+        "X-GitHub-Api-Version": "2022-11-28",
+        "Authorization": f"Bearer {token}",
+    }
+    all_items = []
+    page = 1
+    per_page = min(100, max_results)
+    while len(all_items) < max_results:
+        params = {
+            "q": query,
+            "page": page,
+            "per_page": per_page,
+        }
+        url = "https://api.github.com/search/code"
+        try:
+            response = requests.get(url, headers=headers, params=params, timeout=30)
+            if response.status_code != 200:
+                break
+            data = response.json()
+            items = data.get("items", [])
+            if not items:
+                break
+            all_items.extend(items)
+            if len(all_items) >= data.get("total_count", 0):
+                break
+            page += 1
+        except requests.exceptions.RequestException:
+            break
+    return all_items[:max_results]
+def _glob_match(text: str, pattern: str) -> bool:
+    """Check if text matches glob pattern, supporting ** for multi-level paths."""
+    if "**" in pattern:
+        regex_pattern = pattern.replace("**", "<<<DOUBLESTAR>>>")
+        regex_pattern = fnmatch.translate(regex_pattern)
+        regex_pattern = regex_pattern.replace("<<<DOUBLESTAR>>>", ".*")
+        return re.match(regex_pattern, text) is not None
+    else:
+        return fnmatch.fnmatch(text, pattern)
+def _estimate_line_numbers(fragment: str) -> Tuple[int, int]:
+    """Estimate line numbers from a code fragment."""
+    lines = fragment.split("\n")
+    line_count = len([line for line in lines if line.strip()])
+    return 1, line_count
+def _parse_results_to_matches(
+    raw_results: List[Dict[str, Any]],
+    repo_glob: Optional[str],
+    path_glob: Optional[str],
+) -> List[CodeMatch]:
+    """Parse raw GitHub API results into CodeMatch objects."""
+    matches = []
+    for item in raw_results:
+        repo_name = item.get("repository", {}).get("full_name", "unknown/unknown")
+        file_path = item.get("path", "")
+        sha = item.get("sha", "unknown")
+        if repo_glob and not _glob_match(repo_name, repo_glob):
+            continue
+        if path_glob and not _glob_match(file_path, path_glob):
+            continue
+        text_matches = item.get("text_matches", [])
+        if text_matches:
+            for text_match in text_matches:
+                fragment = text_match.get("fragment", "")
+                line_start, line_end = _estimate_line_numbers(fragment)
+                match = CodeMatch(
+                    repo=repo_name,
+                    path=file_path,
+                    ref=sha,
+                    line_start=line_start,
+                    line_end=line_end,
+                    snippet=fragment.strip(),
+                )
+                matches.append(match)
+        else:
+            match = CodeMatch(
+                repo=repo_name,
+                path=file_path,
+                ref=sha,
+                line_start=1,
+                line_end=1,
+                snippet="<match found, but snippet not available>",
+            )
+            matches.append(match)
+    return matches
+def search_code(
+    query: str,
+    repo_glob: Optional[str] = None,
+    path_glob: Optional[str] = None,
+    regex: bool = False,
+    max_results: int = 100,
+) -> List[CodeMatch]:
+    """
+    Search for code across GitHub with glob filtering and line-level results.
+    Returns: repo, path, ref, line_start, line_end, snippet
+    Args:
+        query: Search term or pattern to find in code
+        repo_glob: Glob pattern to filter repositories (e.g., "github/*", "facebook/react")
+        path_glob: Glob pattern to filter file paths (e.g., "*.py", "src/**/*.js")
+        regex: If True, treat query as a regular expression
+        max_results: Maximum number of results to return (default: 100)
+    Returns:
+        List of CodeMatch objects with repo, path, ref, line numbers, and snippet
+    """
+    github_query = _build_github_query(query, repo_glob, path_glob, regex)
+    token = _get_github_token()
+    raw_results = _fetch_code_search_results(github_query, token, max_results)
+    matches = _parse_results_to_matches(raw_results, repo_glob, path_glob)
+    return matches
+async def _async_call(func, *args, **kwargs):
+    """Wrap synchronous calls for async context."""
+    return await asyncio.to_thread(func, *args, **kwargs)
+def _format_code_matches(matches: List[CodeMatch]) -> str:
+    """Format code matches."""
+    if not matches:
+        return "No matches found."
+    lines = []
+    for i, match in enumerate(matches, 1):
+        lines.append(f"**{i}. {match.repo}/{match.path}:{match.line_start}**")
+        lines.append("```")
+        # Show first 5 lines of snippet
+        snippet_lines = match.snippet.split("\n")[:5]
+        lines.extend(snippet_lines)
+        if len(match.snippet.split("\n")) > 5:
+            lines.append("...")
+        lines.append("```")
+        lines.append("")
+    return "\n".join(lines)
+class SearchCodeTool:
+    """Tool for searching code across GitHub."""
+    async def execute(self, params: Dict[str, Any]) -> ToolResult:
+        """Execute search_code operation."""
+        query = params.get("query")
+        if not query:
+            return {
+                "formatted": "Error: 'query' parameter is required",
+                "totalResults": 0,
+                "resultsShared": 0,
+                "isError": True,
+            }
+        repo_glob = params.get("repo_glob")
+        path_glob = params.get("path_glob")
+        regex = params.get("regex", False)
+        max_results = params.get("max_results", 100)
+        try:
+            matches = await _async_call(
+                search_code,
+                query=query,
+                repo_glob=repo_glob,
+                path_glob=path_glob,
+                regex=regex,
+                max_results=max_results,
+            )
+            if not matches:
+                return {
+                    "formatted": "No matches found",
+                    "totalResults": 0,
+                    "resultsShared": 0,
+                }
+            formatted = _format_code_matches(matches)
+            response = f"**Found {len(matches)} code matches:**\n\n{formatted}"
+            # Add note about viewing full files
+            if matches:
+                response += "\n**To view full file, use:**\n"
+                top_match = matches[0]
+                response += (
+                    f"read_file(repo='{top_match.repo}', path='{top_match.path}')"
+                )
+            return {
+                "formatted": response,
+                "totalResults": len(matches),
+                "resultsShared": min(len(matches), 10),
+            }
+        except GitHubAPIError as e:
+            return {
+                "formatted": f"GitHub API Error: {str(e)}",
+                "totalResults": 0,
+                "resultsShared": 0,
+                "isError": True,
+            }
+        except Exception as e:
+            return {
+                "formatted": f"Error: {str(e)}",
+                "totalResults": 0,
+                "resultsShared": 0,
+                "isError": True,
+            }
+# Tool specification
+SEARCH_CODE_TOOL_SPEC = {
+    "name": "search_code",
+    "description": (
+        "Search code across GitHub with glob filtering and line-level results.\n\n"
+        "Returns: repo, path, ref, line_start, line_end, snippet\n\n"
+        "Examples:\n"
+        "- Search Python functions: {'query': 'def train', 'path_glob': '*.py', 'repo_glob': 'huggingface/*'}\n"
+        "- Search TODO comments: {'query': 'TODO', 'repo_glob': 'github/*', 'max_results': 10}\n"
+        "- Regex search: {'query': r'func Test\\w+', 'path_glob': '*.go', 'regex': True}\n"
+        "- Search in specific repo: {'query': 'HfApi', 'repo_glob': 'huggingface/huggingface_hub', 'path_glob': '*.py'}\n\n"
+    ),
+    "parameters": {
+        "type": "object",
+        "properties": {
+            "query": {
+                "type": "string",
+                "description": "Search term or pattern to find in code",
+            },
+            "repo_glob": {
+                "type": "string",
+                "description": "Glob pattern to filter repositories (e.g., 'github/*', 'facebook/react')",
+            },
+            "path_glob": {
+                "type": "string",
+                "description": "Glob pattern to filter file paths (e.g., '*.py', 'src/**/*.js', 'test_*.py')",
+            },
+            "regex": {
+                "type": "boolean",
+                "description": "Treat query as regular expression (default: false)",
+            },
+            "max_results": {
+                "type": "integer",
+                "description": "Maximum number of results to return (default: 100)",
+            },
+        },
+        "required": ["query"],
+    },
+}
+async def search_code_handler(arguments: Dict[str, Any]) -> tuple[str, bool]:
+    """Handler for agent tool router."""
+    try:
+        tool = SearchCodeTool()
+        result = await tool.execute(arguments)
+        return result["formatted"], not result.get("isError", False)
+    except Exception as e:
+        return f"Error executing search_code: {str(e)}", False

agent/tools/jobs_tool.py CHANGED Viewed

@@ -40,6 +40,20 @@ GPU_FLAVORS = [
     "h100",
     "h100x8",
 ]
 SPECIALIZED_FLAVORS = ["inf2x6"]
 ALL_FLAVORS = CPU_FLAVORS + GPU_FLAVORS + SPECIALIZED_FLAVORS
@@ -741,12 +755,12 @@ HF_JOBS_TOOL_SPEC = {
         "1. **Python mode:** Provide 'script' + 'dependencies' → auto-handles pip install\n"
         "2. **Docker mode:** Provide 'image' + 'command' → full control\n"
         "(script and command are mutually exclusive)\n\n"
-        "## Hardware:\n"
-        "CPU: cpu-basic (default), cpu-upgrade, cpu-performance, cpu-xl\n"
-        "GPU: t4-small, t4-medium, l4x1, a10g-small, a10g-large, a100-large, h100\n\n"
         "## Examples:\n\n"
         "**Fine-tune LLM and push to Hub:**\n"
-        "{'operation': 'run', 'script': 'from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer\\nmodel = AutoModelForCausalLM.from_pretrained(\"gpt2\")\\n# ... training code ...\\nmodel.push_to_hub(\"user-name/my-finetuned-model\")', 'dependencies': ['transformers', 'torch', 'datasets'], 'hardware_flavor': 'a10g-large', 'timeout': '4h', 'env': {'CUSTOM_VAR': 'value'}}\n\n"
         "**Generate dataset daily and upload:**\n"
         "{'operation': 'scheduled run', 'script': 'from datasets import Dataset\\nimport pandas as pd\\n# scrape/generate data\\ndf = pd.DataFrame(data)\\nds = Dataset.from_pandas(df)\\nds.push_to_hub(\"user-name/daily-dataset\")', 'dependencies': ['datasets', 'pandas'], 'schedule': '@daily'}\n\n"
         "**Run custom training with Docker:**\n"
@@ -807,7 +821,7 @@ HF_JOBS_TOOL_SPEC = {
             # Hardware and environment
             "hardware_flavor": {
                 "type": "string",
-                "description": "Hardware type. CPU: cpu-basic (default), cpu-upgrade, cpu-performance, cpu-xl. GPU: t4-small, t4-medium, l4x1, a10g-small, a10g-large, a100-large, h100. Use with 'run'/'scheduled run'.",
             },
             "timeout": {
                 "type": "string",

     "h100",
     "h100x8",
 ]
+# Detailed specs for display (vCPU/RAM/GPU VRAM)
+CPU_FLAVORS_DESC = (
+    "cpu-basic(2vCPU/16GB), cpu-upgrade(8vCPU/32GB), cpu-performance, cpu-xl"
+)
+GPU_FLAVORS_DESC = (
+    "t4-small(4vCPU/15GB/GPU 16GB), t4-medium(8vCPU/30GB/GPU 16GB), "
+    "l4x1(8vCPU/30GB/GPU 24GB), l4x4(48vCPU/186GB/GPU 96GB), "
+    "l40sx1(8vCPU/62GB/GPU 48GB), l40sx4(48vCPU/382GB/GPU 192GB), l40sx8(192vCPU/1534GB/GPU 384GB), "
+    "a10g-small(4vCPU/14GB/GPU 24GB), a10g-large(12vCPU/46GB/GPU 24GB), "
+    "a10g-largex2(24vCPU/92GB/GPU 48GB), a10g-largex4(48vCPU/184GB/GPU 96GB), "
+    "a100-large(12vCPU/142GB/GPU 80GB), h100(23vCPU/240GB/GPU 80GB), h100x8(184vCPU/1920GB/GPU 640GB), "
+    "zero-a10g(dynamic alloc)"
+)
 SPECIALIZED_FLAVORS = ["inf2x6"]
 ALL_FLAVORS = CPU_FLAVORS + GPU_FLAVORS + SPECIALIZED_FLAVORS
         "1. **Python mode:** Provide 'script' + 'dependencies' → auto-handles pip install\n"
         "2. **Docker mode:** Provide 'image' + 'command' → full control\n"
         "(script and command are mutually exclusive)\n\n"
+        "## Available Hardware (vCPU/RAM/GPU):\n"
+        f"CPU: {CPU_FLAVORS_DESC}\n"
+        f"GPU: {GPU_FLAVORS_DESC}\n"
         "## Examples:\n\n"
         "**Fine-tune LLM and push to Hub:**\n"
+        "{'operation': 'run', 'script': 'from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer\\nmodel = AutoModelForCausalLM.from_pretrained(\"Qwen/Qwen3-4B-Thinking-2507\")\\n# ... training code ...\\nmodel.push_to_hub(\"user-name/my-finetuned-model\")', 'dependencies': ['transformers', 'torch', 'datasets'], 'hardware_flavor': 'a10g-large', 'timeout': '4h', 'env': {'CUSTOM_VAR': 'value'}}\n\n"
         "**Generate dataset daily and upload:**\n"
         "{'operation': 'scheduled run', 'script': 'from datasets import Dataset\\nimport pandas as pd\\n# scrape/generate data\\ndf = pd.DataFrame(data)\\nds = Dataset.from_pandas(df)\\nds.push_to_hub(\"user-name/daily-dataset\")', 'dependencies': ['datasets', 'pandas'], 'schedule': '@daily'}\n\n"
         "**Run custom training with Docker:**\n"
             # Hardware and environment
             "hardware_flavor": {
                 "type": "string",
+                "description": f"Hardware type. Available CPU flavors: {CPU_FLAVORS}. Available GPU flavors: {GPU_FLAVORS}. Use with 'run'/'scheduled run'.",
             },
             "timeout": {
                 "type": "string",