ml-agent / agent /tools /github_read_file.py
akseljoonas's picture
akseljoonas HF Staff
feat: use hf_agent tool descriptions, hardware flavors, and default env
c45ebae
raw
history blame
10 kB
"""
GitHub Read File Tool - Read file contents from any GitHub repository with line range support
Fetch exact file contents with metadata, supporting line ranges for efficient reading.
"""
import base64
import json
import os
from typing import Any, Dict, Optional
import nbformat
import requests
from nbconvert import MarkdownExporter
from nbconvert.preprocessors import ClearOutputPreprocessor, TagRemovePreprocessor
from agent.tools.types import ToolResult
def _convert_ipynb_to_markdown(content: str) -> str:
"""
Convert Jupyter notebook JSON to LLM-friendly Markdown.
Args:
content: Raw notebook JSON string
Returns:
Converted Markdown string
"""
try:
# Parse notebook JSON
nb_dict = json.loads(content)
# Normalize cell sources (can be string or list of strings)
if "cells" in nb_dict:
for cell in nb_dict["cells"]:
if "source" in cell and isinstance(cell["source"], list):
cell["source"] = "".join(cell["source"])
# Read notebook with explicit version
nb = nbformat.reads(json.dumps(nb_dict), as_version=4)
# Strip outputs for LLM readability (outputs can be noisy/large)
clear = ClearOutputPreprocessor()
nb, _ = clear.preprocess(nb, {})
# Optionally remove cells tagged with "hide" or similar
remove = TagRemovePreprocessor(
remove_cell_tags={"hide", "hidden", "remove"},
remove_input_tags=set(),
remove_all_outputs_tags=set(),
)
nb, _ = remove.preprocess(nb, {})
# Convert to markdown
exporter = MarkdownExporter()
markdown, _ = exporter.from_notebook_node(nb)
return markdown
except json.JSONDecodeError:
return content
except Exception:
return content
def read_file(
repo: str,
path: str,
ref: str = "HEAD",
line_start: Optional[int] = None,
line_end: Optional[int] = None,
) -> ToolResult:
"""
Read file contents from a GitHub repository with line range support.
Args:
repo: Repository in format "owner/repo" (e.g., "github/github-mcp-server")
path: Path to file in repository (e.g., "pkg/github/search.go")
ref: Git reference - branch name, tag, or commit SHA (default: "HEAD")
line_start: Starting line number (1-indexed, inclusive)
line_end: Ending line number (1-indexed, inclusive)
Returns:
ToolResult with file contents and metadata
"""
token = os.environ.get("GITHUB_TOKEN")
if not token:
return {
"formatted": "Error: GITHUB_TOKEN environment variable is required",
"totalResults": 0,
"resultsShared": 0,
"isError": True,
}
# Parse repo
if "/" not in repo:
return {
"formatted": "Error: repo must be in format 'owner/repo'",
"totalResults": 0,
"resultsShared": 0,
"isError": True,
}
owner, repo_name = repo.split("/", 1)
headers = {
"Accept": "application/vnd.github+json",
"X-GitHub-Api-Version": "2022-11-28",
"Authorization": f"Bearer {token}",
}
# Fetch file contents
url = f"https://api.github.com/repos/{owner}/{repo_name}/contents/{path}"
params = {}
if ref and ref != "HEAD":
params["ref"] = ref
try:
response = requests.get(url, headers=headers, params=params, timeout=30)
if response.status_code == 404:
return {
"formatted": f"File not found: {path} in {repo} (ref: {ref})",
"totalResults": 0,
"resultsShared": 0,
"isError": True,
}
if response.status_code != 200:
error_msg = f"GitHub API error (status {response.status_code})"
try:
error_data = response.json()
if "message" in error_data:
error_msg += f": {error_data['message']}"
except Exception:
pass
return {
"formatted": error_msg,
"totalResults": 0,
"resultsShared": 0,
"isError": True,
}
data = response.json()
# Check if it's a file
if data.get("type") != "file":
return {
"formatted": f"Path {path} is not a file (type: {data.get('type')})",
"totalResults": 0,
"resultsShared": 0,
"isError": True,
}
# Decode content
content_b64 = data.get("content", "")
if content_b64:
content_b64 = content_b64.replace("\n", "").replace(" ", "")
content = base64.b64decode(content_b64).decode("utf-8", errors="replace")
else:
# For large files, fetch raw content
raw_headers = {
"Accept": "application/vnd.github.raw",
"X-GitHub-Api-Version": "2022-11-28",
"Authorization": f"Bearer {token}",
}
raw_response = requests.get(
url, headers=raw_headers, params=params, timeout=30
)
if raw_response.status_code != 200:
return {
"formatted": "Failed to fetch file content",
"totalResults": 0,
"resultsShared": 0,
"isError": True,
}
content = raw_response.text
if path.lower().endswith(".ipynb"):
content = _convert_ipynb_to_markdown(content)
# Process line ranges
lines = content.split("\n")
total_lines = len(lines)
truncated = False
if line_start is None and line_end is None:
# No range specified
if total_lines > 300:
line_start = 1
line_end = 300
truncated = True
else:
line_start = 1
line_end = total_lines
else:
# Range specified
if line_start is None:
line_start = 1
if line_end is None:
line_end = total_lines
# Validate range
line_start = max(1, line_start)
line_end = min(total_lines, line_end)
if line_start > line_end:
return {
"formatted": f"Invalid range: line_start ({line_start}) > line_end ({line_end})",
"totalResults": 0,
"resultsShared": 0,
"isError": True,
}
# Extract lines
selected_lines = lines[line_start - 1 : line_end]
selected_content = "\n".join(selected_lines)
# Format output
lines_output = [f"**Reading file from repo: {repo}, path: {path}**"]
if ref and ref != "HEAD":
lines_output.append(f"Ref: {ref}")
lines_output.append("\n**File content:")
lines_output.append("```")
lines_output.append(selected_content)
lines_output.append("```")
if truncated:
lines_output.append(
f"Currently showing lines {line_start}-{line_end} out of {total_lines} total lines. Use line_start and line_end to view more lines."
)
return {
"formatted": "\n".join(lines_output),
"totalResults": 1,
"resultsShared": 1,
}
except requests.exceptions.RequestException as e:
return {
"formatted": f"Failed to connect to GitHub API: {str(e)}",
"totalResults": 0,
"resultsShared": 0,
"isError": True,
}
# Tool specification
GITHUB_READ_FILE_TOOL_SPEC = {
"name": "github_read_file",
"description": (
"Read file contents from GitHub repositories. Returns first 300 lines by default. "
"Auto-converts Jupyter notebooks to markdown.\n\n"
"Use AFTER github_find_examples to study the working implementation. "
"The purpose is to learn current API patterns — imports, trainer configs, dataset handling — "
"so your implementation uses correct, up-to-date code.\n\n"
"Use line_start/line_end for large files (>300 lines) to read specific sections.\n\n"
"When NOT to use: when you don't know the file path (use github_find_examples first)."
),
"parameters": {
"type": "object",
"properties": {
"repo": {
"type": "string",
"description": "Repository in format 'owner/repo' (e.g., 'github/github-mcp-server'). Required.",
},
"path": {
"type": "string",
"description": "Path to file in repository (e.g., 'src/index.js'). Required.",
},
"ref": {
"type": "string",
"description": "Git reference - branch name, tag, or commit SHA. Default: 'HEAD'.",
},
"line_start": {
"type": "integer",
"description": "Starting line number (1-indexed, inclusive). Optional.",
},
"line_end": {
"type": "integer",
"description": "Ending line number (1-indexed, inclusive). Optional.",
},
},
"required": ["repo", "path"],
},
}
async def github_read_file_handler(arguments: Dict[str, Any]) -> tuple[str, bool]:
"""Handler for agent tool router"""
try:
result = read_file(
repo=arguments["repo"],
path=arguments["path"],
ref=arguments.get("ref", "HEAD"),
line_start=arguments.get("line_start"),
line_end=arguments.get("line_end"),
)
return result["formatted"], not result.get("isError", False)
except Exception as e:
return f"Error reading file: {str(e)}", False