akseljoonas HF Staff commited on
Commit
ccbe2d2
·
1 Parent(s): b70fed7

poc github tools

Browse files
agent/core/agent_loop.py CHANGED
@@ -25,9 +25,15 @@ def _validate_tool_args(tool_args: dict) -> tuple[bool, str | None]:
25
  args = tool_args.get("args", {})
26
  # Sometimes LLM passes args as string instead of dict
27
  if isinstance(args, str):
28
- return False, f"Tool call error: 'args' must be a JSON object, not a string. You passed: {repr(args)}"
 
 
 
29
  if not isinstance(args, dict) and args is not None:
30
- return False, f"Tool call error: 'args' must be a JSON object. You passed type: {type(args).__name__}"
 
 
 
31
  return True, None
32
 
33
 
@@ -38,8 +44,6 @@ def _needs_approval(tool_name: str, tool_args: dict) -> bool:
38
  if not args_valid:
39
  return False
40
 
41
- args = tool_args.get("args", {})
42
-
43
  if tool_name == "hf_jobs":
44
  # Check if it's a run or uv operation
45
  operation = tool_args.get("operation", "")
 
25
  args = tool_args.get("args", {})
26
  # Sometimes LLM passes args as string instead of dict
27
  if isinstance(args, str):
28
+ return (
29
+ False,
30
+ f"Tool call error: 'args' must be a JSON object, not a string. You passed: {repr(args)}",
31
+ )
32
  if not isinstance(args, dict) and args is not None:
33
+ return (
34
+ False,
35
+ f"Tool call error: 'args' must be a JSON object. You passed type: {type(args).__name__}",
36
+ )
37
  return True, None
38
 
39
 
 
44
  if not args_valid:
45
  return False
46
 
 
 
47
  if tool_name == "hf_jobs":
48
  # Check if it's a run or uv operation
49
  operation = tool_args.get("operation", "")
agent/core/tools.py CHANGED
@@ -19,6 +19,13 @@ from agent.tools.docs_tools import (
19
  explore_hf_docs_handler,
20
  hf_docs_fetch_handler,
21
  )
 
 
 
 
 
 
 
22
  from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, hf_jobs_handler
23
  from agent.tools.plan_tool import PLAN_TOOL_SPEC, plan_tool_handler
24
  from agent.tools.private_hf_repo_tools import (
@@ -224,7 +231,7 @@ class ToolRouter:
224
  def create_builtin_tools() -> list[ToolSpec]:
225
  """Create built-in tool specifications"""
226
  print(
227
- f"Creating built-in tools: {EXPLORE_HF_DOCS_TOOL_SPEC['name']}, {HF_DOCS_FETCH_TOOL_SPEC['name']}, {PLAN_TOOL_SPEC['name']}, {HF_JOBS_TOOL_SPEC['name']}, {PRIVATE_HF_REPO_TOOL_SPEC['name']}, {UTILS_TOOL_SPEC['name']}"
228
  )
229
  # in order of importance
230
  return [
@@ -266,4 +273,29 @@ def create_builtin_tools() -> list[ToolSpec]:
266
  parameters=UTILS_TOOL_SPEC["parameters"],
267
  handler=utils_handler,
268
  ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
  ]
 
19
  explore_hf_docs_handler,
20
  hf_docs_fetch_handler,
21
  )
22
+ from agent.tools.github_find_examples import (
23
+ FIND_EXAMPLES_TOOL_SPEC,
24
+ find_examples_handler,
25
+ )
26
+ from agent.tools.github_list_repos import LIST_REPOS_TOOL_SPEC, list_repos_handler
27
+ from agent.tools.github_read_file import READ_FILE_TOOL_SPEC, read_file_handler
28
+ from agent.tools.github_search_code import SEARCH_CODE_TOOL_SPEC, search_code_handler
29
  from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, hf_jobs_handler
30
  from agent.tools.plan_tool import PLAN_TOOL_SPEC, plan_tool_handler
31
  from agent.tools.private_hf_repo_tools import (
 
231
  def create_builtin_tools() -> list[ToolSpec]:
232
  """Create built-in tool specifications"""
233
  print(
234
+ f"Creating built-in tools: {EXPLORE_HF_DOCS_TOOL_SPEC['name']}, {HF_DOCS_FETCH_TOOL_SPEC['name']}, {PLAN_TOOL_SPEC['name']}, {HF_JOBS_TOOL_SPEC['name']}, {PRIVATE_HF_REPO_TOOL_SPEC['name']}, {UTILS_TOOL_SPEC['name']}, {FIND_EXAMPLES_TOOL_SPEC['name']}, {READ_FILE_TOOL_SPEC['name']}, {LIST_REPOS_TOOL_SPEC['name']}, {SEARCH_CODE_TOOL_SPEC['name']}"
235
  )
236
  # in order of importance
237
  return [
 
273
  parameters=UTILS_TOOL_SPEC["parameters"],
274
  handler=utils_handler,
275
  ),
276
+ # GitHub tools - 4 separate tools
277
+ ToolSpec(
278
+ name=FIND_EXAMPLES_TOOL_SPEC["name"],
279
+ description=FIND_EXAMPLES_TOOL_SPEC["description"],
280
+ parameters=FIND_EXAMPLES_TOOL_SPEC["parameters"],
281
+ handler=find_examples_handler,
282
+ ),
283
+ ToolSpec(
284
+ name=READ_FILE_TOOL_SPEC["name"],
285
+ description=READ_FILE_TOOL_SPEC["description"],
286
+ parameters=READ_FILE_TOOL_SPEC["parameters"],
287
+ handler=read_file_handler,
288
+ ),
289
+ ToolSpec(
290
+ name=LIST_REPOS_TOOL_SPEC["name"],
291
+ description=LIST_REPOS_TOOL_SPEC["description"],
292
+ parameters=LIST_REPOS_TOOL_SPEC["parameters"],
293
+ handler=list_repos_handler,
294
+ ),
295
+ ToolSpec(
296
+ name=SEARCH_CODE_TOOL_SPEC["name"],
297
+ description=SEARCH_CODE_TOOL_SPEC["description"],
298
+ parameters=SEARCH_CODE_TOOL_SPEC["parameters"],
299
+ handler=search_code_handler,
300
+ ),
301
  ]
agent/main.py CHANGED
@@ -222,11 +222,15 @@ async def event_listener(
222
 
223
  # Build repo URL
224
  type_path = "" if repo_type == "model" else f"{repo_type}s"
225
- repo_url = f"https://huggingface.co/{type_path}/{repo_id}".replace("//", "/")
 
 
 
 
226
 
227
  print(f"Repository: {repo_id}")
228
  print(f"Type: {repo_type}")
229
- print(f"Private: Yes")
230
  print(f"URL: {repo_url}")
231
 
232
  # Show file preview for upload_file operation
@@ -237,9 +241,9 @@ async def event_listener(
237
 
238
  if isinstance(file_content, str):
239
  # Calculate metrics
240
- all_lines = file_content.split('\n')
241
  line_count = len(all_lines)
242
- size_bytes = len(file_content.encode('utf-8'))
243
  size_kb = size_bytes / 1024
244
  size_mb = size_kb / 1024
245
 
@@ -251,8 +255,10 @@ async def event_listener(
251
 
252
  # Show preview
253
  preview_lines = all_lines[:5]
254
- preview = '\n'.join(preview_lines)
255
- print(f"Content preview (first 5 lines):\n{preview}")
 
 
256
  if len(all_lines) > 5:
257
  print("...")
258
 
 
222
 
223
  # Build repo URL
224
  type_path = "" if repo_type == "model" else f"{repo_type}s"
225
+ repo_url = (
226
+ f"https://huggingface.co/{type_path}/{repo_id}".replace(
227
+ "//", "/"
228
+ )
229
+ )
230
 
231
  print(f"Repository: {repo_id}")
232
  print(f"Type: {repo_type}")
233
+ print("Private: Yes")
234
  print(f"URL: {repo_url}")
235
 
236
  # Show file preview for upload_file operation
 
241
 
242
  if isinstance(file_content, str):
243
  # Calculate metrics
244
+ all_lines = file_content.split("\n")
245
  line_count = len(all_lines)
246
+ size_bytes = len(file_content.encode("utf-8"))
247
  size_kb = size_bytes / 1024
248
  size_mb = size_kb / 1024
249
 
 
255
 
256
  # Show preview
257
  preview_lines = all_lines[:5]
258
+ preview = "\n".join(preview_lines)
259
+ print(
260
+ f"Content preview (first 5 lines):\n{preview}"
261
+ )
262
  if len(all_lines) > 5:
263
  print("...")
264
 
agent/tools/__init__.py CHANGED
@@ -2,6 +2,26 @@
2
  Hugging Face tools for the agent
3
  """
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, HfJobsTool, hf_jobs_handler
6
  from agent.tools.types import ToolResult
7
 
@@ -10,4 +30,16 @@ __all__ = [
10
  "HF_JOBS_TOOL_SPEC",
11
  "hf_jobs_handler",
12
  "HfJobsTool",
 
 
 
 
 
 
 
 
 
 
 
 
13
  ]
 
2
  Hugging Face tools for the agent
3
  """
4
 
5
+ from agent.tools.github_find_examples import (
6
+ FIND_EXAMPLES_TOOL_SPEC,
7
+ FindExamplesTool,
8
+ find_examples_handler,
9
+ )
10
+ from agent.tools.github_list_repos import (
11
+ LIST_REPOS_TOOL_SPEC,
12
+ ListReposTool,
13
+ list_repos_handler,
14
+ )
15
+ from agent.tools.github_read_file import (
16
+ READ_FILE_TOOL_SPEC,
17
+ ReadFileTool,
18
+ read_file_handler,
19
+ )
20
+ from agent.tools.github_search_code import (
21
+ SEARCH_CODE_TOOL_SPEC,
22
+ SearchCodeTool,
23
+ search_code_handler,
24
+ )
25
  from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, HfJobsTool, hf_jobs_handler
26
  from agent.tools.types import ToolResult
27
 
 
30
  "HF_JOBS_TOOL_SPEC",
31
  "hf_jobs_handler",
32
  "HfJobsTool",
33
+ "FIND_EXAMPLES_TOOL_SPEC",
34
+ "find_examples_handler",
35
+ "FindExamplesTool",
36
+ "READ_FILE_TOOL_SPEC",
37
+ "read_file_handler",
38
+ "ReadFileTool",
39
+ "LIST_REPOS_TOOL_SPEC",
40
+ "list_repos_handler",
41
+ "ListReposTool",
42
+ "SEARCH_CODE_TOOL_SPEC",
43
+ "search_code_handler",
44
+ "SearchCodeTool",
45
  ]
agent/tools/github_find_examples.py ADDED
@@ -0,0 +1,524 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ GitHub Find Examples Tool
3
+
4
+ Finds examples, guides, and tutorials for a library using deterministic queries and heuristics.
5
+ """
6
+
7
+ import asyncio
8
+ import math
9
+ import os
10
+ from dataclasses import asdict, dataclass
11
+ from datetime import datetime, timedelta
12
+ from typing import Any, Dict, List, Optional
13
+
14
+ try:
15
+ import requests
16
+ except ImportError:
17
+ raise ImportError(
18
+ "requests library is required. Install with: pip install requests"
19
+ )
20
+
21
+ from agent.tools.types import ToolResult
22
+
23
+
24
+ @dataclass
25
+ class Example:
26
+ """An example file with metadata and relevance score."""
27
+
28
+ repo: str
29
+ path: str
30
+ ref: str
31
+ url: str
32
+ score: float
33
+ reason: str
34
+ repo_stars: int
35
+ repo_updated: str
36
+ file_size: int
37
+
38
+ def to_dict(self):
39
+ return asdict(self)
40
+
41
+
42
+ class GitHubAPIError(Exception):
43
+ """Raised when GitHub API returns an error."""
44
+
45
+ pass
46
+
47
+
48
+ # Path-based scoring weights
49
+ PATH_SCORES = {
50
+ "README.md": 100,
51
+ "readme.md": 100,
52
+ "docs/": 80,
53
+ "doc/": 80,
54
+ "examples/": 90,
55
+ "example/": 90,
56
+ "notebooks/": 70,
57
+ "notebook/": 70,
58
+ "tutorials/": 85,
59
+ "tutorial/": 85,
60
+ "guides/": 85,
61
+ "guide/": 85,
62
+ "tests/": 40,
63
+ "test/": 40,
64
+ "demos/": 75,
65
+ "demo/": 75,
66
+ "samples/": 75,
67
+ "sample/": 75,
68
+ }
69
+
70
+ # Content-based scoring keywords
71
+ CONTENT_KEYWORDS = {
72
+ 'if __name__ == "__main__"': 50,
73
+ "if __name__ == '__main__'": 50,
74
+ "quickstart": 60,
75
+ "quick start": 60,
76
+ "getting started": 60,
77
+ "tutorial": 50,
78
+ "example usage": 55,
79
+ "usage example": 55,
80
+ "how to use": 45,
81
+ "basic example": 50,
82
+ "simple example": 50,
83
+ }
84
+
85
+ # File extension preferences
86
+ PREFERRED_EXTENSIONS = {
87
+ ".py": 10,
88
+ ".ipynb": 15,
89
+ ".md": 20,
90
+ ".rst": 10,
91
+ ".js": 10,
92
+ ".ts": 10,
93
+ ".go": 10,
94
+ ".java": 10,
95
+ ".cpp": 10,
96
+ ".c": 10,
97
+ }
98
+
99
+
100
+ def _get_github_token() -> str:
101
+ """Get GitHub token from environment."""
102
+ token = os.environ.get("GITHUB_TOKEN")
103
+ if not token:
104
+ raise GitHubAPIError(
105
+ "GITHUB_TOKEN environment variable is required. "
106
+ "Set it with: export GITHUB_TOKEN=your_token_here"
107
+ )
108
+ return token
109
+
110
+
111
+ def _execute_search(query: str, token: str, limit: int = 20) -> List[Dict[str, Any]]:
112
+ """Execute a GitHub code search query."""
113
+ headers = {
114
+ "Accept": "application/vnd.github.text-match+json",
115
+ "X-GitHub-Api-Version": "2022-11-28",
116
+ "Authorization": f"Bearer {token}",
117
+ }
118
+
119
+ results = []
120
+ page = 1
121
+ per_page = min(100, limit)
122
+
123
+ try:
124
+ while len(results) < limit:
125
+ params = {"q": query, "per_page": per_page, "page": page}
126
+ url = "https://api.github.com/search/code"
127
+ response = requests.get(url, headers=headers, params=params, timeout=30)
128
+
129
+ if response.status_code != 200:
130
+ break
131
+
132
+ data = response.json()
133
+ items = data.get("items", [])
134
+
135
+ if not items:
136
+ break
137
+
138
+ for item in items:
139
+ results.append(
140
+ {
141
+ "repo": item.get("repository", {}).get("full_name", ""),
142
+ "path": item.get("path", ""),
143
+ "sha": item.get("sha", ""),
144
+ "url": item.get("html_url", ""),
145
+ "size": item.get("size", 0),
146
+ "text_matches": item.get("text_matches", []),
147
+ }
148
+ )
149
+
150
+ if len(results) >= limit or len(items) < per_page:
151
+ break
152
+
153
+ page += 1
154
+
155
+ except Exception:
156
+ pass
157
+
158
+ return results[:limit]
159
+
160
+
161
+ def _fetch_repo_metadata(repos: List[str], token: str) -> Dict[str, Dict[str, Any]]:
162
+ """Fetch metadata for repositories."""
163
+ headers = {
164
+ "Accept": "application/vnd.github+json",
165
+ "X-GitHub-Api-Version": "2022-11-28",
166
+ "Authorization": f"Bearer {token}",
167
+ }
168
+
169
+ metadata = {}
170
+
171
+ for repo in repos:
172
+ try:
173
+ url = f"https://api.github.com/repos/{repo}"
174
+ response = requests.get(url, headers=headers, timeout=10)
175
+
176
+ if response.status_code == 200:
177
+ data = response.json()
178
+ metadata[repo] = {
179
+ "stars": data.get("stargazers_count", 0),
180
+ "updated_at": data.get("updated_at", ""),
181
+ "description": data.get("description", ""),
182
+ }
183
+ except:
184
+ continue
185
+
186
+ return metadata
187
+
188
+
189
+ def _score_and_rank(
190
+ results: List[Dict[str, Any]], library: str, token: str
191
+ ) -> List[Example]:
192
+ """Score results based on heuristics and rank them."""
193
+ repos = list(set(r["repo"] for r in results))
194
+ repo_metadata = _fetch_repo_metadata(repos, token)
195
+
196
+ scored_examples = []
197
+
198
+ for result in results:
199
+ repo = result["repo"]
200
+ path = result["path"]
201
+
202
+ score = 0.0
203
+ reasons = []
204
+
205
+ # Path-based scoring
206
+ path_lower = path.lower()
207
+ for pattern, points in PATH_SCORES.items():
208
+ if pattern.lower() in path_lower:
209
+ score += points
210
+ reasons.append(f"in {pattern}")
211
+ break
212
+
213
+ # File extension scoring
214
+ for ext, points in PREFERRED_EXTENSIONS.items():
215
+ if path_lower.endswith(ext):
216
+ score += points
217
+ break
218
+
219
+ # Content-based scoring
220
+ text_content = ""
221
+ for match in result.get("text_matches", []):
222
+ text_content += match.get("fragment", "").lower() + " "
223
+
224
+ for keyword, points in CONTENT_KEYWORDS.items():
225
+ if keyword.lower() in text_content:
226
+ score += points
227
+ reasons.append(f"contains '{keyword}'")
228
+
229
+ # Repo-based scoring
230
+ metadata = repo_metadata.get(repo, {})
231
+ stars = metadata.get("stars", 0)
232
+ updated = metadata.get("updated_at", "")
233
+
234
+ if stars > 0:
235
+ star_score = math.log10(stars + 1) * 10
236
+ score += star_score
237
+
238
+ # Recency bonus
239
+ if updated:
240
+ try:
241
+ updated_date = datetime.fromisoformat(updated.replace("Z", "+00:00"))
242
+ if datetime.now(updated_date.tzinfo) - updated_date < timedelta(
243
+ days=180
244
+ ):
245
+ score += 20
246
+ reasons.append("recently updated")
247
+ except:
248
+ pass
249
+
250
+ # Filename quality
251
+ filename = path.split("/")[-1].lower()
252
+ if any(
253
+ word in filename
254
+ for word in ["example", "tutorial", "guide", "quickstart", "demo"]
255
+ ):
256
+ score += 30
257
+ reasons.append("descriptive filename")
258
+
259
+ # Size penalty
260
+ if result["size"] > 100000:
261
+ score *= 0.5
262
+ reasons.append("large file")
263
+
264
+ example = Example(
265
+ repo=repo,
266
+ path=path,
267
+ ref=result["sha"],
268
+ url=result["url"],
269
+ score=score,
270
+ reason=", ".join(reasons) if reasons else "matches library",
271
+ repo_stars=stars,
272
+ repo_updated=updated,
273
+ file_size=result["size"],
274
+ )
275
+
276
+ scored_examples.append(example)
277
+
278
+ scored_examples.sort(key=lambda x: x.score, reverse=True)
279
+ return scored_examples
280
+
281
+
282
+ def _search_by_path(
283
+ library: str, org: str, repo_scope: Optional[str], token: str
284
+ ) -> List[Dict[str, Any]]:
285
+ """Search for library in example/tutorial/docs directories."""
286
+ results = []
287
+ path_patterns = [
288
+ "examples/",
289
+ "example/",
290
+ "docs/",
291
+ "tutorials/",
292
+ "notebooks/",
293
+ "guides/",
294
+ ]
295
+
296
+ for path in path_patterns:
297
+ query_parts = [f"org:{org}", f"{library}", f"path:{path}"]
298
+ if repo_scope:
299
+ query_parts[0] = f"repo:{org}/{repo_scope}"
300
+
301
+ query = " ".join(query_parts)
302
+ results.extend(_execute_search(query, token, limit=20))
303
+
304
+ return results
305
+
306
+
307
+ def _search_by_content(
308
+ library: str, org: str, repo_scope: Optional[str], token: str
309
+ ) -> List[Dict[str, Any]]:
310
+ """Search for library with specific content patterns."""
311
+ results = []
312
+ content_patterns = [
313
+ f"{library} if __name__",
314
+ f"{library} quickstart",
315
+ f"{library} tutorial",
316
+ f"{library} usage example",
317
+ ]
318
+
319
+ for pattern in content_patterns:
320
+ query_parts = [f"org:{org}", pattern]
321
+ if repo_scope:
322
+ query_parts[0] = f"repo:{org}/{repo_scope}"
323
+
324
+ query = " ".join(query_parts)
325
+ results.extend(_execute_search(query, token, limit=15))
326
+
327
+ return results
328
+
329
+
330
+ def _search_readmes(
331
+ library: str, org: str, repo_scope: Optional[str], token: str
332
+ ) -> List[Dict[str, Any]]:
333
+ """Search for library mentions in README files."""
334
+ query_parts = [f"org:{org}", f"{library}", "filename:README"]
335
+ if repo_scope:
336
+ query_parts[0] = f"repo:{org}/{repo_scope}"
337
+
338
+ query = " ".join(query_parts)
339
+ return _execute_search(query, token, limit=20)
340
+
341
+
342
+ def find_examples(
343
+ library: str,
344
+ org: str = "huggingface",
345
+ repo_scope: Optional[str] = None,
346
+ max_results: int = 10,
347
+ ) -> List[Example]:
348
+ """
349
+ Find examples, guides, and tutorials for a library using deterministic queries.
350
+
351
+ Uses a playbook of smart searches and heuristics to find canonical examples:
352
+ - Prefers README.md, docs/**, examples/**, notebooks/**, tests/**
353
+ - Prefers files with if __name__ == "__main__", "quickstart", "tutorial"
354
+ - Prefers repos with higher stars and more recent updates
355
+
356
+ Args:
357
+ library: Library name to search for (e.g., "transformers", "torch")
358
+ org: GitHub organization to search in (default: "huggingface")
359
+ repo_scope: Optional specific repository (e.g., "transformers")
360
+ max_results: Maximum number of results to return (default: 10)
361
+
362
+ Returns:
363
+ List of Example objects, ranked by relevance score
364
+ """
365
+ token = _get_github_token()
366
+
367
+ all_results = []
368
+ all_results.extend(_search_by_path(library, org, repo_scope, token))
369
+ all_results.extend(_search_by_content(library, org, repo_scope, token))
370
+ all_results.extend(_search_readmes(library, org, repo_scope, token))
371
+
372
+ # Deduplicate
373
+ seen = set()
374
+ unique_results = []
375
+ for result in all_results:
376
+ key = (result["repo"], result["path"])
377
+ if key not in seen:
378
+ seen.add(key)
379
+ unique_results.append(result)
380
+
381
+ scored_examples = _score_and_rank(unique_results, library, token)
382
+ return scored_examples[:max_results]
383
+
384
+
385
+ async def _async_call(func, *args, **kwargs):
386
+ """Wrap synchronous calls for async context."""
387
+ return await asyncio.to_thread(func, *args, **kwargs)
388
+
389
+
390
+ def _format_examples_table(examples: List[Example]) -> str:
391
+ """Format examples as a markdown table."""
392
+ if not examples:
393
+ return "No examples found."
394
+
395
+ lines = [
396
+ "| Rank | File | Score | Stars | Reason |",
397
+ "|------|------|-------|-------|--------|",
398
+ ]
399
+
400
+ for i, ex in enumerate(examples, 1):
401
+ file_path = f"{ex.repo}/{ex.path}"
402
+ if len(file_path) > 60:
403
+ file_path = file_path[:57] + "..."
404
+ reason = ex.reason if len(ex.reason) < 40 else ex.reason[:37] + "..."
405
+ lines.append(
406
+ f"| {i} | {file_path} | {ex.score:.1f} | {ex.repo_stars:,} | {reason} |"
407
+ )
408
+
409
+ return "\n".join(lines)
410
+
411
+
412
+ class FindExamplesTool:
413
+ """Tool for finding examples and tutorials for libraries."""
414
+
415
+ async def execute(self, params: Dict[str, Any]) -> ToolResult:
416
+ """Execute find_examples operation."""
417
+ library = params.get("library")
418
+ if not library:
419
+ return {
420
+ "formatted": "Error: 'library' parameter is required",
421
+ "totalResults": 0,
422
+ "resultsShared": 0,
423
+ "isError": True,
424
+ }
425
+
426
+ org = params.get("org", "huggingface")
427
+ repo_scope = params.get("repo_scope")
428
+ max_results = params.get("max_results", 10)
429
+
430
+ try:
431
+ examples = await _async_call(
432
+ find_examples,
433
+ library=library,
434
+ org=org,
435
+ repo_scope=repo_scope,
436
+ max_results=max_results,
437
+ )
438
+
439
+ if not examples:
440
+ return {
441
+ "formatted": f"No examples found for '{library}' in {org}",
442
+ "totalResults": 0,
443
+ "resultsShared": 0,
444
+ }
445
+
446
+ table = _format_examples_table(examples)
447
+ response = f"**Found {len(examples)} examples for '{library}' in {org}:**\n\n{table}"
448
+
449
+ # Add URLs and suggest using read_file
450
+ response += "\n\n**Top examples (use read_file to view):**\n"
451
+ for i, ex in enumerate(examples[:3], 1):
452
+ response += f"{i}. [{ex.repo}/{ex.path}]({ex.url})\n"
453
+ response += f" Use: read_file(repo='{ex.repo}', path='{ex.path}')\n"
454
+
455
+ return {
456
+ "formatted": response,
457
+ "totalResults": len(examples),
458
+ "resultsShared": len(examples),
459
+ }
460
+
461
+ except GitHubAPIError as e:
462
+ return {
463
+ "formatted": f"GitHub API Error: {str(e)}",
464
+ "totalResults": 0,
465
+ "resultsShared": 0,
466
+ "isError": True,
467
+ }
468
+ except Exception as e:
469
+ return {
470
+ "formatted": f"Error: {str(e)}",
471
+ "totalResults": 0,
472
+ "resultsShared": 0,
473
+ "isError": True,
474
+ }
475
+
476
+
477
+ # Tool specification
478
+ FIND_EXAMPLES_TOOL_SPEC = {
479
+ "name": "find_examples",
480
+ "description": (
481
+ "Find examples, guides, and tutorials for a library using deterministic queries and heuristics.\n\n"
482
+ "Uses best practices retrieval without semantic search:\n"
483
+ "- Prefers README.md, docs/**, examples/**, notebooks/**, tests/**\n"
484
+ "- Prefers files with if __name__ == '__main__', 'quickstart', 'tutorial', 'usage'\n"
485
+ "- Prefers repos with higher stars and more recent updates\n\n"
486
+ "Returns a ranked list of canonical example files.\n\n"
487
+ "Examples:\n"
488
+ "- Find transformers examples: {'library': 'transformers', 'org': 'huggingface', 'max_results': 5}\n"
489
+ "- Find torch examples in specific repo: {'library': 'torch', 'org': 'pytorch', 'repo_scope': 'examples'}\n\n"
490
+ "Use read_file tool to view the content of returned files.\n\n"
491
+ ),
492
+ "parameters": {
493
+ "type": "object",
494
+ "properties": {
495
+ "library": {
496
+ "type": "string",
497
+ "description": "Library name to search for (e.g., 'transformers', 'torch', 'react')",
498
+ },
499
+ "org": {
500
+ "type": "string",
501
+ "description": "GitHub organization to search in (default: 'huggingface')",
502
+ },
503
+ "repo_scope": {
504
+ "type": "string",
505
+ "description": "Optional specific repository to search within",
506
+ },
507
+ "max_results": {
508
+ "type": "integer",
509
+ "description": "Maximum number of results to return (default: 10)",
510
+ },
511
+ },
512
+ "required": ["library"],
513
+ },
514
+ }
515
+
516
+
517
+ async def find_examples_handler(arguments: Dict[str, Any]) -> tuple[str, bool]:
518
+ """Handler for agent tool router."""
519
+ try:
520
+ tool = FindExamplesTool()
521
+ result = await tool.execute(arguments)
522
+ return result["formatted"], not result.get("isError", False)
523
+ except Exception as e:
524
+ return f"Error executing find_examples: {str(e)}", False
agent/tools/github_list_repos.py ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ GitHub List Repos Tool
3
+
4
+ Lists repositories for a user or organization with sorting options.
5
+ """
6
+
7
+ import asyncio
8
+ import os
9
+ from dataclasses import asdict, dataclass
10
+ from typing import Any, Dict, List, Literal, Optional
11
+
12
+ try:
13
+ import requests
14
+ except ImportError:
15
+ raise ImportError(
16
+ "requests library is required. Install with: pip install requests"
17
+ )
18
+
19
+ from agent.tools.types import ToolResult
20
+
21
+
22
+ @dataclass
23
+ class Repository:
24
+ """Repository information."""
25
+
26
+ id: int
27
+ name: str
28
+ full_name: str
29
+ description: Optional[str]
30
+ html_url: str
31
+ language: Optional[str]
32
+ stars: int
33
+ forks: int
34
+ open_issues: int
35
+ private: bool
36
+ fork: bool
37
+ archived: bool
38
+ default_branch: str
39
+ created_at: Optional[str] = None
40
+ updated_at: Optional[str] = None
41
+ topics: Optional[List[str]] = None
42
+
43
+ def to_dict(self):
44
+ return asdict(self)
45
+
46
+
47
+ class GitHubAPIError(Exception):
48
+ """Raised when GitHub API returns an error."""
49
+
50
+ pass
51
+
52
+
53
+ def _get_github_token() -> str:
54
+ """Get GitHub token from environment."""
55
+ token = os.environ.get("GITHUB_TOKEN")
56
+ if not token:
57
+ raise GitHubAPIError(
58
+ "GITHUB_TOKEN environment variable is required. "
59
+ "Set it with: export GITHUB_TOKEN=your_token_here"
60
+ )
61
+ return token
62
+
63
+
64
+ def _fetch_repositories(
65
+ query: str, sort: str, order: str, limit: Optional[int], token: str
66
+ ) -> List[Repository]:
67
+ """Fetch repositories from GitHub Search API."""
68
+ headers = {
69
+ "Accept": "application/vnd.github+json",
70
+ "X-GitHub-Api-Version": "2022-11-28",
71
+ "Authorization": f"Bearer {token}",
72
+ }
73
+
74
+ all_repos = []
75
+ page = 1
76
+ per_page = min(100, limit) if limit else 100
77
+
78
+ while True:
79
+ params = {
80
+ "q": query,
81
+ "sort": sort,
82
+ "order": order,
83
+ "page": page,
84
+ "per_page": per_page,
85
+ }
86
+
87
+ url = "https://api.github.com/search/repositories"
88
+
89
+ try:
90
+ response = requests.get(url, headers=headers, params=params, timeout=30)
91
+
92
+ if response.status_code != 200:
93
+ break
94
+
95
+ data = response.json()
96
+ items = data.get("items", [])
97
+
98
+ if not items:
99
+ break
100
+
101
+ for item in items:
102
+ repo = Repository(
103
+ id=item.get("id"),
104
+ name=item.get("name"),
105
+ full_name=item.get("full_name"),
106
+ description=item.get("description"),
107
+ html_url=item.get("html_url"),
108
+ language=item.get("language"),
109
+ stars=item.get("stargazers_count", 0),
110
+ forks=item.get("forks_count", 0),
111
+ open_issues=item.get("open_issues_count", 0),
112
+ private=item.get("private", False),
113
+ fork=item.get("fork", False),
114
+ archived=item.get("archived", False),
115
+ default_branch=item.get("default_branch", "main"),
116
+ created_at=item.get("created_at"),
117
+ updated_at=item.get("updated_at"),
118
+ topics=item.get("topics", []),
119
+ )
120
+ all_repos.append(repo)
121
+
122
+ if limit and len(all_repos) >= limit:
123
+ all_repos = all_repos[:limit]
124
+ break
125
+
126
+ total_count = data.get("total_count", 0)
127
+ if len(all_repos) >= total_count:
128
+ break
129
+
130
+ if page * per_page >= 1000:
131
+ break
132
+
133
+ page += 1
134
+
135
+ except requests.exceptions.RequestException:
136
+ break
137
+
138
+ return all_repos
139
+
140
+
141
+ def list_repos(
142
+ owner: str,
143
+ owner_type: Literal["user", "org"] = "org",
144
+ sort: Literal["stars", "forks", "updated", "created"] = "stars",
145
+ order: Literal["asc", "desc"] = "desc",
146
+ limit: Optional[int] = None,
147
+ ) -> List[Repository]:
148
+ """
149
+ List repositories for a user or organization using GitHub Search API.
150
+
151
+ Backed by https://api.github.com/search/repositories?q=org:huggingface&sort=stars&order=desc
152
+ or can use GraphQL + client-side sort.
153
+
154
+ Args:
155
+ owner: GitHub username or organization name
156
+ owner_type: Whether the owner is a "user" or "org" (default: "org")
157
+ sort: Sort field - "stars", "forks", "updated", or "created" (default: "stars")
158
+ order: Sort order - "asc" or "desc" (default: "desc")
159
+ limit: Maximum number of repositories to return (default: no limit)
160
+
161
+ Returns:
162
+ List of Repository objects
163
+ """
164
+ token = _get_github_token()
165
+
166
+ if owner_type == "org":
167
+ query = f"org:{owner}"
168
+ else:
169
+ query = f"user:{owner}"
170
+
171
+ repos = _fetch_repositories(
172
+ query=query, sort=sort, order=order, limit=limit, token=token
173
+ )
174
+
175
+ return repos
176
+
177
+
178
+ async def _async_call(func, *args, **kwargs):
179
+ """Wrap synchronous calls for async context."""
180
+ return await asyncio.to_thread(func, *args, **kwargs)
181
+
182
+
183
+ def _format_repos_table(repos: List[Repository]) -> str:
184
+ """Format repositories as a markdown table."""
185
+ if not repos:
186
+ return "No repositories found."
187
+
188
+ lines = [
189
+ "| Repo | Stars | Forks | Language | Description |",
190
+ "|------|-------|-------|----------|-------------|",
191
+ ]
192
+
193
+ for repo in repos:
194
+ desc = repo.description or "N/A"
195
+ if len(desc) > 50:
196
+ desc = desc[:47] + "..."
197
+ lang = repo.language or "N/A"
198
+ lines.append(
199
+ f"| {repo.full_name} | {repo.stars:,} | {repo.forks:,} | {lang} | {desc} |"
200
+ )
201
+
202
+ return "\n".join(lines)
203
+
204
+
205
+ class ListReposTool:
206
+ """Tool for listing GitHub repositories."""
207
+
208
+ async def execute(self, params: Dict[str, Any]) -> ToolResult:
209
+ """Execute list_repos operation."""
210
+ owner = params.get("owner")
211
+ if not owner:
212
+ return {
213
+ "formatted": "Error: 'owner' parameter is required",
214
+ "totalResults": 0,
215
+ "resultsShared": 0,
216
+ "isError": True,
217
+ }
218
+
219
+ owner_type = params.get("owner_type", "org")
220
+ sort = params.get("sort", "stars")
221
+ order = params.get("order", "desc")
222
+ limit = params.get("limit")
223
+
224
+ try:
225
+ repos = await _async_call(
226
+ list_repos,
227
+ owner=owner,
228
+ owner_type=owner_type,
229
+ sort=sort,
230
+ order=order,
231
+ limit=limit,
232
+ )
233
+
234
+ if not repos:
235
+ return {
236
+ "formatted": f"No repositories found for {owner}",
237
+ "totalResults": 0,
238
+ "resultsShared": 0,
239
+ }
240
+
241
+ table = _format_repos_table(repos)
242
+ response = f"**Found {len(repos)} repositories for {owner} (sorted by {sort}, {order}):**\n\n{table}"
243
+
244
+ # Add links to top repos
245
+ response += "\n\n**Top repositories:**\n"
246
+ for i, repo in enumerate(repos[:5], 1):
247
+ response += (
248
+ f"{i}. [{repo.full_name}]({repo.html_url}) - ⭐ {repo.stars:,}\n"
249
+ )
250
+
251
+ return {
252
+ "formatted": response,
253
+ "totalResults": len(repos),
254
+ "resultsShared": len(repos),
255
+ }
256
+
257
+ except GitHubAPIError as e:
258
+ return {
259
+ "formatted": f"GitHub API Error: {str(e)}",
260
+ "totalResults": 0,
261
+ "resultsShared": 0,
262
+ "isError": True,
263
+ }
264
+ except Exception as e:
265
+ return {
266
+ "formatted": f"Error: {str(e)}",
267
+ "totalResults": 0,
268
+ "resultsShared": 0,
269
+ "isError": True,
270
+ }
271
+
272
+
273
+ # Tool specification
274
+ LIST_REPOS_TOOL_SPEC = {
275
+ "name": "list_repos",
276
+ "description": (
277
+ "List repositories for a user or organization with sorting options.\n\n"
278
+ "Backed by GitHub Search API: https://api.github.com/search/repositories?q=org:huggingface&sort=stars&order=desc\n\n"
279
+ "Examples:\n"
280
+ "- Top 10 starred repos: {'owner': 'huggingface', 'sort': 'stars', 'limit': 10}\n"
281
+ "- Recently updated: {'owner': 'microsoft', 'sort': 'updated', 'order': 'desc', 'limit': 5}\n"
282
+ "- User repos: {'owner': 'torvalds', 'owner_type': 'user', 'sort': 'stars'}\n"
283
+ "- All repos: {'owner': 'pytorch', 'sort': 'forks'}\n\n"
284
+ ),
285
+ "parameters": {
286
+ "type": "object",
287
+ "properties": {
288
+ "owner": {
289
+ "type": "string",
290
+ "description": "GitHub username or organization name (e.g., 'huggingface', 'torvalds')",
291
+ },
292
+ "owner_type": {
293
+ "type": "string",
294
+ "enum": ["user", "org"],
295
+ "description": "Whether the owner is a 'user' or 'org' (default: 'org')",
296
+ },
297
+ "sort": {
298
+ "type": "string",
299
+ "enum": ["stars", "forks", "updated", "created"],
300
+ "description": "Sort field: 'stars', 'forks', 'updated', or 'created' (default: 'stars')",
301
+ },
302
+ "order": {
303
+ "type": "string",
304
+ "enum": ["asc", "desc"],
305
+ "description": "Sort order: 'asc' or 'desc' (default: 'desc')",
306
+ },
307
+ "limit": {
308
+ "type": "integer",
309
+ "description": "Maximum number of repositories to return (default: no limit, returns all)",
310
+ },
311
+ },
312
+ "required": ["owner"],
313
+ },
314
+ }
315
+
316
+
317
+ async def list_repos_handler(arguments: Dict[str, Any]) -> tuple[str, bool]:
318
+ """Handler for agent tool router."""
319
+ try:
320
+ tool = ListReposTool()
321
+ result = await tool.execute(arguments)
322
+ return result["formatted"], not result.get("isError", False)
323
+ except Exception as e:
324
+ return f"Error executing list_repos: {str(e)}", False
agent/tools/github_read_file.py ADDED
@@ -0,0 +1,392 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ GitHub Read File Tool
3
+
4
+ Reads file contents from a GitHub repository with line range support.
5
+ """
6
+
7
+ import asyncio
8
+ import base64
9
+ import os
10
+ from dataclasses import asdict, dataclass
11
+ from typing import Any, Dict, Optional, Tuple
12
+
13
+ try:
14
+ import requests
15
+ except ImportError:
16
+ raise ImportError(
17
+ "requests library is required. Install with: pip install requests"
18
+ )
19
+
20
+ from agent.tools.types import ToolResult
21
+
22
+
23
+ @dataclass
24
+ class FileContents:
25
+ """File contents with metadata."""
26
+
27
+ content: str
28
+ sha: str
29
+ path: str
30
+ size: int
31
+ last_modified: Optional[str]
32
+ last_commit_sha: Optional[str]
33
+ line_start: int
34
+ line_end: int
35
+ total_lines: int
36
+ truncated: bool
37
+ message: Optional[str] = None
38
+
39
+ def to_dict(self):
40
+ return asdict(self)
41
+
42
+
43
+ class GitHubAPIError(Exception):
44
+ """Raised when GitHub API returns an error."""
45
+
46
+ pass
47
+
48
+
49
+ def _get_github_token() -> str:
50
+ """Get GitHub token from environment."""
51
+ token = os.environ.get("GITHUB_TOKEN")
52
+ if not token:
53
+ raise GitHubAPIError(
54
+ "GITHUB_TOKEN environment variable is required. "
55
+ "Set it with: export GITHUB_TOKEN=your_token_here"
56
+ )
57
+ return token
58
+
59
+
60
+ def _fetch_raw_content(owner: str, repo: str, path: str, ref: str, token: str) -> str:
61
+ """Fetch raw file content for large files."""
62
+ headers = {
63
+ "Accept": "application/vnd.github.raw",
64
+ "X-GitHub-Api-Version": "2022-11-28",
65
+ "Authorization": f"Bearer {token}",
66
+ }
67
+
68
+ url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}"
69
+ params = {"ref": ref}
70
+
71
+ response = requests.get(url, headers=headers, params=params, timeout=30)
72
+
73
+ if response.status_code != 200:
74
+ raise GitHubAPIError(
75
+ f"Failed to fetch raw content: HTTP {response.status_code}"
76
+ )
77
+
78
+ return response.text
79
+
80
+
81
+ def _get_last_commit_info(
82
+ owner: str, repo: str, path: str, ref: Optional[str], token: str
83
+ ) -> Tuple[Optional[str], Optional[str]]:
84
+ """Get last commit information for a specific file."""
85
+ headers = {
86
+ "Accept": "application/vnd.github+json",
87
+ "X-GitHub-Api-Version": "2022-11-28",
88
+ "Authorization": f"Bearer {token}",
89
+ }
90
+
91
+ url = f"https://api.github.com/repos/{owner}/{repo}/commits"
92
+ params = {"path": path, "per_page": 1}
93
+
94
+ if ref and ref != "HEAD":
95
+ params["sha"] = ref
96
+
97
+ try:
98
+ response = requests.get(url, headers=headers, params=params, timeout=30)
99
+
100
+ if response.status_code == 200:
101
+ commits = response.json()
102
+ if commits:
103
+ commit = commits[0]
104
+ commit_sha = commit.get("sha")
105
+ commit_date = commit.get("commit", {}).get("committer", {}).get("date")
106
+ return commit_date, commit_sha
107
+
108
+ except:
109
+ pass
110
+
111
+ return None, None
112
+
113
+
114
+ def _fetch_file_contents(
115
+ owner: str,
116
+ repo: str,
117
+ path: str,
118
+ ref: str,
119
+ line_start: Optional[int],
120
+ line_end: Optional[int],
121
+ token: str,
122
+ ) -> FileContents:
123
+ """Fetch file contents from GitHub API."""
124
+ headers = {
125
+ "Accept": "application/vnd.github+json",
126
+ "X-GitHub-Api-Version": "2022-11-28",
127
+ "Authorization": f"Bearer {token}",
128
+ }
129
+
130
+ url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}"
131
+ params = {}
132
+
133
+ if ref and ref != "HEAD":
134
+ params["ref"] = ref
135
+
136
+ try:
137
+ response = requests.get(url, headers=headers, params=params, timeout=30)
138
+
139
+ if response.status_code == 404:
140
+ raise GitHubAPIError(
141
+ f"File not found: {path} in {owner}/{repo} (ref: {ref})"
142
+ )
143
+
144
+ if response.status_code != 200:
145
+ error_msg = f"GitHub API error (status {response.status_code})"
146
+ try:
147
+ error_data = response.json()
148
+ if "message" in error_data:
149
+ error_msg += f": {error_data['message']}"
150
+ except:
151
+ pass
152
+ raise GitHubAPIError(error_msg)
153
+
154
+ data = response.json()
155
+
156
+ if data.get("type") != "file":
157
+ raise GitHubAPIError(
158
+ f"Path {path} is not a file (type: {data.get('type')})"
159
+ )
160
+
161
+ file_sha = data.get("sha")
162
+ file_size = data.get("size", 0)
163
+
164
+ # Decode content
165
+ content_b64 = data.get("content", "")
166
+ if content_b64:
167
+ content_b64 = content_b64.replace("\n", "").replace(" ", "")
168
+ content = base64.b64decode(content_b64).decode("utf-8", errors="replace")
169
+ else:
170
+ content = _fetch_raw_content(owner, repo, path, ref or "HEAD", token)
171
+
172
+ except requests.exceptions.RequestException as e:
173
+ raise GitHubAPIError(f"Failed to connect to GitHub API: {e}")
174
+
175
+ # Get last commit info
176
+ last_modified, last_commit_sha = _get_last_commit_info(
177
+ owner, repo, path, ref, token
178
+ )
179
+
180
+ # Process line ranges
181
+ lines = content.split("\n")
182
+ total_lines = len(lines)
183
+
184
+ truncated = False
185
+ message = None
186
+
187
+ if line_start is None and line_end is None:
188
+ if total_lines > 300:
189
+ line_start = 1
190
+ line_end = 300
191
+ truncated = True
192
+ message = (
193
+ f"File has {total_lines} lines. Returned only the first 300 lines. "
194
+ f"To view more, use the line_start and line_end parameters."
195
+ )
196
+ else:
197
+ line_start = 1
198
+ line_end = total_lines
199
+ else:
200
+ if line_start is None:
201
+ line_start = 1
202
+ if line_end is None:
203
+ line_end = total_lines
204
+
205
+ if line_start < 1:
206
+ line_start = 1
207
+ if line_end > total_lines:
208
+ line_end = total_lines
209
+ if line_start > line_end:
210
+ raise ValueError(
211
+ f"line_start ({line_start}) cannot be greater than line_end ({line_end})"
212
+ )
213
+
214
+ selected_lines = lines[line_start - 1 : line_end]
215
+ selected_content = "\n".join(selected_lines)
216
+
217
+ return FileContents(
218
+ content=selected_content,
219
+ sha=file_sha,
220
+ path=path,
221
+ size=file_size,
222
+ last_modified=last_modified,
223
+ last_commit_sha=last_commit_sha,
224
+ line_start=line_start,
225
+ line_end=line_end,
226
+ total_lines=total_lines,
227
+ truncated=truncated,
228
+ message=message,
229
+ )
230
+
231
+
232
+ def read_file(
233
+ repo: str,
234
+ path: str,
235
+ ref: str = "HEAD",
236
+ line_start: Optional[int] = None,
237
+ line_end: Optional[int] = None,
238
+ ) -> FileContents:
239
+ """
240
+ Read file contents from a GitHub repository.
241
+
242
+ Returns raw file text plus metadata (commit SHA, last modified).
243
+ If file is more than 300 lines and no line range is specified,
244
+ returns only the first 300 lines with a message.
245
+
246
+ Args:
247
+ repo: Repository in format "owner/repo" (e.g., "huggingface/transformers")
248
+ path: Path to file in repository (e.g., "README.md")
249
+ ref: Git reference - branch name, tag, or commit SHA (default: "HEAD")
250
+ line_start: Starting line number (1-indexed, inclusive)
251
+ line_end: Ending line number (1-indexed, inclusive)
252
+
253
+ Returns:
254
+ FileContents object with content and metadata
255
+ """
256
+ if "/" not in repo:
257
+ raise ValueError("repo must be in format 'owner/repo'")
258
+
259
+ owner, repo_name = repo.split("/", 1)
260
+ token = _get_github_token()
261
+
262
+ return _fetch_file_contents(
263
+ owner=owner,
264
+ repo=repo_name,
265
+ path=path,
266
+ ref=ref,
267
+ line_start=line_start,
268
+ line_end=line_end,
269
+ token=token,
270
+ )
271
+
272
+
273
+ async def _async_call(func, *args, **kwargs):
274
+ """Wrap synchronous calls for async context."""
275
+ return await asyncio.to_thread(func, *args, **kwargs)
276
+
277
+
278
+ class ReadFileTool:
279
+ """Tool for reading files from GitHub repositories."""
280
+
281
+ async def execute(self, params: Dict[str, Any]) -> ToolResult:
282
+ """Execute read_file operation."""
283
+ repo = params.get("repo")
284
+ path = params.get("path")
285
+
286
+ if not repo or not path:
287
+ return {
288
+ "formatted": "Error: 'repo' and 'path' parameters are required",
289
+ "totalResults": 0,
290
+ "resultsShared": 0,
291
+ "isError": True,
292
+ }
293
+
294
+ ref = params.get("ref", "HEAD")
295
+ line_start = params.get("line_start")
296
+ line_end = params.get("line_end")
297
+
298
+ try:
299
+ file_contents = await _async_call(
300
+ read_file,
301
+ repo=repo,
302
+ path=path,
303
+ ref=ref,
304
+ line_start=line_start,
305
+ line_end=line_end,
306
+ )
307
+
308
+ response = f"**File: {file_contents.path}**\n"
309
+ response += f"**Repo: {repo}**\n"
310
+ response += f"**Lines:** {file_contents.line_start}-{file_contents.line_end} of {file_contents.total_lines}\n"
311
+ response += f"**SHA:** {file_contents.sha}\n"
312
+
313
+ if file_contents.last_modified:
314
+ response += f"**Last modified:** {file_contents.last_modified}\n"
315
+
316
+ if file_contents.message:
317
+ response += f"\n⚠️ {file_contents.message}\n"
318
+
319
+ response += f"\n```\n{file_contents.content}\n```"
320
+
321
+ return {
322
+ "formatted": response,
323
+ "totalResults": 1,
324
+ "resultsShared": 1,
325
+ }
326
+
327
+ except GitHubAPIError as e:
328
+ return {
329
+ "formatted": f"GitHub API Error: {str(e)}",
330
+ "totalResults": 0,
331
+ "resultsShared": 0,
332
+ "isError": True,
333
+ }
334
+ except Exception as e:
335
+ return {
336
+ "formatted": f"Error: {str(e)}",
337
+ "totalResults": 0,
338
+ "resultsShared": 0,
339
+ "isError": True,
340
+ }
341
+
342
+
343
+ # Tool specification
344
+ READ_FILE_TOOL_SPEC = {
345
+ "name": "read_file",
346
+ "description": (
347
+ "Read file contents from a GitHub repository.\n\n"
348
+ "Returns raw file text plus metadata (commit SHA, last modified).\n"
349
+ "If file is more than 300 lines, returns only the first 300 lines and includes line_start and line_end indexes.\n"
350
+ "Use line_start and line_end parameters to view specific line ranges.\n\n"
351
+ "Examples:\n"
352
+ "- Read README: {'repo': 'huggingface/transformers', 'path': 'README.md'}\n"
353
+ "- Read specific lines: {'repo': 'huggingface/transformers', 'path': 'src/transformers/__init__.py', 'line_start': 1, 'line_end': 50}\n"
354
+ "- Read from branch: {'repo': 'torvalds/linux', 'path': 'MAINTAINERS', 'ref': 'master', 'line_start': 1, 'line_end': 20}\n\n"
355
+ ),
356
+ "parameters": {
357
+ "type": "object",
358
+ "properties": {
359
+ "repo": {
360
+ "type": "string",
361
+ "description": "Repository in format 'owner/repo' (e.g., 'huggingface/transformers')",
362
+ },
363
+ "path": {
364
+ "type": "string",
365
+ "description": "Path to file in repository (e.g., 'README.md', 'src/main.py')",
366
+ },
367
+ "ref": {
368
+ "type": "string",
369
+ "description": "Git reference: branch name, tag, or commit SHA (default: 'HEAD')",
370
+ },
371
+ "line_start": {
372
+ "type": "integer",
373
+ "description": "Starting line number (1-indexed, inclusive). Use to read specific range.",
374
+ },
375
+ "line_end": {
376
+ "type": "integer",
377
+ "description": "Ending line number (1-indexed, inclusive). Use to read specific range.",
378
+ },
379
+ },
380
+ "required": ["repo", "path"],
381
+ },
382
+ }
383
+
384
+
385
+ async def read_file_handler(arguments: Dict[str, Any]) -> tuple[str, bool]:
386
+ """Handler for agent tool router."""
387
+ try:
388
+ tool = ReadFileTool()
389
+ result = await tool.execute(arguments)
390
+ return result["formatted"], not result.get("isError", False)
391
+ except Exception as e:
392
+ return f"Error executing read_file: {str(e)}", False
agent/tools/github_search_code.py ADDED
@@ -0,0 +1,381 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ GitHub Search Code Tool
3
+
4
+ Searches code across GitHub with glob filtering and line-level results.
5
+ """
6
+
7
+ import asyncio
8
+ import fnmatch
9
+ import os
10
+ import re
11
+ from dataclasses import asdict, dataclass
12
+ from typing import Any, Dict, List, Optional, Tuple
13
+
14
+ try:
15
+ import requests
16
+ except ImportError:
17
+ raise ImportError(
18
+ "requests library is required. Install with: pip install requests"
19
+ )
20
+
21
+ from agent.tools.types import ToolResult
22
+
23
+
24
+ @dataclass
25
+ class CodeMatch:
26
+ """A code match with location information."""
27
+
28
+ repo: str
29
+ path: str
30
+ ref: str
31
+ line_start: int
32
+ line_end: int
33
+ snippet: str
34
+
35
+ def to_dict(self):
36
+ return asdict(self)
37
+
38
+
39
+ class GitHubAPIError(Exception):
40
+ """Raised when GitHub API returns an error."""
41
+
42
+ pass
43
+
44
+
45
+ def _get_github_token() -> str:
46
+ """Get GitHub token from environment."""
47
+ token = os.environ.get("GITHUB_TOKEN")
48
+ if not token:
49
+ raise GitHubAPIError(
50
+ "GITHUB_TOKEN environment variable is required. "
51
+ "Set it with: export GITHUB_TOKEN=your_token_here"
52
+ )
53
+ return token
54
+
55
+
56
+ def _build_github_query(
57
+ query: str, repo_glob: Optional[str], path_glob: Optional[str], regex: bool
58
+ ) -> str:
59
+ """Build GitHub search query string from parameters."""
60
+ parts = []
61
+
62
+ if regex:
63
+ parts.append(f"/{query}/")
64
+ else:
65
+ if " " in query:
66
+ parts.append(f'"{query}"')
67
+ else:
68
+ parts.append(query)
69
+
70
+ if repo_glob:
71
+ if "/" in repo_glob:
72
+ parts.append(f"repo:{repo_glob}")
73
+ else:
74
+ parts.append(f"user:{repo_glob}")
75
+
76
+ if path_glob:
77
+ if "*" not in path_glob and "?" not in path_glob:
78
+ parts.append(f"path:{path_glob}")
79
+ elif path_glob.startswith("*."):
80
+ ext = path_glob[2:]
81
+ parts.append(f"extension:{ext}")
82
+ elif "/" not in path_glob and "*" in path_glob:
83
+ parts.append(f"filename:{path_glob}")
84
+ else:
85
+ if "." in path_glob:
86
+ ext_match = re.search(r"\*\.(\w+)", path_glob)
87
+ if ext_match:
88
+ parts.append(f"extension:{ext_match.group(1)}")
89
+
90
+ return " ".join(parts)
91
+
92
+
93
+ def _fetch_code_search_results(
94
+ query: str, token: str, max_results: int
95
+ ) -> List[Dict[str, Any]]:
96
+ """Fetch code search results from GitHub API."""
97
+ headers = {
98
+ "Accept": "application/vnd.github.text-match+json",
99
+ "X-GitHub-Api-Version": "2022-11-28",
100
+ "Authorization": f"Bearer {token}",
101
+ }
102
+
103
+ all_items = []
104
+ page = 1
105
+ per_page = min(100, max_results)
106
+
107
+ while len(all_items) < max_results:
108
+ params = {
109
+ "q": query,
110
+ "page": page,
111
+ "per_page": per_page,
112
+ }
113
+
114
+ url = "https://api.github.com/search/code"
115
+
116
+ try:
117
+ response = requests.get(url, headers=headers, params=params, timeout=30)
118
+
119
+ if response.status_code != 200:
120
+ break
121
+
122
+ data = response.json()
123
+ items = data.get("items", [])
124
+
125
+ if not items:
126
+ break
127
+
128
+ all_items.extend(items)
129
+
130
+ if len(all_items) >= data.get("total_count", 0):
131
+ break
132
+
133
+ page += 1
134
+
135
+ except requests.exceptions.RequestException:
136
+ break
137
+
138
+ return all_items[:max_results]
139
+
140
+
141
+ def _glob_match(text: str, pattern: str) -> bool:
142
+ """Check if text matches glob pattern, supporting ** for multi-level paths."""
143
+ if "**" in pattern:
144
+ regex_pattern = pattern.replace("**", "<<<DOUBLESTAR>>>")
145
+ regex_pattern = fnmatch.translate(regex_pattern)
146
+ regex_pattern = regex_pattern.replace("<<<DOUBLESTAR>>>", ".*")
147
+ return re.match(regex_pattern, text) is not None
148
+ else:
149
+ return fnmatch.fnmatch(text, pattern)
150
+
151
+
152
+ def _estimate_line_numbers(fragment: str) -> Tuple[int, int]:
153
+ """Estimate line numbers from a code fragment."""
154
+ lines = fragment.split("\n")
155
+ line_count = len([line for line in lines if line.strip()])
156
+ return 1, line_count
157
+
158
+
159
+ def _parse_results_to_matches(
160
+ raw_results: List[Dict[str, Any]],
161
+ repo_glob: Optional[str],
162
+ path_glob: Optional[str],
163
+ ) -> List[CodeMatch]:
164
+ """Parse raw GitHub API results into CodeMatch objects."""
165
+ matches = []
166
+
167
+ for item in raw_results:
168
+ repo_name = item.get("repository", {}).get("full_name", "unknown/unknown")
169
+ file_path = item.get("path", "")
170
+ sha = item.get("sha", "unknown")
171
+
172
+ if repo_glob and not _glob_match(repo_name, repo_glob):
173
+ continue
174
+
175
+ if path_glob and not _glob_match(file_path, path_glob):
176
+ continue
177
+
178
+ text_matches = item.get("text_matches", [])
179
+
180
+ if text_matches:
181
+ for text_match in text_matches:
182
+ fragment = text_match.get("fragment", "")
183
+ line_start, line_end = _estimate_line_numbers(fragment)
184
+
185
+ match = CodeMatch(
186
+ repo=repo_name,
187
+ path=file_path,
188
+ ref=sha,
189
+ line_start=line_start,
190
+ line_end=line_end,
191
+ snippet=fragment.strip(),
192
+ )
193
+ matches.append(match)
194
+ else:
195
+ match = CodeMatch(
196
+ repo=repo_name,
197
+ path=file_path,
198
+ ref=sha,
199
+ line_start=1,
200
+ line_end=1,
201
+ snippet="<match found, but snippet not available>",
202
+ )
203
+ matches.append(match)
204
+
205
+ return matches
206
+
207
+
208
+ def search_code(
209
+ query: str,
210
+ repo_glob: Optional[str] = None,
211
+ path_glob: Optional[str] = None,
212
+ regex: bool = False,
213
+ max_results: int = 100,
214
+ ) -> List[CodeMatch]:
215
+ """
216
+ Search for code across GitHub with glob filtering and line-level results.
217
+
218
+ Returns: repo, path, ref, line_start, line_end, snippet
219
+
220
+ Args:
221
+ query: Search term or pattern to find in code
222
+ repo_glob: Glob pattern to filter repositories (e.g., "github/*", "facebook/react")
223
+ path_glob: Glob pattern to filter file paths (e.g., "*.py", "src/**/*.js")
224
+ regex: If True, treat query as a regular expression
225
+ max_results: Maximum number of results to return (default: 100)
226
+
227
+ Returns:
228
+ List of CodeMatch objects with repo, path, ref, line numbers, and snippet
229
+ """
230
+ github_query = _build_github_query(query, repo_glob, path_glob, regex)
231
+ token = _get_github_token()
232
+
233
+ raw_results = _fetch_code_search_results(github_query, token, max_results)
234
+ matches = _parse_results_to_matches(raw_results, repo_glob, path_glob)
235
+
236
+ return matches
237
+
238
+
239
+ async def _async_call(func, *args, **kwargs):
240
+ """Wrap synchronous calls for async context."""
241
+ return await asyncio.to_thread(func, *args, **kwargs)
242
+
243
+
244
+ def _format_code_matches(matches: List[CodeMatch]) -> str:
245
+ """Format code matches."""
246
+ if not matches:
247
+ return "No matches found."
248
+
249
+ lines = []
250
+ for i, match in enumerate(matches, 1):
251
+ lines.append(f"**{i}. {match.repo}/{match.path}:{match.line_start}**")
252
+ lines.append("```")
253
+ # Show first 5 lines of snippet
254
+ snippet_lines = match.snippet.split("\n")[:5]
255
+ lines.extend(snippet_lines)
256
+ if len(match.snippet.split("\n")) > 5:
257
+ lines.append("...")
258
+ lines.append("```")
259
+ lines.append("")
260
+
261
+ return "\n".join(lines)
262
+
263
+
264
+ class SearchCodeTool:
265
+ """Tool for searching code across GitHub."""
266
+
267
+ async def execute(self, params: Dict[str, Any]) -> ToolResult:
268
+ """Execute search_code operation."""
269
+ query = params.get("query")
270
+ if not query:
271
+ return {
272
+ "formatted": "Error: 'query' parameter is required",
273
+ "totalResults": 0,
274
+ "resultsShared": 0,
275
+ "isError": True,
276
+ }
277
+
278
+ repo_glob = params.get("repo_glob")
279
+ path_glob = params.get("path_glob")
280
+ regex = params.get("regex", False)
281
+ max_results = params.get("max_results", 100)
282
+
283
+ try:
284
+ matches = await _async_call(
285
+ search_code,
286
+ query=query,
287
+ repo_glob=repo_glob,
288
+ path_glob=path_glob,
289
+ regex=regex,
290
+ max_results=max_results,
291
+ )
292
+
293
+ if not matches:
294
+ return {
295
+ "formatted": "No matches found",
296
+ "totalResults": 0,
297
+ "resultsShared": 0,
298
+ }
299
+
300
+ formatted = _format_code_matches(matches)
301
+ response = f"**Found {len(matches)} code matches:**\n\n{formatted}"
302
+
303
+ # Add note about viewing full files
304
+ if matches:
305
+ response += "\n**To view full file, use:**\n"
306
+ top_match = matches[0]
307
+ response += (
308
+ f"read_file(repo='{top_match.repo}', path='{top_match.path}')"
309
+ )
310
+
311
+ return {
312
+ "formatted": response,
313
+ "totalResults": len(matches),
314
+ "resultsShared": min(len(matches), 10),
315
+ }
316
+
317
+ except GitHubAPIError as e:
318
+ return {
319
+ "formatted": f"GitHub API Error: {str(e)}",
320
+ "totalResults": 0,
321
+ "resultsShared": 0,
322
+ "isError": True,
323
+ }
324
+ except Exception as e:
325
+ return {
326
+ "formatted": f"Error: {str(e)}",
327
+ "totalResults": 0,
328
+ "resultsShared": 0,
329
+ "isError": True,
330
+ }
331
+
332
+
333
+ # Tool specification
334
+ SEARCH_CODE_TOOL_SPEC = {
335
+ "name": "search_code",
336
+ "description": (
337
+ "Search code across GitHub with glob filtering and line-level results.\n\n"
338
+ "Returns: repo, path, ref, line_start, line_end, snippet\n\n"
339
+ "Examples:\n"
340
+ "- Search Python functions: {'query': 'def train', 'path_glob': '*.py', 'repo_glob': 'huggingface/*'}\n"
341
+ "- Search TODO comments: {'query': 'TODO', 'repo_glob': 'github/*', 'max_results': 10}\n"
342
+ "- Regex search: {'query': r'func Test\\w+', 'path_glob': '*.go', 'regex': True}\n"
343
+ "- Search in specific repo: {'query': 'HfApi', 'repo_glob': 'huggingface/huggingface_hub', 'path_glob': '*.py'}\n\n"
344
+ ),
345
+ "parameters": {
346
+ "type": "object",
347
+ "properties": {
348
+ "query": {
349
+ "type": "string",
350
+ "description": "Search term or pattern to find in code",
351
+ },
352
+ "repo_glob": {
353
+ "type": "string",
354
+ "description": "Glob pattern to filter repositories (e.g., 'github/*', 'facebook/react')",
355
+ },
356
+ "path_glob": {
357
+ "type": "string",
358
+ "description": "Glob pattern to filter file paths (e.g., '*.py', 'src/**/*.js', 'test_*.py')",
359
+ },
360
+ "regex": {
361
+ "type": "boolean",
362
+ "description": "Treat query as regular expression (default: false)",
363
+ },
364
+ "max_results": {
365
+ "type": "integer",
366
+ "description": "Maximum number of results to return (default: 100)",
367
+ },
368
+ },
369
+ "required": ["query"],
370
+ },
371
+ }
372
+
373
+
374
+ async def search_code_handler(arguments: Dict[str, Any]) -> tuple[str, bool]:
375
+ """Handler for agent tool router."""
376
+ try:
377
+ tool = SearchCodeTool()
378
+ result = await tool.execute(arguments)
379
+ return result["formatted"], not result.get("isError", False)
380
+ except Exception as e:
381
+ return f"Error executing search_code: {str(e)}", False
agent/tools/jobs_tool.py CHANGED
@@ -40,6 +40,20 @@ GPU_FLAVORS = [
40
  "h100",
41
  "h100x8",
42
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  SPECIALIZED_FLAVORS = ["inf2x6"]
44
  ALL_FLAVORS = CPU_FLAVORS + GPU_FLAVORS + SPECIALIZED_FLAVORS
45
 
@@ -741,12 +755,12 @@ HF_JOBS_TOOL_SPEC = {
741
  "1. **Python mode:** Provide 'script' + 'dependencies' → auto-handles pip install\n"
742
  "2. **Docker mode:** Provide 'image' + 'command' → full control\n"
743
  "(script and command are mutually exclusive)\n\n"
744
- "## Hardware:\n"
745
- "CPU: cpu-basic (default), cpu-upgrade, cpu-performance, cpu-xl\n"
746
- "GPU: t4-small, t4-medium, l4x1, a10g-small, a10g-large, a100-large, h100\n\n"
747
  "## Examples:\n\n"
748
  "**Fine-tune LLM and push to Hub:**\n"
749
- "{'operation': 'run', 'script': 'from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer\\nmodel = AutoModelForCausalLM.from_pretrained(\"gpt2\")\\n# ... training code ...\\nmodel.push_to_hub(\"user-name/my-finetuned-model\")', 'dependencies': ['transformers', 'torch', 'datasets'], 'hardware_flavor': 'a10g-large', 'timeout': '4h', 'env': {'CUSTOM_VAR': 'value'}}\n\n"
750
  "**Generate dataset daily and upload:**\n"
751
  "{'operation': 'scheduled run', 'script': 'from datasets import Dataset\\nimport pandas as pd\\n# scrape/generate data\\ndf = pd.DataFrame(data)\\nds = Dataset.from_pandas(df)\\nds.push_to_hub(\"user-name/daily-dataset\")', 'dependencies': ['datasets', 'pandas'], 'schedule': '@daily'}\n\n"
752
  "**Run custom training with Docker:**\n"
@@ -807,7 +821,7 @@ HF_JOBS_TOOL_SPEC = {
807
  # Hardware and environment
808
  "hardware_flavor": {
809
  "type": "string",
810
- "description": "Hardware type. CPU: cpu-basic (default), cpu-upgrade, cpu-performance, cpu-xl. GPU: t4-small, t4-medium, l4x1, a10g-small, a10g-large, a100-large, h100. Use with 'run'/'scheduled run'.",
811
  },
812
  "timeout": {
813
  "type": "string",
 
40
  "h100",
41
  "h100x8",
42
  ]
43
+
44
+ # Detailed specs for display (vCPU/RAM/GPU VRAM)
45
+ CPU_FLAVORS_DESC = (
46
+ "cpu-basic(2vCPU/16GB), cpu-upgrade(8vCPU/32GB), cpu-performance, cpu-xl"
47
+ )
48
+ GPU_FLAVORS_DESC = (
49
+ "t4-small(4vCPU/15GB/GPU 16GB), t4-medium(8vCPU/30GB/GPU 16GB), "
50
+ "l4x1(8vCPU/30GB/GPU 24GB), l4x4(48vCPU/186GB/GPU 96GB), "
51
+ "l40sx1(8vCPU/62GB/GPU 48GB), l40sx4(48vCPU/382GB/GPU 192GB), l40sx8(192vCPU/1534GB/GPU 384GB), "
52
+ "a10g-small(4vCPU/14GB/GPU 24GB), a10g-large(12vCPU/46GB/GPU 24GB), "
53
+ "a10g-largex2(24vCPU/92GB/GPU 48GB), a10g-largex4(48vCPU/184GB/GPU 96GB), "
54
+ "a100-large(12vCPU/142GB/GPU 80GB), h100(23vCPU/240GB/GPU 80GB), h100x8(184vCPU/1920GB/GPU 640GB), "
55
+ "zero-a10g(dynamic alloc)"
56
+ )
57
  SPECIALIZED_FLAVORS = ["inf2x6"]
58
  ALL_FLAVORS = CPU_FLAVORS + GPU_FLAVORS + SPECIALIZED_FLAVORS
59
 
 
755
  "1. **Python mode:** Provide 'script' + 'dependencies' → auto-handles pip install\n"
756
  "2. **Docker mode:** Provide 'image' + 'command' → full control\n"
757
  "(script and command are mutually exclusive)\n\n"
758
+ "## Available Hardware (vCPU/RAM/GPU):\n"
759
+ f"CPU: {CPU_FLAVORS_DESC}\n"
760
+ f"GPU: {GPU_FLAVORS_DESC}\n"
761
  "## Examples:\n\n"
762
  "**Fine-tune LLM and push to Hub:**\n"
763
+ "{'operation': 'run', 'script': 'from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer\\nmodel = AutoModelForCausalLM.from_pretrained(\"Qwen/Qwen3-4B-Thinking-2507\")\\n# ... training code ...\\nmodel.push_to_hub(\"user-name/my-finetuned-model\")', 'dependencies': ['transformers', 'torch', 'datasets'], 'hardware_flavor': 'a10g-large', 'timeout': '4h', 'env': {'CUSTOM_VAR': 'value'}}\n\n"
764
  "**Generate dataset daily and upload:**\n"
765
  "{'operation': 'scheduled run', 'script': 'from datasets import Dataset\\nimport pandas as pd\\n# scrape/generate data\\ndf = pd.DataFrame(data)\\nds = Dataset.from_pandas(df)\\nds.push_to_hub(\"user-name/daily-dataset\")', 'dependencies': ['datasets', 'pandas'], 'schedule': '@daily'}\n\n"
766
  "**Run custom training with Docker:**\n"
 
821
  # Hardware and environment
822
  "hardware_flavor": {
823
  "type": "string",
824
+ "description": f"Hardware type. Available CPU flavors: {CPU_FLAVORS}. Available GPU flavors: {GPU_FLAVORS}. Use with 'run'/'scheduled run'.",
825
  },
826
  "timeout": {
827
  "type": "string",