Spaces:
Sleeping
Sleeping
wony617
commited on
Commit
Β·
8957aec
1
Parent(s):
10adc15
Fix translation doc finder
Browse files- agent/handler.py +2 -6
- agent/toctree_handler.py +16 -47
- agent/workflow.py +22 -16
- pr_generator/agent.py +23 -10
agent/handler.py
CHANGED
|
@@ -8,7 +8,6 @@ import gradio as gr
|
|
| 8 |
|
| 9 |
from agent.workflow import (
|
| 10 |
report_translation_target_files,
|
| 11 |
-
report_in_translation_status_files,
|
| 12 |
translate_docs_interactive,
|
| 13 |
generate_github_pr,
|
| 14 |
)
|
|
@@ -73,11 +72,8 @@ def process_file_search_handler(lang: str, k: int, history: list) -> tuple:
|
|
| 73 |
state.step = "find_files"
|
| 74 |
|
| 75 |
status_report, files_list = report_translation_target_files(lang, k)
|
| 76 |
-
in_progress_status_report, in_progress_docs = report_in_translation_status_files(
|
| 77 |
-
lang
|
| 78 |
-
)
|
| 79 |
state.files_to_translate = (
|
| 80 |
-
[file[0] for file in files_list
|
| 81 |
if files_list
|
| 82 |
else []
|
| 83 |
)
|
|
@@ -86,7 +82,7 @@ def process_file_search_handler(lang: str, k: int, history: list) -> tuple:
|
|
| 86 |
|
| 87 |
**Status Report:**
|
| 88 |
{status_report}
|
| 89 |
-
|
| 90 |
**π Found first {len(state.files_to_translate)} files to translate:**
|
| 91 |
"""
|
| 92 |
|
|
|
|
| 8 |
|
| 9 |
from agent.workflow import (
|
| 10 |
report_translation_target_files,
|
|
|
|
| 11 |
translate_docs_interactive,
|
| 12 |
generate_github_pr,
|
| 13 |
)
|
|
|
|
| 72 |
state.step = "find_files"
|
| 73 |
|
| 74 |
status_report, files_list = report_translation_target_files(lang, k)
|
|
|
|
|
|
|
|
|
|
| 75 |
state.files_to_translate = (
|
| 76 |
+
[file[0] for file in files_list]
|
| 77 |
if files_list
|
| 78 |
else []
|
| 79 |
)
|
|
|
|
| 82 |
|
| 83 |
**Status Report:**
|
| 84 |
{status_report}
|
| 85 |
+
|
| 86 |
**π Found first {len(state.files_to_translate)} files to translate:**
|
| 87 |
"""
|
| 88 |
|
agent/toctree_handler.py
CHANGED
|
@@ -90,26 +90,7 @@ Korean title:"""
|
|
| 90 |
'local': local_file_path,
|
| 91 |
'title': en_title
|
| 92 |
}
|
| 93 |
-
|
| 94 |
-
def update_local_toctree_file(self, new_entries: List[Dict[str, str]]):
|
| 95 |
-
"""Update or create local _toctree.yml file"""
|
| 96 |
-
toctree_path = os.path.join(self.local_docs_path, "_toctree.yml")
|
| 97 |
-
|
| 98 |
-
os.makedirs(self.local_docs_path, exist_ok=True)
|
| 99 |
-
|
| 100 |
-
if os.path.exists(toctree_path):
|
| 101 |
-
with open(toctree_path, 'r', encoding='utf-8') as f:
|
| 102 |
-
existing_data = yaml.safe_load(f) or []
|
| 103 |
-
else:
|
| 104 |
-
existing_data = []
|
| 105 |
-
|
| 106 |
-
for entry in new_entries:
|
| 107 |
-
if entry not in existing_data:
|
| 108 |
-
existing_data.append(entry)
|
| 109 |
-
|
| 110 |
-
with open(toctree_path, 'w', encoding='utf-8') as f:
|
| 111 |
-
yaml.dump(existing_data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
| 112 |
-
|
| 113 |
def create_updated_toctree_with_llm(self, en_toctree_yaml: str, ko_toctree_yaml: str, target_local: str) -> dict:
|
| 114 |
"""Use LLM to create updated Korean toctree with new entry at correct position"""
|
| 115 |
try:
|
|
@@ -177,7 +158,7 @@ Example: If English entry is at position [1]['sections'][0] (1st item in section
|
|
| 177 |
print(f"Error using LLM to create updated toctree: {e}")
|
| 178 |
return None
|
| 179 |
|
| 180 |
-
def process_pr_commit(self,
|
| 181 |
"""Process PR commit by using LLM to create complete updated Korean toctree"""
|
| 182 |
# Get filepath without prefix
|
| 183 |
filepath_without_prefix = filepath.replace("docs/source/en/", "").replace(".md", "")
|
|
@@ -194,16 +175,12 @@ Example: If English entry is at position [1]['sections'][0] (1st item in section
|
|
| 194 |
|
| 195 |
if not updated_ko_toctree:
|
| 196 |
print(f"Failed to create updated Korean toctree for local: {filepath_without_prefix}")
|
| 197 |
-
return
|
| 198 |
|
| 199 |
print(f"LLM successfully updated Korean toctree")
|
| 200 |
|
| 201 |
# Store the updated toctree for commit
|
| 202 |
self.updated_ko_toctree = updated_ko_toctree
|
| 203 |
-
|
| 204 |
-
print(f"Updated Korean toctree has {len(updated_ko_toctree)} items")
|
| 205 |
-
|
| 206 |
-
return []
|
| 207 |
|
| 208 |
def commit_and_push_toctree(self, pr_agent, owner: str, repo_name: str, branch_name: str):
|
| 209 |
"""Commit and push toctree updates as a separate commit"""
|
|
@@ -219,7 +196,7 @@ Example: If English entry is at position [1]['sections'][0] (1st item in section
|
|
| 219 |
toctree_content = yaml.dump(ko_data, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
| 220 |
|
| 221 |
# Create toctree commit message
|
| 222 |
-
commit_message = "docs: update Korean documentation table of contents
|
| 223 |
|
| 224 |
# Commit toctree file
|
| 225 |
file_result = pr_agent.create_or_update_file(
|
|
@@ -252,7 +229,6 @@ Example: If English entry is at position [1]['sections'][0] (1st item in section
|
|
| 252 |
def update_toctree_after_translation(
|
| 253 |
self,
|
| 254 |
translation_result: dict,
|
| 255 |
-
en_title: str,
|
| 256 |
filepath: str,
|
| 257 |
pr_agent,
|
| 258 |
github_config: dict
|
|
@@ -261,7 +237,6 @@ Example: If English entry is at position [1]['sections'][0] (1st item in section
|
|
| 261 |
|
| 262 |
Args:
|
| 263 |
translation_result: Result from translation PR workflow
|
| 264 |
-
en_title: English title for toctree mapping
|
| 265 |
filepath: Original file path
|
| 266 |
pr_agent: GitHub PR agent instance
|
| 267 |
github_config: GitHub configuration dictionary
|
|
@@ -269,28 +244,22 @@ Example: If English entry is at position [1]['sections'][0] (1st item in section
|
|
| 269 |
Returns:
|
| 270 |
Dictionary with toctree update result
|
| 271 |
"""
|
| 272 |
-
if translation_result["status"] == "error"
|
| 273 |
return None
|
| 274 |
|
| 275 |
try:
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
# Create new toctree entries
|
| 279 |
-
new_entries = self.process_pr_commit([en_title], [local_path], filepath)
|
| 280 |
-
print("self.updated_ko_toctree = updated_ko_toctree:", self.updated_ko_toctree)
|
| 281 |
# Commit toctree as separate commit
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
# 'commit_message': 'docs: update Korean documentation table of contents'
|
| 292 |
-
# }
|
| 293 |
-
|
| 294 |
except Exception as e:
|
| 295 |
return {
|
| 296 |
"status": "error",
|
|
|
|
| 90 |
'local': local_file_path,
|
| 91 |
'title': en_title
|
| 92 |
}
|
| 93 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
def create_updated_toctree_with_llm(self, en_toctree_yaml: str, ko_toctree_yaml: str, target_local: str) -> dict:
|
| 95 |
"""Use LLM to create updated Korean toctree with new entry at correct position"""
|
| 96 |
try:
|
|
|
|
| 158 |
print(f"Error using LLM to create updated toctree: {e}")
|
| 159 |
return None
|
| 160 |
|
| 161 |
+
def process_pr_commit(self, filepath: str):
|
| 162 |
"""Process PR commit by using LLM to create complete updated Korean toctree"""
|
| 163 |
# Get filepath without prefix
|
| 164 |
filepath_without_prefix = filepath.replace("docs/source/en/", "").replace(".md", "")
|
|
|
|
| 175 |
|
| 176 |
if not updated_ko_toctree:
|
| 177 |
print(f"Failed to create updated Korean toctree for local: {filepath_without_prefix}")
|
| 178 |
+
return
|
| 179 |
|
| 180 |
print(f"LLM successfully updated Korean toctree")
|
| 181 |
|
| 182 |
# Store the updated toctree for commit
|
| 183 |
self.updated_ko_toctree = updated_ko_toctree
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
|
| 185 |
def commit_and_push_toctree(self, pr_agent, owner: str, repo_name: str, branch_name: str):
|
| 186 |
"""Commit and push toctree updates as a separate commit"""
|
|
|
|
| 196 |
toctree_content = yaml.dump(ko_data, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
| 197 |
|
| 198 |
# Create toctree commit message
|
| 199 |
+
commit_message = "docs: update Korean documentation table of contents"
|
| 200 |
|
| 201 |
# Commit toctree file
|
| 202 |
file_result = pr_agent.create_or_update_file(
|
|
|
|
| 229 |
def update_toctree_after_translation(
|
| 230 |
self,
|
| 231 |
translation_result: dict,
|
|
|
|
| 232 |
filepath: str,
|
| 233 |
pr_agent,
|
| 234 |
github_config: dict
|
|
|
|
| 237 |
|
| 238 |
Args:
|
| 239 |
translation_result: Result from translation PR workflow
|
|
|
|
| 240 |
filepath: Original file path
|
| 241 |
pr_agent: GitHub PR agent instance
|
| 242 |
github_config: GitHub configuration dictionary
|
|
|
|
| 244 |
Returns:
|
| 245 |
Dictionary with toctree update result
|
| 246 |
"""
|
| 247 |
+
if translation_result["status"] == "error":
|
| 248 |
return None
|
| 249 |
|
| 250 |
try:
|
| 251 |
+
# Process toctree update with LLM
|
| 252 |
+
self.process_pr_commit(filepath)
|
|
|
|
|
|
|
|
|
|
| 253 |
# Commit toctree as separate commit
|
| 254 |
+
print("self.updated_ko_toctree:", self.updated_ko_toctree:)
|
| 255 |
+
if self.updated_ko_toctree:
|
| 256 |
+
return self.commit_and_push_toctree(
|
| 257 |
+
pr_agent=pr_agent,
|
| 258 |
+
owner=github_config["owner"],
|
| 259 |
+
repo_name=github_config["repo_name"],
|
| 260 |
+
branch_name=translation_result["branch"]
|
| 261 |
+
)
|
| 262 |
+
|
|
|
|
|
|
|
|
|
|
| 263 |
except Exception as e:
|
| 264 |
return {
|
| 265 |
"status": "error",
|
agent/workflow.py
CHANGED
|
@@ -28,27 +28,34 @@ except ImportError as e:
|
|
| 28 |
def report_translation_target_files(
|
| 29 |
translate_lang: str, top_k: int = 1
|
| 30 |
) -> tuple[str, list[list[str]]]:
|
| 31 |
-
"""Return the top-k files that need translation.
|
| 32 |
|
| 33 |
Args:
|
| 34 |
translate_lang: Target language to translate
|
| 35 |
top_k: Number of top-first files to return for translation. (Default 1)
|
| 36 |
"""
|
| 37 |
-
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
| 39 |
|
|
|
|
|
|
|
| 40 |
|
| 41 |
-
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
"""
|
| 48 |
-
for i, file in enumerate(docs):
|
| 49 |
status_report += f"\n{i+1}. `{file}`: {pr_info_list[i]}"
|
| 50 |
-
status_report += "\n"
|
| 51 |
-
|
|
|
|
| 52 |
|
| 53 |
|
| 54 |
def translate_docs(lang: str, file_path: str, additional_instruction: str = "") -> tuple[str, str]:
|
|
@@ -58,14 +65,14 @@ def translate_docs(lang: str, file_path: str, additional_instruction: str = "")
|
|
| 58 |
Path(__file__).resolve().parent.parent
|
| 59 |
/ f"translation_result/{file_path}"
|
| 60 |
)
|
| 61 |
-
|
| 62 |
if translation_file_path.exists():
|
| 63 |
print(f"π Found existing translation: {translation_file_path}")
|
| 64 |
with open(translation_file_path, "r", encoding="utf-8") as f:
|
| 65 |
existing_content = f.read()
|
| 66 |
if existing_content.strip():
|
| 67 |
return "Existing translation loaded (no tokens used)", existing_content
|
| 68 |
-
|
| 69 |
# step 1. Get content from file path
|
| 70 |
content = get_content(file_path)
|
| 71 |
to_translate = preprocess_content(content)
|
|
@@ -195,9 +202,8 @@ def generate_github_pr(
|
|
| 195 |
from agent.toctree_handler import TocTreeHandler
|
| 196 |
toctree_handler = TocTreeHandler()
|
| 197 |
toctree_result = toctree_handler.update_toctree_after_translation(
|
| 198 |
-
result,
|
| 199 |
)
|
| 200 |
-
print("toctree_result:", toctree_result)
|
| 201 |
|
| 202 |
# Process result
|
| 203 |
# Generate toctree status message (shared for both success and partial_success)
|
|
|
|
| 28 |
def report_translation_target_files(
|
| 29 |
translate_lang: str, top_k: int = 1
|
| 30 |
) -> tuple[str, list[list[str]]]:
|
| 31 |
+
"""Return the top-k files that need translation, excluding files already in progress.
|
| 32 |
|
| 33 |
Args:
|
| 34 |
translate_lang: Target language to translate
|
| 35 |
top_k: Number of top-first files to return for translation. (Default 1)
|
| 36 |
"""
|
| 37 |
+
# Get files in progress
|
| 38 |
+
docs_in_progress, pr_info_list = get_github_issue_open_pr(translate_lang)
|
| 39 |
+
|
| 40 |
+
# Get all available files for translation
|
| 41 |
+
all_status_report, all_filepath_list = report(translate_lang, top_k * 2) # Get more to account for filtering
|
| 42 |
|
| 43 |
+
# Filter out files that are already in progress
|
| 44 |
+
available_files = [f for f in all_filepath_list if f not in docs_in_progress]
|
| 45 |
|
| 46 |
+
# Take only the requested number
|
| 47 |
+
filepath_list = available_files[:top_k]
|
| 48 |
+
|
| 49 |
+
# Build combined status report
|
| 50 |
+
status_report = all_status_report
|
| 51 |
|
| 52 |
+
if docs_in_progress:
|
| 53 |
+
status_report += f"\n\nπ€ Found {len(docs_in_progress)} files in progress for translation:"
|
| 54 |
+
for i, file in enumerate(docs_in_progress):
|
|
|
|
|
|
|
| 55 |
status_report += f"\n{i+1}. `{file}`: {pr_info_list[i]}"
|
| 56 |
+
status_report += f"\n\nπ Showing {len(filepath_list)} available files (excluding in-progress):"
|
| 57 |
+
|
| 58 |
+
return status_report, [[file] for file in filepath_list]
|
| 59 |
|
| 60 |
|
| 61 |
def translate_docs(lang: str, file_path: str, additional_instruction: str = "") -> tuple[str, str]:
|
|
|
|
| 65 |
Path(__file__).resolve().parent.parent
|
| 66 |
/ f"translation_result/{file_path}"
|
| 67 |
)
|
| 68 |
+
|
| 69 |
if translation_file_path.exists():
|
| 70 |
print(f"π Found existing translation: {translation_file_path}")
|
| 71 |
with open(translation_file_path, "r", encoding="utf-8") as f:
|
| 72 |
existing_content = f.read()
|
| 73 |
if existing_content.strip():
|
| 74 |
return "Existing translation loaded (no tokens used)", existing_content
|
| 75 |
+
|
| 76 |
# step 1. Get content from file path
|
| 77 |
content = get_content(file_path)
|
| 78 |
to_translate = preprocess_content(content)
|
|
|
|
| 202 |
from agent.toctree_handler import TocTreeHandler
|
| 203 |
toctree_handler = TocTreeHandler()
|
| 204 |
toctree_result = toctree_handler.update_toctree_after_translation(
|
| 205 |
+
result, filepath, agent, github_config
|
| 206 |
)
|
|
|
|
| 207 |
|
| 208 |
# Process result
|
| 209 |
# Generate toctree status message (shared for both success and partial_success)
|
pr_generator/agent.py
CHANGED
|
@@ -94,10 +94,17 @@ class GitHubPRAgent:
|
|
| 94 |
if existing_pr:
|
| 95 |
return f"ERROR: {existing_pr}"
|
| 96 |
|
| 97 |
-
# 3. Verify head
|
| 98 |
repo = self.github_client.get_repo(f"{owner}/{repo_name}")
|
| 99 |
try:
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
base_branch = repo.get_branch(base)
|
| 102 |
|
| 103 |
# 4. Check if head and base branches point to the same commit
|
|
@@ -159,7 +166,9 @@ class GitHubPRAgent:
|
|
| 159 |
"""Check if there's an existing PR with the same head and base."""
|
| 160 |
try:
|
| 161 |
repo = self.github_client.get_repo(f"{owner}/{repo_name}")
|
| 162 |
-
|
|
|
|
|
|
|
| 163 |
for pr in pulls:
|
| 164 |
return f"Existing PR found: {pr.html_url}"
|
| 165 |
return None
|
|
@@ -448,12 +457,12 @@ Please return only the commit message. No other explanation is needed."""
|
|
| 448 |
pr_analysis["head_branch"], target_language, file_name
|
| 449 |
)
|
| 450 |
|
| 451 |
-
# 3. Get main branch SHA and create branch
|
| 452 |
-
|
| 453 |
-
main_branch =
|
| 454 |
main_sha = main_branch.commit.sha
|
| 455 |
|
| 456 |
-
print(f"πΏ Creating branch: {branch_name}")
|
| 457 |
branch_result = self.create_branch(owner, repo_name, branch_name, main_sha)
|
| 458 |
|
| 459 |
# Check branch creation result
|
|
@@ -466,8 +475,11 @@ Please return only the commit message. No other explanation is needed."""
|
|
| 466 |
elif branch_result.startswith("WARNING"):
|
| 467 |
print(f"β οΈ {branch_result}")
|
| 468 |
# Continue if branch already exists
|
|
|
|
|
|
|
| 469 |
else:
|
| 470 |
-
print(f"{branch_result}")
|
|
|
|
| 471 |
|
| 472 |
# 4. Generate commit message and save file
|
| 473 |
commit_messages = [commit["message"] for commit in pr_analysis["commits"]]
|
|
@@ -506,10 +518,11 @@ Please return only the commit message. No other explanation is needed."""
|
|
| 506 |
)
|
| 507 |
|
| 508 |
print(f"π Creating PR: {pr_title}")
|
| 509 |
-
print(f" Head: {branch_name} β Base: {base_branch}")
|
| 510 |
|
|
|
|
| 511 |
pr_result = self.create_pull_request(
|
| 512 |
-
|
| 513 |
)
|
| 514 |
|
| 515 |
if pr_result.startswith("ERROR"):
|
|
|
|
| 94 |
if existing_pr:
|
| 95 |
return f"ERROR: {existing_pr}"
|
| 96 |
|
| 97 |
+
# 3. Verify head and base branches exist
|
| 98 |
repo = self.github_client.get_repo(f"{owner}/{repo_name}")
|
| 99 |
try:
|
| 100 |
+
# For fork-to-upstream PR, head format is "fork_owner:branch_name"
|
| 101 |
+
if ":" in head:
|
| 102 |
+
fork_owner, branch_name = head.split(":", 1)
|
| 103 |
+
fork_repo = self.github_client.get_repo(f"{fork_owner}/{repo_name}")
|
| 104 |
+
head_branch = fork_repo.get_branch(branch_name)
|
| 105 |
+
else:
|
| 106 |
+
head_branch = repo.get_branch(head)
|
| 107 |
+
|
| 108 |
base_branch = repo.get_branch(base)
|
| 109 |
|
| 110 |
# 4. Check if head and base branches point to the same commit
|
|
|
|
| 166 |
"""Check if there's an existing PR with the same head and base."""
|
| 167 |
try:
|
| 168 |
repo = self.github_client.get_repo(f"{owner}/{repo_name}")
|
| 169 |
+
# For head parameter, use exactly what was passed (could be "fork_owner:branch" or just "branch")
|
| 170 |
+
search_head = head if ":" in head else f"{owner}:{head}"
|
| 171 |
+
pulls = repo.get_pulls(state="open", head=search_head, base=base)
|
| 172 |
for pr in pulls:
|
| 173 |
return f"Existing PR found: {pr.html_url}"
|
| 174 |
return None
|
|
|
|
| 457 |
pr_analysis["head_branch"], target_language, file_name
|
| 458 |
)
|
| 459 |
|
| 460 |
+
# 3. Get main branch SHA from upstream and create branch in fork
|
| 461 |
+
upstream_repo = self.github_client.get_repo(f"huggingface/{repo_name}")
|
| 462 |
+
main_branch = upstream_repo.get_branch(base_branch)
|
| 463 |
main_sha = main_branch.commit.sha
|
| 464 |
|
| 465 |
+
print(f"πΏ Creating branch: {branch_name} in fork repository")
|
| 466 |
branch_result = self.create_branch(owner, repo_name, branch_name, main_sha)
|
| 467 |
|
| 468 |
# Check branch creation result
|
|
|
|
| 475 |
elif branch_result.startswith("WARNING"):
|
| 476 |
print(f"β οΈ {branch_result}")
|
| 477 |
# Continue if branch already exists
|
| 478 |
+
elif branch_result.startswith("SUCCESS"):
|
| 479 |
+
print(f"β
{branch_result}")
|
| 480 |
else:
|
| 481 |
+
print(f"β οΈ Unexpected branch creation result: {branch_result}")
|
| 482 |
+
# Continue anyway, might still work
|
| 483 |
|
| 484 |
# 4. Generate commit message and save file
|
| 485 |
commit_messages = [commit["message"] for commit in pr_analysis["commits"]]
|
|
|
|
| 518 |
)
|
| 519 |
|
| 520 |
print(f"π Creating PR: {pr_title}")
|
| 521 |
+
print(f" Head: {owner}:{branch_name} β Base: huggingface:{base_branch}")
|
| 522 |
|
| 523 |
+
# Create PR from fork to upstream repository
|
| 524 |
pr_result = self.create_pull_request(
|
| 525 |
+
"huggingface", "transformers", pr_title, f"{owner}:{branch_name}", base_branch, pr_body
|
| 526 |
)
|
| 527 |
|
| 528 |
if pr_result.startswith("ERROR"):
|