Spaces:
Sleeping
Sleeping
wony617
commited on
Commit
·
1b1c0d8
1
Parent(s):
a487d1c
fix the docs path of the open pr list
Browse files- agent/workflow.py +9 -6
- translator/retriever.py +54 -10
agent/workflow.py
CHANGED
|
@@ -11,7 +11,7 @@ from translator.content import (
|
|
| 11 |
llm_translate,
|
| 12 |
preprocess_content,
|
| 13 |
)
|
| 14 |
-
from translator.retriever import report, get_github_issue_open_pr
|
| 15 |
# GitHub PR Agent import
|
| 16 |
try:
|
| 17 |
from pr_generator.agent import GitHubPRAgent
|
|
@@ -35,11 +35,14 @@ def report_translation_target_files(
|
|
| 35 |
translate_lang: Target language to translate
|
| 36 |
top_k: Number of top-first files to return for translation. (Default 1)
|
| 37 |
"""
|
| 38 |
-
# Get files
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
# Get all available files for translation
|
| 42 |
-
all_status_report, all_filepath_list = report(project, translate_lang, top_k * 2) # Get more to account for filtering
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
# Filter out files that are already in progress
|
| 45 |
available_files = [f for f in all_filepath_list if f not in docs_in_progress]
|
|
|
|
| 11 |
llm_translate,
|
| 12 |
preprocess_content,
|
| 13 |
)
|
| 14 |
+
from translator.retriever import report, get_github_issue_open_pr, get_github_repo_files
|
| 15 |
# GitHub PR Agent import
|
| 16 |
try:
|
| 17 |
from pr_generator.agent import GitHubPRAgent
|
|
|
|
| 35 |
translate_lang: Target language to translate
|
| 36 |
top_k: Number of top-first files to return for translation. (Default 1)
|
| 37 |
"""
|
| 38 |
+
# Get repo files once to avoid duplicate API calls
|
| 39 |
+
all_repo_files = get_github_repo_files(project)
|
| 40 |
+
|
| 41 |
+
# Get all available files for translation using the file list
|
| 42 |
+
all_status_report, all_filepath_list = report(project, translate_lang, top_k * 2, all_repo_files) # Get more to account for filtering
|
| 43 |
+
|
| 44 |
+
# Get files in progress using the same file list
|
| 45 |
+
docs_in_progress, pr_info_list = get_github_issue_open_pr(project, translate_lang, all_repo_files)
|
| 46 |
|
| 47 |
# Filter out files that are already in progress
|
| 48 |
available_files = [f for f in all_filepath_list if f not in docs_in_progress]
|
translator/retriever.py
CHANGED
|
@@ -37,7 +37,7 @@ def get_github_repo_files(project: str = "transformers"):
|
|
| 37 |
return file_paths
|
| 38 |
|
| 39 |
|
| 40 |
-
def get_github_issue_open_pr(project: str = "transformers", lang: str = "ko"):
|
| 41 |
"""
|
| 42 |
Get open PR in the github issue, filtered by title containing '[i18n-KO]'.
|
| 43 |
"""
|
|
@@ -48,6 +48,10 @@ def get_github_issue_open_pr(project: str = "transformers", lang: str = "ko"):
|
|
| 48 |
if not issue_id:
|
| 49 |
raise ValueError(f"⚠️ No GitHub issue registered for {project}.")
|
| 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
headers = {
|
| 52 |
"Accept": "application/vnd.github+json",
|
| 53 |
}
|
|
@@ -84,20 +88,59 @@ def get_github_issue_open_pr(project: str = "transformers", lang: str = "ko"):
|
|
| 84 |
|
| 85 |
filtered_prs = [pr for pr in all_open_prs if "[i18n-KO]" in pr["title"]]
|
| 86 |
|
| 87 |
-
# Pattern to match
|
| 88 |
-
pattern = re.compile(r"(?:`([^`]
|
| 89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
filenames = []
|
|
|
|
|
|
|
| 91 |
for pr in filtered_prs:
|
| 92 |
match = pattern.search(pr["title"])
|
| 93 |
if match:
|
| 94 |
# Use group 1 (with backticks) or group 2 (without backticks)
|
| 95 |
filename = match.group(1) or match.group(2)
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
return filenames, pr_info_list
|
| 102 |
|
| 103 |
|
|
@@ -121,11 +164,12 @@ def retrieve(summary: Summary, table_size: int = 10) -> tuple[str, list[str]]:
|
|
| 121 |
return report, first_missing_docs
|
| 122 |
|
| 123 |
|
| 124 |
-
def report(project: str, target_lang: str, top_k: int = 1) -> tuple[str, list[str]]:
|
| 125 |
"""
|
| 126 |
Generate a report for the translated docs
|
| 127 |
"""
|
| 128 |
-
docs_file
|
|
|
|
| 129 |
|
| 130 |
base_docs_path = Path("docs/source")
|
| 131 |
en_docs_path = Path("docs/source/en")
|
|
|
|
| 37 |
return file_paths
|
| 38 |
|
| 39 |
|
| 40 |
+
def get_github_issue_open_pr(project: str = "transformers", lang: str = "ko", all_files: list = None):
|
| 41 |
"""
|
| 42 |
Get open PR in the github issue, filtered by title containing '[i18n-KO]'.
|
| 43 |
"""
|
|
|
|
| 48 |
if not issue_id:
|
| 49 |
raise ValueError(f"⚠️ No GitHub issue registered for {project}.")
|
| 50 |
|
| 51 |
+
# Require all_files parameter
|
| 52 |
+
if all_files is None:
|
| 53 |
+
raise ValueError("Repository file list must be provided")
|
| 54 |
+
|
| 55 |
headers = {
|
| 56 |
"Accept": "application/vnd.github+json",
|
| 57 |
}
|
|
|
|
| 88 |
|
| 89 |
filtered_prs = [pr for pr in all_open_prs if "[i18n-KO]" in pr["title"]]
|
| 90 |
|
| 91 |
+
# Pattern to match filenames after "Translated" keyword
|
| 92 |
+
pattern = re.compile(r"Translated\s+(?:`([^`]+)`|(\S+))\s+to")
|
| 93 |
|
| 94 |
+
def find_original_file_path(filename_from_title, all_files):
|
| 95 |
+
"""Find the exact file path from repo files by matching filename"""
|
| 96 |
+
if not filename_from_title:
|
| 97 |
+
return None
|
| 98 |
+
|
| 99 |
+
# Remove .md extension for matching
|
| 100 |
+
base_name = filename_from_title.replace('.md', '')
|
| 101 |
+
|
| 102 |
+
# Look for exact matches in repo files
|
| 103 |
+
for file_path in all_files:
|
| 104 |
+
if file_path.startswith("docs/source/en/") and file_path.endswith(".md"):
|
| 105 |
+
file_base = file_path.split("/")[-1].replace('.md', '')
|
| 106 |
+
if file_base == base_name:
|
| 107 |
+
return file_path
|
| 108 |
+
|
| 109 |
+
# If no exact match, fallback to simple path
|
| 110 |
+
return f"docs/source/en/{filename_from_title}"
|
| 111 |
+
|
| 112 |
filenames = []
|
| 113 |
+
pr_info_list = []
|
| 114 |
+
|
| 115 |
for pr in filtered_prs:
|
| 116 |
match = pattern.search(pr["title"])
|
| 117 |
if match:
|
| 118 |
# Use group 1 (with backticks) or group 2 (without backticks)
|
| 119 |
filename = match.group(1) or match.group(2)
|
| 120 |
+
# Add .md extension if not present
|
| 121 |
+
if not filename.endswith('.md'):
|
| 122 |
+
filename += '.md'
|
| 123 |
+
|
| 124 |
+
# Find the correct file path by matching filename
|
| 125 |
+
correct_path = None
|
| 126 |
+
if filename:
|
| 127 |
+
# Remove .md extension for matching
|
| 128 |
+
base_name = filename.replace('.md', '')
|
| 129 |
+
|
| 130 |
+
# Look for exact matches in repo files
|
| 131 |
+
for file_path in all_files:
|
| 132 |
+
if file_path.startswith("docs/source/en/") and file_path.endswith(".md"):
|
| 133 |
+
file_base = file_path.split("/")[-1].replace('.md', '')
|
| 134 |
+
if file_base == base_name:
|
| 135 |
+
correct_path = file_path
|
| 136 |
+
break
|
| 137 |
+
|
| 138 |
+
# If no exact match, fallback to simple path
|
| 139 |
+
if not correct_path:
|
| 140 |
+
correct_path = f"docs/source/en/{filename}"
|
| 141 |
+
if correct_path:
|
| 142 |
+
filenames.append(correct_path)
|
| 143 |
+
pr_info_list.append(f"{config.repo_url}/pull/{pr['url'].rstrip('/').split('/')[-1]}")
|
| 144 |
return filenames, pr_info_list
|
| 145 |
|
| 146 |
|
|
|
|
| 164 |
return report, first_missing_docs
|
| 165 |
|
| 166 |
|
| 167 |
+
def report(project: str, target_lang: str, top_k: int = 1, docs_file: list = None) -> tuple[str, list[str]]:
|
| 168 |
"""
|
| 169 |
Generate a report for the translated docs
|
| 170 |
"""
|
| 171 |
+
if docs_file is None:
|
| 172 |
+
raise ValueError("Repository file list must be provided")
|
| 173 |
|
| 174 |
base_docs_path = Path("docs/source")
|
| 175 |
en_docs_path = Path("docs/source/en")
|