Spaces:
Running
Running
""" | |
Script to clone Hugging Face documentation repositories and organize them | |
based on their toctree structure with proper naming. | |
""" | |
import json | |
import os | |
import re | |
import shutil | |
import subprocess | |
import sys | |
import argparse | |
from tqdm import tqdm | |
from pathlib import Path | |
from typing import Dict, List, Optional, Tuple | |
import yaml | |
def parse_toctree_yaml(file_path: str) -> Optional[Dict]: | |
"""Parse a YAML-based toctree file.""" | |
try: | |
with open(file_path, 'r', encoding='utf-8') as f: | |
return yaml.safe_load(f) | |
except Exception as e: | |
print(f"Error parsing YAML toctree {file_path}: {e}") | |
return None | |
def run_command(cmd: List[str], cwd: Optional[str] = None) -> bool: | |
"""Run a shell command and return success status.""" | |
try: | |
result = subprocess.run(cmd, cwd=cwd, check=True, capture_output=True, text=True) | |
return True | |
except subprocess.CalledProcessError as e: | |
print(f"Error running command {' '.join(cmd)}: {e}") | |
print(f"STDOUT: {e.stdout}") | |
print(f"STDERR: {e.stderr}") | |
return False | |
def clone_repo(repo_url: str, dir_to_clone: str, target_dir: str) -> bool: | |
"""Clone a repository to the target directory.""" | |
if os.path.exists(Path(target_dir) / Path(dir_to_clone)): | |
print(f"Directory {target_dir} already exists, skipping clone") | |
return True | |
# Clone without checking out any files | |
out_clone = run_command(["git", "clone", "--no-checkout", repo_url, target_dir]) | |
if not out_clone: return False | |
# Initialize sparse checkout without cone mode | |
sparse_init = run_command(["git", "sparse-checkout", "init", "--no-cone"], cwd=target_dir) | |
if not sparse_init: return False | |
# Set sparse checkout patterns to only include the specified directory | |
sparse_patterns = ['/*', '!/*', f'/{dir_to_clone}/', f'/{dir_to_clone}/**'] | |
sparse_set = run_command(["git", "sparse-checkout", "set", "--no-cone"] + sparse_patterns, cwd=target_dir) | |
if not sparse_set: return False | |
# Check out the files based on sparse checkout configuration | |
checkout = run_command(["git", "checkout", "main"], cwd=target_dir) | |
if not checkout: | |
# Try 'master' if 'main' fails | |
checkout = run_command(["git", "checkout", "master"], cwd=target_dir) | |
if not checkout: | |
print(f"Failed to checkout main or master branch in {target_dir}") | |
return False | |
return True | |
def save_section_to_disk(section: Dict, file_path: Path, raw_docs_path: Path, prefix: str, index: int): | |
""" | |
Recursively saves a documentation section to disk with hierarchical numbering. | |
""" | |
current_number = f"{prefix}{index}" | |
numbered_title = f"{current_number}. {section['title']}" | |
if "sections" in section: | |
# This is a directory | |
new_dir_path = file_path / numbered_title | |
os.makedirs(new_dir_path, exist_ok=True) | |
# The new prefix for children adds the current number, e.g., "1.1." | |
new_prefix = f"{current_number}." | |
for i, subsection in enumerate(section["sections"], 1): | |
save_section_to_disk(subsection, new_dir_path, raw_docs_path, new_prefix, i) | |
else: | |
# This is a file | |
try: | |
local_path = raw_docs_path / f"{section['local']}.md" | |
if not local_path.exists(): | |
local_path = raw_docs_path / f"{section['local']}.mdx" | |
assert local_path.exists(), f"File {local_path} does not exist" | |
# Create the numbered filename | |
new_filename = f"{numbered_title}{local_path.suffix}" | |
shutil.copy(local_path, file_path / new_filename) | |
except Exception as e: | |
# TODO: Not many cases, but handle symlinks, missing files, and other edge cases | |
pass | |
def make_docs(repos: Dict, args: Dict): | |
for repo_index, repo in enumerate(tqdm(repos, desc="Consolidating 🤗 Documentation"), 1): | |
save_repo_docs_path = Path(f"{args.repos_dir}/{repo['repo_url'].split('/')[-1]}") | |
clone_repo(repo["repo_url"], repo["subfolder"], str(save_repo_docs_path)) | |
repo_docs_path = save_repo_docs_path / repo["subfolder"] | |
toctree = parse_toctree_yaml(repo_docs_path / "_toctree.yml") | |
# Create the top-level numbered directory for the repo, e.g., "1. Accelerate" | |
repo_title = f"{repo_index}. {repo['title']}" | |
repo_output_path = Path(args.docs_dir) / repo_title | |
os.makedirs(repo_output_path, exist_ok=True) | |
# The initial prefix for numbering is the repo index, e.g., "1." | |
prefix = f"{repo_index}." | |
for block_index, block in enumerate(toctree, 1): | |
# Start the recursive saving with the initial prefix and the block's index | |
save_section_to_disk(block, repo_output_path, repo_docs_path, prefix, block_index) | |
shutil.rmtree(save_repo_docs_path) | |
shutil.rmtree(args.repos_dir) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--docs_dir", type=str, default="docs") | |
parser.add_argument("--repos_dir", type=str, default="repos") | |
args = parser.parse_args() | |
with open("repos_config.json", "r") as f: | |
repos = json.load(f) | |
if os.path.exists(args.docs_dir): | |
shutil.rmtree(args.docs_dir) | |
make_docs(repos, args) |