Spaces:

AiCoderv2
/

my-app-565

Build error

App Files Files

xet

Community

AiCoderv2 commited on 11 days ago

Commit

d79890b

verified ·

1 Parent(s): 72dbae2

Deploy Gradio app with multiple files

Browse files

Files changed (5) hide show

app.py +441 -0
config.py +238 -0
models.py +226 -0
requirements.txt +45 -0
utils.py +365 -0

app.py ADDED Viewed

	@@ -0,0 +1,441 @@

+import gradio as gr
+import requests
+import os
+import base64
+import json
+import re
+from pathlib import Path
+from typing import List, Dict, Optional, Tuple
+import zipfile
+import io
+from datetime import datetime
+import math
+from utils import (
+    clean_code_content,
+    get_file_language,
+    estimate_tokens,
+    create_chunked_output
+)
+from models import (
+    process_github_repo,
+    process_huggingface_repo,
+    download_repo_as_zip
+)
+from config import (
+    SUPPORTED_EXTENSIONS,
+    MAX_FILE_SIZE,
+    MAX_TOTAL_SIZE,
+    CHUNK_SIZE,
+    GITHUB_API_BASE,
+    HF_API_BASE
+)
+# CSS for better UI
+css = """
+.container {
+    max-width: 1200px;
+    margin: 0 auto;
+}
+.progress-bar {
+    height: 20px;
+    background: linear-gradient(90deg, #4CAF50, #45a049);
+    border-radius: 10px;
+    transition: width 0.3s ease;
+}
+.file-stats {
+    background: #f0f0f0;
+    padding: 10px;
+    border-radius: 5px;
+    margin: 10px 0;
+}
+.warning {
+    background: #fff3cd;
+    border: 1px solid #ffeaa7;
+    padding: 10px;
+    border-radius: 5px;
+    color: #856404;
+}
+.error {
+    background: #f8d7da;
+    border: 1px solid #f5c6cb;
+    padding: 10px;
+    border-radius: 5px;
+    color: #721c24;
+}
+.success {
+    background: #d4edda;
+    border: 1px solid #c3e6cb;
+    padding: 10px;
+    border-radius: 5px;
+    color: #155724;
+}
+"""
+def validate_repo_url(url: str) -> Tuple[str, str]:
+    """Validate and determine repository type and owner/name"""
+    url = url.strip()
+    # GitHub URL patterns
+    github_patterns = [
+        r'github\.com/([^/]+)/([^/]+?)(?:\.git)?/?$',
+        r'api\.github\.com/repos/([^/]+)/([^/]+)'
+    ]
+    # Hugging Face URL patterns
+    hf_patterns = [
+        r'huggingface\.co/([^/]+)/([^/]+?)(?:\.git)?/?$',
+        r'hf\.co/([^/]+)/([^/]+?)(?:\.git)?/?$'
+    ]
+    for pattern in github_patterns:
+        match = re.search(pattern, url)
+        if match:
+            return "github", f"{match.group(1)}/{match.group(2)}"
+    for pattern in hf_patterns:
+        match = re.search(pattern, url)
+        if match:
+            return "huggingface", f"{match.group(1)}/{match.group(2)}"
+    raise ValueError("Invalid repository URL. Please provide a valid GitHub or Hugging Face repository URL.")
+def process_repository(
+    repo_url: str,
+    token: str = "",
+    include_patterns: str = "",
+    exclude_patterns: str = "",
+    max_file_size_mb: int = 10,
+    chunk_size: int = 50000,
+    include_metadata: bool = True,
+    remove_comments: bool = False,
+    progress=gr.Progress()
+) -> Tuple[str, str, str]:
+    """Main function to process repository and generate text file"""
+    try:
+        # Validate URL and get repo info
+        repo_type, repo_path = validate_repo_url(repo_url)
+        # Parse include/exclude patterns
+        include_list = [p.strip() for p in include_patterns.split(",") if p.strip()] if include_patterns else []
+        exclude_list = [p.strip() for p in exclude_patterns.split(",") if p.strip()] if exclude_patterns else []
+        progress(0.1, desc="Fetching repository information...")
+        # Process repository based on type
+        if repo_type == "github":
+            files_data, repo_info = process_github_repo(
+                repo_path,
+                token,
+                include_list,
+                exclude_list,
+                max_file_size_mb * 1024 * 1024
+            )
+        else:  # huggingface
+            files_data, repo_info = process_huggingface_repo(
+                repo_path,
+                token,
+                include_list,
+                exclude_list,
+                max_file_size_mb * 1024 * 1024
+            )
+        if not files_data:
+            return "", "⚠️ No files found matching the criteria.", ""
+        progress(0.3, desc="Processing files...")
+        # Generate consolidated text
+        total_files = len(files_data)
+        processed_files = 0
+        total_tokens = 0
+        total_chars = 0
+        # Create header
+        header_lines = []
+        if include_metadata:
+            header_lines.append("=" * 80)
+            header_lines.append(f"REPOSITORY: {repo_info.get('full_name', repo_path)}")
+            header_lines.append(f"DESCRIPTION: {repo_info.get('description', 'No description')}")
+            header_lines.append(f"URL: {repo_url}")
+            header_lines.append(f"PROCESSED: {datetime.now().isoformat()}")
+            header_lines.append(f"TOTAL FILES: {total_files}")
+            header_lines.append("=" * 80)
+            header_lines.append("")
+        content_parts = ["\n".join(header_lines)]
+        # Process each file
+        for i, (file_path, content, file_size) in enumerate(files_data):
+            progress(0.3 + (0.5 * i / total_files), desc=f"Processing file {i+1}/{total_files}")
+            # Clean content if requested
+            if remove_comments:
+                content = clean_code_content(content, file_path)
+            # Add file header
+            file_header = f"\n{'-' * 60}\n"
+            file_header += f"FILE: {file_path}\n"
+            file_header += f"SIZE: {file_size:,} bytes\n"
+            file_header += f"LANGUAGE: {get_file_language(file_path)}\n"
+            file_header += f"{'-' * 60}\n\n"
+            # Add content
+            file_content = file_header + content + "\n\n"
+            # Check if adding this file would exceed chunk size
+            if len("\n".join(content_parts + [file_content])) > chunk_size:
+                # Save current chunk
+                yield "\n".join(content_parts), generate_stats(processed_files, total_tokens, total_chars, total_files), "success"
+                # Start new chunk
+                content_parts = [file_header + "\n".join(header_lines)]
+            content_parts.append(file_content)
+            processed_files += 1
+            total_chars += len(content)
+            total_tokens += estimate_tokens(content)
+        progress(0.9, desc="Finalizing...")
+        # Final content
+        final_content = "\n".join(content_parts)
+        # Add footer
+        if include_metadata:
+            footer = f"\n{'=' * 80}\n"
+            footer += f"SUMMARY:\n"
+            footer += f"- Files processed: {processed_files}\n"
+            footer += f"- Total characters: {total_chars:,}\n"
+            footer += f"- Estimated tokens: {total_tokens:,}\n"
+            footer += f"- Repository: {repo_info.get('full_name', repo_path)}\n"
+            footer += f"{'=' * 80}\n"
+            final_content += footer
+        progress(1.0, desc="Complete!")
+        return final_content, generate_stats(processed_files, total_tokens, total_chars, total_files), "success"
+    except Exception as e:
+        error_msg = f"❌ Error: {str(e)}"
+        return "", error_msg, "error"
+def generate_stats(files_processed: int, tokens: int, chars: int, total_files: int) -> str:
+    """Generate statistics HTML"""
+    stats_html = f"""
+    <div class="file-stats">
+        <h3>📊 Processing Statistics</h3>
+        <p><strong>Files Processed:</strong> {files_processed:,} / {total_files:,}</p>
+        <p><strong>Total Characters:</strong> {chars:,}</p>
+        <p><strong>Estimated Tokens:</strong> {tokens:,}</p>
+        <p><strong>Average Tokens per File:</strong> {tokens // max(files_processed, 1):,}</p>
+    </div>
+    """
+    return stats_html
+def download_repo_locally(repo_url: str, token: str = "") -> str:
+    """Download repository as ZIP for local processing"""
+    try:
+        repo_type, repo_path = validate_repo_url(repo_url)
+        if repo_type == "github":
+            return download_repo_as_zip(f"github.com/{repo_path}", token)
+        else:
+            return download_repo_as_zip(f"huggingface.co/{repo_path}", token)
+    except Exception as e:
+        return f"Error downloading repository: {str(e)}"
+# Create Gradio interface
+def create_interface():
+    with gr.Blocks(
+        title="Repo-to-Text Converter",
+        theme=gr.themes.Soft(),
+        css=css
+    ) as demo:
+        gr.Markdown("""
+        # 📚 Repository to Text Converter
+        Convert GitHub or Hugging Face repositories into formatted text files perfect for LLM training.
+        **Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)**
+        """)
+        with gr.Row():
+            with gr.Column(scale=2):
+                # Input section
+                gr.Markdown("## 📥 Repository Input")
+                repo_url = gr.Textbox(
+                    label="Repository URL",
+                    placeholder="https://github.com/username/repo or https://huggingface.co/username/repo",
+                    lines=2
+                )
+                token = gr.Textbox(
+                    label="Access Token (Optional)",
+                    placeholder="GitHub token or Hugging Face token for private repos",
+                    type="password"
+                )
+                with gr.Accordion("🔧 Advanced Options", open=False):
+                    include_patterns = gr.Textbox(
+                        label="Include Patterns (comma-separated)",
+                        placeholder="*.py,*.md,src/**/*.py",
+                        info="Only include files matching these patterns"
+                    )
+                    exclude_patterns = gr.Textbox(
+                        label="Exclude Patterns (comma-separated)",
+                        placeholder="*.git*,*.log,node_modules/**",
+                        value="*.git*,*.log,node_modules/**,__pycache__/**,.DS_Store"
+                    )
+                    max_file_size = gr.Slider(
+                        minimum=1,
+                        maximum=100,
+                        value=10,
+                        step=1,
+                        label="Max File Size (MB)",
+                        info="Files larger than this will be skipped"
+                    )
+                    chunk_size = gr.Slider(
+                        minimum=1000,
+                        maximum=100000,
+                        value=50000,
+                        step=1000,
+                        label="Chunk Size (characters)",
+                        info="Split output into chunks of this size"
+                    )
+                    include_metadata = gr.Checkbox(
+                        value=True,
+                        label="Include Metadata",
+                        info="Add repository information and statistics"
+                    )
+                    remove_comments = gr.Checkbox(
+                        value=False,
+                        label="Remove Comments",
+                        info="Strip comments from code files (experimental)"
+                    )
+                process_btn = gr.Button(
+                    "🚀 Process Repository",
+                    variant="primary",
+                    size="lg"
+                )
+                download_btn = gr.Button(
+                    "⬇️ Download as ZIP",
+                    variant="secondary"
+                )
+            with gr.Column(scale=1):
+                # Info section
+                gr.Markdown("## ℹ️ Information")
+                gr.Markdown("""
+                ### Supported Platforms:
+                - ✅ GitHub (public and private)
+                - ✅ Hugging Face (public and private)
+                ### Supported File Types:
+                - Code files (.py, .js, .java, .cpp, etc.)
+                - Documentation (.md, .txt, .rst)
+                - Configuration files (.json, .yaml, .toml)
+                - And many more!
+                ### Features:
+                - 🔄 Chunked output for large repos
+                - 📊 Token estimation
+                - 🎯 Pattern-based file filtering
+                - 🧹 Optional comment removal
+                """)
+        # Output section
+        gr.Markdown("## 📤 Output")
+        with gr.Row():
+            stats_display = gr.HTML(label="Statistics")
+        output_text = gr.Textbox(
+            label="Generated Text",
+            lines=20,
+            max_lines=50,
+            show_copy_button=True,
+            interactive=True
+        )
+        status_display = gr.HTML()
+        # Event handlers
+        process_btn.click(
+            fn=process_repository,
+            inputs=[
+                repo_url,
+                token,
+                include_patterns,
+                exclude_patterns,
+                max_file_size,
+                chunk_size,
+                include_metadata,
+                remove_comments
+            ],
+            outputs=[output_text, stats_display, status_display]
+        )
+        download_btn.click(
+            fn=download_repo_locally,
+            inputs=[repo_url, token],
+            outputs=gr.File(label="Downloaded Repository")
+        )
+        # Examples
+        gr.Markdown("## 🎯 Examples")
+        gr.Examples(
+            examples=[
+                [
+                    "https://github.com/gradio-app/gradio",
+                    "",
+                    "*.py,*.md",
+                    "",
+                    10,
+                    50000,
+                    True,
+                    False
+                ],
+                [
+                    "https://huggingface.co/huggingface/transformers",
+                    "",
+                    "*.py,*.md,*.rst",
+                    "tests/**,docs/**",
+                    5,
+                    30000,
+                    True,
+                    False
+                ]
+            ],
+            inputs=[
+                repo_url,
+                token,
+                include_patterns,
+                exclude_patterns,
+                max_file_size,
+                chunk_size,
+                include_metadata,
+                remove_comments
+            ]
+        )
+    return demo
+if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch(
+        share=True,
+        show_error=True,
+        show_tips=True
+    )

config.py ADDED Viewed

	@@ -0,0 +1,238 @@

+# Configuration constants
+# API endpoints
+GITHUB_API_BASE = "https://api.github.com"
+HF_API_BASE = "https://huggingface.co"
+# Supported file extensions for text processing
+SUPPORTED_EXTENSIONS = {
+    # Programming languages
+    '.py', '.js', '.ts', '.jsx', '.tsx', '.java', '.cpp', '.c', '.cs', '.go', '.rs',
+    '.php', '.rb', '.swift', '.kt', '.scala', '.r', '.m', '.sh', '.bash', '.zsh',
+    '.fish', '.ps1', '.bat', '.sql', '.html', '.htm', '.xml', '.css', '.scss',
+    '.sass', '.less', '.json', '.yaml', '.yml', '.toml', '.ini', '.cfg', '.conf',
+    '.md', '.rst', '.txt', '.log', '.dockerfile', '.gitignore', '.gitattributes',
+    '.editorconfig', '.eslintrc', '.prettierrc', '.babelrc', '.tsconfig',
+    # Configuration files
+    '.env', '.env.example', '.env.local', '.env.development', '.env.production',
+    'package.json', 'package-lock.json', 'yarn.lock', 'pnpm-lock.yaml',
+    'requirements.txt', 'Pipfile', 'poetry.lock', 'pyproject.toml',
+    'Cargo.toml', 'Cargo.lock', 'go.mod', 'go.sum', 'composer.json',
+    'composer.lock', 'Gemfile', 'Gemfile.lock', 'pom.xml', 'build.gradle',
+    'CMakeLists.txt', 'Makefile', 'Dockerfile', 'docker-compose.yml',
+    # Documentation
+    '.md', '.rst', '.txt', '.adoc', '.tex', '.bib',
+    # Data formats
+    '.json', '.yaml', '.yml', '.toml', '.ini', '.cfg', '.conf',
+    '.csv', '.tsv', '.xml', '.rss', '.atom',
+    # Scripts
+    '.sh', '.bash', '.zsh', '.fish', '.ps1', '.bat', '.cmd',
+    '.py', '.pl', '.rb', '.lua', '.tcl', '.awk', '.sed',
+}
+# Size limits
+MAX_FILE_SIZE = 10 * 1024 * 1024  # 10MB default
+MAX_TOTAL_SIZE = 100 * 1024 * 1024  # 100MB default
+CHUNK_SIZE = 50000  # Characters per chunk
+# File patterns to exclude by default
+DEFAULT_EXCLUDE_PATTERNS = [
+    "*.git*",
+    "*.log",
+    "node_modules/**",
+    "__pycache__/**",
+    ".DS_Store",
+    "Thumbs.db",
+    "*.tmp",
+    "*.temp",
+    "*.swp",
+    "*.swo",
+    "*~",
+    ".vscode/**",
+    ".idea/**",
+    "*.pyc",
+    "*.pyo",
+    "*.pyd",
+    ".Python",
+    "build/**",
+    "dist/**",
+    "*.egg-info/**",
+    ".pytest_cache/**",
+    ".coverage",
+    "htmlcov/**",
+    ".tox/**",
+    "*.cover",
+    "coverage.xml",
+    "*.cover",
+    ".hypothesis/**",
+    ".mypy_cache/**",
+    "dmypy.json",
+    dmypy.json",
+    ".pytest_cache/**",
+    "nosetests.xml",
+    "coverage.xml",
+    "*.cover",
+    ".hypothesis/**",
+    ".cache/**",
+    "*.pid",
+    "*.seed",
+    "*.pid.lock",
+    ".nyc_output",
+    ".grunt",
+    ".bower",
+    ".lock-wscript",
+    "build/Release",
+    "jspm_packages/",
+    "typings",
+    ".npm",
+    ".eslintcache",
+    ".stylelintcache",
+    "*.tsbuildinfo",
+    ".rsync_user",
+    ".vscode-test",
+]
+# File patterns to include by default
+DEFAULT_INCLUDE_PATTERNS = [
+    "*.py",
+    "*.js",
+    "*.ts",
+    "*.jsx",
+    "*.tsx",
+    "*.java",
+    "*.cpp",
+    "*.c",
+    "*.cs",
+    "*.go",
+    "*.rs",
+    "*.php",
+    "*.rb",
+    "*.swift",
+    "*.kt",
+    "*.scala",
+    "*.r",
+    "*.m",
+    "*.sh",
+    "*.bash",
+    "*.zsh",
+    "*.fish",
+    "*.ps1",
+    "*.bat",
+    "*.sql",
+    "*.html",
+    "*.htm",
+    "*.xml",
+    "*.css",
+    "*.scss",
+    "*.sass",
+    "*.less",
+    "*.json",
+    "*.yaml",
+    "*.yml",
+    "*.toml",
+    "*.ini",
+    "*.cfg",
+    "*.conf",
+    "*.md",
+    "*.rst",
+    "*.txt",
+    "*.dockerfile",
+    "*.gitignore",
+    "*.gitattributes",
+    "*.editorconfig",
+    "*.eslintrc",
+    "*.prettierrc",
+    "*.babelrc",
+    "*.tsconfig",
+    "package.json",
+    "requirements.txt",
+    "Pipfile",
+    "poetry.lock",
+    "pyproject.toml",
+    "Cargo.toml",
+    "go.mod",
+    "composer.json",
+    "Gemfile",
+    "pom.xml",
+    "build.gradle",
+    "CMakeLists.txt",
+    "Makefile",
+    "Dockerfile",
+    "docker-compose.yml",
+]
+# Language comment patterns for cleaning
+COMMENT_PATTERNS = {
+    'python': [r'#.*$', r'""".*?"""', r"'''.*?'''"],
+    'javascript': [r'//.*$', r'/\*.*?\*/'],
+    'java': [r'//.*$', r'/\*.*?\*/'],
+    'cpp': [r'//.*$', r'/\*.*?\*/'],
+    'c': [r'//.*$', r'/\*.*?\*/'],
+    'cs': [r'//.*$', r'/\*.*?\*/'],
+    'go': [r'//.*$', r'/\*.*?\*/'],
+    'rs': [r'//.*$', r'/\*.*?\*/'],
+    'php': [r'//.*$', r'#.*$', r'/\*.*?\*/'],
+    'ruby': [r'#.*$', r'=begin.*?=end'],
+    'shell': [r'#.*$'],
+    'sql': [r'--.*$', r'/\*.*?\*/'],
+    'html': [r'<!--.*?-->'],
+    'xml': [r'<!--.*?-->'],
+    'css': [r'/\*.*?\*/'],
+}
+# Token estimation multipliers for different languages
+TOKEN_MULTIPLIERS = {
+    'python': 0.25,
+    'javascript': 0.3,
+    'java': 0.25,
+    'cpp': 0.25,
+    'c': 0.25,
+    'cs': 0.25,
+    'go': 0.25,
+    'rs': 0.25,
+    'php': 0.3,
+    'ruby': 0.25,
+    'shell': 0.3,
+    'sql': 0.25,
+    'html': 0.2,
+    'xml': 0.2,
+    'css': 0.25,
+    'json': 0.15,
+    'yaml': 0.2,
+    'markdown': 0.2,
+    'text': 0.25,
+    'default': 0.25,
+}
+# Rate limiting
+MAX_REQUESTS_PER_MINUTE = 60
+REQUEST_TIMEOUT = 30
+# UI Configuration
+THEME_COLORS = {
+    'primary': '#3070f0',
+    'secondary': '#64748b',
+    'success': '#10b981',
+    'warning': '#f59e0b',
+    'error': '#ef4444',
+    'background': '#ffffff',
+    'surface': '#f8fafc',
+    'text': '#1e293b',
+    'text_secondary': '#64748b',
+}
+# Progress tracking
+PROGRESS_STEPS = [
+    (0.0, "Initializing..."),
+    (0.1, "Fetching repository information..."),
+    (0.2, "Scanning files..."),
+    (0.3, "Processing files..."),
+    (0.5, "Analyzing content..."),
+    (0.7, "Generating output..."),
+    (0.9, "Finalizing..."),
+    (1.0, "Complete!"),
+]

models.py ADDED Viewed

	@@ -0,0 +1,226 @@

+import requests
+import base64
+import json
+import zipfile
+import io
+import os
+from typing import List, Dict, Tuple, Optional
+from pathlib import Path
+import re
+from utils import matches_patterns, is_binary_file, format_file_size
+from config import GITHUB_API_BASE, HF_API_BASE
+def process_github_repo(
+    repo_path: str,
+    token: str,
+    include_patterns: List[str],
+    exclude_patterns: List[str],
+    max_file_size: int
+) -> Tuple[List[Tuple[str, str, int]], Dict]:
+    """Process GitHub repository and return file contents"""
+    headers = {}
+    if token:
+        headers['Authorization'] = f'token {token}'
+    # Get repository info
+    repo_url = f"{GITHUB_API_BASE}/repos/{repo_path}"
+    repo_response = requests.get(repo_url, headers=headers)
+    if repo_response.status_code != 200:
+        raise Exception(f"Failed to fetch repository info: {repo_response.json().get('message', 'Unknown error')}")
+    repo_info = repo_response.json()
+    # Get all files recursively
+    files_data = []
+    contents_queue = [""]
+    while contents_queue:
+        current_path = contents_queue.pop(0)
+        # Get directory contents
+        contents_url = f"{GITHUB_API_BASE}/repos/{repo_path}/contents/{current_path}"
+        contents_response = requests.get(contents_url, headers=headers)
+        if contents_response.status_code != 200:
+            continue
+        contents = contents_response.json()
+        if isinstance(contents, dict):
+            # Single file
+            contents = [contents]
+        for item in contents:
+            item_path = f"{current_path}/{item['name']}" if current_path else item['name']
+            if item['type'] == 'dir':
+                contents_queue.append(item_path)
+            elif item['type'] == 'file':
+                # Check if file matches patterns
+                if not matches_patterns(item_path, include_patterns, exclude_patterns):
+                    continue
+                # Check file size
+                if item['size'] > max_file_size:
+                    continue
+                # Get file content
+                try:
+                    file_url = item['url']
+                    file_response = requests.get(file_url, headers=headers)
+                    if file_response.status_code == 200:
+                        file_data = file_response.json()
+                        content = base64.b64decode(file_data['content']).decode('utf-8', errors='ignore')
+                        # Skip binary files
+                        if is_binary_file(content, item_path):
+                            continue
+                        files_data.append((item_path, content, item['size']))
+                except Exception as e:
+                    print(f"Error processing file {item_path}: {e}")
+                    continue
+    return files_data, repo_info
+def process_huggingface_repo(
+    repo_path: str,
+    token: str,
+    include_patterns: List[str],
+    exclude_patterns: List[str],
+    max_file_size: int
+) -> Tuple[List[Tuple[str, str, int]], Dict]:
+    """Process Hugging Face repository and return file contents"""
+    headers = {}
+    if token:
+        headers['Authorization'] = f'Bearer {token}'
+    # Get repository info
+    repo_url = f"{HF_API_BASE}/api/models/{repo_path}"
+    repo_response = requests.get(repo_url, headers=headers)
+    if repo_response.status_code != 200:
+        raise Exception(f"Failed to fetch repository info: {repo_response.json().get('error', 'Unknown error')}")
+    repo_info = repo_response.json()
+    # Get repository tree
+    tree_url = f"{HF_API_BASE}/api/models/{repo_path}/tree/main"
+    tree_response = requests.get(tree_url, headers=headers)
+    if tree_response.status_code != 200:
+        raise Exception(f"Failed to fetch repository tree: {tree_response.json().get('error', 'Unknown error')}")
+    tree_data = tree_response.json()
+    files_data = []
+    def process_tree_item(item, current_path=""):
+        if isinstance(item, list):
+            for subitem in item:
+                process_tree_item(subitem, current_path)
+        elif isinstance(item, dict):
+            item_path = f"{current_path}/{item['path']}" if current_path else item['path']
+            if item['type'] == 'directory':
+                # Get directory contents
+                dir_url = f"{HF_API_BASE}/api/models/{repo_path}/tree/main/{item_path}"
+                dir_response = requests.get(dir_url, headers=headers)
+                if dir_response.status_code == 200:
+                    process_tree_item(dir_response.json(), item_path)
+            elif item['type'] == 'file':
+                # Check if file matches patterns
+                if not matches_patterns(item_path, include_patterns, exclude_patterns):
+                    return
+                # Check file size
+                if item.get('size', 0) > max_file_size:
+                    return
+                # Get file content
+                try:
+                    raw_url = f"https://huggingface.co/{repo_path}/raw/main/{item_path}"
+                    file_response = requests.get(raw_url, headers=headers)
+                    if file_response.status_code == 200:
+                        content = file_response.text
+                        # Skip binary files
+                        if is_binary_file(content, item_path):
+                            return
+                        files_data.append((item_path, content, len(content)))
+                except Exception as e:
+                    print(f"Error processing file {item_path}: {e}")
+                    return
+    process_tree_item(tree_data)
+    return files_data, repo_info
+def download_repo_as_zip(repo_url: str, token: str) -> str:
+    """Download repository as ZIP file"""
+    if "github.com" in repo_url:
+        # GitHub ZIP URL
+        if token:
+            headers = {'Authorization': f'token {token}'}
+            zip_url = repo_url.replace("github.com", "api.github.com/repos") + "/zipball/main"
+        else:
+            headers = {}
+            zip_url = repo_url.replace("github.com", "codeload.github.com") + "/zip/main"
+    elif "huggingface.co" in repo_url:
+        # Hugging Face ZIP URL
+        headers = {}
+        if token:
+            headers['Authorization'] = f'Bearer {token}'
+        zip_url = repo_url.replace("huggingface.co", "huggingface.co") + "/resolve/main?download=true"
+    else:
+        raise ValueError("Unsupported repository URL")
+    response = requests.get(zip_url, headers=headers, stream=True)
+    if response.status_code != 200:
+        raise Exception(f"Failed to download repository: {response.status_code}")
+    # Save to temporary file
+    temp_path = f"/tmp/repo_{hash(repo_url)}.zip"
+    with open(temp_path, 'wb') as f:
+        for chunk in response.iter_content(chunk_size=8192):
+            f.write(chunk)
+    return temp_path
+def extract_repo_info(repo_url: str, repo_type: str) -> Dict:
+    """Extract basic repository information"""
+    if repo_type == "github":
+        # Extract owner and repo name
+        match = re.search(r'github\.com/([^/]+)/([^/]+)', repo_url)
+        if match:
+            return {
+                'owner': match.group(1),
+                'repo': match.group(2),
+                'full_name': f"{match.group(1)}/{match.group(2)}",
+                'url': repo_url
+            }
+    elif repo_type == "huggingface":
+        # Extract owner and repo name
+        match = re.search(r'huggingface\.co/([^/]+)/([^/]+)', repo_url)
+        if match:
+            return {
+                'owner': match.group(1),
+                'repo': match.group(2),
+                'full_name': f"{match.group(1)}/{match.group(2)}",
+                'url': repo_url
+            }
+    return {'url': repo_url}

requirements.txt ADDED Viewed

	@@ -0,0 +1,45 @@

+gradio>=4.0.0
+requests>=2.31.0
+python-multipart>=0.0.6
+pathlib>=1.0.1
+re>=2.2.1
+hashlib>=20081119
+zipfile>=0.5
+io>=0.1
+datetime>=4.3
+mimetypes>=0.1
+fnmatch>=2.4.3
+base64>=0.1
+json>=2.0.9
+This Gradio application provides a comprehensive solution for converting GitHub or Hugging Face repositories into text files suitable for LLM training. Here are the key features:
+## 🚀 Main Features:
+1. **Multi-Platform Support**: Works with both GitHub and Hugging Face repositories
+2. **Smart File Filtering**: Include/exclude patterns to process only relevant files
+3. **Token Estimation**: Provides rough token counts for training planning
+4. **Chunked Output**: Splits large repositories into manageable chunks
+5. **Comment Removal**: Optional comment stripping for cleaner training data
+6. **Binary File Detection**: Automatically skips binary files
+7. **Language Detection**: Identifies programming languages for better organization
+8. **Progress Tracking**: Real-time progress updates during processing
+## 🛠️ Advanced Options:
+- File size limits to prevent processing huge files
+- Pattern-based filtering (glob patterns supported)
+- Chunk size customization
+- Metadata inclusion
+- Private repository support with tokens
+- ZIP download option
+## 📊 Output Features:
+- Repository metadata and statistics
+- File headers with path, size, and language info
+- Token and character counts
+- Formatted, readable output structure
+- Error handling and status messages
+The application is designed to handle repositories of various sizes while providing useful feedback and statistics about the processed content. It's perfect for preparing code repositories for LLM fine-tuning or analysis.

utils.py ADDED Viewed

	@@ -0,0 +1,365 @@

+import re
+import hashlib
+from typing import List, Dict, Optional
+import mimetypes
+def clean_code_content(content: str, file_path: str) -> str:
+    """Remove comments from code files while preserving structure"""
+    ext = file_path.split('.')[-1].lower()
+    # Language-specific comment patterns
+    comment_patterns = {
+        'py': [
+            (r'#.*$', ''),  # Single line comments
+            (r'""".*?"""', '', re.DOTALL),  # Triple quotes
+            (r"'''.*?'''", '', re.DOTALL),
+        ],
+        'js': [
+            (r'//.*$', ''),  # Single line comments
+            (r'/\*.*?\*/', '', re.DOTALL),  # Multi-line comments
+        ],
+        'java': [
+            (r'//.*$', ''),
+            (r'/\*.*?\*/', '', re.DOTALL),
+        ],
+        'cpp': [
+            (r'//.*$', ''),
+            (r'/\*.*?\*/', '', re.DOTALL),
+        ],
+        'c': [
+            (r'//.*$', ''),
+            (r'/\*.*?\*/', '', re.DOTALL),
+        ],
+        'cs': [
+            (r'//.*$', ''),
+            (r'/\*.*?\*/', '', re.DOTALL),
+        ],
+        'go': [
+            (r'//.*$', ''),
+            (r'/\*.*?\*/', '', re.DOTALL),
+        ],
+        'rs': [
+            (r'//.*$', ''),
+            (r'/\*.*?\*/', '', re.DOTALL),
+        ],
+        'php': [
+            (r'//.*$', ''),
+            (r'#.*$', ''),
+            (r'/\*.*?\*/', '', re.DOTALL),
+        ],
+        'rb': [
+            (r'#.*$', ''),
+            (r'=begin.*?=end', '', re.DOTALL),
+        ],
+        'sh': [
+            (r'#.*$', ''),
+        ],
+        'sql': [
+            (r'--.*$', ''),
+            (r'/\*.*?\*/', '', re.DOTALL),
+        ],
+        'html': [
+            (r'<!--.*?-->', '', re.DOTALL),
+        ],
+        'xml': [
+            (r'<!--.*?-->', '', re.DOTALL),
+        ],
+        'css': [
+            (r'/\*.*?\*/', '', re.DOTALL),
+        ],
+    }
+    if ext in comment_patterns:
+        content = content.strip()
+        for pattern, replacement, *flags in comment_patterns[ext]:
+            flags = flags[0] if flags else 0
+            content = re.sub(pattern, replacement, content, flags=flags)
+        # Clean up extra whitespace
+        content = re.sub(r'\n\s*\n\s*\n', '\n\n', content)
+        content = content.strip()
+    return content
+def get_file_language(file_path: str) -> str:
+    """Determine programming language from file extension"""
+    ext = file_path.split('.')[-1].lower()
+    language_map = {
+        'py': 'Python',
+        'js': 'JavaScript',
+        'ts': 'TypeScript',
+        'jsx': 'React JSX',
+        'tsx': 'React TSX',
+        'java': 'Java',
+        'cpp': 'C++',
+        'c': 'C',
+        'cs': 'C#',
+        'go': 'Go',
+        'rs': 'Rust',
+        'php': 'PHP',
+        'rb': 'Ruby',
+        'swift': 'Swift',
+        'kt': 'Kotlin',
+        'scala': 'Scala',
+        'r': 'R',
+        'm': 'Objective-C',
+        'sh': 'Shell',
+        'bash': 'Bash',
+        'zsh': 'Zsh',
+        'fish': 'Fish',
+        'ps1': 'PowerShell',
+        'bat': 'Batch',
+        'sql': 'SQL',
+        'html': 'HTML',
+        'htm': 'HTML',
+        'xml': 'XML',
+        'css': 'CSS',
+        'scss': 'SCSS',
+        'sass': 'SASS',
+        'less': 'LESS',
+        'json': 'JSON',
+        'yaml': 'YAML',
+        'yml': 'YAML',
+        'toml': 'TOML',
+        'ini': 'INI',
+        'cfg': 'Config',
+        'conf': 'Config',
+        'md': 'Markdown',
+        'rst': 'reStructuredText',
+        'txt': 'Text',
+        'log': 'Log',
+        'dockerfile': 'Docker',
+        'docker': 'Docker',
+        'gitignore': 'Git',
+        'gitattributes': 'Git',
+        'editorconfig': 'EditorConfig',
+        'eslintrc': 'ESLint',
+        'prettierrc': 'Prettier',
+        'babelrc': 'Babel',
+        'tsconfig': 'TypeScript',
+        'package': 'NPM',
+        'lock': 'Lock',
+        'requirements': 'Python',
+        'pipfile': 'Python',
+        'poetry': 'Python',
+        'makefile': 'Make',
+        'cmake': 'CMake',
+        'gradle': 'Gradle',
+        'pom': 'Maven',
+        'sbt': 'SBT',
+        'vue': 'Vue',
+        'svelte': 'Svelte',
+        'elm': 'Elm',
+        'pug': 'Pug',
+        'haml': 'Haml',
+        'erb': 'ERB',
+        'ejs': 'EJS',
+        'twig': 'Twig',
+        'liquid': 'Liquid',
+        'handlebars': 'Handlebars',
+        'mustache': 'Mustache',
+        'jinja': 'Jinja',
+        'tex': 'LaTeX',
+        'bib': 'BibTeX',
+        'plt': 'Gnuplot',
+        'dot': 'Graphviz',
+        'mermaid': 'Mermaid',
+        'drawio': 'DrawIO',
+        'puml': 'PlantUML',
+        'wsdl': 'WSDL',
+        'xsd': 'XSD',
+        'xslt': 'XSLT',
+        'graphql': 'GraphQL',
+        'proto': 'Protocol Buffers',
+        'avro': 'Avro',
+        'parquet': 'Parquet',
+        'arrow': 'Arrow',
+        'feather': 'Feather',
+        'hdf5': 'HDF5',
+        'netcdf': 'NetCDF',
+        'matlab': 'MATLAB',
+        'mex': 'MATLAB',
+        'fig': 'MATLAB',
+        'slx': 'Simulink',
+        'simulink': 'Simulink',
+        'labview': 'LabVIEW',
+        'vi': 'LabVIEW',
+        'lvproj': 'LabVIEW',
+        'lvlib': 'LabVIEW',
+        'stata': 'Stata',
+        'do': 'Stata',
+        'ado': 'Stata',
+        'spss': 'SPSS',
+        'sav': 'SPSS',
+        'sas': 'SAS',
+        's7dat': 'SAS',
+        's7bdat': 'SAS',
+        'xpt': 'SAS',
+        'dta': 'Stata',
+        'rdata': 'R',
+        'rds': 'R',
+        'rda': 'R',
+        'jl': 'Julia',
+        'nim': 'Nim',
+        'zig': 'Zig',
+        'v': 'V',
+        'ada': 'Ada',
+        'adb': 'Ada',
+        'ads': 'Ada',
+        'pas': 'Pascal',
+        'pp': 'Pascal',
+        'dpr': 'Pascal',
+        'lpr': 'Pascal',
+        'dfm': 'Pascal',
+        'pl': 'Perl',
+        'pm': 'Perl',
+        't': 'Perl',
+        'pod': 'Perl',
+        'lua': 'Lua',
+        'moon': 'MoonScript',
+        'el': 'Emacs Lisp',
+        'elc': 'Emacs Lisp',
+        'elisp': 'Emacs Lisp',
+        'cl': 'Common Lisp',
+        'lisp': 'Common Lisp',
+        'lsp': 'Common Lisp',
+        'fasl': 'Common Lisp',
+        'ss': 'Scheme',
+        'scm': 'Scheme',
+        'rkt': 'Scheme',
+        'sch': 'Scheme',
+        'fs': 'F#',
+        'fsi': 'F#',
+        'fsx': 'F#',
+        'fsscript': 'F#',
+        'ml': 'OCaml',
+        'mli': 'OCaml',
+        'll': 'LLVM',
+        'bc': 'LLVM',
+        'nim': 'Nim',
+        'nimble': 'Nim',
+        'nims': 'Nim',
+        'v': 'V',
+        'vsh': 'V',
+        'vv': 'V',
+        'vh': 'V',
+        'd': 'D',
+        'di': 'D',
+        'dart': 'Dart',
+        'groovy': 'Groovy',
+        'gvy': 'Groovy',
+        'gy': 'Groovy',
+        'gsh': 'Groovy',
+        'clj': 'Clojure',
+        'cljs': 'ClojureScript',
+        'cljc': 'Clojure',
+        'edn': 'Clojure',
+        'coffee': 'CoffeeScript',
+        'litcoffee': 'CoffeeScript',
+        'cjsx': 'Cjsx',
+        'iced': 'IcedCoffeeScript',
+        'hx': 'Haxe',
+        'hxml': 'Haxe',
+        'purs': 'PureScript',
+        'elm': 'Elm',
+        'p8': 'Pico-8',
+        'lua': 'Lua',
+        'moon': 'MoonScript',
+        'wren': 'Wren',
+        'earl-grey': 'Earl Grey',
+        'eg': 'Earl Grey',
+        'tsv': 'TSV',
+        'csv': 'CSV',
+    }
+    return language_map.get(ext, ext.upper())
+def estimate_tokens(text: str) -> int:
+    """Estimate token count (rough approximation)"""
+    # Simple heuristic: ~4 characters per token for English text
+    # For code, this varies more, but it's a reasonable approximation
+    return len(text) // 4
+def create_chunked_output(content: str, chunk_size: int) -> List[str]:
+    """Split content into chunks of specified size"""
+    chunks = []
+    current_chunk = ""
+    lines = content.split('\n')
+    for line in lines:
+        if len(current_chunk) + len(line) + 1 > chunk_size:
+            if current_chunk:
+                chunks.append(current_chunk)
+            current_chunk = line
+        else:
+            if current_chunk:
+                current_chunk += '\n' + line
+            else:
+                current_chunk = line
+    if current_chunk:
+        chunks.append(current_chunk)
+    return chunks
+def matches_patterns(file_path: str, include_patterns: List[str], exclude_patterns: List[str]) -> bool:
+    """Check if file matches include/exclude patterns"""
+    import fnmatch
+    # Check exclude patterns first
+    for pattern in exclude_patterns:
+        if fnmatch.fnmatch(file_path, pattern) or fnmatch.fnmatch(file_path, f"**/{pattern}"):
+            return False
+    # If no include patterns, include everything else
+    if not include_patterns:
+        return True
+    # Check include patterns
+    for pattern in include_patterns:
+        if fnmatch.fnmatch(file_path, pattern) or fnmatch.fnmatch(file_path, f"**/{pattern}"):
+            return True
+    return False
+def format_file_size(size_bytes: int) -> str:
+    """Format file size in human readable format"""
+    for unit in ['B', 'KB', 'MB', 'GB']:
+        if size_bytes < 1024.0:
+            return f"{size_bytes:.1f} {unit}"
+        size_bytes /= 1024.0
+    return f"{size_bytes:.1f} TB"
+def generate_file_hash(content: str) -> str:
+    """Generate SHA-256 hash of file content"""
+    return hashlib.sha256(content.encode()).hexdigest()[:16]
+def is_binary_file(content: str, file_path: str) -> bool:
+    """Check if file is binary"""
+    # Check file extension first
+    binary_extensions = {
+        'png', 'jpg', 'jpeg', 'gif', 'bmp', 'ico', 'svg', 'webp',
+        'mp3', 'mp4', 'avi', 'mov', 'wav', 'flac', 'ogg',
+        'zip', 'rar', 'tar', 'gz', '7z', 'bz2', 'xz',
+        'exe', 'dll', 'so', 'dylib',
+        'pdf', 'doc', 'docx', 'xls', 'xlsx', 'ppt', 'pptx',
+        'ttf', 'otf', 'woff', 'woff2', 'eot',
+        'bin', 'dat', 'db', 'sqlite', 'sqlite3',
+    }
+    ext = file_path.split('.')[-1].lower()
+    if ext in binary_extensions:
+        return True
+    # Check content for null bytes (indicator of binary)
+    if '\0' in content[:1024]:
+        return True
+    # Check if content has too many non-printable characters
+    printable_chars = sum(1 for c in content[:1024] if c.isprintable() or c in '\t\n\r')
+    if printable_chars / len(content[:1024]) < 0.7:
+        return True
+    return False