AiCoderv2 commited on
Commit
d79890b
·
verified ·
1 Parent(s): 72dbae2

Deploy Gradio app with multiple files

Browse files
Files changed (5) hide show
  1. app.py +441 -0
  2. config.py +238 -0
  3. models.py +226 -0
  4. requirements.txt +45 -0
  5. utils.py +365 -0
app.py ADDED
@@ -0,0 +1,441 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ import os
4
+ import base64
5
+ import json
6
+ import re
7
+ from pathlib import Path
8
+ from typing import List, Dict, Optional, Tuple
9
+ import zipfile
10
+ import io
11
+ from datetime import datetime
12
+ import math
13
+
14
+ from utils import (
15
+ clean_code_content,
16
+ get_file_language,
17
+ estimate_tokens,
18
+ create_chunked_output
19
+ )
20
+ from models import (
21
+ process_github_repo,
22
+ process_huggingface_repo,
23
+ download_repo_as_zip
24
+ )
25
+ from config import (
26
+ SUPPORTED_EXTENSIONS,
27
+ MAX_FILE_SIZE,
28
+ MAX_TOTAL_SIZE,
29
+ CHUNK_SIZE,
30
+ GITHUB_API_BASE,
31
+ HF_API_BASE
32
+ )
33
+
34
+ # CSS for better UI
35
+ css = """
36
+ .container {
37
+ max-width: 1200px;
38
+ margin: 0 auto;
39
+ }
40
+ .progress-bar {
41
+ height: 20px;
42
+ background: linear-gradient(90deg, #4CAF50, #45a049);
43
+ border-radius: 10px;
44
+ transition: width 0.3s ease;
45
+ }
46
+ .file-stats {
47
+ background: #f0f0f0;
48
+ padding: 10px;
49
+ border-radius: 5px;
50
+ margin: 10px 0;
51
+ }
52
+ .warning {
53
+ background: #fff3cd;
54
+ border: 1px solid #ffeaa7;
55
+ padding: 10px;
56
+ border-radius: 5px;
57
+ color: #856404;
58
+ }
59
+ .error {
60
+ background: #f8d7da;
61
+ border: 1px solid #f5c6cb;
62
+ padding: 10px;
63
+ border-radius: 5px;
64
+ color: #721c24;
65
+ }
66
+ .success {
67
+ background: #d4edda;
68
+ border: 1px solid #c3e6cb;
69
+ padding: 10px;
70
+ border-radius: 5px;
71
+ color: #155724;
72
+ }
73
+ """
74
+
75
+ def validate_repo_url(url: str) -> Tuple[str, str]:
76
+ """Validate and determine repository type and owner/name"""
77
+ url = url.strip()
78
+
79
+ # GitHub URL patterns
80
+ github_patterns = [
81
+ r'github\.com/([^/]+)/([^/]+?)(?:\.git)?/?$',
82
+ r'api\.github\.com/repos/([^/]+)/([^/]+)'
83
+ ]
84
+
85
+ # Hugging Face URL patterns
86
+ hf_patterns = [
87
+ r'huggingface\.co/([^/]+)/([^/]+?)(?:\.git)?/?$',
88
+ r'hf\.co/([^/]+)/([^/]+?)(?:\.git)?/?$'
89
+ ]
90
+
91
+ for pattern in github_patterns:
92
+ match = re.search(pattern, url)
93
+ if match:
94
+ return "github", f"{match.group(1)}/{match.group(2)}"
95
+
96
+ for pattern in hf_patterns:
97
+ match = re.search(pattern, url)
98
+ if match:
99
+ return "huggingface", f"{match.group(1)}/{match.group(2)}"
100
+
101
+ raise ValueError("Invalid repository URL. Please provide a valid GitHub or Hugging Face repository URL.")
102
+
103
+ def process_repository(
104
+ repo_url: str,
105
+ token: str = "",
106
+ include_patterns: str = "",
107
+ exclude_patterns: str = "",
108
+ max_file_size_mb: int = 10,
109
+ chunk_size: int = 50000,
110
+ include_metadata: bool = True,
111
+ remove_comments: bool = False,
112
+ progress=gr.Progress()
113
+ ) -> Tuple[str, str, str]:
114
+ """Main function to process repository and generate text file"""
115
+
116
+ try:
117
+ # Validate URL and get repo info
118
+ repo_type, repo_path = validate_repo_url(repo_url)
119
+
120
+ # Parse include/exclude patterns
121
+ include_list = [p.strip() for p in include_patterns.split(",") if p.strip()] if include_patterns else []
122
+ exclude_list = [p.strip() for p in exclude_patterns.split(",") if p.strip()] if exclude_patterns else []
123
+
124
+ progress(0.1, desc="Fetching repository information...")
125
+
126
+ # Process repository based on type
127
+ if repo_type == "github":
128
+ files_data, repo_info = process_github_repo(
129
+ repo_path,
130
+ token,
131
+ include_list,
132
+ exclude_list,
133
+ max_file_size_mb * 1024 * 1024
134
+ )
135
+ else: # huggingface
136
+ files_data, repo_info = process_huggingface_repo(
137
+ repo_path,
138
+ token,
139
+ include_list,
140
+ exclude_list,
141
+ max_file_size_mb * 1024 * 1024
142
+ )
143
+
144
+ if not files_data:
145
+ return "", "⚠️ No files found matching the criteria.", ""
146
+
147
+ progress(0.3, desc="Processing files...")
148
+
149
+ # Generate consolidated text
150
+ total_files = len(files_data)
151
+ processed_files = 0
152
+ total_tokens = 0
153
+ total_chars = 0
154
+
155
+ # Create header
156
+ header_lines = []
157
+ if include_metadata:
158
+ header_lines.append("=" * 80)
159
+ header_lines.append(f"REPOSITORY: {repo_info.get('full_name', repo_path)}")
160
+ header_lines.append(f"DESCRIPTION: {repo_info.get('description', 'No description')}")
161
+ header_lines.append(f"URL: {repo_url}")
162
+ header_lines.append(f"PROCESSED: {datetime.now().isoformat()}")
163
+ header_lines.append(f"TOTAL FILES: {total_files}")
164
+ header_lines.append("=" * 80)
165
+ header_lines.append("")
166
+
167
+ content_parts = ["\n".join(header_lines)]
168
+
169
+ # Process each file
170
+ for i, (file_path, content, file_size) in enumerate(files_data):
171
+ progress(0.3 + (0.5 * i / total_files), desc=f"Processing file {i+1}/{total_files}")
172
+
173
+ # Clean content if requested
174
+ if remove_comments:
175
+ content = clean_code_content(content, file_path)
176
+
177
+ # Add file header
178
+ file_header = f"\n{'-' * 60}\n"
179
+ file_header += f"FILE: {file_path}\n"
180
+ file_header += f"SIZE: {file_size:,} bytes\n"
181
+ file_header += f"LANGUAGE: {get_file_language(file_path)}\n"
182
+ file_header += f"{'-' * 60}\n\n"
183
+
184
+ # Add content
185
+ file_content = file_header + content + "\n\n"
186
+
187
+ # Check if adding this file would exceed chunk size
188
+ if len("\n".join(content_parts + [file_content])) > chunk_size:
189
+ # Save current chunk
190
+ yield "\n".join(content_parts), generate_stats(processed_files, total_tokens, total_chars, total_files), "success"
191
+ # Start new chunk
192
+ content_parts = [file_header + "\n".join(header_lines)]
193
+
194
+ content_parts.append(file_content)
195
+ processed_files += 1
196
+ total_chars += len(content)
197
+ total_tokens += estimate_tokens(content)
198
+
199
+ progress(0.9, desc="Finalizing...")
200
+
201
+ # Final content
202
+ final_content = "\n".join(content_parts)
203
+
204
+ # Add footer
205
+ if include_metadata:
206
+ footer = f"\n{'=' * 80}\n"
207
+ footer += f"SUMMARY:\n"
208
+ footer += f"- Files processed: {processed_files}\n"
209
+ footer += f"- Total characters: {total_chars:,}\n"
210
+ footer += f"- Estimated tokens: {total_tokens:,}\n"
211
+ footer += f"- Repository: {repo_info.get('full_name', repo_path)}\n"
212
+ footer += f"{'=' * 80}\n"
213
+ final_content += footer
214
+
215
+ progress(1.0, desc="Complete!")
216
+
217
+ return final_content, generate_stats(processed_files, total_tokens, total_chars, total_files), "success"
218
+
219
+ except Exception as e:
220
+ error_msg = f"❌ Error: {str(e)}"
221
+ return "", error_msg, "error"
222
+
223
+ def generate_stats(files_processed: int, tokens: int, chars: int, total_files: int) -> str:
224
+ """Generate statistics HTML"""
225
+ stats_html = f"""
226
+ <div class="file-stats">
227
+ <h3>📊 Processing Statistics</h3>
228
+ <p><strong>Files Processed:</strong> {files_processed:,} / {total_files:,}</p>
229
+ <p><strong>Total Characters:</strong> {chars:,}</p>
230
+ <p><strong>Estimated Tokens:</strong> {tokens:,}</p>
231
+ <p><strong>Average Tokens per File:</strong> {tokens // max(files_processed, 1):,}</p>
232
+ </div>
233
+ """
234
+ return stats_html
235
+
236
+ def download_repo_locally(repo_url: str, token: str = "") -> str:
237
+ """Download repository as ZIP for local processing"""
238
+ try:
239
+ repo_type, repo_path = validate_repo_url(repo_url)
240
+
241
+ if repo_type == "github":
242
+ return download_repo_as_zip(f"github.com/{repo_path}", token)
243
+ else:
244
+ return download_repo_as_zip(f"huggingface.co/{repo_path}", token)
245
+
246
+ except Exception as e:
247
+ return f"Error downloading repository: {str(e)}"
248
+
249
+ # Create Gradio interface
250
+ def create_interface():
251
+ with gr.Blocks(
252
+ title="Repo-to-Text Converter",
253
+ theme=gr.themes.Soft(),
254
+ css=css
255
+ ) as demo:
256
+
257
+ gr.Markdown("""
258
+ # 📚 Repository to Text Converter
259
+
260
+ Convert GitHub or Hugging Face repositories into formatted text files perfect for LLM training.
261
+
262
+ **Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)**
263
+ """)
264
+
265
+ with gr.Row():
266
+ with gr.Column(scale=2):
267
+ # Input section
268
+ gr.Markdown("## 📥 Repository Input")
269
+
270
+ repo_url = gr.Textbox(
271
+ label="Repository URL",
272
+ placeholder="https://github.com/username/repo or https://huggingface.co/username/repo",
273
+ lines=2
274
+ )
275
+
276
+ token = gr.Textbox(
277
+ label="Access Token (Optional)",
278
+ placeholder="GitHub token or Hugging Face token for private repos",
279
+ type="password"
280
+ )
281
+
282
+ with gr.Accordion("🔧 Advanced Options", open=False):
283
+ include_patterns = gr.Textbox(
284
+ label="Include Patterns (comma-separated)",
285
+ placeholder="*.py,*.md,src/**/*.py",
286
+ info="Only include files matching these patterns"
287
+ )
288
+
289
+ exclude_patterns = gr.Textbox(
290
+ label="Exclude Patterns (comma-separated)",
291
+ placeholder="*.git*,*.log,node_modules/**",
292
+ value="*.git*,*.log,node_modules/**,__pycache__/**,.DS_Store"
293
+ )
294
+
295
+ max_file_size = gr.Slider(
296
+ minimum=1,
297
+ maximum=100,
298
+ value=10,
299
+ step=1,
300
+ label="Max File Size (MB)",
301
+ info="Files larger than this will be skipped"
302
+ )
303
+
304
+ chunk_size = gr.Slider(
305
+ minimum=1000,
306
+ maximum=100000,
307
+ value=50000,
308
+ step=1000,
309
+ label="Chunk Size (characters)",
310
+ info="Split output into chunks of this size"
311
+ )
312
+
313
+ include_metadata = gr.Checkbox(
314
+ value=True,
315
+ label="Include Metadata",
316
+ info="Add repository information and statistics"
317
+ )
318
+
319
+ remove_comments = gr.Checkbox(
320
+ value=False,
321
+ label="Remove Comments",
322
+ info="Strip comments from code files (experimental)"
323
+ )
324
+
325
+ process_btn = gr.Button(
326
+ "🚀 Process Repository",
327
+ variant="primary",
328
+ size="lg"
329
+ )
330
+
331
+ download_btn = gr.Button(
332
+ "⬇️ Download as ZIP",
333
+ variant="secondary"
334
+ )
335
+
336
+ with gr.Column(scale=1):
337
+ # Info section
338
+ gr.Markdown("## ℹ️ Information")
339
+
340
+ gr.Markdown("""
341
+ ### Supported Platforms:
342
+ - ✅ GitHub (public and private)
343
+ - ✅ Hugging Face (public and private)
344
+
345
+ ### Supported File Types:
346
+ - Code files (.py, .js, .java, .cpp, etc.)
347
+ - Documentation (.md, .txt, .rst)
348
+ - Configuration files (.json, .yaml, .toml)
349
+ - And many more!
350
+
351
+ ### Features:
352
+ - 🔄 Chunked output for large repos
353
+ - 📊 Token estimation
354
+ - 🎯 Pattern-based file filtering
355
+ - 🧹 Optional comment removal
356
+ """)
357
+
358
+ # Output section
359
+ gr.Markdown("## 📤 Output")
360
+
361
+ with gr.Row():
362
+ stats_display = gr.HTML(label="Statistics")
363
+
364
+ output_text = gr.Textbox(
365
+ label="Generated Text",
366
+ lines=20,
367
+ max_lines=50,
368
+ show_copy_button=True,
369
+ interactive=True
370
+ )
371
+
372
+ status_display = gr.HTML()
373
+
374
+ # Event handlers
375
+ process_btn.click(
376
+ fn=process_repository,
377
+ inputs=[
378
+ repo_url,
379
+ token,
380
+ include_patterns,
381
+ exclude_patterns,
382
+ max_file_size,
383
+ chunk_size,
384
+ include_metadata,
385
+ remove_comments
386
+ ],
387
+ outputs=[output_text, stats_display, status_display]
388
+ )
389
+
390
+ download_btn.click(
391
+ fn=download_repo_locally,
392
+ inputs=[repo_url, token],
393
+ outputs=gr.File(label="Downloaded Repository")
394
+ )
395
+
396
+ # Examples
397
+ gr.Markdown("## 🎯 Examples")
398
+ gr.Examples(
399
+ examples=[
400
+ [
401
+ "https://github.com/gradio-app/gradio",
402
+ "",
403
+ "*.py,*.md",
404
+ "",
405
+ 10,
406
+ 50000,
407
+ True,
408
+ False
409
+ ],
410
+ [
411
+ "https://huggingface.co/huggingface/transformers",
412
+ "",
413
+ "*.py,*.md,*.rst",
414
+ "tests/**,docs/**",
415
+ 5,
416
+ 30000,
417
+ True,
418
+ False
419
+ ]
420
+ ],
421
+ inputs=[
422
+ repo_url,
423
+ token,
424
+ include_patterns,
425
+ exclude_patterns,
426
+ max_file_size,
427
+ chunk_size,
428
+ include_metadata,
429
+ remove_comments
430
+ ]
431
+ )
432
+
433
+ return demo
434
+
435
+ if __name__ == "__main__":
436
+ demo = create_interface()
437
+ demo.launch(
438
+ share=True,
439
+ show_error=True,
440
+ show_tips=True
441
+ )
config.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Configuration constants
2
+
3
+ # API endpoints
4
+ GITHUB_API_BASE = "https://api.github.com"
5
+ HF_API_BASE = "https://huggingface.co"
6
+
7
+ # Supported file extensions for text processing
8
+ SUPPORTED_EXTENSIONS = {
9
+ # Programming languages
10
+ '.py', '.js', '.ts', '.jsx', '.tsx', '.java', '.cpp', '.c', '.cs', '.go', '.rs',
11
+ '.php', '.rb', '.swift', '.kt', '.scala', '.r', '.m', '.sh', '.bash', '.zsh',
12
+ '.fish', '.ps1', '.bat', '.sql', '.html', '.htm', '.xml', '.css', '.scss',
13
+ '.sass', '.less', '.json', '.yaml', '.yml', '.toml', '.ini', '.cfg', '.conf',
14
+ '.md', '.rst', '.txt', '.log', '.dockerfile', '.gitignore', '.gitattributes',
15
+ '.editorconfig', '.eslintrc', '.prettierrc', '.babelrc', '.tsconfig',
16
+
17
+ # Configuration files
18
+ '.env', '.env.example', '.env.local', '.env.development', '.env.production',
19
+ 'package.json', 'package-lock.json', 'yarn.lock', 'pnpm-lock.yaml',
20
+ 'requirements.txt', 'Pipfile', 'poetry.lock', 'pyproject.toml',
21
+ 'Cargo.toml', 'Cargo.lock', 'go.mod', 'go.sum', 'composer.json',
22
+ 'composer.lock', 'Gemfile', 'Gemfile.lock', 'pom.xml', 'build.gradle',
23
+ 'CMakeLists.txt', 'Makefile', 'Dockerfile', 'docker-compose.yml',
24
+
25
+ # Documentation
26
+ '.md', '.rst', '.txt', '.adoc', '.tex', '.bib',
27
+
28
+ # Data formats
29
+ '.json', '.yaml', '.yml', '.toml', '.ini', '.cfg', '.conf',
30
+ '.csv', '.tsv', '.xml', '.rss', '.atom',
31
+
32
+ # Scripts
33
+ '.sh', '.bash', '.zsh', '.fish', '.ps1', '.bat', '.cmd',
34
+ '.py', '.pl', '.rb', '.lua', '.tcl', '.awk', '.sed',
35
+ }
36
+
37
+ # Size limits
38
+ MAX_FILE_SIZE = 10 * 1024 * 1024 # 10MB default
39
+ MAX_TOTAL_SIZE = 100 * 1024 * 1024 # 100MB default
40
+ CHUNK_SIZE = 50000 # Characters per chunk
41
+
42
+ # File patterns to exclude by default
43
+ DEFAULT_EXCLUDE_PATTERNS = [
44
+ "*.git*",
45
+ "*.log",
46
+ "node_modules/**",
47
+ "__pycache__/**",
48
+ ".DS_Store",
49
+ "Thumbs.db",
50
+ "*.tmp",
51
+ "*.temp",
52
+ "*.swp",
53
+ "*.swo",
54
+ "*~",
55
+ ".vscode/**",
56
+ ".idea/**",
57
+ "*.pyc",
58
+ "*.pyo",
59
+ "*.pyd",
60
+ ".Python",
61
+ "build/**",
62
+ "dist/**",
63
+ "*.egg-info/**",
64
+ ".pytest_cache/**",
65
+ ".coverage",
66
+ "htmlcov/**",
67
+ ".tox/**",
68
+ "*.cover",
69
+ "coverage.xml",
70
+ "*.cover",
71
+ ".hypothesis/**",
72
+ ".mypy_cache/**",
73
+ "dmypy.json",
74
+ dmypy.json",
75
+ ".pytest_cache/**",
76
+ "nosetests.xml",
77
+ "coverage.xml",
78
+ "*.cover",
79
+ ".hypothesis/**",
80
+ ".cache/**",
81
+ "*.pid",
82
+ "*.seed",
83
+ "*.pid.lock",
84
+ ".nyc_output",
85
+ ".grunt",
86
+ ".bower",
87
+ ".lock-wscript",
88
+ "build/Release",
89
+ "jspm_packages/",
90
+ "typings",
91
+ ".npm",
92
+ ".eslintcache",
93
+ ".stylelintcache",
94
+ "*.tsbuildinfo",
95
+ ".rsync_user",
96
+ ".vscode-test",
97
+ ]
98
+
99
+ # File patterns to include by default
100
+ DEFAULT_INCLUDE_PATTERNS = [
101
+ "*.py",
102
+ "*.js",
103
+ "*.ts",
104
+ "*.jsx",
105
+ "*.tsx",
106
+ "*.java",
107
+ "*.cpp",
108
+ "*.c",
109
+ "*.cs",
110
+ "*.go",
111
+ "*.rs",
112
+ "*.php",
113
+ "*.rb",
114
+ "*.swift",
115
+ "*.kt",
116
+ "*.scala",
117
+ "*.r",
118
+ "*.m",
119
+ "*.sh",
120
+ "*.bash",
121
+ "*.zsh",
122
+ "*.fish",
123
+ "*.ps1",
124
+ "*.bat",
125
+ "*.sql",
126
+ "*.html",
127
+ "*.htm",
128
+ "*.xml",
129
+ "*.css",
130
+ "*.scss",
131
+ "*.sass",
132
+ "*.less",
133
+ "*.json",
134
+ "*.yaml",
135
+ "*.yml",
136
+ "*.toml",
137
+ "*.ini",
138
+ "*.cfg",
139
+ "*.conf",
140
+ "*.md",
141
+ "*.rst",
142
+ "*.txt",
143
+ "*.dockerfile",
144
+ "*.gitignore",
145
+ "*.gitattributes",
146
+ "*.editorconfig",
147
+ "*.eslintrc",
148
+ "*.prettierrc",
149
+ "*.babelrc",
150
+ "*.tsconfig",
151
+ "package.json",
152
+ "requirements.txt",
153
+ "Pipfile",
154
+ "poetry.lock",
155
+ "pyproject.toml",
156
+ "Cargo.toml",
157
+ "go.mod",
158
+ "composer.json",
159
+ "Gemfile",
160
+ "pom.xml",
161
+ "build.gradle",
162
+ "CMakeLists.txt",
163
+ "Makefile",
164
+ "Dockerfile",
165
+ "docker-compose.yml",
166
+ ]
167
+
168
+ # Language comment patterns for cleaning
169
+ COMMENT_PATTERNS = {
170
+ 'python': [r'#.*$', r'""".*?"""', r"'''.*?'''"],
171
+ 'javascript': [r'//.*$', r'/\*.*?\*/'],
172
+ 'java': [r'//.*$', r'/\*.*?\*/'],
173
+ 'cpp': [r'//.*$', r'/\*.*?\*/'],
174
+ 'c': [r'//.*$', r'/\*.*?\*/'],
175
+ 'cs': [r'//.*$', r'/\*.*?\*/'],
176
+ 'go': [r'//.*$', r'/\*.*?\*/'],
177
+ 'rs': [r'//.*$', r'/\*.*?\*/'],
178
+ 'php': [r'//.*$', r'#.*$', r'/\*.*?\*/'],
179
+ 'ruby': [r'#.*$', r'=begin.*?=end'],
180
+ 'shell': [r'#.*$'],
181
+ 'sql': [r'--.*$', r'/\*.*?\*/'],
182
+ 'html': [r'<!--.*?-->'],
183
+ 'xml': [r'<!--.*?-->'],
184
+ 'css': [r'/\*.*?\*/'],
185
+ }
186
+
187
+ # Token estimation multipliers for different languages
188
+ TOKEN_MULTIPLIERS = {
189
+ 'python': 0.25,
190
+ 'javascript': 0.3,
191
+ 'java': 0.25,
192
+ 'cpp': 0.25,
193
+ 'c': 0.25,
194
+ 'cs': 0.25,
195
+ 'go': 0.25,
196
+ 'rs': 0.25,
197
+ 'php': 0.3,
198
+ 'ruby': 0.25,
199
+ 'shell': 0.3,
200
+ 'sql': 0.25,
201
+ 'html': 0.2,
202
+ 'xml': 0.2,
203
+ 'css': 0.25,
204
+ 'json': 0.15,
205
+ 'yaml': 0.2,
206
+ 'markdown': 0.2,
207
+ 'text': 0.25,
208
+ 'default': 0.25,
209
+ }
210
+
211
+ # Rate limiting
212
+ MAX_REQUESTS_PER_MINUTE = 60
213
+ REQUEST_TIMEOUT = 30
214
+
215
+ # UI Configuration
216
+ THEME_COLORS = {
217
+ 'primary': '#3070f0',
218
+ 'secondary': '#64748b',
219
+ 'success': '#10b981',
220
+ 'warning': '#f59e0b',
221
+ 'error': '#ef4444',
222
+ 'background': '#ffffff',
223
+ 'surface': '#f8fafc',
224
+ 'text': '#1e293b',
225
+ 'text_secondary': '#64748b',
226
+ }
227
+
228
+ # Progress tracking
229
+ PROGRESS_STEPS = [
230
+ (0.0, "Initializing..."),
231
+ (0.1, "Fetching repository information..."),
232
+ (0.2, "Scanning files..."),
233
+ (0.3, "Processing files..."),
234
+ (0.5, "Analyzing content..."),
235
+ (0.7, "Generating output..."),
236
+ (0.9, "Finalizing..."),
237
+ (1.0, "Complete!"),
238
+ ]
models.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import base64
3
+ import json
4
+ import zipfile
5
+ import io
6
+ import os
7
+ from typing import List, Dict, Tuple, Optional
8
+ from pathlib import Path
9
+ import re
10
+
11
+ from utils import matches_patterns, is_binary_file, format_file_size
12
+ from config import GITHUB_API_BASE, HF_API_BASE
13
+
14
+ def process_github_repo(
15
+ repo_path: str,
16
+ token: str,
17
+ include_patterns: List[str],
18
+ exclude_patterns: List[str],
19
+ max_file_size: int
20
+ ) -> Tuple[List[Tuple[str, str, int]], Dict]:
21
+ """Process GitHub repository and return file contents"""
22
+
23
+ headers = {}
24
+ if token:
25
+ headers['Authorization'] = f'token {token}'
26
+
27
+ # Get repository info
28
+ repo_url = f"{GITHUB_API_BASE}/repos/{repo_path}"
29
+ repo_response = requests.get(repo_url, headers=headers)
30
+
31
+ if repo_response.status_code != 200:
32
+ raise Exception(f"Failed to fetch repository info: {repo_response.json().get('message', 'Unknown error')}")
33
+
34
+ repo_info = repo_response.json()
35
+
36
+ # Get all files recursively
37
+ files_data = []
38
+ contents_queue = [""]
39
+
40
+ while contents_queue:
41
+ current_path = contents_queue.pop(0)
42
+
43
+ # Get directory contents
44
+ contents_url = f"{GITHUB_API_BASE}/repos/{repo_path}/contents/{current_path}"
45
+ contents_response = requests.get(contents_url, headers=headers)
46
+
47
+ if contents_response.status_code != 200:
48
+ continue
49
+
50
+ contents = contents_response.json()
51
+
52
+ if isinstance(contents, dict):
53
+ # Single file
54
+ contents = [contents]
55
+
56
+ for item in contents:
57
+ item_path = f"{current_path}/{item['name']}" if current_path else item['name']
58
+
59
+ if item['type'] == 'dir':
60
+ contents_queue.append(item_path)
61
+ elif item['type'] == 'file':
62
+ # Check if file matches patterns
63
+ if not matches_patterns(item_path, include_patterns, exclude_patterns):
64
+ continue
65
+
66
+ # Check file size
67
+ if item['size'] > max_file_size:
68
+ continue
69
+
70
+ # Get file content
71
+ try:
72
+ file_url = item['url']
73
+ file_response = requests.get(file_url, headers=headers)
74
+
75
+ if file_response.status_code == 200:
76
+ file_data = file_response.json()
77
+ content = base64.b64decode(file_data['content']).decode('utf-8', errors='ignore')
78
+
79
+ # Skip binary files
80
+ if is_binary_file(content, item_path):
81
+ continue
82
+
83
+ files_data.append((item_path, content, item['size']))
84
+
85
+ except Exception as e:
86
+ print(f"Error processing file {item_path}: {e}")
87
+ continue
88
+
89
+ return files_data, repo_info
90
+
91
+ def process_huggingface_repo(
92
+ repo_path: str,
93
+ token: str,
94
+ include_patterns: List[str],
95
+ exclude_patterns: List[str],
96
+ max_file_size: int
97
+ ) -> Tuple[List[Tuple[str, str, int]], Dict]:
98
+ """Process Hugging Face repository and return file contents"""
99
+
100
+ headers = {}
101
+ if token:
102
+ headers['Authorization'] = f'Bearer {token}'
103
+
104
+ # Get repository info
105
+ repo_url = f"{HF_API_BASE}/api/models/{repo_path}"
106
+ repo_response = requests.get(repo_url, headers=headers)
107
+
108
+ if repo_response.status_code != 200:
109
+ raise Exception(f"Failed to fetch repository info: {repo_response.json().get('error', 'Unknown error')}")
110
+
111
+ repo_info = repo_response.json()
112
+
113
+ # Get repository tree
114
+ tree_url = f"{HF_API_BASE}/api/models/{repo_path}/tree/main"
115
+ tree_response = requests.get(tree_url, headers=headers)
116
+
117
+ if tree_response.status_code != 200:
118
+ raise Exception(f"Failed to fetch repository tree: {tree_response.json().get('error', 'Unknown error')}")
119
+
120
+ tree_data = tree_response.json()
121
+
122
+ files_data = []
123
+
124
+ def process_tree_item(item, current_path=""):
125
+ if isinstance(item, list):
126
+ for subitem in item:
127
+ process_tree_item(subitem, current_path)
128
+ elif isinstance(item, dict):
129
+ item_path = f"{current_path}/{item['path']}" if current_path else item['path']
130
+
131
+ if item['type'] == 'directory':
132
+ # Get directory contents
133
+ dir_url = f"{HF_API_BASE}/api/models/{repo_path}/tree/main/{item_path}"
134
+ dir_response = requests.get(dir_url, headers=headers)
135
+
136
+ if dir_response.status_code == 200:
137
+ process_tree_item(dir_response.json(), item_path)
138
+ elif item['type'] == 'file':
139
+ # Check if file matches patterns
140
+ if not matches_patterns(item_path, include_patterns, exclude_patterns):
141
+ return
142
+
143
+ # Check file size
144
+ if item.get('size', 0) > max_file_size:
145
+ return
146
+
147
+ # Get file content
148
+ try:
149
+ raw_url = f"https://huggingface.co/{repo_path}/raw/main/{item_path}"
150
+ file_response = requests.get(raw_url, headers=headers)
151
+
152
+ if file_response.status_code == 200:
153
+ content = file_response.text
154
+
155
+ # Skip binary files
156
+ if is_binary_file(content, item_path):
157
+ return
158
+
159
+ files_data.append((item_path, content, len(content)))
160
+
161
+ except Exception as e:
162
+ print(f"Error processing file {item_path}: {e}")
163
+ return
164
+
165
+ process_tree_item(tree_data)
166
+
167
+ return files_data, repo_info
168
+
169
+ def download_repo_as_zip(repo_url: str, token: str) -> str:
170
+ """Download repository as ZIP file"""
171
+
172
+ if "github.com" in repo_url:
173
+ # GitHub ZIP URL
174
+ if token:
175
+ headers = {'Authorization': f'token {token}'}
176
+ zip_url = repo_url.replace("github.com", "api.github.com/repos") + "/zipball/main"
177
+ else:
178
+ headers = {}
179
+ zip_url = repo_url.replace("github.com", "codeload.github.com") + "/zip/main"
180
+ elif "huggingface.co" in repo_url:
181
+ # Hugging Face ZIP URL
182
+ headers = {}
183
+ if token:
184
+ headers['Authorization'] = f'Bearer {token}'
185
+ zip_url = repo_url.replace("huggingface.co", "huggingface.co") + "/resolve/main?download=true"
186
+ else:
187
+ raise ValueError("Unsupported repository URL")
188
+
189
+ response = requests.get(zip_url, headers=headers, stream=True)
190
+
191
+ if response.status_code != 200:
192
+ raise Exception(f"Failed to download repository: {response.status_code}")
193
+
194
+ # Save to temporary file
195
+ temp_path = f"/tmp/repo_{hash(repo_url)}.zip"
196
+
197
+ with open(temp_path, 'wb') as f:
198
+ for chunk in response.iter_content(chunk_size=8192):
199
+ f.write(chunk)
200
+
201
+ return temp_path
202
+
203
+ def extract_repo_info(repo_url: str, repo_type: str) -> Dict:
204
+ """Extract basic repository information"""
205
+ if repo_type == "github":
206
+ # Extract owner and repo name
207
+ match = re.search(r'github\.com/([^/]+)/([^/]+)', repo_url)
208
+ if match:
209
+ return {
210
+ 'owner': match.group(1),
211
+ 'repo': match.group(2),
212
+ 'full_name': f"{match.group(1)}/{match.group(2)}",
213
+ 'url': repo_url
214
+ }
215
+ elif repo_type == "huggingface":
216
+ # Extract owner and repo name
217
+ match = re.search(r'huggingface\.co/([^/]+)/([^/]+)', repo_url)
218
+ if match:
219
+ return {
220
+ 'owner': match.group(1),
221
+ 'repo': match.group(2),
222
+ 'full_name': f"{match.group(1)}/{match.group(2)}",
223
+ 'url': repo_url
224
+ }
225
+
226
+ return {'url': repo_url}
requirements.txt ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ requests>=2.31.0
3
+ python-multipart>=0.0.6
4
+ pathlib>=1.0.1
5
+ re>=2.2.1
6
+ hashlib>=20081119
7
+ zipfile>=0.5
8
+ io>=0.1
9
+ datetime>=4.3
10
+ mimetypes>=0.1
11
+ fnmatch>=2.4.3
12
+ base64>=0.1
13
+ json>=2.0.9
14
+
15
+ This Gradio application provides a comprehensive solution for converting GitHub or Hugging Face repositories into text files suitable for LLM training. Here are the key features:
16
+
17
+ ## 🚀 Main Features:
18
+
19
+ 1. **Multi-Platform Support**: Works with both GitHub and Hugging Face repositories
20
+ 2. **Smart File Filtering**: Include/exclude patterns to process only relevant files
21
+ 3. **Token Estimation**: Provides rough token counts for training planning
22
+ 4. **Chunked Output**: Splits large repositories into manageable chunks
23
+ 5. **Comment Removal**: Optional comment stripping for cleaner training data
24
+ 6. **Binary File Detection**: Automatically skips binary files
25
+ 7. **Language Detection**: Identifies programming languages for better organization
26
+ 8. **Progress Tracking**: Real-time progress updates during processing
27
+
28
+ ## 🛠️ Advanced Options:
29
+
30
+ - File size limits to prevent processing huge files
31
+ - Pattern-based filtering (glob patterns supported)
32
+ - Chunk size customization
33
+ - Metadata inclusion
34
+ - Private repository support with tokens
35
+ - ZIP download option
36
+
37
+ ## 📊 Output Features:
38
+
39
+ - Repository metadata and statistics
40
+ - File headers with path, size, and language info
41
+ - Token and character counts
42
+ - Formatted, readable output structure
43
+ - Error handling and status messages
44
+
45
+ The application is designed to handle repositories of various sizes while providing useful feedback and statistics about the processed content. It's perfect for preparing code repositories for LLM fine-tuning or analysis.
utils.py ADDED
@@ -0,0 +1,365 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import hashlib
3
+ from typing import List, Dict, Optional
4
+ import mimetypes
5
+
6
+ def clean_code_content(content: str, file_path: str) -> str:
7
+ """Remove comments from code files while preserving structure"""
8
+ ext = file_path.split('.')[-1].lower()
9
+
10
+ # Language-specific comment patterns
11
+ comment_patterns = {
12
+ 'py': [
13
+ (r'#.*$', ''), # Single line comments
14
+ (r'""".*?"""', '', re.DOTALL), # Triple quotes
15
+ (r"'''.*?'''", '', re.DOTALL),
16
+ ],
17
+ 'js': [
18
+ (r'//.*$', ''), # Single line comments
19
+ (r'/\*.*?\*/', '', re.DOTALL), # Multi-line comments
20
+ ],
21
+ 'java': [
22
+ (r'//.*$', ''),
23
+ (r'/\*.*?\*/', '', re.DOTALL),
24
+ ],
25
+ 'cpp': [
26
+ (r'//.*$', ''),
27
+ (r'/\*.*?\*/', '', re.DOTALL),
28
+ ],
29
+ 'c': [
30
+ (r'//.*$', ''),
31
+ (r'/\*.*?\*/', '', re.DOTALL),
32
+ ],
33
+ 'cs': [
34
+ (r'//.*$', ''),
35
+ (r'/\*.*?\*/', '', re.DOTALL),
36
+ ],
37
+ 'go': [
38
+ (r'//.*$', ''),
39
+ (r'/\*.*?\*/', '', re.DOTALL),
40
+ ],
41
+ 'rs': [
42
+ (r'//.*$', ''),
43
+ (r'/\*.*?\*/', '', re.DOTALL),
44
+ ],
45
+ 'php': [
46
+ (r'//.*$', ''),
47
+ (r'#.*$', ''),
48
+ (r'/\*.*?\*/', '', re.DOTALL),
49
+ ],
50
+ 'rb': [
51
+ (r'#.*$', ''),
52
+ (r'=begin.*?=end', '', re.DOTALL),
53
+ ],
54
+ 'sh': [
55
+ (r'#.*$', ''),
56
+ ],
57
+ 'sql': [
58
+ (r'--.*$', ''),
59
+ (r'/\*.*?\*/', '', re.DOTALL),
60
+ ],
61
+ 'html': [
62
+ (r'<!--.*?-->', '', re.DOTALL),
63
+ ],
64
+ 'xml': [
65
+ (r'<!--.*?-->', '', re.DOTALL),
66
+ ],
67
+ 'css': [
68
+ (r'/\*.*?\*/', '', re.DOTALL),
69
+ ],
70
+ }
71
+
72
+ if ext in comment_patterns:
73
+ content = content.strip()
74
+ for pattern, replacement, *flags in comment_patterns[ext]:
75
+ flags = flags[0] if flags else 0
76
+ content = re.sub(pattern, replacement, content, flags=flags)
77
+
78
+ # Clean up extra whitespace
79
+ content = re.sub(r'\n\s*\n\s*\n', '\n\n', content)
80
+ content = content.strip()
81
+
82
+ return content
83
+
84
+ def get_file_language(file_path: str) -> str:
85
+ """Determine programming language from file extension"""
86
+ ext = file_path.split('.')[-1].lower()
87
+
88
+ language_map = {
89
+ 'py': 'Python',
90
+ 'js': 'JavaScript',
91
+ 'ts': 'TypeScript',
92
+ 'jsx': 'React JSX',
93
+ 'tsx': 'React TSX',
94
+ 'java': 'Java',
95
+ 'cpp': 'C++',
96
+ 'c': 'C',
97
+ 'cs': 'C#',
98
+ 'go': 'Go',
99
+ 'rs': 'Rust',
100
+ 'php': 'PHP',
101
+ 'rb': 'Ruby',
102
+ 'swift': 'Swift',
103
+ 'kt': 'Kotlin',
104
+ 'scala': 'Scala',
105
+ 'r': 'R',
106
+ 'm': 'Objective-C',
107
+ 'sh': 'Shell',
108
+ 'bash': 'Bash',
109
+ 'zsh': 'Zsh',
110
+ 'fish': 'Fish',
111
+ 'ps1': 'PowerShell',
112
+ 'bat': 'Batch',
113
+ 'sql': 'SQL',
114
+ 'html': 'HTML',
115
+ 'htm': 'HTML',
116
+ 'xml': 'XML',
117
+ 'css': 'CSS',
118
+ 'scss': 'SCSS',
119
+ 'sass': 'SASS',
120
+ 'less': 'LESS',
121
+ 'json': 'JSON',
122
+ 'yaml': 'YAML',
123
+ 'yml': 'YAML',
124
+ 'toml': 'TOML',
125
+ 'ini': 'INI',
126
+ 'cfg': 'Config',
127
+ 'conf': 'Config',
128
+ 'md': 'Markdown',
129
+ 'rst': 'reStructuredText',
130
+ 'txt': 'Text',
131
+ 'log': 'Log',
132
+ 'dockerfile': 'Docker',
133
+ 'docker': 'Docker',
134
+ 'gitignore': 'Git',
135
+ 'gitattributes': 'Git',
136
+ 'editorconfig': 'EditorConfig',
137
+ 'eslintrc': 'ESLint',
138
+ 'prettierrc': 'Prettier',
139
+ 'babelrc': 'Babel',
140
+ 'tsconfig': 'TypeScript',
141
+ 'package': 'NPM',
142
+ 'lock': 'Lock',
143
+ 'requirements': 'Python',
144
+ 'pipfile': 'Python',
145
+ 'poetry': 'Python',
146
+ 'makefile': 'Make',
147
+ 'cmake': 'CMake',
148
+ 'gradle': 'Gradle',
149
+ 'pom': 'Maven',
150
+ 'sbt': 'SBT',
151
+ 'vue': 'Vue',
152
+ 'svelte': 'Svelte',
153
+ 'elm': 'Elm',
154
+ 'pug': 'Pug',
155
+ 'haml': 'Haml',
156
+ 'erb': 'ERB',
157
+ 'ejs': 'EJS',
158
+ 'twig': 'Twig',
159
+ 'liquid': 'Liquid',
160
+ 'handlebars': 'Handlebars',
161
+ 'mustache': 'Mustache',
162
+ 'jinja': 'Jinja',
163
+ 'tex': 'LaTeX',
164
+ 'bib': 'BibTeX',
165
+ 'plt': 'Gnuplot',
166
+ 'dot': 'Graphviz',
167
+ 'mermaid': 'Mermaid',
168
+ 'drawio': 'DrawIO',
169
+ 'puml': 'PlantUML',
170
+ 'wsdl': 'WSDL',
171
+ 'xsd': 'XSD',
172
+ 'xslt': 'XSLT',
173
+ 'graphql': 'GraphQL',
174
+ 'proto': 'Protocol Buffers',
175
+ 'avro': 'Avro',
176
+ 'parquet': 'Parquet',
177
+ 'arrow': 'Arrow',
178
+ 'feather': 'Feather',
179
+ 'hdf5': 'HDF5',
180
+ 'netcdf': 'NetCDF',
181
+ 'matlab': 'MATLAB',
182
+ 'mex': 'MATLAB',
183
+ 'fig': 'MATLAB',
184
+ 'slx': 'Simulink',
185
+ 'simulink': 'Simulink',
186
+ 'labview': 'LabVIEW',
187
+ 'vi': 'LabVIEW',
188
+ 'lvproj': 'LabVIEW',
189
+ 'lvlib': 'LabVIEW',
190
+ 'stata': 'Stata',
191
+ 'do': 'Stata',
192
+ 'ado': 'Stata',
193
+ 'spss': 'SPSS',
194
+ 'sav': 'SPSS',
195
+ 'sas': 'SAS',
196
+ 's7dat': 'SAS',
197
+ 's7bdat': 'SAS',
198
+ 'xpt': 'SAS',
199
+ 'dta': 'Stata',
200
+ 'rdata': 'R',
201
+ 'rds': 'R',
202
+ 'rda': 'R',
203
+ 'jl': 'Julia',
204
+ 'nim': 'Nim',
205
+ 'zig': 'Zig',
206
+ 'v': 'V',
207
+ 'ada': 'Ada',
208
+ 'adb': 'Ada',
209
+ 'ads': 'Ada',
210
+ 'pas': 'Pascal',
211
+ 'pp': 'Pascal',
212
+ 'dpr': 'Pascal',
213
+ 'lpr': 'Pascal',
214
+ 'dfm': 'Pascal',
215
+ 'pl': 'Perl',
216
+ 'pm': 'Perl',
217
+ 't': 'Perl',
218
+ 'pod': 'Perl',
219
+ 'lua': 'Lua',
220
+ 'moon': 'MoonScript',
221
+ 'el': 'Emacs Lisp',
222
+ 'elc': 'Emacs Lisp',
223
+ 'elisp': 'Emacs Lisp',
224
+ 'cl': 'Common Lisp',
225
+ 'lisp': 'Common Lisp',
226
+ 'lsp': 'Common Lisp',
227
+ 'fasl': 'Common Lisp',
228
+ 'ss': 'Scheme',
229
+ 'scm': 'Scheme',
230
+ 'rkt': 'Scheme',
231
+ 'sch': 'Scheme',
232
+ 'fs': 'F#',
233
+ 'fsi': 'F#',
234
+ 'fsx': 'F#',
235
+ 'fsscript': 'F#',
236
+ 'ml': 'OCaml',
237
+ 'mli': 'OCaml',
238
+ 'll': 'LLVM',
239
+ 'bc': 'LLVM',
240
+ 'nim': 'Nim',
241
+ 'nimble': 'Nim',
242
+ 'nims': 'Nim',
243
+ 'v': 'V',
244
+ 'vsh': 'V',
245
+ 'vv': 'V',
246
+ 'vh': 'V',
247
+ 'd': 'D',
248
+ 'di': 'D',
249
+ 'dart': 'Dart',
250
+ 'groovy': 'Groovy',
251
+ 'gvy': 'Groovy',
252
+ 'gy': 'Groovy',
253
+ 'gsh': 'Groovy',
254
+ 'clj': 'Clojure',
255
+ 'cljs': 'ClojureScript',
256
+ 'cljc': 'Clojure',
257
+ 'edn': 'Clojure',
258
+ 'coffee': 'CoffeeScript',
259
+ 'litcoffee': 'CoffeeScript',
260
+ 'cjsx': 'Cjsx',
261
+ 'iced': 'IcedCoffeeScript',
262
+ 'hx': 'Haxe',
263
+ 'hxml': 'Haxe',
264
+ 'purs': 'PureScript',
265
+ 'elm': 'Elm',
266
+ 'p8': 'Pico-8',
267
+ 'lua': 'Lua',
268
+ 'moon': 'MoonScript',
269
+ 'wren': 'Wren',
270
+ 'earl-grey': 'Earl Grey',
271
+ 'eg': 'Earl Grey',
272
+ 'tsv': 'TSV',
273
+ 'csv': 'CSV',
274
+ }
275
+
276
+ return language_map.get(ext, ext.upper())
277
+
278
+ def estimate_tokens(text: str) -> int:
279
+ """Estimate token count (rough approximation)"""
280
+ # Simple heuristic: ~4 characters per token for English text
281
+ # For code, this varies more, but it's a reasonable approximation
282
+ return len(text) // 4
283
+
284
+ def create_chunked_output(content: str, chunk_size: int) -> List[str]:
285
+ """Split content into chunks of specified size"""
286
+ chunks = []
287
+ current_chunk = ""
288
+
289
+ lines = content.split('\n')
290
+
291
+ for line in lines:
292
+ if len(current_chunk) + len(line) + 1 > chunk_size:
293
+ if current_chunk:
294
+ chunks.append(current_chunk)
295
+ current_chunk = line
296
+ else:
297
+ if current_chunk:
298
+ current_chunk += '\n' + line
299
+ else:
300
+ current_chunk = line
301
+
302
+ if current_chunk:
303
+ chunks.append(current_chunk)
304
+
305
+ return chunks
306
+
307
+ def matches_patterns(file_path: str, include_patterns: List[str], exclude_patterns: List[str]) -> bool:
308
+ """Check if file matches include/exclude patterns"""
309
+ import fnmatch
310
+
311
+ # Check exclude patterns first
312
+ for pattern in exclude_patterns:
313
+ if fnmatch.fnmatch(file_path, pattern) or fnmatch.fnmatch(file_path, f"**/{pattern}"):
314
+ return False
315
+
316
+ # If no include patterns, include everything else
317
+ if not include_patterns:
318
+ return True
319
+
320
+ # Check include patterns
321
+ for pattern in include_patterns:
322
+ if fnmatch.fnmatch(file_path, pattern) or fnmatch.fnmatch(file_path, f"**/{pattern}"):
323
+ return True
324
+
325
+ return False
326
+
327
+ def format_file_size(size_bytes: int) -> str:
328
+ """Format file size in human readable format"""
329
+ for unit in ['B', 'KB', 'MB', 'GB']:
330
+ if size_bytes < 1024.0:
331
+ return f"{size_bytes:.1f} {unit}"
332
+ size_bytes /= 1024.0
333
+ return f"{size_bytes:.1f} TB"
334
+
335
+ def generate_file_hash(content: str) -> str:
336
+ """Generate SHA-256 hash of file content"""
337
+ return hashlib.sha256(content.encode()).hexdigest()[:16]
338
+
339
+ def is_binary_file(content: str, file_path: str) -> bool:
340
+ """Check if file is binary"""
341
+ # Check file extension first
342
+ binary_extensions = {
343
+ 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'ico', 'svg', 'webp',
344
+ 'mp3', 'mp4', 'avi', 'mov', 'wav', 'flac', 'ogg',
345
+ 'zip', 'rar', 'tar', 'gz', '7z', 'bz2', 'xz',
346
+ 'exe', 'dll', 'so', 'dylib',
347
+ 'pdf', 'doc', 'docx', 'xls', 'xlsx', 'ppt', 'pptx',
348
+ 'ttf', 'otf', 'woff', 'woff2', 'eot',
349
+ 'bin', 'dat', 'db', 'sqlite', 'sqlite3',
350
+ }
351
+
352
+ ext = file_path.split('.')[-1].lower()
353
+ if ext in binary_extensions:
354
+ return True
355
+
356
+ # Check content for null bytes (indicator of binary)
357
+ if '\0' in content[:1024]:
358
+ return True
359
+
360
+ # Check if content has too many non-printable characters
361
+ printable_chars = sum(1 for c in content[:1024] if c.isprintable() or c in '\t\n\r')
362
+ if printable_chars / len(content[:1024]) < 0.7:
363
+ return True
364
+
365
+ return False