Spaces:
Build error
Build error
Deploy Gradio app with multiple files
Browse files- app.py +441 -0
- config.py +238 -0
- models.py +226 -0
- requirements.txt +45 -0
- utils.py +365 -0
app.py
ADDED
@@ -0,0 +1,441 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import requests
|
3 |
+
import os
|
4 |
+
import base64
|
5 |
+
import json
|
6 |
+
import re
|
7 |
+
from pathlib import Path
|
8 |
+
from typing import List, Dict, Optional, Tuple
|
9 |
+
import zipfile
|
10 |
+
import io
|
11 |
+
from datetime import datetime
|
12 |
+
import math
|
13 |
+
|
14 |
+
from utils import (
|
15 |
+
clean_code_content,
|
16 |
+
get_file_language,
|
17 |
+
estimate_tokens,
|
18 |
+
create_chunked_output
|
19 |
+
)
|
20 |
+
from models import (
|
21 |
+
process_github_repo,
|
22 |
+
process_huggingface_repo,
|
23 |
+
download_repo_as_zip
|
24 |
+
)
|
25 |
+
from config import (
|
26 |
+
SUPPORTED_EXTENSIONS,
|
27 |
+
MAX_FILE_SIZE,
|
28 |
+
MAX_TOTAL_SIZE,
|
29 |
+
CHUNK_SIZE,
|
30 |
+
GITHUB_API_BASE,
|
31 |
+
HF_API_BASE
|
32 |
+
)
|
33 |
+
|
34 |
+
# CSS for better UI
|
35 |
+
css = """
|
36 |
+
.container {
|
37 |
+
max-width: 1200px;
|
38 |
+
margin: 0 auto;
|
39 |
+
}
|
40 |
+
.progress-bar {
|
41 |
+
height: 20px;
|
42 |
+
background: linear-gradient(90deg, #4CAF50, #45a049);
|
43 |
+
border-radius: 10px;
|
44 |
+
transition: width 0.3s ease;
|
45 |
+
}
|
46 |
+
.file-stats {
|
47 |
+
background: #f0f0f0;
|
48 |
+
padding: 10px;
|
49 |
+
border-radius: 5px;
|
50 |
+
margin: 10px 0;
|
51 |
+
}
|
52 |
+
.warning {
|
53 |
+
background: #fff3cd;
|
54 |
+
border: 1px solid #ffeaa7;
|
55 |
+
padding: 10px;
|
56 |
+
border-radius: 5px;
|
57 |
+
color: #856404;
|
58 |
+
}
|
59 |
+
.error {
|
60 |
+
background: #f8d7da;
|
61 |
+
border: 1px solid #f5c6cb;
|
62 |
+
padding: 10px;
|
63 |
+
border-radius: 5px;
|
64 |
+
color: #721c24;
|
65 |
+
}
|
66 |
+
.success {
|
67 |
+
background: #d4edda;
|
68 |
+
border: 1px solid #c3e6cb;
|
69 |
+
padding: 10px;
|
70 |
+
border-radius: 5px;
|
71 |
+
color: #155724;
|
72 |
+
}
|
73 |
+
"""
|
74 |
+
|
75 |
+
def validate_repo_url(url: str) -> Tuple[str, str]:
|
76 |
+
"""Validate and determine repository type and owner/name"""
|
77 |
+
url = url.strip()
|
78 |
+
|
79 |
+
# GitHub URL patterns
|
80 |
+
github_patterns = [
|
81 |
+
r'github\.com/([^/]+)/([^/]+?)(?:\.git)?/?$',
|
82 |
+
r'api\.github\.com/repos/([^/]+)/([^/]+)'
|
83 |
+
]
|
84 |
+
|
85 |
+
# Hugging Face URL patterns
|
86 |
+
hf_patterns = [
|
87 |
+
r'huggingface\.co/([^/]+)/([^/]+?)(?:\.git)?/?$',
|
88 |
+
r'hf\.co/([^/]+)/([^/]+?)(?:\.git)?/?$'
|
89 |
+
]
|
90 |
+
|
91 |
+
for pattern in github_patterns:
|
92 |
+
match = re.search(pattern, url)
|
93 |
+
if match:
|
94 |
+
return "github", f"{match.group(1)}/{match.group(2)}"
|
95 |
+
|
96 |
+
for pattern in hf_patterns:
|
97 |
+
match = re.search(pattern, url)
|
98 |
+
if match:
|
99 |
+
return "huggingface", f"{match.group(1)}/{match.group(2)}"
|
100 |
+
|
101 |
+
raise ValueError("Invalid repository URL. Please provide a valid GitHub or Hugging Face repository URL.")
|
102 |
+
|
103 |
+
def process_repository(
|
104 |
+
repo_url: str,
|
105 |
+
token: str = "",
|
106 |
+
include_patterns: str = "",
|
107 |
+
exclude_patterns: str = "",
|
108 |
+
max_file_size_mb: int = 10,
|
109 |
+
chunk_size: int = 50000,
|
110 |
+
include_metadata: bool = True,
|
111 |
+
remove_comments: bool = False,
|
112 |
+
progress=gr.Progress()
|
113 |
+
) -> Tuple[str, str, str]:
|
114 |
+
"""Main function to process repository and generate text file"""
|
115 |
+
|
116 |
+
try:
|
117 |
+
# Validate URL and get repo info
|
118 |
+
repo_type, repo_path = validate_repo_url(repo_url)
|
119 |
+
|
120 |
+
# Parse include/exclude patterns
|
121 |
+
include_list = [p.strip() for p in include_patterns.split(",") if p.strip()] if include_patterns else []
|
122 |
+
exclude_list = [p.strip() for p in exclude_patterns.split(",") if p.strip()] if exclude_patterns else []
|
123 |
+
|
124 |
+
progress(0.1, desc="Fetching repository information...")
|
125 |
+
|
126 |
+
# Process repository based on type
|
127 |
+
if repo_type == "github":
|
128 |
+
files_data, repo_info = process_github_repo(
|
129 |
+
repo_path,
|
130 |
+
token,
|
131 |
+
include_list,
|
132 |
+
exclude_list,
|
133 |
+
max_file_size_mb * 1024 * 1024
|
134 |
+
)
|
135 |
+
else: # huggingface
|
136 |
+
files_data, repo_info = process_huggingface_repo(
|
137 |
+
repo_path,
|
138 |
+
token,
|
139 |
+
include_list,
|
140 |
+
exclude_list,
|
141 |
+
max_file_size_mb * 1024 * 1024
|
142 |
+
)
|
143 |
+
|
144 |
+
if not files_data:
|
145 |
+
return "", "⚠️ No files found matching the criteria.", ""
|
146 |
+
|
147 |
+
progress(0.3, desc="Processing files...")
|
148 |
+
|
149 |
+
# Generate consolidated text
|
150 |
+
total_files = len(files_data)
|
151 |
+
processed_files = 0
|
152 |
+
total_tokens = 0
|
153 |
+
total_chars = 0
|
154 |
+
|
155 |
+
# Create header
|
156 |
+
header_lines = []
|
157 |
+
if include_metadata:
|
158 |
+
header_lines.append("=" * 80)
|
159 |
+
header_lines.append(f"REPOSITORY: {repo_info.get('full_name', repo_path)}")
|
160 |
+
header_lines.append(f"DESCRIPTION: {repo_info.get('description', 'No description')}")
|
161 |
+
header_lines.append(f"URL: {repo_url}")
|
162 |
+
header_lines.append(f"PROCESSED: {datetime.now().isoformat()}")
|
163 |
+
header_lines.append(f"TOTAL FILES: {total_files}")
|
164 |
+
header_lines.append("=" * 80)
|
165 |
+
header_lines.append("")
|
166 |
+
|
167 |
+
content_parts = ["\n".join(header_lines)]
|
168 |
+
|
169 |
+
# Process each file
|
170 |
+
for i, (file_path, content, file_size) in enumerate(files_data):
|
171 |
+
progress(0.3 + (0.5 * i / total_files), desc=f"Processing file {i+1}/{total_files}")
|
172 |
+
|
173 |
+
# Clean content if requested
|
174 |
+
if remove_comments:
|
175 |
+
content = clean_code_content(content, file_path)
|
176 |
+
|
177 |
+
# Add file header
|
178 |
+
file_header = f"\n{'-' * 60}\n"
|
179 |
+
file_header += f"FILE: {file_path}\n"
|
180 |
+
file_header += f"SIZE: {file_size:,} bytes\n"
|
181 |
+
file_header += f"LANGUAGE: {get_file_language(file_path)}\n"
|
182 |
+
file_header += f"{'-' * 60}\n\n"
|
183 |
+
|
184 |
+
# Add content
|
185 |
+
file_content = file_header + content + "\n\n"
|
186 |
+
|
187 |
+
# Check if adding this file would exceed chunk size
|
188 |
+
if len("\n".join(content_parts + [file_content])) > chunk_size:
|
189 |
+
# Save current chunk
|
190 |
+
yield "\n".join(content_parts), generate_stats(processed_files, total_tokens, total_chars, total_files), "success"
|
191 |
+
# Start new chunk
|
192 |
+
content_parts = [file_header + "\n".join(header_lines)]
|
193 |
+
|
194 |
+
content_parts.append(file_content)
|
195 |
+
processed_files += 1
|
196 |
+
total_chars += len(content)
|
197 |
+
total_tokens += estimate_tokens(content)
|
198 |
+
|
199 |
+
progress(0.9, desc="Finalizing...")
|
200 |
+
|
201 |
+
# Final content
|
202 |
+
final_content = "\n".join(content_parts)
|
203 |
+
|
204 |
+
# Add footer
|
205 |
+
if include_metadata:
|
206 |
+
footer = f"\n{'=' * 80}\n"
|
207 |
+
footer += f"SUMMARY:\n"
|
208 |
+
footer += f"- Files processed: {processed_files}\n"
|
209 |
+
footer += f"- Total characters: {total_chars:,}\n"
|
210 |
+
footer += f"- Estimated tokens: {total_tokens:,}\n"
|
211 |
+
footer += f"- Repository: {repo_info.get('full_name', repo_path)}\n"
|
212 |
+
footer += f"{'=' * 80}\n"
|
213 |
+
final_content += footer
|
214 |
+
|
215 |
+
progress(1.0, desc="Complete!")
|
216 |
+
|
217 |
+
return final_content, generate_stats(processed_files, total_tokens, total_chars, total_files), "success"
|
218 |
+
|
219 |
+
except Exception as e:
|
220 |
+
error_msg = f"❌ Error: {str(e)}"
|
221 |
+
return "", error_msg, "error"
|
222 |
+
|
223 |
+
def generate_stats(files_processed: int, tokens: int, chars: int, total_files: int) -> str:
|
224 |
+
"""Generate statistics HTML"""
|
225 |
+
stats_html = f"""
|
226 |
+
<div class="file-stats">
|
227 |
+
<h3>📊 Processing Statistics</h3>
|
228 |
+
<p><strong>Files Processed:</strong> {files_processed:,} / {total_files:,}</p>
|
229 |
+
<p><strong>Total Characters:</strong> {chars:,}</p>
|
230 |
+
<p><strong>Estimated Tokens:</strong> {tokens:,}</p>
|
231 |
+
<p><strong>Average Tokens per File:</strong> {tokens // max(files_processed, 1):,}</p>
|
232 |
+
</div>
|
233 |
+
"""
|
234 |
+
return stats_html
|
235 |
+
|
236 |
+
def download_repo_locally(repo_url: str, token: str = "") -> str:
|
237 |
+
"""Download repository as ZIP for local processing"""
|
238 |
+
try:
|
239 |
+
repo_type, repo_path = validate_repo_url(repo_url)
|
240 |
+
|
241 |
+
if repo_type == "github":
|
242 |
+
return download_repo_as_zip(f"github.com/{repo_path}", token)
|
243 |
+
else:
|
244 |
+
return download_repo_as_zip(f"huggingface.co/{repo_path}", token)
|
245 |
+
|
246 |
+
except Exception as e:
|
247 |
+
return f"Error downloading repository: {str(e)}"
|
248 |
+
|
249 |
+
# Create Gradio interface
|
250 |
+
def create_interface():
|
251 |
+
with gr.Blocks(
|
252 |
+
title="Repo-to-Text Converter",
|
253 |
+
theme=gr.themes.Soft(),
|
254 |
+
css=css
|
255 |
+
) as demo:
|
256 |
+
|
257 |
+
gr.Markdown("""
|
258 |
+
# 📚 Repository to Text Converter
|
259 |
+
|
260 |
+
Convert GitHub or Hugging Face repositories into formatted text files perfect for LLM training.
|
261 |
+
|
262 |
+
**Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)**
|
263 |
+
""")
|
264 |
+
|
265 |
+
with gr.Row():
|
266 |
+
with gr.Column(scale=2):
|
267 |
+
# Input section
|
268 |
+
gr.Markdown("## 📥 Repository Input")
|
269 |
+
|
270 |
+
repo_url = gr.Textbox(
|
271 |
+
label="Repository URL",
|
272 |
+
placeholder="https://github.com/username/repo or https://huggingface.co/username/repo",
|
273 |
+
lines=2
|
274 |
+
)
|
275 |
+
|
276 |
+
token = gr.Textbox(
|
277 |
+
label="Access Token (Optional)",
|
278 |
+
placeholder="GitHub token or Hugging Face token for private repos",
|
279 |
+
type="password"
|
280 |
+
)
|
281 |
+
|
282 |
+
with gr.Accordion("🔧 Advanced Options", open=False):
|
283 |
+
include_patterns = gr.Textbox(
|
284 |
+
label="Include Patterns (comma-separated)",
|
285 |
+
placeholder="*.py,*.md,src/**/*.py",
|
286 |
+
info="Only include files matching these patterns"
|
287 |
+
)
|
288 |
+
|
289 |
+
exclude_patterns = gr.Textbox(
|
290 |
+
label="Exclude Patterns (comma-separated)",
|
291 |
+
placeholder="*.git*,*.log,node_modules/**",
|
292 |
+
value="*.git*,*.log,node_modules/**,__pycache__/**,.DS_Store"
|
293 |
+
)
|
294 |
+
|
295 |
+
max_file_size = gr.Slider(
|
296 |
+
minimum=1,
|
297 |
+
maximum=100,
|
298 |
+
value=10,
|
299 |
+
step=1,
|
300 |
+
label="Max File Size (MB)",
|
301 |
+
info="Files larger than this will be skipped"
|
302 |
+
)
|
303 |
+
|
304 |
+
chunk_size = gr.Slider(
|
305 |
+
minimum=1000,
|
306 |
+
maximum=100000,
|
307 |
+
value=50000,
|
308 |
+
step=1000,
|
309 |
+
label="Chunk Size (characters)",
|
310 |
+
info="Split output into chunks of this size"
|
311 |
+
)
|
312 |
+
|
313 |
+
include_metadata = gr.Checkbox(
|
314 |
+
value=True,
|
315 |
+
label="Include Metadata",
|
316 |
+
info="Add repository information and statistics"
|
317 |
+
)
|
318 |
+
|
319 |
+
remove_comments = gr.Checkbox(
|
320 |
+
value=False,
|
321 |
+
label="Remove Comments",
|
322 |
+
info="Strip comments from code files (experimental)"
|
323 |
+
)
|
324 |
+
|
325 |
+
process_btn = gr.Button(
|
326 |
+
"🚀 Process Repository",
|
327 |
+
variant="primary",
|
328 |
+
size="lg"
|
329 |
+
)
|
330 |
+
|
331 |
+
download_btn = gr.Button(
|
332 |
+
"⬇️ Download as ZIP",
|
333 |
+
variant="secondary"
|
334 |
+
)
|
335 |
+
|
336 |
+
with gr.Column(scale=1):
|
337 |
+
# Info section
|
338 |
+
gr.Markdown("## ℹ️ Information")
|
339 |
+
|
340 |
+
gr.Markdown("""
|
341 |
+
### Supported Platforms:
|
342 |
+
- ✅ GitHub (public and private)
|
343 |
+
- ✅ Hugging Face (public and private)
|
344 |
+
|
345 |
+
### Supported File Types:
|
346 |
+
- Code files (.py, .js, .java, .cpp, etc.)
|
347 |
+
- Documentation (.md, .txt, .rst)
|
348 |
+
- Configuration files (.json, .yaml, .toml)
|
349 |
+
- And many more!
|
350 |
+
|
351 |
+
### Features:
|
352 |
+
- 🔄 Chunked output for large repos
|
353 |
+
- 📊 Token estimation
|
354 |
+
- 🎯 Pattern-based file filtering
|
355 |
+
- 🧹 Optional comment removal
|
356 |
+
""")
|
357 |
+
|
358 |
+
# Output section
|
359 |
+
gr.Markdown("## 📤 Output")
|
360 |
+
|
361 |
+
with gr.Row():
|
362 |
+
stats_display = gr.HTML(label="Statistics")
|
363 |
+
|
364 |
+
output_text = gr.Textbox(
|
365 |
+
label="Generated Text",
|
366 |
+
lines=20,
|
367 |
+
max_lines=50,
|
368 |
+
show_copy_button=True,
|
369 |
+
interactive=True
|
370 |
+
)
|
371 |
+
|
372 |
+
status_display = gr.HTML()
|
373 |
+
|
374 |
+
# Event handlers
|
375 |
+
process_btn.click(
|
376 |
+
fn=process_repository,
|
377 |
+
inputs=[
|
378 |
+
repo_url,
|
379 |
+
token,
|
380 |
+
include_patterns,
|
381 |
+
exclude_patterns,
|
382 |
+
max_file_size,
|
383 |
+
chunk_size,
|
384 |
+
include_metadata,
|
385 |
+
remove_comments
|
386 |
+
],
|
387 |
+
outputs=[output_text, stats_display, status_display]
|
388 |
+
)
|
389 |
+
|
390 |
+
download_btn.click(
|
391 |
+
fn=download_repo_locally,
|
392 |
+
inputs=[repo_url, token],
|
393 |
+
outputs=gr.File(label="Downloaded Repository")
|
394 |
+
)
|
395 |
+
|
396 |
+
# Examples
|
397 |
+
gr.Markdown("## 🎯 Examples")
|
398 |
+
gr.Examples(
|
399 |
+
examples=[
|
400 |
+
[
|
401 |
+
"https://github.com/gradio-app/gradio",
|
402 |
+
"",
|
403 |
+
"*.py,*.md",
|
404 |
+
"",
|
405 |
+
10,
|
406 |
+
50000,
|
407 |
+
True,
|
408 |
+
False
|
409 |
+
],
|
410 |
+
[
|
411 |
+
"https://huggingface.co/huggingface/transformers",
|
412 |
+
"",
|
413 |
+
"*.py,*.md,*.rst",
|
414 |
+
"tests/**,docs/**",
|
415 |
+
5,
|
416 |
+
30000,
|
417 |
+
True,
|
418 |
+
False
|
419 |
+
]
|
420 |
+
],
|
421 |
+
inputs=[
|
422 |
+
repo_url,
|
423 |
+
token,
|
424 |
+
include_patterns,
|
425 |
+
exclude_patterns,
|
426 |
+
max_file_size,
|
427 |
+
chunk_size,
|
428 |
+
include_metadata,
|
429 |
+
remove_comments
|
430 |
+
]
|
431 |
+
)
|
432 |
+
|
433 |
+
return demo
|
434 |
+
|
435 |
+
if __name__ == "__main__":
|
436 |
+
demo = create_interface()
|
437 |
+
demo.launch(
|
438 |
+
share=True,
|
439 |
+
show_error=True,
|
440 |
+
show_tips=True
|
441 |
+
)
|
config.py
ADDED
@@ -0,0 +1,238 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Configuration constants
|
2 |
+
|
3 |
+
# API endpoints
|
4 |
+
GITHUB_API_BASE = "https://api.github.com"
|
5 |
+
HF_API_BASE = "https://huggingface.co"
|
6 |
+
|
7 |
+
# Supported file extensions for text processing
|
8 |
+
SUPPORTED_EXTENSIONS = {
|
9 |
+
# Programming languages
|
10 |
+
'.py', '.js', '.ts', '.jsx', '.tsx', '.java', '.cpp', '.c', '.cs', '.go', '.rs',
|
11 |
+
'.php', '.rb', '.swift', '.kt', '.scala', '.r', '.m', '.sh', '.bash', '.zsh',
|
12 |
+
'.fish', '.ps1', '.bat', '.sql', '.html', '.htm', '.xml', '.css', '.scss',
|
13 |
+
'.sass', '.less', '.json', '.yaml', '.yml', '.toml', '.ini', '.cfg', '.conf',
|
14 |
+
'.md', '.rst', '.txt', '.log', '.dockerfile', '.gitignore', '.gitattributes',
|
15 |
+
'.editorconfig', '.eslintrc', '.prettierrc', '.babelrc', '.tsconfig',
|
16 |
+
|
17 |
+
# Configuration files
|
18 |
+
'.env', '.env.example', '.env.local', '.env.development', '.env.production',
|
19 |
+
'package.json', 'package-lock.json', 'yarn.lock', 'pnpm-lock.yaml',
|
20 |
+
'requirements.txt', 'Pipfile', 'poetry.lock', 'pyproject.toml',
|
21 |
+
'Cargo.toml', 'Cargo.lock', 'go.mod', 'go.sum', 'composer.json',
|
22 |
+
'composer.lock', 'Gemfile', 'Gemfile.lock', 'pom.xml', 'build.gradle',
|
23 |
+
'CMakeLists.txt', 'Makefile', 'Dockerfile', 'docker-compose.yml',
|
24 |
+
|
25 |
+
# Documentation
|
26 |
+
'.md', '.rst', '.txt', '.adoc', '.tex', '.bib',
|
27 |
+
|
28 |
+
# Data formats
|
29 |
+
'.json', '.yaml', '.yml', '.toml', '.ini', '.cfg', '.conf',
|
30 |
+
'.csv', '.tsv', '.xml', '.rss', '.atom',
|
31 |
+
|
32 |
+
# Scripts
|
33 |
+
'.sh', '.bash', '.zsh', '.fish', '.ps1', '.bat', '.cmd',
|
34 |
+
'.py', '.pl', '.rb', '.lua', '.tcl', '.awk', '.sed',
|
35 |
+
}
|
36 |
+
|
37 |
+
# Size limits
|
38 |
+
MAX_FILE_SIZE = 10 * 1024 * 1024 # 10MB default
|
39 |
+
MAX_TOTAL_SIZE = 100 * 1024 * 1024 # 100MB default
|
40 |
+
CHUNK_SIZE = 50000 # Characters per chunk
|
41 |
+
|
42 |
+
# File patterns to exclude by default
|
43 |
+
DEFAULT_EXCLUDE_PATTERNS = [
|
44 |
+
"*.git*",
|
45 |
+
"*.log",
|
46 |
+
"node_modules/**",
|
47 |
+
"__pycache__/**",
|
48 |
+
".DS_Store",
|
49 |
+
"Thumbs.db",
|
50 |
+
"*.tmp",
|
51 |
+
"*.temp",
|
52 |
+
"*.swp",
|
53 |
+
"*.swo",
|
54 |
+
"*~",
|
55 |
+
".vscode/**",
|
56 |
+
".idea/**",
|
57 |
+
"*.pyc",
|
58 |
+
"*.pyo",
|
59 |
+
"*.pyd",
|
60 |
+
".Python",
|
61 |
+
"build/**",
|
62 |
+
"dist/**",
|
63 |
+
"*.egg-info/**",
|
64 |
+
".pytest_cache/**",
|
65 |
+
".coverage",
|
66 |
+
"htmlcov/**",
|
67 |
+
".tox/**",
|
68 |
+
"*.cover",
|
69 |
+
"coverage.xml",
|
70 |
+
"*.cover",
|
71 |
+
".hypothesis/**",
|
72 |
+
".mypy_cache/**",
|
73 |
+
"dmypy.json",
|
74 |
+
dmypy.json",
|
75 |
+
".pytest_cache/**",
|
76 |
+
"nosetests.xml",
|
77 |
+
"coverage.xml",
|
78 |
+
"*.cover",
|
79 |
+
".hypothesis/**",
|
80 |
+
".cache/**",
|
81 |
+
"*.pid",
|
82 |
+
"*.seed",
|
83 |
+
"*.pid.lock",
|
84 |
+
".nyc_output",
|
85 |
+
".grunt",
|
86 |
+
".bower",
|
87 |
+
".lock-wscript",
|
88 |
+
"build/Release",
|
89 |
+
"jspm_packages/",
|
90 |
+
"typings",
|
91 |
+
".npm",
|
92 |
+
".eslintcache",
|
93 |
+
".stylelintcache",
|
94 |
+
"*.tsbuildinfo",
|
95 |
+
".rsync_user",
|
96 |
+
".vscode-test",
|
97 |
+
]
|
98 |
+
|
99 |
+
# File patterns to include by default
|
100 |
+
DEFAULT_INCLUDE_PATTERNS = [
|
101 |
+
"*.py",
|
102 |
+
"*.js",
|
103 |
+
"*.ts",
|
104 |
+
"*.jsx",
|
105 |
+
"*.tsx",
|
106 |
+
"*.java",
|
107 |
+
"*.cpp",
|
108 |
+
"*.c",
|
109 |
+
"*.cs",
|
110 |
+
"*.go",
|
111 |
+
"*.rs",
|
112 |
+
"*.php",
|
113 |
+
"*.rb",
|
114 |
+
"*.swift",
|
115 |
+
"*.kt",
|
116 |
+
"*.scala",
|
117 |
+
"*.r",
|
118 |
+
"*.m",
|
119 |
+
"*.sh",
|
120 |
+
"*.bash",
|
121 |
+
"*.zsh",
|
122 |
+
"*.fish",
|
123 |
+
"*.ps1",
|
124 |
+
"*.bat",
|
125 |
+
"*.sql",
|
126 |
+
"*.html",
|
127 |
+
"*.htm",
|
128 |
+
"*.xml",
|
129 |
+
"*.css",
|
130 |
+
"*.scss",
|
131 |
+
"*.sass",
|
132 |
+
"*.less",
|
133 |
+
"*.json",
|
134 |
+
"*.yaml",
|
135 |
+
"*.yml",
|
136 |
+
"*.toml",
|
137 |
+
"*.ini",
|
138 |
+
"*.cfg",
|
139 |
+
"*.conf",
|
140 |
+
"*.md",
|
141 |
+
"*.rst",
|
142 |
+
"*.txt",
|
143 |
+
"*.dockerfile",
|
144 |
+
"*.gitignore",
|
145 |
+
"*.gitattributes",
|
146 |
+
"*.editorconfig",
|
147 |
+
"*.eslintrc",
|
148 |
+
"*.prettierrc",
|
149 |
+
"*.babelrc",
|
150 |
+
"*.tsconfig",
|
151 |
+
"package.json",
|
152 |
+
"requirements.txt",
|
153 |
+
"Pipfile",
|
154 |
+
"poetry.lock",
|
155 |
+
"pyproject.toml",
|
156 |
+
"Cargo.toml",
|
157 |
+
"go.mod",
|
158 |
+
"composer.json",
|
159 |
+
"Gemfile",
|
160 |
+
"pom.xml",
|
161 |
+
"build.gradle",
|
162 |
+
"CMakeLists.txt",
|
163 |
+
"Makefile",
|
164 |
+
"Dockerfile",
|
165 |
+
"docker-compose.yml",
|
166 |
+
]
|
167 |
+
|
168 |
+
# Language comment patterns for cleaning
|
169 |
+
COMMENT_PATTERNS = {
|
170 |
+
'python': [r'#.*$', r'""".*?"""', r"'''.*?'''"],
|
171 |
+
'javascript': [r'//.*$', r'/\*.*?\*/'],
|
172 |
+
'java': [r'//.*$', r'/\*.*?\*/'],
|
173 |
+
'cpp': [r'//.*$', r'/\*.*?\*/'],
|
174 |
+
'c': [r'//.*$', r'/\*.*?\*/'],
|
175 |
+
'cs': [r'//.*$', r'/\*.*?\*/'],
|
176 |
+
'go': [r'//.*$', r'/\*.*?\*/'],
|
177 |
+
'rs': [r'//.*$', r'/\*.*?\*/'],
|
178 |
+
'php': [r'//.*$', r'#.*$', r'/\*.*?\*/'],
|
179 |
+
'ruby': [r'#.*$', r'=begin.*?=end'],
|
180 |
+
'shell': [r'#.*$'],
|
181 |
+
'sql': [r'--.*$', r'/\*.*?\*/'],
|
182 |
+
'html': [r'<!--.*?-->'],
|
183 |
+
'xml': [r'<!--.*?-->'],
|
184 |
+
'css': [r'/\*.*?\*/'],
|
185 |
+
}
|
186 |
+
|
187 |
+
# Token estimation multipliers for different languages
|
188 |
+
TOKEN_MULTIPLIERS = {
|
189 |
+
'python': 0.25,
|
190 |
+
'javascript': 0.3,
|
191 |
+
'java': 0.25,
|
192 |
+
'cpp': 0.25,
|
193 |
+
'c': 0.25,
|
194 |
+
'cs': 0.25,
|
195 |
+
'go': 0.25,
|
196 |
+
'rs': 0.25,
|
197 |
+
'php': 0.3,
|
198 |
+
'ruby': 0.25,
|
199 |
+
'shell': 0.3,
|
200 |
+
'sql': 0.25,
|
201 |
+
'html': 0.2,
|
202 |
+
'xml': 0.2,
|
203 |
+
'css': 0.25,
|
204 |
+
'json': 0.15,
|
205 |
+
'yaml': 0.2,
|
206 |
+
'markdown': 0.2,
|
207 |
+
'text': 0.25,
|
208 |
+
'default': 0.25,
|
209 |
+
}
|
210 |
+
|
211 |
+
# Rate limiting
|
212 |
+
MAX_REQUESTS_PER_MINUTE = 60
|
213 |
+
REQUEST_TIMEOUT = 30
|
214 |
+
|
215 |
+
# UI Configuration
|
216 |
+
THEME_COLORS = {
|
217 |
+
'primary': '#3070f0',
|
218 |
+
'secondary': '#64748b',
|
219 |
+
'success': '#10b981',
|
220 |
+
'warning': '#f59e0b',
|
221 |
+
'error': '#ef4444',
|
222 |
+
'background': '#ffffff',
|
223 |
+
'surface': '#f8fafc',
|
224 |
+
'text': '#1e293b',
|
225 |
+
'text_secondary': '#64748b',
|
226 |
+
}
|
227 |
+
|
228 |
+
# Progress tracking
|
229 |
+
PROGRESS_STEPS = [
|
230 |
+
(0.0, "Initializing..."),
|
231 |
+
(0.1, "Fetching repository information..."),
|
232 |
+
(0.2, "Scanning files..."),
|
233 |
+
(0.3, "Processing files..."),
|
234 |
+
(0.5, "Analyzing content..."),
|
235 |
+
(0.7, "Generating output..."),
|
236 |
+
(0.9, "Finalizing..."),
|
237 |
+
(1.0, "Complete!"),
|
238 |
+
]
|
models.py
ADDED
@@ -0,0 +1,226 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import base64
|
3 |
+
import json
|
4 |
+
import zipfile
|
5 |
+
import io
|
6 |
+
import os
|
7 |
+
from typing import List, Dict, Tuple, Optional
|
8 |
+
from pathlib import Path
|
9 |
+
import re
|
10 |
+
|
11 |
+
from utils import matches_patterns, is_binary_file, format_file_size
|
12 |
+
from config import GITHUB_API_BASE, HF_API_BASE
|
13 |
+
|
14 |
+
def process_github_repo(
|
15 |
+
repo_path: str,
|
16 |
+
token: str,
|
17 |
+
include_patterns: List[str],
|
18 |
+
exclude_patterns: List[str],
|
19 |
+
max_file_size: int
|
20 |
+
) -> Tuple[List[Tuple[str, str, int]], Dict]:
|
21 |
+
"""Process GitHub repository and return file contents"""
|
22 |
+
|
23 |
+
headers = {}
|
24 |
+
if token:
|
25 |
+
headers['Authorization'] = f'token {token}'
|
26 |
+
|
27 |
+
# Get repository info
|
28 |
+
repo_url = f"{GITHUB_API_BASE}/repos/{repo_path}"
|
29 |
+
repo_response = requests.get(repo_url, headers=headers)
|
30 |
+
|
31 |
+
if repo_response.status_code != 200:
|
32 |
+
raise Exception(f"Failed to fetch repository info: {repo_response.json().get('message', 'Unknown error')}")
|
33 |
+
|
34 |
+
repo_info = repo_response.json()
|
35 |
+
|
36 |
+
# Get all files recursively
|
37 |
+
files_data = []
|
38 |
+
contents_queue = [""]
|
39 |
+
|
40 |
+
while contents_queue:
|
41 |
+
current_path = contents_queue.pop(0)
|
42 |
+
|
43 |
+
# Get directory contents
|
44 |
+
contents_url = f"{GITHUB_API_BASE}/repos/{repo_path}/contents/{current_path}"
|
45 |
+
contents_response = requests.get(contents_url, headers=headers)
|
46 |
+
|
47 |
+
if contents_response.status_code != 200:
|
48 |
+
continue
|
49 |
+
|
50 |
+
contents = contents_response.json()
|
51 |
+
|
52 |
+
if isinstance(contents, dict):
|
53 |
+
# Single file
|
54 |
+
contents = [contents]
|
55 |
+
|
56 |
+
for item in contents:
|
57 |
+
item_path = f"{current_path}/{item['name']}" if current_path else item['name']
|
58 |
+
|
59 |
+
if item['type'] == 'dir':
|
60 |
+
contents_queue.append(item_path)
|
61 |
+
elif item['type'] == 'file':
|
62 |
+
# Check if file matches patterns
|
63 |
+
if not matches_patterns(item_path, include_patterns, exclude_patterns):
|
64 |
+
continue
|
65 |
+
|
66 |
+
# Check file size
|
67 |
+
if item['size'] > max_file_size:
|
68 |
+
continue
|
69 |
+
|
70 |
+
# Get file content
|
71 |
+
try:
|
72 |
+
file_url = item['url']
|
73 |
+
file_response = requests.get(file_url, headers=headers)
|
74 |
+
|
75 |
+
if file_response.status_code == 200:
|
76 |
+
file_data = file_response.json()
|
77 |
+
content = base64.b64decode(file_data['content']).decode('utf-8', errors='ignore')
|
78 |
+
|
79 |
+
# Skip binary files
|
80 |
+
if is_binary_file(content, item_path):
|
81 |
+
continue
|
82 |
+
|
83 |
+
files_data.append((item_path, content, item['size']))
|
84 |
+
|
85 |
+
except Exception as e:
|
86 |
+
print(f"Error processing file {item_path}: {e}")
|
87 |
+
continue
|
88 |
+
|
89 |
+
return files_data, repo_info
|
90 |
+
|
91 |
+
def process_huggingface_repo(
|
92 |
+
repo_path: str,
|
93 |
+
token: str,
|
94 |
+
include_patterns: List[str],
|
95 |
+
exclude_patterns: List[str],
|
96 |
+
max_file_size: int
|
97 |
+
) -> Tuple[List[Tuple[str, str, int]], Dict]:
|
98 |
+
"""Process Hugging Face repository and return file contents"""
|
99 |
+
|
100 |
+
headers = {}
|
101 |
+
if token:
|
102 |
+
headers['Authorization'] = f'Bearer {token}'
|
103 |
+
|
104 |
+
# Get repository info
|
105 |
+
repo_url = f"{HF_API_BASE}/api/models/{repo_path}"
|
106 |
+
repo_response = requests.get(repo_url, headers=headers)
|
107 |
+
|
108 |
+
if repo_response.status_code != 200:
|
109 |
+
raise Exception(f"Failed to fetch repository info: {repo_response.json().get('error', 'Unknown error')}")
|
110 |
+
|
111 |
+
repo_info = repo_response.json()
|
112 |
+
|
113 |
+
# Get repository tree
|
114 |
+
tree_url = f"{HF_API_BASE}/api/models/{repo_path}/tree/main"
|
115 |
+
tree_response = requests.get(tree_url, headers=headers)
|
116 |
+
|
117 |
+
if tree_response.status_code != 200:
|
118 |
+
raise Exception(f"Failed to fetch repository tree: {tree_response.json().get('error', 'Unknown error')}")
|
119 |
+
|
120 |
+
tree_data = tree_response.json()
|
121 |
+
|
122 |
+
files_data = []
|
123 |
+
|
124 |
+
def process_tree_item(item, current_path=""):
|
125 |
+
if isinstance(item, list):
|
126 |
+
for subitem in item:
|
127 |
+
process_tree_item(subitem, current_path)
|
128 |
+
elif isinstance(item, dict):
|
129 |
+
item_path = f"{current_path}/{item['path']}" if current_path else item['path']
|
130 |
+
|
131 |
+
if item['type'] == 'directory':
|
132 |
+
# Get directory contents
|
133 |
+
dir_url = f"{HF_API_BASE}/api/models/{repo_path}/tree/main/{item_path}"
|
134 |
+
dir_response = requests.get(dir_url, headers=headers)
|
135 |
+
|
136 |
+
if dir_response.status_code == 200:
|
137 |
+
process_tree_item(dir_response.json(), item_path)
|
138 |
+
elif item['type'] == 'file':
|
139 |
+
# Check if file matches patterns
|
140 |
+
if not matches_patterns(item_path, include_patterns, exclude_patterns):
|
141 |
+
return
|
142 |
+
|
143 |
+
# Check file size
|
144 |
+
if item.get('size', 0) > max_file_size:
|
145 |
+
return
|
146 |
+
|
147 |
+
# Get file content
|
148 |
+
try:
|
149 |
+
raw_url = f"https://huggingface.co/{repo_path}/raw/main/{item_path}"
|
150 |
+
file_response = requests.get(raw_url, headers=headers)
|
151 |
+
|
152 |
+
if file_response.status_code == 200:
|
153 |
+
content = file_response.text
|
154 |
+
|
155 |
+
# Skip binary files
|
156 |
+
if is_binary_file(content, item_path):
|
157 |
+
return
|
158 |
+
|
159 |
+
files_data.append((item_path, content, len(content)))
|
160 |
+
|
161 |
+
except Exception as e:
|
162 |
+
print(f"Error processing file {item_path}: {e}")
|
163 |
+
return
|
164 |
+
|
165 |
+
process_tree_item(tree_data)
|
166 |
+
|
167 |
+
return files_data, repo_info
|
168 |
+
|
169 |
+
def download_repo_as_zip(repo_url: str, token: str) -> str:
|
170 |
+
"""Download repository as ZIP file"""
|
171 |
+
|
172 |
+
if "github.com" in repo_url:
|
173 |
+
# GitHub ZIP URL
|
174 |
+
if token:
|
175 |
+
headers = {'Authorization': f'token {token}'}
|
176 |
+
zip_url = repo_url.replace("github.com", "api.github.com/repos") + "/zipball/main"
|
177 |
+
else:
|
178 |
+
headers = {}
|
179 |
+
zip_url = repo_url.replace("github.com", "codeload.github.com") + "/zip/main"
|
180 |
+
elif "huggingface.co" in repo_url:
|
181 |
+
# Hugging Face ZIP URL
|
182 |
+
headers = {}
|
183 |
+
if token:
|
184 |
+
headers['Authorization'] = f'Bearer {token}'
|
185 |
+
zip_url = repo_url.replace("huggingface.co", "huggingface.co") + "/resolve/main?download=true"
|
186 |
+
else:
|
187 |
+
raise ValueError("Unsupported repository URL")
|
188 |
+
|
189 |
+
response = requests.get(zip_url, headers=headers, stream=True)
|
190 |
+
|
191 |
+
if response.status_code != 200:
|
192 |
+
raise Exception(f"Failed to download repository: {response.status_code}")
|
193 |
+
|
194 |
+
# Save to temporary file
|
195 |
+
temp_path = f"/tmp/repo_{hash(repo_url)}.zip"
|
196 |
+
|
197 |
+
with open(temp_path, 'wb') as f:
|
198 |
+
for chunk in response.iter_content(chunk_size=8192):
|
199 |
+
f.write(chunk)
|
200 |
+
|
201 |
+
return temp_path
|
202 |
+
|
203 |
+
def extract_repo_info(repo_url: str, repo_type: str) -> Dict:
|
204 |
+
"""Extract basic repository information"""
|
205 |
+
if repo_type == "github":
|
206 |
+
# Extract owner and repo name
|
207 |
+
match = re.search(r'github\.com/([^/]+)/([^/]+)', repo_url)
|
208 |
+
if match:
|
209 |
+
return {
|
210 |
+
'owner': match.group(1),
|
211 |
+
'repo': match.group(2),
|
212 |
+
'full_name': f"{match.group(1)}/{match.group(2)}",
|
213 |
+
'url': repo_url
|
214 |
+
}
|
215 |
+
elif repo_type == "huggingface":
|
216 |
+
# Extract owner and repo name
|
217 |
+
match = re.search(r'huggingface\.co/([^/]+)/([^/]+)', repo_url)
|
218 |
+
if match:
|
219 |
+
return {
|
220 |
+
'owner': match.group(1),
|
221 |
+
'repo': match.group(2),
|
222 |
+
'full_name': f"{match.group(1)}/{match.group(2)}",
|
223 |
+
'url': repo_url
|
224 |
+
}
|
225 |
+
|
226 |
+
return {'url': repo_url}
|
requirements.txt
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio>=4.0.0
|
2 |
+
requests>=2.31.0
|
3 |
+
python-multipart>=0.0.6
|
4 |
+
pathlib>=1.0.1
|
5 |
+
re>=2.2.1
|
6 |
+
hashlib>=20081119
|
7 |
+
zipfile>=0.5
|
8 |
+
io>=0.1
|
9 |
+
datetime>=4.3
|
10 |
+
mimetypes>=0.1
|
11 |
+
fnmatch>=2.4.3
|
12 |
+
base64>=0.1
|
13 |
+
json>=2.0.9
|
14 |
+
|
15 |
+
This Gradio application provides a comprehensive solution for converting GitHub or Hugging Face repositories into text files suitable for LLM training. Here are the key features:
|
16 |
+
|
17 |
+
## 🚀 Main Features:
|
18 |
+
|
19 |
+
1. **Multi-Platform Support**: Works with both GitHub and Hugging Face repositories
|
20 |
+
2. **Smart File Filtering**: Include/exclude patterns to process only relevant files
|
21 |
+
3. **Token Estimation**: Provides rough token counts for training planning
|
22 |
+
4. **Chunked Output**: Splits large repositories into manageable chunks
|
23 |
+
5. **Comment Removal**: Optional comment stripping for cleaner training data
|
24 |
+
6. **Binary File Detection**: Automatically skips binary files
|
25 |
+
7. **Language Detection**: Identifies programming languages for better organization
|
26 |
+
8. **Progress Tracking**: Real-time progress updates during processing
|
27 |
+
|
28 |
+
## 🛠️ Advanced Options:
|
29 |
+
|
30 |
+
- File size limits to prevent processing huge files
|
31 |
+
- Pattern-based filtering (glob patterns supported)
|
32 |
+
- Chunk size customization
|
33 |
+
- Metadata inclusion
|
34 |
+
- Private repository support with tokens
|
35 |
+
- ZIP download option
|
36 |
+
|
37 |
+
## 📊 Output Features:
|
38 |
+
|
39 |
+
- Repository metadata and statistics
|
40 |
+
- File headers with path, size, and language info
|
41 |
+
- Token and character counts
|
42 |
+
- Formatted, readable output structure
|
43 |
+
- Error handling and status messages
|
44 |
+
|
45 |
+
The application is designed to handle repositories of various sizes while providing useful feedback and statistics about the processed content. It's perfect for preparing code repositories for LLM fine-tuning or analysis.
|
utils.py
ADDED
@@ -0,0 +1,365 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import hashlib
|
3 |
+
from typing import List, Dict, Optional
|
4 |
+
import mimetypes
|
5 |
+
|
6 |
+
def clean_code_content(content: str, file_path: str) -> str:
|
7 |
+
"""Remove comments from code files while preserving structure"""
|
8 |
+
ext = file_path.split('.')[-1].lower()
|
9 |
+
|
10 |
+
# Language-specific comment patterns
|
11 |
+
comment_patterns = {
|
12 |
+
'py': [
|
13 |
+
(r'#.*$', ''), # Single line comments
|
14 |
+
(r'""".*?"""', '', re.DOTALL), # Triple quotes
|
15 |
+
(r"'''.*?'''", '', re.DOTALL),
|
16 |
+
],
|
17 |
+
'js': [
|
18 |
+
(r'//.*$', ''), # Single line comments
|
19 |
+
(r'/\*.*?\*/', '', re.DOTALL), # Multi-line comments
|
20 |
+
],
|
21 |
+
'java': [
|
22 |
+
(r'//.*$', ''),
|
23 |
+
(r'/\*.*?\*/', '', re.DOTALL),
|
24 |
+
],
|
25 |
+
'cpp': [
|
26 |
+
(r'//.*$', ''),
|
27 |
+
(r'/\*.*?\*/', '', re.DOTALL),
|
28 |
+
],
|
29 |
+
'c': [
|
30 |
+
(r'//.*$', ''),
|
31 |
+
(r'/\*.*?\*/', '', re.DOTALL),
|
32 |
+
],
|
33 |
+
'cs': [
|
34 |
+
(r'//.*$', ''),
|
35 |
+
(r'/\*.*?\*/', '', re.DOTALL),
|
36 |
+
],
|
37 |
+
'go': [
|
38 |
+
(r'//.*$', ''),
|
39 |
+
(r'/\*.*?\*/', '', re.DOTALL),
|
40 |
+
],
|
41 |
+
'rs': [
|
42 |
+
(r'//.*$', ''),
|
43 |
+
(r'/\*.*?\*/', '', re.DOTALL),
|
44 |
+
],
|
45 |
+
'php': [
|
46 |
+
(r'//.*$', ''),
|
47 |
+
(r'#.*$', ''),
|
48 |
+
(r'/\*.*?\*/', '', re.DOTALL),
|
49 |
+
],
|
50 |
+
'rb': [
|
51 |
+
(r'#.*$', ''),
|
52 |
+
(r'=begin.*?=end', '', re.DOTALL),
|
53 |
+
],
|
54 |
+
'sh': [
|
55 |
+
(r'#.*$', ''),
|
56 |
+
],
|
57 |
+
'sql': [
|
58 |
+
(r'--.*$', ''),
|
59 |
+
(r'/\*.*?\*/', '', re.DOTALL),
|
60 |
+
],
|
61 |
+
'html': [
|
62 |
+
(r'<!--.*?-->', '', re.DOTALL),
|
63 |
+
],
|
64 |
+
'xml': [
|
65 |
+
(r'<!--.*?-->', '', re.DOTALL),
|
66 |
+
],
|
67 |
+
'css': [
|
68 |
+
(r'/\*.*?\*/', '', re.DOTALL),
|
69 |
+
],
|
70 |
+
}
|
71 |
+
|
72 |
+
if ext in comment_patterns:
|
73 |
+
content = content.strip()
|
74 |
+
for pattern, replacement, *flags in comment_patterns[ext]:
|
75 |
+
flags = flags[0] if flags else 0
|
76 |
+
content = re.sub(pattern, replacement, content, flags=flags)
|
77 |
+
|
78 |
+
# Clean up extra whitespace
|
79 |
+
content = re.sub(r'\n\s*\n\s*\n', '\n\n', content)
|
80 |
+
content = content.strip()
|
81 |
+
|
82 |
+
return content
|
83 |
+
|
84 |
+
def get_file_language(file_path: str) -> str:
|
85 |
+
"""Determine programming language from file extension"""
|
86 |
+
ext = file_path.split('.')[-1].lower()
|
87 |
+
|
88 |
+
language_map = {
|
89 |
+
'py': 'Python',
|
90 |
+
'js': 'JavaScript',
|
91 |
+
'ts': 'TypeScript',
|
92 |
+
'jsx': 'React JSX',
|
93 |
+
'tsx': 'React TSX',
|
94 |
+
'java': 'Java',
|
95 |
+
'cpp': 'C++',
|
96 |
+
'c': 'C',
|
97 |
+
'cs': 'C#',
|
98 |
+
'go': 'Go',
|
99 |
+
'rs': 'Rust',
|
100 |
+
'php': 'PHP',
|
101 |
+
'rb': 'Ruby',
|
102 |
+
'swift': 'Swift',
|
103 |
+
'kt': 'Kotlin',
|
104 |
+
'scala': 'Scala',
|
105 |
+
'r': 'R',
|
106 |
+
'm': 'Objective-C',
|
107 |
+
'sh': 'Shell',
|
108 |
+
'bash': 'Bash',
|
109 |
+
'zsh': 'Zsh',
|
110 |
+
'fish': 'Fish',
|
111 |
+
'ps1': 'PowerShell',
|
112 |
+
'bat': 'Batch',
|
113 |
+
'sql': 'SQL',
|
114 |
+
'html': 'HTML',
|
115 |
+
'htm': 'HTML',
|
116 |
+
'xml': 'XML',
|
117 |
+
'css': 'CSS',
|
118 |
+
'scss': 'SCSS',
|
119 |
+
'sass': 'SASS',
|
120 |
+
'less': 'LESS',
|
121 |
+
'json': 'JSON',
|
122 |
+
'yaml': 'YAML',
|
123 |
+
'yml': 'YAML',
|
124 |
+
'toml': 'TOML',
|
125 |
+
'ini': 'INI',
|
126 |
+
'cfg': 'Config',
|
127 |
+
'conf': 'Config',
|
128 |
+
'md': 'Markdown',
|
129 |
+
'rst': 'reStructuredText',
|
130 |
+
'txt': 'Text',
|
131 |
+
'log': 'Log',
|
132 |
+
'dockerfile': 'Docker',
|
133 |
+
'docker': 'Docker',
|
134 |
+
'gitignore': 'Git',
|
135 |
+
'gitattributes': 'Git',
|
136 |
+
'editorconfig': 'EditorConfig',
|
137 |
+
'eslintrc': 'ESLint',
|
138 |
+
'prettierrc': 'Prettier',
|
139 |
+
'babelrc': 'Babel',
|
140 |
+
'tsconfig': 'TypeScript',
|
141 |
+
'package': 'NPM',
|
142 |
+
'lock': 'Lock',
|
143 |
+
'requirements': 'Python',
|
144 |
+
'pipfile': 'Python',
|
145 |
+
'poetry': 'Python',
|
146 |
+
'makefile': 'Make',
|
147 |
+
'cmake': 'CMake',
|
148 |
+
'gradle': 'Gradle',
|
149 |
+
'pom': 'Maven',
|
150 |
+
'sbt': 'SBT',
|
151 |
+
'vue': 'Vue',
|
152 |
+
'svelte': 'Svelte',
|
153 |
+
'elm': 'Elm',
|
154 |
+
'pug': 'Pug',
|
155 |
+
'haml': 'Haml',
|
156 |
+
'erb': 'ERB',
|
157 |
+
'ejs': 'EJS',
|
158 |
+
'twig': 'Twig',
|
159 |
+
'liquid': 'Liquid',
|
160 |
+
'handlebars': 'Handlebars',
|
161 |
+
'mustache': 'Mustache',
|
162 |
+
'jinja': 'Jinja',
|
163 |
+
'tex': 'LaTeX',
|
164 |
+
'bib': 'BibTeX',
|
165 |
+
'plt': 'Gnuplot',
|
166 |
+
'dot': 'Graphviz',
|
167 |
+
'mermaid': 'Mermaid',
|
168 |
+
'drawio': 'DrawIO',
|
169 |
+
'puml': 'PlantUML',
|
170 |
+
'wsdl': 'WSDL',
|
171 |
+
'xsd': 'XSD',
|
172 |
+
'xslt': 'XSLT',
|
173 |
+
'graphql': 'GraphQL',
|
174 |
+
'proto': 'Protocol Buffers',
|
175 |
+
'avro': 'Avro',
|
176 |
+
'parquet': 'Parquet',
|
177 |
+
'arrow': 'Arrow',
|
178 |
+
'feather': 'Feather',
|
179 |
+
'hdf5': 'HDF5',
|
180 |
+
'netcdf': 'NetCDF',
|
181 |
+
'matlab': 'MATLAB',
|
182 |
+
'mex': 'MATLAB',
|
183 |
+
'fig': 'MATLAB',
|
184 |
+
'slx': 'Simulink',
|
185 |
+
'simulink': 'Simulink',
|
186 |
+
'labview': 'LabVIEW',
|
187 |
+
'vi': 'LabVIEW',
|
188 |
+
'lvproj': 'LabVIEW',
|
189 |
+
'lvlib': 'LabVIEW',
|
190 |
+
'stata': 'Stata',
|
191 |
+
'do': 'Stata',
|
192 |
+
'ado': 'Stata',
|
193 |
+
'spss': 'SPSS',
|
194 |
+
'sav': 'SPSS',
|
195 |
+
'sas': 'SAS',
|
196 |
+
's7dat': 'SAS',
|
197 |
+
's7bdat': 'SAS',
|
198 |
+
'xpt': 'SAS',
|
199 |
+
'dta': 'Stata',
|
200 |
+
'rdata': 'R',
|
201 |
+
'rds': 'R',
|
202 |
+
'rda': 'R',
|
203 |
+
'jl': 'Julia',
|
204 |
+
'nim': 'Nim',
|
205 |
+
'zig': 'Zig',
|
206 |
+
'v': 'V',
|
207 |
+
'ada': 'Ada',
|
208 |
+
'adb': 'Ada',
|
209 |
+
'ads': 'Ada',
|
210 |
+
'pas': 'Pascal',
|
211 |
+
'pp': 'Pascal',
|
212 |
+
'dpr': 'Pascal',
|
213 |
+
'lpr': 'Pascal',
|
214 |
+
'dfm': 'Pascal',
|
215 |
+
'pl': 'Perl',
|
216 |
+
'pm': 'Perl',
|
217 |
+
't': 'Perl',
|
218 |
+
'pod': 'Perl',
|
219 |
+
'lua': 'Lua',
|
220 |
+
'moon': 'MoonScript',
|
221 |
+
'el': 'Emacs Lisp',
|
222 |
+
'elc': 'Emacs Lisp',
|
223 |
+
'elisp': 'Emacs Lisp',
|
224 |
+
'cl': 'Common Lisp',
|
225 |
+
'lisp': 'Common Lisp',
|
226 |
+
'lsp': 'Common Lisp',
|
227 |
+
'fasl': 'Common Lisp',
|
228 |
+
'ss': 'Scheme',
|
229 |
+
'scm': 'Scheme',
|
230 |
+
'rkt': 'Scheme',
|
231 |
+
'sch': 'Scheme',
|
232 |
+
'fs': 'F#',
|
233 |
+
'fsi': 'F#',
|
234 |
+
'fsx': 'F#',
|
235 |
+
'fsscript': 'F#',
|
236 |
+
'ml': 'OCaml',
|
237 |
+
'mli': 'OCaml',
|
238 |
+
'll': 'LLVM',
|
239 |
+
'bc': 'LLVM',
|
240 |
+
'nim': 'Nim',
|
241 |
+
'nimble': 'Nim',
|
242 |
+
'nims': 'Nim',
|
243 |
+
'v': 'V',
|
244 |
+
'vsh': 'V',
|
245 |
+
'vv': 'V',
|
246 |
+
'vh': 'V',
|
247 |
+
'd': 'D',
|
248 |
+
'di': 'D',
|
249 |
+
'dart': 'Dart',
|
250 |
+
'groovy': 'Groovy',
|
251 |
+
'gvy': 'Groovy',
|
252 |
+
'gy': 'Groovy',
|
253 |
+
'gsh': 'Groovy',
|
254 |
+
'clj': 'Clojure',
|
255 |
+
'cljs': 'ClojureScript',
|
256 |
+
'cljc': 'Clojure',
|
257 |
+
'edn': 'Clojure',
|
258 |
+
'coffee': 'CoffeeScript',
|
259 |
+
'litcoffee': 'CoffeeScript',
|
260 |
+
'cjsx': 'Cjsx',
|
261 |
+
'iced': 'IcedCoffeeScript',
|
262 |
+
'hx': 'Haxe',
|
263 |
+
'hxml': 'Haxe',
|
264 |
+
'purs': 'PureScript',
|
265 |
+
'elm': 'Elm',
|
266 |
+
'p8': 'Pico-8',
|
267 |
+
'lua': 'Lua',
|
268 |
+
'moon': 'MoonScript',
|
269 |
+
'wren': 'Wren',
|
270 |
+
'earl-grey': 'Earl Grey',
|
271 |
+
'eg': 'Earl Grey',
|
272 |
+
'tsv': 'TSV',
|
273 |
+
'csv': 'CSV',
|
274 |
+
}
|
275 |
+
|
276 |
+
return language_map.get(ext, ext.upper())
|
277 |
+
|
278 |
+
def estimate_tokens(text: str) -> int:
|
279 |
+
"""Estimate token count (rough approximation)"""
|
280 |
+
# Simple heuristic: ~4 characters per token for English text
|
281 |
+
# For code, this varies more, but it's a reasonable approximation
|
282 |
+
return len(text) // 4
|
283 |
+
|
284 |
+
def create_chunked_output(content: str, chunk_size: int) -> List[str]:
|
285 |
+
"""Split content into chunks of specified size"""
|
286 |
+
chunks = []
|
287 |
+
current_chunk = ""
|
288 |
+
|
289 |
+
lines = content.split('\n')
|
290 |
+
|
291 |
+
for line in lines:
|
292 |
+
if len(current_chunk) + len(line) + 1 > chunk_size:
|
293 |
+
if current_chunk:
|
294 |
+
chunks.append(current_chunk)
|
295 |
+
current_chunk = line
|
296 |
+
else:
|
297 |
+
if current_chunk:
|
298 |
+
current_chunk += '\n' + line
|
299 |
+
else:
|
300 |
+
current_chunk = line
|
301 |
+
|
302 |
+
if current_chunk:
|
303 |
+
chunks.append(current_chunk)
|
304 |
+
|
305 |
+
return chunks
|
306 |
+
|
307 |
+
def matches_patterns(file_path: str, include_patterns: List[str], exclude_patterns: List[str]) -> bool:
|
308 |
+
"""Check if file matches include/exclude patterns"""
|
309 |
+
import fnmatch
|
310 |
+
|
311 |
+
# Check exclude patterns first
|
312 |
+
for pattern in exclude_patterns:
|
313 |
+
if fnmatch.fnmatch(file_path, pattern) or fnmatch.fnmatch(file_path, f"**/{pattern}"):
|
314 |
+
return False
|
315 |
+
|
316 |
+
# If no include patterns, include everything else
|
317 |
+
if not include_patterns:
|
318 |
+
return True
|
319 |
+
|
320 |
+
# Check include patterns
|
321 |
+
for pattern in include_patterns:
|
322 |
+
if fnmatch.fnmatch(file_path, pattern) or fnmatch.fnmatch(file_path, f"**/{pattern}"):
|
323 |
+
return True
|
324 |
+
|
325 |
+
return False
|
326 |
+
|
327 |
+
def format_file_size(size_bytes: int) -> str:
|
328 |
+
"""Format file size in human readable format"""
|
329 |
+
for unit in ['B', 'KB', 'MB', 'GB']:
|
330 |
+
if size_bytes < 1024.0:
|
331 |
+
return f"{size_bytes:.1f} {unit}"
|
332 |
+
size_bytes /= 1024.0
|
333 |
+
return f"{size_bytes:.1f} TB"
|
334 |
+
|
335 |
+
def generate_file_hash(content: str) -> str:
|
336 |
+
"""Generate SHA-256 hash of file content"""
|
337 |
+
return hashlib.sha256(content.encode()).hexdigest()[:16]
|
338 |
+
|
339 |
+
def is_binary_file(content: str, file_path: str) -> bool:
|
340 |
+
"""Check if file is binary"""
|
341 |
+
# Check file extension first
|
342 |
+
binary_extensions = {
|
343 |
+
'png', 'jpg', 'jpeg', 'gif', 'bmp', 'ico', 'svg', 'webp',
|
344 |
+
'mp3', 'mp4', 'avi', 'mov', 'wav', 'flac', 'ogg',
|
345 |
+
'zip', 'rar', 'tar', 'gz', '7z', 'bz2', 'xz',
|
346 |
+
'exe', 'dll', 'so', 'dylib',
|
347 |
+
'pdf', 'doc', 'docx', 'xls', 'xlsx', 'ppt', 'pptx',
|
348 |
+
'ttf', 'otf', 'woff', 'woff2', 'eot',
|
349 |
+
'bin', 'dat', 'db', 'sqlite', 'sqlite3',
|
350 |
+
}
|
351 |
+
|
352 |
+
ext = file_path.split('.')[-1].lower()
|
353 |
+
if ext in binary_extensions:
|
354 |
+
return True
|
355 |
+
|
356 |
+
# Check content for null bytes (indicator of binary)
|
357 |
+
if '\0' in content[:1024]:
|
358 |
+
return True
|
359 |
+
|
360 |
+
# Check if content has too many non-printable characters
|
361 |
+
printable_chars = sum(1 for c in content[:1024] if c.isprintable() or c in '\t\n\r')
|
362 |
+
if printable_chars / len(content[:1024]) < 0.7:
|
363 |
+
return True
|
364 |
+
|
365 |
+
return False
|