Spaces:
Sleeping
Sleeping
Add document parser app
Browse files
app.py
ADDED
|
@@ -0,0 +1,289 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import zipfile
|
| 3 |
+
import os
|
| 4 |
+
import io
|
| 5 |
+
import json
|
| 6 |
+
import tempfile
|
| 7 |
+
import shutil
|
| 8 |
+
|
| 9 |
+
# Supported text-based extensions
|
| 10 |
+
TEXT_EXTS = {
|
| 11 |
+
".txt", ".md", ".py", ".js", ".ts", ".jsx", ".tsx", ".html", ".css",
|
| 12 |
+
".json", ".yaml", ".yml", ".csv", ".xml", ".toml", ".cfg", ".ini",
|
| 13 |
+
".sh", ".bash", ".bat", ".ps1", ".r", ".java", ".c", ".cpp", ".h",
|
| 14 |
+
".hpp", ".go", ".rs", ".rb", ".php", ".swift", ".kt", ".scala",
|
| 15 |
+
".sql", ".dockerfile", ".makefile", ".gitignore", ".env", ".log",
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
# Extensions we can parse with special libraries
|
| 19 |
+
PDF_EXTS = {".pdf"}
|
| 20 |
+
DOCX_EXTS = {".docx"}
|
| 21 |
+
XLSX_EXTS = {".xlsx"}
|
| 22 |
+
IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".svg", ".webp", ".ico"}
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def get_file_type(filename):
|
| 26 |
+
"""Categorize file by extension."""
|
| 27 |
+
ext = os.path.splitext(filename)[1].lower()
|
| 28 |
+
if not ext and filename.split("/")[-1] in {"Makefile", "Dockerfile", "Procfile", ".gitignore", ".dockerignore"}:
|
| 29 |
+
return "text", ext
|
| 30 |
+
if ext in TEXT_EXTS:
|
| 31 |
+
return "text", ext
|
| 32 |
+
if ext in PDF_EXTS:
|
| 33 |
+
return "pdf", ext
|
| 34 |
+
if ext in DOCX_EXTS:
|
| 35 |
+
return "docx", ext
|
| 36 |
+
if ext in XLSX_EXTS:
|
| 37 |
+
return "xlsx", ext
|
| 38 |
+
if ext in IMAGE_EXTS:
|
| 39 |
+
return "image", ext
|
| 40 |
+
return "binary", ext
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def parse_pdf_content(data):
|
| 44 |
+
"""Parse PDF bytes to text using PyMuPDF."""
|
| 45 |
+
try:
|
| 46 |
+
import fitz
|
| 47 |
+
doc = fitz.open(stream=data, filetype="pdf")
|
| 48 |
+
text = ""
|
| 49 |
+
for page_num, page in enumerate(doc):
|
| 50 |
+
text += f"\n--- Page {page_num + 1} ---\n"
|
| 51 |
+
text += page.get_text()
|
| 52 |
+
doc.close()
|
| 53 |
+
return text.strip() if text.strip() else "[PDF: no extractable text]"
|
| 54 |
+
except ImportError:
|
| 55 |
+
return "[PDF parsing unavailable - PyMuPDF not installed]"
|
| 56 |
+
except Exception as e:
|
| 57 |
+
return f"[PDF parse error: {e}]"
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def parse_docx_content(data):
|
| 61 |
+
"""Parse DOCX bytes to text."""
|
| 62 |
+
try:
|
| 63 |
+
from docx import Document
|
| 64 |
+
doc = Document(io.BytesIO(data))
|
| 65 |
+
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
| 66 |
+
return "\n".join(paragraphs) if paragraphs else "[DOCX: empty document]"
|
| 67 |
+
except ImportError:
|
| 68 |
+
return "[DOCX parsing unavailable - python-docx not installed]"
|
| 69 |
+
except Exception as e:
|
| 70 |
+
return f"[DOCX parse error: {e}]"
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def parse_xlsx_content(data):
|
| 74 |
+
"""Parse XLSX bytes to text summary."""
|
| 75 |
+
try:
|
| 76 |
+
import openpyxl
|
| 77 |
+
wb = openpyxl.load_workbook(io.BytesIO(data), read_only=True)
|
| 78 |
+
text = ""
|
| 79 |
+
for sheet_name in wb.sheetnames:
|
| 80 |
+
ws = wb[sheet_name]
|
| 81 |
+
text += f"\n--- Sheet: {sheet_name} ---\n"
|
| 82 |
+
row_count = 0
|
| 83 |
+
for row in ws.iter_rows(values_only=True):
|
| 84 |
+
if row_count >= 50: # Limit rows shown
|
| 85 |
+
text += f"\n... (more rows exist)\n"
|
| 86 |
+
break
|
| 87 |
+
text += " | ".join(str(cell) if cell is not None else "" for cell in row) + "\n"
|
| 88 |
+
row_count += 1
|
| 89 |
+
wb.close()
|
| 90 |
+
return text.strip() if text.strip() else "[XLSX: empty workbook]"
|
| 91 |
+
except ImportError:
|
| 92 |
+
return "[XLSX parsing unavailable - openpyxl not installed]"
|
| 93 |
+
except Exception as e:
|
| 94 |
+
return f"[XLSX parse error: {e}]"
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def format_size(size_bytes):
|
| 98 |
+
"""Format bytes to human-readable string."""
|
| 99 |
+
if size_bytes < 1024:
|
| 100 |
+
return f"{size_bytes} B"
|
| 101 |
+
elif size_bytes < 1024 * 1024:
|
| 102 |
+
return f"{size_bytes / 1024:.1f} KB"
|
| 103 |
+
else:
|
| 104 |
+
return f"{size_bytes / (1024 * 1024):.1f} MB"
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def parse_zip(file_obj):
|
| 108 |
+
"""Main parsing function for uploaded zip files."""
|
| 109 |
+
if file_obj is None:
|
| 110 |
+
return "β οΈ Please upload a ZIP file.", [], "", []
|
| 111 |
+
|
| 112 |
+
file_path = file_obj if isinstance(file_obj, str) else file_obj.name
|
| 113 |
+
|
| 114 |
+
if not zipfile.is_zipfile(file_path):
|
| 115 |
+
return "β The uploaded file is not a valid ZIP archive.", [], "", []
|
| 116 |
+
|
| 117 |
+
results = []
|
| 118 |
+
table_rows = []
|
| 119 |
+
full_text_parts = []
|
| 120 |
+
stats = {"total_files": 0, "text_files": 0, "pdf_files": 0, "docx_files": 0,
|
| 121 |
+
"xlsx_files": 0, "image_files": 0, "binary_files": 0, "total_size": 0}
|
| 122 |
+
|
| 123 |
+
with zipfile.ZipFile(file_path, "r") as zf:
|
| 124 |
+
for info in zf.infolist():
|
| 125 |
+
if info.is_dir():
|
| 126 |
+
continue
|
| 127 |
+
|
| 128 |
+
stats["total_files"] += 1
|
| 129 |
+
stats["total_size"] += info.file_size
|
| 130 |
+
file_type, ext = get_file_type(info.filename)
|
| 131 |
+
content_preview = ""
|
| 132 |
+
|
| 133 |
+
try:
|
| 134 |
+
raw_data = zf.read(info)
|
| 135 |
+
except Exception as e:
|
| 136 |
+
content_preview = f"[Read error: {e}]"
|
| 137 |
+
raw_data = None
|
| 138 |
+
|
| 139 |
+
if raw_data is not None:
|
| 140 |
+
if file_type == "text":
|
| 141 |
+
stats["text_files"] += 1
|
| 142 |
+
try:
|
| 143 |
+
content = raw_data.decode("utf-8", errors="replace")
|
| 144 |
+
content_preview = content[:2000]
|
| 145 |
+
full_text_parts.append(f"\n{'='*60}\nπ {info.filename}\n{'='*60}\n{content}")
|
| 146 |
+
except Exception as e:
|
| 147 |
+
content_preview = f"[Decode error: {e}]"
|
| 148 |
+
elif file_type == "pdf":
|
| 149 |
+
stats["pdf_files"] += 1
|
| 150 |
+
content = parse_pdf_content(raw_data)
|
| 151 |
+
content_preview = content[:2000]
|
| 152 |
+
full_text_parts.append(f"\n{'='*60}\nπ {info.filename}\n{'='*60}\n{content}")
|
| 153 |
+
elif file_type == "docx":
|
| 154 |
+
stats["docx_files"] += 1
|
| 155 |
+
content = parse_docx_content(raw_data)
|
| 156 |
+
content_preview = content[:2000]
|
| 157 |
+
full_text_parts.append(f"\n{'='*60}\nπ {info.filename}\n{'='*60}\n{content}")
|
| 158 |
+
elif file_type == "xlsx":
|
| 159 |
+
stats["xlsx_files"] += 1
|
| 160 |
+
content = parse_xlsx_content(raw_data)
|
| 161 |
+
content_preview = content[:2000]
|
| 162 |
+
full_text_parts.append(f"\n{'='*60}\nπ {info.filename}\n{'='*60}\n{content}")
|
| 163 |
+
elif file_type == "image":
|
| 164 |
+
stats["image_files"] += 1
|
| 165 |
+
content_preview = f"[Image: {ext}]"
|
| 166 |
+
else:
|
| 167 |
+
stats["binary_files"] += 1
|
| 168 |
+
content_preview = f"[Binary file: {ext}]"
|
| 169 |
+
|
| 170 |
+
results.append({
|
| 171 |
+
"filename": info.filename,
|
| 172 |
+
"type": file_type,
|
| 173 |
+
"extension": ext or "(none)",
|
| 174 |
+
"size": info.file_size,
|
| 175 |
+
"size_formatted": format_size(info.file_size),
|
| 176 |
+
"preview": content_preview[:500],
|
| 177 |
+
})
|
| 178 |
+
|
| 179 |
+
table_rows.append([
|
| 180 |
+
info.filename,
|
| 181 |
+
ext or "(none)",
|
| 182 |
+
file_type,
|
| 183 |
+
format_size(info.file_size),
|
| 184 |
+
content_preview[:200].replace("\n", " "),
|
| 185 |
+
])
|
| 186 |
+
|
| 187 |
+
# Build summary
|
| 188 |
+
summary = f"""## π¦ ZIP Archive Summary
|
| 189 |
+
|
| 190 |
+
| Metric | Value |
|
| 191 |
+
|--------|-------|
|
| 192 |
+
| **Total files** | {stats['total_files']} |
|
| 193 |
+
| **Total size** | {format_size(stats['total_size'])} |
|
| 194 |
+
| **Text/Code files** | {stats['text_files']} |
|
| 195 |
+
| **PDF files** | {stats['pdf_files']} |
|
| 196 |
+
| **DOCX files** | {stats['docx_files']} |
|
| 197 |
+
| **XLSX files** | {stats['xlsx_files']} |
|
| 198 |
+
| **Image files** | {stats['image_files']} |
|
| 199 |
+
| **Binary files** | {stats['binary_files']} |
|
| 200 |
+
"""
|
| 201 |
+
|
| 202 |
+
full_text = "\n".join(full_text_parts) if full_text_parts else "(No text content extracted)"
|
| 203 |
+
|
| 204 |
+
return summary, table_rows, full_text, results
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
def select_file_content(file_data_json, evt: gr.SelectData):
|
| 208 |
+
"""When user clicks a row in the table, show that file's full preview."""
|
| 209 |
+
if not file_data_json or not isinstance(file_data_json, list):
|
| 210 |
+
return "Select a file from the table above."
|
| 211 |
+
|
| 212 |
+
row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
|
| 213 |
+
if 0 <= row_idx < len(file_data_json):
|
| 214 |
+
item = file_data_json[row_idx]
|
| 215 |
+
return f"## π {item['filename']}\n**Type:** {item['type']} | **Size:** {item['size_formatted']}\n\n```\n{item.get('preview', '(no preview)')}\n```"
|
| 216 |
+
return "File not found."
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
# βββ Gradio UI βββββββββββββββββββββββββββββββββββββββββββ
|
| 220 |
+
|
| 221 |
+
with gr.Blocks(
|
| 222 |
+
title="π¦ Document Parser",
|
| 223 |
+
theme=gr.themes.Soft(),
|
| 224 |
+
) as demo:
|
| 225 |
+
gr.Markdown("""
|
| 226 |
+
# π¦ Document Parser
|
| 227 |
+
Upload a **ZIP file** containing documents and this tool will parse and extract text from all supported formats.
|
| 228 |
+
|
| 229 |
+
**Supported formats:** `.txt`, `.md`, `.py`, `.js`, `.json`, `.yaml`, `.csv`, `.html`, `.pdf`, `.docx`, `.xlsx`, and 30+ more text/code formats.
|
| 230 |
+
""")
|
| 231 |
+
|
| 232 |
+
with gr.Row():
|
| 233 |
+
with gr.Column(scale=1):
|
| 234 |
+
zip_input = gr.File(
|
| 235 |
+
label="Upload ZIP File",
|
| 236 |
+
file_types=[".zip"],
|
| 237 |
+
type="filepath",
|
| 238 |
+
)
|
| 239 |
+
parse_btn = gr.Button("π Parse Documents", variant="primary", size="lg")
|
| 240 |
+
|
| 241 |
+
summary_output = gr.Markdown(label="Summary")
|
| 242 |
+
|
| 243 |
+
with gr.Tabs():
|
| 244 |
+
with gr.Tab("π File Listing"):
|
| 245 |
+
file_table = gr.Dataframe(
|
| 246 |
+
headers=["Filename", "Extension", "Type", "Size", "Preview"],
|
| 247 |
+
label="Files in Archive",
|
| 248 |
+
interactive=False,
|
| 249 |
+
wrap=True,
|
| 250 |
+
)
|
| 251 |
+
with gr.Tab("π Extracted Text"):
|
| 252 |
+
text_output = gr.Textbox(
|
| 253 |
+
label="Full Extracted Text",
|
| 254 |
+
lines=30,
|
| 255 |
+
max_lines=100,
|
| 256 |
+
show_copy_button=True,
|
| 257 |
+
)
|
| 258 |
+
with gr.Tab("π File Detail"):
|
| 259 |
+
gr.Markdown("*Click a row in the File Listing tab to see its full preview here.*")
|
| 260 |
+
detail_output = gr.Markdown("Select a file from the table above.")
|
| 261 |
+
with gr.Tab("π JSON Data"):
|
| 262 |
+
json_output = gr.JSON(label="Structured Parse Results")
|
| 263 |
+
|
| 264 |
+
# Hidden state for file data
|
| 265 |
+
file_data_state = gr.State([])
|
| 266 |
+
|
| 267 |
+
def run_parse(file_obj):
|
| 268 |
+
summary, table, text, data = parse_zip(file_obj)
|
| 269 |
+
return summary, table, text, data, data
|
| 270 |
+
|
| 271 |
+
parse_btn.click(
|
| 272 |
+
fn=run_parse,
|
| 273 |
+
inputs=zip_input,
|
| 274 |
+
outputs=[summary_output, file_table, text_output, json_output, file_data_state],
|
| 275 |
+
)
|
| 276 |
+
zip_input.upload(
|
| 277 |
+
fn=run_parse,
|
| 278 |
+
inputs=zip_input,
|
| 279 |
+
outputs=[summary_output, file_table, text_output, json_output, file_data_state],
|
| 280 |
+
)
|
| 281 |
+
|
| 282 |
+
file_table.select(
|
| 283 |
+
fn=select_file_content,
|
| 284 |
+
inputs=file_data_state,
|
| 285 |
+
outputs=detail_output,
|
| 286 |
+
)
|
| 287 |
+
|
| 288 |
+
if __name__ == "__main__":
|
| 289 |
+
demo.launch()
|