Spaces:

ScottzillaSystems
/

document-parser

Sleeping

App Files Files Community

ScottzillaSystems commited on 11 days ago

Commit

61ca336

verified ·

1 Parent(s): e561c4d

Add document parser app

Browse files

Files changed (1) hide show

app.py +289 -0

app.py ADDED Viewed

	@@ -0,0 +1,289 @@

+import gradio as gr
+import zipfile
+import os
+import io
+import json
+import tempfile
+import shutil
+# Supported text-based extensions
+TEXT_EXTS = {
+    ".txt", ".md", ".py", ".js", ".ts", ".jsx", ".tsx", ".html", ".css",
+    ".json", ".yaml", ".yml", ".csv", ".xml", ".toml", ".cfg", ".ini",
+    ".sh", ".bash", ".bat", ".ps1", ".r", ".java", ".c", ".cpp", ".h",
+    ".hpp", ".go", ".rs", ".rb", ".php", ".swift", ".kt", ".scala",
+    ".sql", ".dockerfile", ".makefile", ".gitignore", ".env", ".log",
+}
+# Extensions we can parse with special libraries
+PDF_EXTS = {".pdf"}
+DOCX_EXTS = {".docx"}
+XLSX_EXTS = {".xlsx"}
+IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".svg", ".webp", ".ico"}
+def get_file_type(filename):
+    """Categorize file by extension."""
+    ext = os.path.splitext(filename)[1].lower()
+    if not ext and filename.split("/")[-1] in {"Makefile", "Dockerfile", "Procfile", ".gitignore", ".dockerignore"}:
+        return "text", ext
+    if ext in TEXT_EXTS:
+        return "text", ext
+    if ext in PDF_EXTS:
+        return "pdf", ext
+    if ext in DOCX_EXTS:
+        return "docx", ext
+    if ext in XLSX_EXTS:
+        return "xlsx", ext
+    if ext in IMAGE_EXTS:
+        return "image", ext
+    return "binary", ext
+def parse_pdf_content(data):
+    """Parse PDF bytes to text using PyMuPDF."""
+    try:
+        import fitz
+        doc = fitz.open(stream=data, filetype="pdf")
+        text = ""
+        for page_num, page in enumerate(doc):
+            text += f"\n--- Page {page_num + 1} ---\n"
+            text += page.get_text()
+        doc.close()
+        return text.strip() if text.strip() else "[PDF: no extractable text]"
+    except ImportError:
+        return "[PDF parsing unavailable - PyMuPDF not installed]"
+    except Exception as e:
+        return f"[PDF parse error: {e}]"
+def parse_docx_content(data):
+    """Parse DOCX bytes to text."""
+    try:
+        from docx import Document
+        doc = Document(io.BytesIO(data))
+        paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
+        return "\n".join(paragraphs) if paragraphs else "[DOCX: empty document]"
+    except ImportError:
+        return "[DOCX parsing unavailable - python-docx not installed]"
+    except Exception as e:
+        return f"[DOCX parse error: {e}]"
+def parse_xlsx_content(data):
+    """Parse XLSX bytes to text summary."""
+    try:
+        import openpyxl
+        wb = openpyxl.load_workbook(io.BytesIO(data), read_only=True)
+        text = ""
+        for sheet_name in wb.sheetnames:
+            ws = wb[sheet_name]
+            text += f"\n--- Sheet: {sheet_name} ---\n"
+            row_count = 0
+            for row in ws.iter_rows(values_only=True):
+                if row_count >= 50:  # Limit rows shown
+                    text += f"\n... (more rows exist)\n"
+                    break
+                text += " | ".join(str(cell) if cell is not None else "" for cell in row) + "\n"
+                row_count += 1
+        wb.close()
+        return text.strip() if text.strip() else "[XLSX: empty workbook]"
+    except ImportError:
+        return "[XLSX parsing unavailable - openpyxl not installed]"
+    except Exception as e:
+        return f"[XLSX parse error: {e}]"
+def format_size(size_bytes):
+    """Format bytes to human-readable string."""
+    if size_bytes < 1024:
+        return f"{size_bytes} B"
+    elif size_bytes < 1024 * 1024:
+        return f"{size_bytes / 1024:.1f} KB"
+    else:
+        return f"{size_bytes / (1024 * 1024):.1f} MB"
+def parse_zip(file_obj):
+    """Main parsing function for uploaded zip files."""
+    if file_obj is None:
+        return "⚠️ Please upload a ZIP file.", [], "", []
+    file_path = file_obj if isinstance(file_obj, str) else file_obj.name
+    if not zipfile.is_zipfile(file_path):
+        return "❌ The uploaded file is not a valid ZIP archive.", [], "", []
+    results = []
+    table_rows = []
+    full_text_parts = []
+    stats = {"total_files": 0, "text_files": 0, "pdf_files": 0, "docx_files": 0,
+             "xlsx_files": 0, "image_files": 0, "binary_files": 0, "total_size": 0}
+    with zipfile.ZipFile(file_path, "r") as zf:
+        for info in zf.infolist():
+            if info.is_dir():
+                continue
+            stats["total_files"] += 1
+            stats["total_size"] += info.file_size
+            file_type, ext = get_file_type(info.filename)
+            content_preview = ""
+            try:
+                raw_data = zf.read(info)
+            except Exception as e:
+                content_preview = f"[Read error: {e}]"
+                raw_data = None
+            if raw_data is not None:
+                if file_type == "text":
+                    stats["text_files"] += 1
+                    try:
+                        content = raw_data.decode("utf-8", errors="replace")
+                        content_preview = content[:2000]
+                        full_text_parts.append(f"\n{'='*60}\n📄 {info.filename}\n{'='*60}\n{content}")
+                    except Exception as e:
+                        content_preview = f"[Decode error: {e}]"
+                elif file_type == "pdf":
+                    stats["pdf_files"] += 1
+                    content = parse_pdf_content(raw_data)
+                    content_preview = content[:2000]
+                    full_text_parts.append(f"\n{'='*60}\n📕 {info.filename}\n{'='*60}\n{content}")
+                elif file_type == "docx":
+                    stats["docx_files"] += 1
+                    content = parse_docx_content(raw_data)
+                    content_preview = content[:2000]
+                    full_text_parts.append(f"\n{'='*60}\n📘 {info.filename}\n{'='*60}\n{content}")
+                elif file_type == "xlsx":
+                    stats["xlsx_files"] += 1
+                    content = parse_xlsx_content(raw_data)
+                    content_preview = content[:2000]
+                    full_text_parts.append(f"\n{'='*60}\n📊 {info.filename}\n{'='*60}\n{content}")
+                elif file_type == "image":
+                    stats["image_files"] += 1
+                    content_preview = f"[Image: {ext}]"
+                else:
+                    stats["binary_files"] += 1
+                    content_preview = f"[Binary file: {ext}]"
+            results.append({
+                "filename": info.filename,
+                "type": file_type,
+                "extension": ext or "(none)",
+                "size": info.file_size,
+                "size_formatted": format_size(info.file_size),
+                "preview": content_preview[:500],
+            })
+            table_rows.append([
+                info.filename,
+                ext or "(none)",
+                file_type,
+                format_size(info.file_size),
+                content_preview[:200].replace("\n", " "),
+            ])
+    # Build summary
+    summary = f"""## 📦 ZIP Archive Summary
+| Metric | Value |
+|--------|-------|
+| **Total files** | {stats['total_files']} |
+| **Total size** | {format_size(stats['total_size'])} |
+| **Text/Code files** | {stats['text_files']} |
+| **PDF files** | {stats['pdf_files']} |
+| **DOCX files** | {stats['docx_files']} |
+| **XLSX files** | {stats['xlsx_files']} |
+| **Image files** | {stats['image_files']} |
+| **Binary files** | {stats['binary_files']} |
+"""
+    full_text = "\n".join(full_text_parts) if full_text_parts else "(No text content extracted)"
+    return summary, table_rows, full_text, results
+def select_file_content(file_data_json, evt: gr.SelectData):
+    """When user clicks a row in the table, show that file's full preview."""
+    if not file_data_json or not isinstance(file_data_json, list):
+        return "Select a file from the table above."
+    row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
+    if 0 <= row_idx < len(file_data_json):
+        item = file_data_json[row_idx]
+        return f"## 📄 {item['filename']}\n**Type:** {item['type']} | **Size:** {item['size_formatted']}\n\n```\n{item.get('preview', '(no preview)')}\n```"
+    return "File not found."
+# ─── Gradio UI ───────────────────────────────────────────
+with gr.Blocks(
+    title="📦 Document Parser",
+    theme=gr.themes.Soft(),
+) as demo:
+    gr.Markdown("""
+# 📦 Document Parser
+Upload a **ZIP file** containing documents and this tool will parse and extract text from all supported formats.
+**Supported formats:** `.txt`, `.md`, `.py`, `.js`, `.json`, `.yaml`, `.csv`, `.html`, `.pdf`, `.docx`, `.xlsx`, and 30+ more text/code formats.
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            zip_input = gr.File(
+                label="Upload ZIP File",
+                file_types=[".zip"],
+                type="filepath",
+            )
+            parse_btn = gr.Button("🔍 Parse Documents", variant="primary", size="lg")
+    summary_output = gr.Markdown(label="Summary")
+    with gr.Tabs():
+        with gr.Tab("📋 File Listing"):
+            file_table = gr.Dataframe(
+                headers=["Filename", "Extension", "Type", "Size", "Preview"],
+                label="Files in Archive",
+                interactive=False,
+                wrap=True,
+            )
+        with gr.Tab("📝 Extracted Text"):
+            text_output = gr.Textbox(
+                label="Full Extracted Text",
+                lines=30,
+                max_lines=100,
+                show_copy_button=True,
+            )
+        with gr.Tab("🔎 File Detail"):
+            gr.Markdown("*Click a row in the File Listing tab to see its full preview here.*")
+            detail_output = gr.Markdown("Select a file from the table above.")
+        with gr.Tab("📊 JSON Data"):
+            json_output = gr.JSON(label="Structured Parse Results")
+    # Hidden state for file data
+    file_data_state = gr.State([])
+    def run_parse(file_obj):
+        summary, table, text, data = parse_zip(file_obj)
+        return summary, table, text, data, data
+    parse_btn.click(
+        fn=run_parse,
+        inputs=zip_input,
+        outputs=[summary_output, file_table, text_output, json_output, file_data_state],
+    )
+    zip_input.upload(
+        fn=run_parse,
+        inputs=zip_input,
+        outputs=[summary_output, file_table, text_output, json_output, file_data_state],
+    )
+    file_table.select(
+        fn=select_file_content,
+        inputs=file_data_state,
+        outputs=detail_output,
+    )
+if __name__ == "__main__":
+    demo.launch()