Spaces:

SlouchyBuffalo
/

pages-converter-pro

Running on Zero

App Files Files Community

SlouchyBuffalo commited on May 12

Commit

af6cf59

verified ·

1 Parent(s): 864712e

Create app.py

Browse files

Files changed (1) hide show

app.py +382 -0

app.py CHANGED Viewed

	@@ -0,0 +1,382 @@

+# app.py - Corrected ZeroGPU Pages Converter with PDF Fix
+import gradio as gr
+import os
+import spaces
+import tempfile
+import zipfile
+import json
+from pathlib import Path
+from huggingface_hub import InferenceClient
+import time
+# Debug token
+token = os.getenv("HF_TOKEN")
+print(f"Debug: Token exists = {token is not None}")
+print(f"Debug: Token length = {len(token) if token else 0}")
+# Initialize the client with Cerebras
+client = InferenceClient(
+    "meta-llama/Llama-3.3-70B-Instruct",
+    provider="cerebras",
+    token=token
+)
+@spaces.GPU
+def extract_pages_content(file_path):
+    """Extract content from Apple Pages file using ZeroGPU"""
+    print(f"DEBUG: Processing file: {file_path}")
+    print(f"DEBUG: File exists: {os.path.exists(file_path)}")
+    try:
+        content_parts = []
+        with zipfile.ZipFile(file_path, 'r') as zip_ref:
+            with tempfile.TemporaryDirectory() as temp_dir:
+                zip_ref.extractall(temp_dir)
+                temp_path = Path(temp_dir)
+                print(f"DEBUG: Extracted files: {list(temp_path.iterdir())}")
+                print(f"DEBUG: Index folder contents: {list((temp_path / 'Index').iterdir()) if (temp_path / 'Index').is_dir() else 'No Index folder'}")
+                # Strategy 1: Look for iwa files in Index folder
+                index_path = temp_path / "Index"
+                if index_path.exists():
+                    for iwa_file in index_path.glob("*.iwa"):
+                        try:
+                            # iwa files are protobuf archives, try reading as binary
+                            with open(iwa_file, 'rb') as f:
+                                binary_content = f.read()
+                                # Try to find text content in the binary
+                                text_content = binary_content.decode('utf-8', errors='ignore')
+                                # Extract readable text (basic approach)
+                                import re
+                                readable_text = re.findall(r'[\x20-\x7E]+', text_content)
+                                content_parts.extend([t.strip() for t in readable_text if len(t.strip()) > 5])
+                        except:
+                            continue
+        if content_parts:
+            # Clean and deduplicate
+            unique_content = list(dict.fromkeys(content_parts))
+            return "\n\n".join(unique_content)
+        else:
+            return "Could not extract readable content from .pages file"
+    except Exception as e:
+        return f"Error extracting content: {str(e)}"
+@spaces.GPU
+def convert_pages_document(file, output_format, progress=gr.Progress()):
+    """Convert Pages document using Cerebras with ZeroGPU acceleration"""
+    if not file:
+        return None, "❌ Please upload a .pages file"
+    try:
+        progress(0.1, desc="📖 Extracting content from .pages file...")
+        # Extract content
+        content = extract_pages_content(file.name)
+        if not content or len(content.strip()) < 10:
+            return None, "❌ Could not extract sufficient content from .pages file"
+        progress(0.4, desc="🤖 Preparing conversion with Cerebras...")
+        # Create format-specific prompt
+        prompt = create_conversion_prompt(content, output_format)
+        progress(0.6, desc="⚡ Converting with Cerebras Lightning Speed...")
+        # Convert using Cerebras
+        try:
+            # Use chat completion instead
+            messages = [{"role": "user", "content": prompt}]
+            response = client.chat_completion(
+                messages=messages,
+                max_tokens=4096,
+                temperature=0.1
+            )
+            # Extract the response text
+            response = response.choices[0].message.content
+        except Exception as e:
+            return None, f"❌ Conversion error: {str(e)}"
+        progress(0.9, desc="💫 Creating output file...")
+        # Create output file
+        output_path = create_output_file(response, output_format)
+        progress(1.0, desc="✅ Conversion complete!")
+        return output_path, f"✅ Successfully converted to {output_format} using ZeroGPU!"
+    except Exception as e:
+        return None, f"❌ Error: {str(e)}"
+def create_conversion_prompt(content, output_format):
+    """Create optimized prompt for Cerebras model"""
+    format_instructions = {
+        "PDF": "Create content suitable for PDF format with proper structure and formatting",
+        "DOCX": "Format as Microsoft Word document with headers, paragraphs, and proper styling",
+        "TXT": "Convert to clean, readable plain text preserving structure",
+        "HTML": "Create well-structured HTML with semantic markup",
+        "Markdown": "Convert to properly formatted Markdown with headers and structure"
+    }
+    return f"""You are an expert document converter. Convert the following Apple Pages document content to {output_format} format.
+INSTRUCTIONS:
+1. Preserve the original structure, formatting, and content organization
+2. Maintain headings, paragraphs, lists, and any tables if present
+3. Ensure the output is clean, professional, and well-formatted
+4. {format_instructions.get(output_format, "Format appropriately for the requested output type")}
+5. Return ONLY the converted content without explanations or meta-commentary
+ORIGINAL CONTENT:
+{content}
+CONVERTED {output_format.upper()} OUTPUT:"""
+def create_output_file(content, output_format):
+    """Create output file in specified format"""
+    # Clean the content (remove potential prompt artifacts)
+    content = content.strip()
+    # Create temporary file with appropriate extension
+    extensions = {
+        "PDF": ".pdf",
+        "DOCX": ".docx",
+        "TXT": ".txt",
+        "HTML": ".html",
+        "Markdown": ".md"
+    }
+    if output_format == "PDF":
+        # Create a temporary file with .pdf extension
+        with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as f:
+            from reportlab.pdfgen import canvas
+            from reportlab.lib.pagesizes import letter
+            import textwrap
+            # Create PDF
+            pdf = canvas.Canvas(f.name, pagesize=letter)
+            width, height = letter
+            y_position = height - 50
+            # Split content into lines and wrap long lines
+            lines = []
+            for paragraph in content.split('\n'):
+                if paragraph.strip():
+                    # Wrap long lines at 80 characters
+                    wrapped_lines = textwrap.wrap(paragraph, width=80)
+                    lines.extend(wrapped_lines if wrapped_lines else [''])
+                else:
+                    lines.append('')  # Preserve empty lines
+            for line in lines:
+                if y_position < 50:  # Start new page
+                    pdf.showPage()
+                    y_position = height - 50
+                pdf.drawString(50, y_position, line)
+                y_position -= 20
+            pdf.save()
+            return f.name
+    elif output_format == "DOCX":
+        # Create a temporary file with .docx extension
+        with tempfile.NamedTemporaryFile(suffix='.docx', delete=False) as f:
+            from docx import Document
+            doc = Document()
+            paragraphs = content.split('\n\n')
+            for para in paragraphs:
+                if para.strip():
+                    doc.add_paragraph(para.strip())
+            doc.save(f.name)
+            return f.name
+    else:
+        # For TXT, HTML, Markdown
+        ext = extensions.get(output_format, ".txt")
+        with tempfile.NamedTemporaryFile(mode='w', suffix=ext, delete=False, encoding='utf-8') as f:
+            f.write(content)
+            return f.name
+# Custom CSS for professional appearance
+css = """
+.gradio-container {
+    background: linear-gradient(135deg, #1e3c72 0%, #2a5298 100%);
+    min-height: 100vh;
+}
+.main-content {
+    max-width: 1000px;
+    margin: 0 auto;
+    padding: 2rem;
+}
+.hero-section {
+    background: white;
+    border-radius: 1rem;
+    padding: 2rem;
+    text-align: center;
+    box-shadow: 0 10px 30px rgba(0,0,0,0.1);
+    margin-bottom: 2rem;
+}
+.upload-section {
+    background: white;
+    border-radius: 1rem;
+    padding: 2rem;
+    box-shadow: 0 5px 15px rgba(0,0,0,0.1);
+}
+.format-selector {
+    background: #f8f9fa;
+    border-radius: 0.5rem;
+    padding: 1rem;
+    margin: 1rem 0;
+}
+.convert-button {
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    color: white;
+    border: none;
+    padding: 1rem 2rem;
+    border-radius: 0.5rem;
+    font-size: 1.1rem;
+    font-weight: bold;
+    width: 100%;
+    cursor: pointer;
+    transition: all 0.3s ease;
+}
+.convert-button:hover {
+    transform: translateY(-2px);
+    box-shadow: 0 5px 15px rgba(102, 126, 234, 0.3);
+}
+.zerogpu-badge {
+    display: inline-block;
+    background: linear-gradient(45deg, #ff6b6b, #feca57);
+    color: white;
+    padding: 0.5rem 1rem;
+    border-radius: 2rem;
+    font-weight: bold;
+    font-size: 0.9rem;
+}
+.pro-features {
+    background: #e8f5e9;
+    border-radius: 0.5rem;
+    padding: 1rem;
+    margin-top: 1rem;
+}
+"""
+# Create the Gradio interface
+with gr.Blocks(css=css, title="Pages Converter Pro - ZeroGPU", theme=gr.themes.Soft()) as app:
+    with gr.Column(elem_classes=["main-content"]):
+        # Hero section
+        gr.HTML("""
+        <div class="hero-section">
+            <h1>📄 Pages Converter Pro</h1>
+            <span class="zerogpu-badge">⚡ ZeroGPU Accelerated</span>
+            <p style="margin-top: 1rem; color: #666;">
+                Convert Apple Pages documents with lightning-fast Cerebras Llama-3.3-70B
+            </p>
+        </div>
+        """)
+        # Pro benefits showcase
+        gr.HTML("""
+        <div class="pro-features">
+            <h3>🚀 HuggingFace Pro Benefits Active</h3>
+            <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 1rem; margin-top: 1rem;">
+                <div>✅ 5x Usage Quota</div>
+                <div>🔥 Priority Queue Access</div>
+                <div>💎 H200 GPU Hardware</div>
+                <div>⚡ Zero-GPU Acceleration</div>
+            </div>
+        </div>
+        """)
+        # Main conversion interface
+        with gr.Row():
+            with gr.Column(scale=2, elem_classes=["upload-section"]):
+                gr.HTML("<h3>📎 Upload Your Document</h3>")
+                file_input = gr.File(
+                    label="Select .pages file",
+                    file_types=[".pages"],
+                    elem_id="file-upload"
+                )
+                output_format = gr.Radio(
+                    choices=["PDF", "DOCX", "TXT", "HTML", "Markdown"],
+                    value="PDF",
+                    label="🎯 Output Format",
+                    elem_classes=["format-selector"]
+                )
+                convert_btn = gr.Button(
+                    "⚡ Convert with ZeroGPU",
+                    variant="primary",
+                    elem_classes=["convert-button"]
+                )
+            with gr.Column(scale=1):
+                gr.HTML("""
+                <div style="background: white; padding: 1.5rem; border-radius: 1rem; box-shadow: 0 5px 15px rgba(0,0,0,0.1);">
+                    <h3>⚡ ZeroGPU Features</h3>
+                    <ul style="color: #666;">
+                        <li>Lightning-fast processing</li>
+                        <li>H200 hardware acceleration</li>
+                        <li>Priority queue access</li>
+                        <li>Cerebras optimization</li>
+                    </ul>
+                    <h3>📋 Supported Formats</h3>
+                    <ul style="color: #666;">
+                        <li>📄 PDF (best quality)</li>
+                        <li>📝 Microsoft Word (DOCX)</li>
+                        <li>📋 Plain Text (TXT)</li>
+                        <li>🌐 Web Page (HTML)</li>
+                        <li>✏️ Markdown (MD)</li>
+                    </ul>
+                </div>
+                """)
+        # Output section
+        with gr.Row():
+            output_file = gr.File(
+                label="📁 Download Your Converted File",
+                elem_id="output-download"
+            )
+        with gr.Row():
+            status_html = gr.HTML(
+                value="<div style='text-align: center; padding: 1rem; color: #666;'>Ready to convert your Pages document</div>",
+                elem_id="status-display"
+            )
+        # Connect the interface
+        convert_btn.click(
+            fn=convert_pages_document,
+            inputs=[file_input, output_format],
+            outputs=[output_file, status_html],
+            show_progress=True
+        )
+        # Footer
+        gr.HTML("""
+        <div style="text-align: center; margin-top: 3rem; padding: 2rem; color: white;">
+            <p>💎 Built exclusively for HuggingFace Pro users</p>
+            <p><small>Powered by Cerebras • Accelerated by ZeroGPU • Made with ❤️</small></p>
+        </div>
+        """)
+# Launch the app
+if __name__ == "__main__":
+    app.launch()