SlouchyBuffalo commited on
Commit
af6cf59
Β·
verified Β·
1 Parent(s): 864712e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +382 -0
app.py CHANGED
@@ -0,0 +1,382 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py - Corrected ZeroGPU Pages Converter with PDF Fix
2
+ import gradio as gr
3
+ import os
4
+ import spaces
5
+ import tempfile
6
+ import zipfile
7
+ import json
8
+ from pathlib import Path
9
+ from huggingface_hub import InferenceClient
10
+ import time
11
+
12
+ # Debug token
13
+ token = os.getenv("HF_TOKEN")
14
+ print(f"Debug: Token exists = {token is not None}")
15
+ print(f"Debug: Token length = {len(token) if token else 0}")
16
+
17
+ # Initialize the client with Cerebras
18
+ client = InferenceClient(
19
+ "meta-llama/Llama-3.3-70B-Instruct",
20
+ provider="cerebras",
21
+ token=token
22
+ )
23
+
24
+ @spaces.GPU
25
+ def extract_pages_content(file_path):
26
+ """Extract content from Apple Pages file using ZeroGPU"""
27
+ print(f"DEBUG: Processing file: {file_path}")
28
+ print(f"DEBUG: File exists: {os.path.exists(file_path)}")
29
+
30
+ try:
31
+ content_parts = []
32
+
33
+ with zipfile.ZipFile(file_path, 'r') as zip_ref:
34
+ with tempfile.TemporaryDirectory() as temp_dir:
35
+ zip_ref.extractall(temp_dir)
36
+ temp_path = Path(temp_dir)
37
+ print(f"DEBUG: Extracted files: {list(temp_path.iterdir())}")
38
+ print(f"DEBUG: Index folder contents: {list((temp_path / 'Index').iterdir()) if (temp_path / 'Index').is_dir() else 'No Index folder'}")
39
+
40
+ # Strategy 1: Look for iwa files in Index folder
41
+ index_path = temp_path / "Index"
42
+ if index_path.exists():
43
+ for iwa_file in index_path.glob("*.iwa"):
44
+ try:
45
+ # iwa files are protobuf archives, try reading as binary
46
+ with open(iwa_file, 'rb') as f:
47
+ binary_content = f.read()
48
+ # Try to find text content in the binary
49
+ text_content = binary_content.decode('utf-8', errors='ignore')
50
+ # Extract readable text (basic approach)
51
+ import re
52
+ readable_text = re.findall(r'[\x20-\x7E]+', text_content)
53
+ content_parts.extend([t.strip() for t in readable_text if len(t.strip()) > 5])
54
+ except:
55
+ continue
56
+
57
+ if content_parts:
58
+ # Clean and deduplicate
59
+ unique_content = list(dict.fromkeys(content_parts))
60
+ return "\n\n".join(unique_content)
61
+ else:
62
+ return "Could not extract readable content from .pages file"
63
+
64
+ except Exception as e:
65
+ return f"Error extracting content: {str(e)}"
66
+
67
+ @spaces.GPU
68
+ def convert_pages_document(file, output_format, progress=gr.Progress()):
69
+ """Convert Pages document using Cerebras with ZeroGPU acceleration"""
70
+ if not file:
71
+ return None, "❌ Please upload a .pages file"
72
+
73
+ try:
74
+ progress(0.1, desc="πŸ“– Extracting content from .pages file...")
75
+
76
+ # Extract content
77
+ content = extract_pages_content(file.name)
78
+
79
+ if not content or len(content.strip()) < 10:
80
+ return None, "❌ Could not extract sufficient content from .pages file"
81
+
82
+ progress(0.4, desc="πŸ€– Preparing conversion with Cerebras...")
83
+
84
+ # Create format-specific prompt
85
+ prompt = create_conversion_prompt(content, output_format)
86
+
87
+ progress(0.6, desc="⚑ Converting with Cerebras Lightning Speed...")
88
+
89
+ # Convert using Cerebras
90
+ try:
91
+ # Use chat completion instead
92
+ messages = [{"role": "user", "content": prompt}]
93
+ response = client.chat_completion(
94
+ messages=messages,
95
+ max_tokens=4096,
96
+ temperature=0.1
97
+ )
98
+ # Extract the response text
99
+ response = response.choices[0].message.content
100
+ except Exception as e:
101
+ return None, f"❌ Conversion error: {str(e)}"
102
+
103
+ progress(0.9, desc="πŸ’« Creating output file...")
104
+
105
+ # Create output file
106
+ output_path = create_output_file(response, output_format)
107
+
108
+ progress(1.0, desc="βœ… Conversion complete!")
109
+
110
+ return output_path, f"βœ… Successfully converted to {output_format} using ZeroGPU!"
111
+
112
+ except Exception as e:
113
+ return None, f"❌ Error: {str(e)}"
114
+
115
+ def create_conversion_prompt(content, output_format):
116
+ """Create optimized prompt for Cerebras model"""
117
+ format_instructions = {
118
+ "PDF": "Create content suitable for PDF format with proper structure and formatting",
119
+ "DOCX": "Format as Microsoft Word document with headers, paragraphs, and proper styling",
120
+ "TXT": "Convert to clean, readable plain text preserving structure",
121
+ "HTML": "Create well-structured HTML with semantic markup",
122
+ "Markdown": "Convert to properly formatted Markdown with headers and structure"
123
+ }
124
+
125
+ return f"""You are an expert document converter. Convert the following Apple Pages document content to {output_format} format.
126
+
127
+ INSTRUCTIONS:
128
+ 1. Preserve the original structure, formatting, and content organization
129
+ 2. Maintain headings, paragraphs, lists, and any tables if present
130
+ 3. Ensure the output is clean, professional, and well-formatted
131
+ 4. {format_instructions.get(output_format, "Format appropriately for the requested output type")}
132
+ 5. Return ONLY the converted content without explanations or meta-commentary
133
+
134
+ ORIGINAL CONTENT:
135
+ {content}
136
+
137
+ CONVERTED {output_format.upper()} OUTPUT:"""
138
+
139
+ def create_output_file(content, output_format):
140
+ """Create output file in specified format"""
141
+ # Clean the content (remove potential prompt artifacts)
142
+ content = content.strip()
143
+
144
+ # Create temporary file with appropriate extension
145
+ extensions = {
146
+ "PDF": ".pdf",
147
+ "DOCX": ".docx",
148
+ "TXT": ".txt",
149
+ "HTML": ".html",
150
+ "Markdown": ".md"
151
+ }
152
+
153
+ if output_format == "PDF":
154
+ # Create a temporary file with .pdf extension
155
+ with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as f:
156
+ from reportlab.pdfgen import canvas
157
+ from reportlab.lib.pagesizes import letter
158
+ import textwrap
159
+
160
+ # Create PDF
161
+ pdf = canvas.Canvas(f.name, pagesize=letter)
162
+ width, height = letter
163
+ y_position = height - 50
164
+
165
+ # Split content into lines and wrap long lines
166
+ lines = []
167
+ for paragraph in content.split('\n'):
168
+ if paragraph.strip():
169
+ # Wrap long lines at 80 characters
170
+ wrapped_lines = textwrap.wrap(paragraph, width=80)
171
+ lines.extend(wrapped_lines if wrapped_lines else [''])
172
+ else:
173
+ lines.append('') # Preserve empty lines
174
+
175
+ for line in lines:
176
+ if y_position < 50: # Start new page
177
+ pdf.showPage()
178
+ y_position = height - 50
179
+ pdf.drawString(50, y_position, line)
180
+ y_position -= 20
181
+
182
+ pdf.save()
183
+ return f.name
184
+
185
+ elif output_format == "DOCX":
186
+ # Create a temporary file with .docx extension
187
+ with tempfile.NamedTemporaryFile(suffix='.docx', delete=False) as f:
188
+ from docx import Document
189
+
190
+ doc = Document()
191
+ paragraphs = content.split('\n\n')
192
+ for para in paragraphs:
193
+ if para.strip():
194
+ doc.add_paragraph(para.strip())
195
+
196
+ doc.save(f.name)
197
+ return f.name
198
+
199
+ else:
200
+ # For TXT, HTML, Markdown
201
+ ext = extensions.get(output_format, ".txt")
202
+ with tempfile.NamedTemporaryFile(mode='w', suffix=ext, delete=False, encoding='utf-8') as f:
203
+ f.write(content)
204
+ return f.name
205
+
206
+ # Custom CSS for professional appearance
207
+ css = """
208
+ .gradio-container {
209
+ background: linear-gradient(135deg, #1e3c72 0%, #2a5298 100%);
210
+ min-height: 100vh;
211
+ }
212
+
213
+ .main-content {
214
+ max-width: 1000px;
215
+ margin: 0 auto;
216
+ padding: 2rem;
217
+ }
218
+
219
+ .hero-section {
220
+ background: white;
221
+ border-radius: 1rem;
222
+ padding: 2rem;
223
+ text-align: center;
224
+ box-shadow: 0 10px 30px rgba(0,0,0,0.1);
225
+ margin-bottom: 2rem;
226
+ }
227
+
228
+ .upload-section {
229
+ background: white;
230
+ border-radius: 1rem;
231
+ padding: 2rem;
232
+ box-shadow: 0 5px 15px rgba(0,0,0,0.1);
233
+ }
234
+
235
+ .format-selector {
236
+ background: #f8f9fa;
237
+ border-radius: 0.5rem;
238
+ padding: 1rem;
239
+ margin: 1rem 0;
240
+ }
241
+
242
+ .convert-button {
243
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
244
+ color: white;
245
+ border: none;
246
+ padding: 1rem 2rem;
247
+ border-radius: 0.5rem;
248
+ font-size: 1.1rem;
249
+ font-weight: bold;
250
+ width: 100%;
251
+ cursor: pointer;
252
+ transition: all 0.3s ease;
253
+ }
254
+
255
+ .convert-button:hover {
256
+ transform: translateY(-2px);
257
+ box-shadow: 0 5px 15px rgba(102, 126, 234, 0.3);
258
+ }
259
+
260
+ .zerogpu-badge {
261
+ display: inline-block;
262
+ background: linear-gradient(45deg, #ff6b6b, #feca57);
263
+ color: white;
264
+ padding: 0.5rem 1rem;
265
+ border-radius: 2rem;
266
+ font-weight: bold;
267
+ font-size: 0.9rem;
268
+ }
269
+
270
+ .pro-features {
271
+ background: #e8f5e9;
272
+ border-radius: 0.5rem;
273
+ padding: 1rem;
274
+ margin-top: 1rem;
275
+ }
276
+ """
277
+
278
+ # Create the Gradio interface
279
+ with gr.Blocks(css=css, title="Pages Converter Pro - ZeroGPU", theme=gr.themes.Soft()) as app:
280
+ with gr.Column(elem_classes=["main-content"]):
281
+ # Hero section
282
+ gr.HTML("""
283
+ <div class="hero-section">
284
+ <h1>πŸ“„ Pages Converter Pro</h1>
285
+ <span class="zerogpu-badge">⚑ ZeroGPU Accelerated</span>
286
+ <p style="margin-top: 1rem; color: #666;">
287
+ Convert Apple Pages documents with lightning-fast Cerebras Llama-3.3-70B
288
+ </p>
289
+ </div>
290
+ """)
291
+
292
+ # Pro benefits showcase
293
+ gr.HTML("""
294
+ <div class="pro-features">
295
+ <h3>πŸš€ HuggingFace Pro Benefits Active</h3>
296
+ <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 1rem; margin-top: 1rem;">
297
+ <div>βœ… 5x Usage Quota</div>
298
+ <div>πŸ”₯ Priority Queue Access</div>
299
+ <div>πŸ’Ž H200 GPU Hardware</div>
300
+ <div>⚑ Zero-GPU Acceleration</div>
301
+ </div>
302
+ </div>
303
+ """)
304
+
305
+ # Main conversion interface
306
+ with gr.Row():
307
+ with gr.Column(scale=2, elem_classes=["upload-section"]):
308
+ gr.HTML("<h3>πŸ“Ž Upload Your Document</h3>")
309
+
310
+ file_input = gr.File(
311
+ label="Select .pages file",
312
+ file_types=[".pages"],
313
+ elem_id="file-upload"
314
+ )
315
+
316
+ output_format = gr.Radio(
317
+ choices=["PDF", "DOCX", "TXT", "HTML", "Markdown"],
318
+ value="PDF",
319
+ label="🎯 Output Format",
320
+ elem_classes=["format-selector"]
321
+ )
322
+
323
+ convert_btn = gr.Button(
324
+ "⚑ Convert with ZeroGPU",
325
+ variant="primary",
326
+ elem_classes=["convert-button"]
327
+ )
328
+
329
+ with gr.Column(scale=1):
330
+ gr.HTML("""
331
+ <div style="background: white; padding: 1.5rem; border-radius: 1rem; box-shadow: 0 5px 15px rgba(0,0,0,0.1);">
332
+ <h3>⚑ ZeroGPU Features</h3>
333
+ <ul style="color: #666;">
334
+ <li>Lightning-fast processing</li>
335
+ <li>H200 hardware acceleration</li>
336
+ <li>Priority queue access</li>
337
+ <li>Cerebras optimization</li>
338
+ </ul>
339
+
340
+ <h3>πŸ“‹ Supported Formats</h3>
341
+ <ul style="color: #666;">
342
+ <li>πŸ“„ PDF (best quality)</li>
343
+ <li>πŸ“ Microsoft Word (DOCX)</li>
344
+ <li>πŸ“‹ Plain Text (TXT)</li>
345
+ <li>🌐 Web Page (HTML)</li>
346
+ <li>✏️ Markdown (MD)</li>
347
+ </ul>
348
+ </div>
349
+ """)
350
+
351
+ # Output section
352
+ with gr.Row():
353
+ output_file = gr.File(
354
+ label="πŸ“ Download Your Converted File",
355
+ elem_id="output-download"
356
+ )
357
+
358
+ with gr.Row():
359
+ status_html = gr.HTML(
360
+ value="<div style='text-align: center; padding: 1rem; color: #666;'>Ready to convert your Pages document</div>",
361
+ elem_id="status-display"
362
+ )
363
+
364
+ # Connect the interface
365
+ convert_btn.click(
366
+ fn=convert_pages_document,
367
+ inputs=[file_input, output_format],
368
+ outputs=[output_file, status_html],
369
+ show_progress=True
370
+ )
371
+
372
+ # Footer
373
+ gr.HTML("""
374
+ <div style="text-align: center; margin-top: 3rem; padding: 2rem; color: white;">
375
+ <p>πŸ’Ž Built exclusively for HuggingFace Pro users</p>
376
+ <p><small>Powered by Cerebras β€’ Accelerated by ZeroGPU β€’ Made with ❀️</small></p>
377
+ </div>
378
+ """)
379
+
380
+ # Launch the app
381
+ if __name__ == "__main__":
382
+ app.launch()