root commited on
Commit
7928625
Β·
1 Parent(s): 1f4004d
Files changed (1) hide show
  1. app.py +120 -604
app.py CHANGED
@@ -9,13 +9,14 @@ from typing import Tuple
9
  import markdown
10
  from dotenv import load_dotenv
11
  from openai import OpenAI
 
12
 
13
  # Load environment variables from .env file
14
  load_dotenv()
15
 
16
- # API Configuration
17
  API_URL = os.getenv("API_URL", "")
18
- API_TOKEN = os.getenv("API_TOKEN", "")
19
 
20
 
21
  class Doc2PageConverter:
@@ -29,172 +30,91 @@ class Doc2PageConverter:
29
  base_url="https://qianfan.baidubce.com/v2",
30
  api_key=self.qianfan_token
31
  )
32
-
 
 
33
 
34
-
35
- def extract_text_with_api(self, file_path: str) -> str:
36
- """Extract text and structure using PP-StructureV3 API"""
 
37
  try:
38
- if not API_URL or not API_TOKEN:
39
- raise ValueError(
40
- "API_URL and API_TOKEN must be configured in .env file")
41
-
42
- # Determine file type
43
- file_extension = Path(file_path).suffix.lower()
44
- if file_extension == ".pdf":
45
- file_type = 0 # PDF
46
  else:
47
- file_type = 1 # Image
48
-
49
- # Read file content
50
- with open(file_path, "rb") as f:
51
- file_bytes = f.read()
52
-
53
- # Encode file to base64
54
- file_data = base64.b64encode(file_bytes).decode("ascii")
55
-
56
- # Prepare API request
57
- headers = {
58
- "Authorization": f"token {API_TOKEN}",
59
- "Content-Type": "application/json",
60
- }
61
-
62
- # Use default settings for simplicity
63
- payload = {
64
- "file": file_data,
65
- "fileType": file_type,
66
- "useFormulaRecognition": True,
67
- "useChartRecognition": False,
68
- "useDocOrientationClassify": False,
69
- "useDocUnwarping": False,
70
- "useTextlineOrientation": False,
71
- "useSealRecognition": True,
72
- "useRegionDetection": True,
73
- "useTableRecognition": True,
74
- "layoutThreshold": 0.5,
75
- "layoutNms": True,
76
- "layoutUnclipRatio": 1.0,
77
- "textDetLimitType": "min",
78
- "textTetLimitSideLen": 736,
79
- "textDetThresh": 0.30,
80
- "textDetBoxThresh": 0.60,
81
- "textDetUnclipRatio": 1.5,
82
- "textRecScoreThresh": 0.00,
83
- "sealDetLimitType": "min",
84
- "sealDetLimitSideLen": 736,
85
- "sealDetThresh": 0.20,
86
- "sealDetBoxThresh": 0.60,
87
- "sealDetUnclipRatio": 0.5,
88
- "sealRecScoreThresh": 0.00,
89
- "useOcrResultsWithTableCells": True,
90
- "useE2eWiredTableRecModel": False,
91
- "useE2eWirelessTableRecModel": False,
92
- "useWiredTableCellsTransToHtml": False,
93
- "useWirelessWableCellsTransToHtml": False,
94
- "useTableOrientationClassify": True,
95
- }
96
-
97
- # Call API
98
- response = requests.post(
99
- API_URL,
100
- json=payload,
101
- headers=headers,
102
- timeout=300, # 5 minutes timeout
103
- )
104
-
105
  response.raise_for_status()
106
- result = response.json()
107
-
108
- # Process API response
109
- layout_results = result.get("result", {}).get(
110
- "layoutParsingResults", [])
111
-
112
- markdown_content_list = []
113
- markdown_list = []
114
-
115
- for res in layout_results:
116
- markdown_data = res["markdown"]
117
- markdown_text = markdown_data["text"]
118
- img_path_to_url = markdown_data["images"]
119
-
120
- # Embed images into markdown
121
- markdown_content = self.embed_images_into_markdown_text(
122
- markdown_text, img_path_to_url
123
- )
124
- markdown_content_list.append(markdown_content)
125
-
126
- # Prepare for concatenation
127
- markdown_with_content = markdown_data.copy()
128
- markdown_with_content["text"] = markdown_content
129
- markdown_list.append(markdown_with_content)
130
-
131
- # Concatenate all pages
132
- concatenated_markdown = self.concatenate_markdown_pages(markdown_list)
133
-
134
- return concatenated_markdown
135
-
136
  except requests.exceptions.RequestException as e:
137
- raise RuntimeError(f"API request failed: {str(e)}")
138
- except Exception as e:
139
- print(f"Error in API extraction: {e}")
 
 
 
 
 
 
 
140
  return ""
141
-
142
- def embed_images_into_markdown_text(self, markdown_text, markdown_images):
143
- """Embed images into markdown text"""
144
- for img_path, img_url in markdown_images.items():
145
- markdown_text = markdown_text.replace(
146
- f'<img src="{img_path}"', f'<img src="{img_url}"'
147
- )
148
- return markdown_text
149
 
150
- def concatenate_markdown_pages(self, markdown_list):
151
- """Concatenate markdown pages into single document"""
152
- markdown_texts = ""
153
- previous_page_last_element_paragraph_end_flag = True
 
 
154
 
155
- for res in markdown_list:
156
- page_first_element_paragraph_start_flag: bool = res["isStart"]
157
- page_last_element_paragraph_end_flag: bool = res["isEnd"]
158
 
159
- if (
160
- not page_first_element_paragraph_start_flag
161
- and not previous_page_last_element_paragraph_end_flag
162
- ):
163
- last_char_of_markdown = (markdown_texts[-1]
164
- if markdown_texts else "")
165
- first_char_of_handler = res["text"]
166
 
167
- last_is_chinese_char = (
168
- re.match(r"[\u4e00-\u9fff]", last_char_of_markdown)
169
- if last_char_of_markdown
170
- else False
171
- )
172
- first_is_chinese_char = (
173
- re.match(r"[\u4e00-\u9fff]", first_char_of_handler)
174
- if first_char_of_handler
175
- else False
176
- )
177
- if not (last_is_chinese_char or first_is_chinese_char):
178
- markdown_texts += " " + res["text"]
179
- else:
180
- markdown_texts += res["text"]
181
- else:
182
- markdown_texts += "\n\n" + res["text"]
183
- previous_page_last_element_paragraph_end_flag = (
184
- page_last_element_paragraph_end_flag
185
- )
186
 
187
- return markdown_texts
188
-
189
  def markdown_to_html_with_ernie(self, markdown_text: str) -> str:
190
- """Convert markdown to HTML using ERNIE API"""
191
  if not self.client:
192
- # Fallback to basic markdown conversion if no API client
193
  return self.basic_markdown_to_html(markdown_text)
194
 
195
  try:
196
  prompt = f"""Please convert the following markdown text into a modern, clean HTML page. Use contemporary typography with the Inter font family and clean design principles. Make it visually appealing with proper CSS styling, responsive design, and excellent readability.
197
-
198
  Design requirements:
199
  - Use Inter font from Google Fonts
200
  - Clean, modern spacing and typography
@@ -202,12 +122,9 @@ Design requirements:
202
  - Good color contrast and hierarchy
203
  - Responsive design that works on all devices
204
  - Include proper HTML structure with head, body, and semantic elements
205
-
206
- Important: Add a footer at the bottom with "Powered by PaddleOCR and ERNIE" where PaddleOCR links to https://github.com/PaddlePaddle/PaddleOCR and ERNIE links to https://huggingface.co/BAIDU. Style it with modern, subtle styling.
207
-
208
  Markdown content:
209
  {markdown_text}
210
-
211
  IMPORTANT: Return ONLY the raw HTML code starting with <!DOCTYPE html> and ending with </html>. Do NOT wrap it in markdown code blocks or add any explanations. I need the pure HTML content that can be directly saved as an .html file."""
212
 
213
  messages = [{"role": "user", "content": prompt}]
@@ -218,31 +135,26 @@ IMPORTANT: Return ONLY the raw HTML code starting with <!DOCTYPE html> and endin
218
  max_tokens=64000,
219
  )
220
 
221
- html_content = response.choices[0].message.content
222
 
223
- # Clean up markdown code block markers if present
224
  if html_content.startswith('```html'):
225
- html_content = html_content[7:] # Remove ```html
226
  elif html_content.startswith('```'):
227
- html_content = html_content[3:] # Remove ```
228
 
229
  if html_content.endswith('```'):
230
- html_content = html_content[:-3] # Remove ending ```
231
 
232
- # Strip any extra whitespace
233
- html_content = html_content.strip()
234
-
235
- return html_content
236
 
237
  except Exception as e:
238
  print(f"Error calling ERNIE API: {e}")
239
  return self.basic_markdown_to_html(markdown_text)
240
 
241
  def basic_markdown_to_html(self, markdown_text: str) -> str:
242
- """Fallback markdown to HTML conversion"""
243
  html = markdown.markdown(markdown_text)
244
 
245
- # Wrap in a complete HTML document with styling
246
  complete_html = f"""
247
  <!DOCTYPE html>
248
  <html lang="en">
@@ -251,201 +163,29 @@ IMPORTANT: Return ONLY the raw HTML code starting with <!DOCTYPE html> and endin
251
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
252
  <title>Converted Document</title>
253
  <style>
254
- /* Modern, clean typography */
255
  @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
256
-
257
- * {{
258
- margin: 0;
259
- padding: 0;
260
- box-sizing: border-box;
261
- }}
262
-
263
  body {{
264
- font-family: 'Inter', system-ui, -apple-system, sans-serif;
265
- font-weight: 400;
266
- line-height: 1.7;
267
- color: #1a1a1a;
268
- max-width: 850px;
269
- margin: 0 auto;
270
- padding: 32px 24px;
271
- background: #fafafa;
272
- font-size: 16px;
273
  }}
274
-
275
  .container {{
276
- background: #ffffff;
277
- padding: 48px;
278
- border-radius: 12px;
279
  box-shadow: 0 1px 3px rgba(0,0,0,0.08), 0 4px 24px rgba(0,0,0,0.04);
280
- border: 1px solid rgba(0,0,0,0.06);
281
- }}
282
-
283
- /* Typography hierarchy */
284
- h1, h2, h3, h4, h5, h6 {{
285
- font-weight: 600;
286
- color: #0f0f0f;
287
- margin: 32px 0 16px 0;
288
- letter-spacing: -0.02em;
289
- }}
290
-
291
- h1 {{
292
- font-size: 2.25rem;
293
- font-weight: 700;
294
- margin-top: 0;
295
- margin-bottom: 24px;
296
- border-bottom: 2px solid #e5e7eb;
297
- padding-bottom: 16px;
298
- }}
299
-
300
- h2 {{
301
- font-size: 1.75rem;
302
- margin-top: 48px;
303
- }}
304
-
305
- h3 {{
306
- font-size: 1.375rem;
307
- margin-top: 40px;
308
- }}
309
-
310
- h4 {{
311
- font-size: 1.125rem;
312
- }}
313
-
314
- p {{
315
- margin-bottom: 20px;
316
- color: #374151;
317
- line-height: 1.75;
318
- }}
319
-
320
- /* Code styling */
321
- code {{
322
- font-family: 'SF Mono', Consolas, 'Liberation Mono', monospace;
323
- background-color: #f3f4f6;
324
- color: #1f2937;
325
- padding: 3px 6px;
326
- border-radius: 4px;
327
- font-size: 0.875rem;
328
- font-weight: 500;
329
- }}
330
-
331
- pre {{
332
- background-color: #f8fafc;
333
- border: 1px solid #e5e7eb;
334
- padding: 20px;
335
- border-radius: 8px;
336
- overflow-x: auto;
337
- margin: 24px 0;
338
- font-size: 0.875rem;
339
- line-height: 1.6;
340
  }}
341
-
342
- pre code {{
343
- background: none;
344
- padding: 0;
345
- border-radius: 0;
346
- }}
347
-
348
- /* Blockquotes */
349
- blockquote {{
350
- border-left: 4px solid #6366f1;
351
- padding-left: 20px;
352
- margin: 24px 0;
353
- font-style: normal;
354
- color: #4b5563;
355
- background-color: #f8fafc;
356
- padding: 16px 20px;
357
- border-radius: 0 8px 8px 0;
358
- }}
359
-
360
- /* Images */
361
- img {{
362
- max-width: 100%;
363
- height: auto;
364
- border-radius: 8px;
365
- margin: 20px 0;
366
- box-shadow: 0 4px 12px rgba(0,0,0,0.1);
367
- }}
368
-
369
- /* Tables */
370
- table {{
371
- border-collapse: collapse;
372
- width: 100%;
373
- margin: 24px 0;
374
- background: #ffffff;
375
- border-radius: 8px;
376
- overflow: hidden;
377
- box-shadow: 0 1px 3px rgba(0,0,0,0.1);
378
- }}
379
-
380
- th, td {{
381
- padding: 16px;
382
- text-align: left;
383
- border-bottom: 1px solid #e5e7eb;
384
- }}
385
-
386
- th {{
387
- background-color: #f9fafb;
388
- font-weight: 600;
389
- color: #374151;
390
- font-size: 0.875rem;
391
- text-transform: uppercase;
392
- letter-spacing: 0.05em;
393
- }}
394
-
395
- tr:last-child td {{
396
- border-bottom: none;
397
- }}
398
-
399
- /* Lists */
400
- ul, ol {{
401
- margin: 16px 0 20px 24px;
402
- color: #374151;
403
- }}
404
-
405
- li {{
406
- margin-bottom: 8px;
407
- line-height: 1.6;
408
- }}
409
-
410
- /* Links */
411
- a {{
412
- color: #6366f1;
413
- text-decoration: none;
414
- font-weight: 500;
415
- }}
416
-
417
- a:hover {{
418
- color: #4f46e5;
419
- text-decoration: underline;
420
- }}
421
- /* Footer */
422
  .footer {{
423
- margin-top: 64px;
424
- padding-top: 24px;
425
- border-top: 1px solid #e5e7eb;
426
- text-align: center;
427
- font-size: 14px;
428
- color: #6b7280;
429
- font-weight: 400;
430
- }}
431
-
432
- .footer a {{
433
- color: #6366f1;
434
- font-weight: 500;
435
- text-decoration: none;
436
- }}
437
-
438
- .footer a:hover {{
439
- color: #4f46e5;
440
- text-decoration: underline;
441
  }}
 
 
442
  </style>
443
  </head>
444
  <body>
445
  <div class="container">
446
  {html}
447
  <div class="footer">
448
- Powered by <a href="https://github.com/PaddlePaddle/PaddleOCR" target="_blank">PaddleOCR</a> and
449
  <a href="https://huggingface.co/BAIDU" target="_blank">ERNIE</a>
450
  </div>
451
  </div>
@@ -457,22 +197,11 @@ IMPORTANT: Return ONLY the raw HTML code starting with <!DOCTYPE html> and endin
457
  def process_document(self, file_path: str) -> Tuple[str, str]:
458
  """Process uploaded document and convert to HTML"""
459
  try:
460
- file_extension = Path(file_path).suffix.lower()
461
-
462
- # Check supported formats
463
- if file_extension == '.pdf' or file_extension in [
464
- '.png', '.jpg', '.jpeg', '.bmp', '.tiff']:
465
- # Process with PP-StructureV3 API
466
- markdown_content = self.extract_text_with_api(file_path)
467
- else:
468
- return ("Error: Unsupported file format. "
469
- "Please upload PDF or image files."), ""
470
 
471
  if not markdown_content.strip():
472
- return ("Warning: No text content extracted "
473
- "from the document."), ""
474
 
475
- # Convert markdown to HTML using ERNIE or fallback
476
  html_content = self.markdown_to_html_with_ernie(markdown_content)
477
 
478
  return markdown_content, html_content
@@ -480,311 +209,98 @@ IMPORTANT: Return ONLY the raw HTML code starting with <!DOCTYPE html> and endin
480
  except Exception as e:
481
  return f"Error processing document: {str(e)}", ""
482
 
483
- # Initialize converter
484
  converter = Doc2PageConverter()
485
 
486
  def process_upload(file):
487
- """Process uploaded file and return markdown and HTML"""
488
  if file is None:
489
  return "Please upload a file.", "", ""
490
-
491
  try:
492
- # Process the document
493
  markdown_result, html_result = converter.process_document(file.name)
494
-
495
  if html_result:
496
  return "Document processed successfully!", markdown_result, html_result
497
  else:
498
- return markdown_result, "", "" # Error message in markdown_result
499
-
500
  except Exception as e:
501
  return f"Error: {str(e)}", "", ""
502
 
503
  def save_html_file(html_content, filename="converted_page"):
504
- """Save HTML content to file for download"""
505
  if not html_content:
506
  return None
507
-
508
- # Create temporary file
509
  temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False,
510
  prefix=f"{filename}_")
511
  temp_file.write(html_content)
512
  temp_file.close()
513
-
514
  return temp_file.name
515
 
516
- # Create custom theme for a clean, modern look
517
  custom_theme = gr.themes.Default(
518
- primary_hue="blue",
519
- secondary_hue="gray",
520
- neutral_hue="gray",
521
  font=("Inter", "system-ui", "sans-serif"),
522
- font_mono=("SF Mono", "Consolas", "monospace")
523
  ).set(
524
- body_background_fill="#fafafa",
525
- background_fill_primary="#ffffff",
526
- background_fill_secondary="#f8f9fa",
527
- border_color_primary="#e5e7eb",
528
- button_primary_background_fill="#6366f1",
529
- button_primary_background_fill_hover="#4f46e5",
530
- button_primary_text_color="#ffffff",
531
  )
532
 
533
- # Create Gradio interface
534
  with gr.Blocks(
535
- title="Doc2Page - Simple Document Converter",
536
  theme=custom_theme,
537
- css="""
538
- .gradio-container {
539
- max-width: 1200px !important;
540
- margin: auto;
541
- padding: 32px 16px;
542
- }
543
-
544
- /* Enhanced button styling */
545
- .gr-button {
546
- font-weight: 500;
547
- border-radius: 10px;
548
- font-size: 14px;
549
- transition: all 0.2s ease;
550
- box-shadow: 0 2px 4px rgba(99, 102, 241, 0.1);
551
- }
552
-
553
- .gr-button:hover {
554
- transform: translateY(-1px);
555
- box-shadow: 0 4px 8px rgba(99, 102, 241, 0.2);
556
- }
557
-
558
- /* Input styling */
559
- .gr-textbox, .gr-file {
560
- border-radius: 10px;
561
- font-family: 'Inter', system-ui, sans-serif;
562
- border: 1px solid #e5e7eb;
563
- transition: border-color 0.2s ease;
564
- }
565
-
566
- .gr-textbox:focus, .gr-file:focus {
567
- border-color: #6366f1;
568
- box-shadow: 0 0 0 3px rgba(99, 102, 241, 0.1);
569
- }
570
-
571
- /* Typography */
572
- h1 {
573
- font-weight: 700;
574
- color: #1a1a1a;
575
- margin-bottom: 8px;
576
- font-size: 2.5rem;
577
- }
578
-
579
- .app-description {
580
- color: #6b7280;
581
- font-size: 18px;
582
- margin-bottom: 40px;
583
- font-weight: 400;
584
- }
585
-
586
- /* Tab styling */
587
- .gr-tab {
588
- border-radius: 8px 8px 0 0;
589
- font-weight: 500;
590
- }
591
-
592
- /* Card-like sections */
593
- .gr-column {
594
- background: rgba(255, 255, 255, 0.5);
595
- border-radius: 12px;
596
- padding: 16px;
597
- margin: 8px;
598
- }
599
-
600
- /* Status styling */
601
- .gr-textbox[data-testid*="status"] {
602
- background-color: #f8fafc;
603
- border: 1px solid #e2e8f0;
604
- }
605
-
606
- /* Download section styling */
607
- .download-section {
608
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
609
- border-radius: 12px;
610
- padding: 20px;
611
- color: white;
612
- margin-top: 20px;
613
- }
614
- """
615
  ) as app:
616
 
617
- # Header
618
- gr.Markdown(
619
- "# Doc2Page",
620
- elem_classes="main-title"
621
- )
622
- gr.Markdown(
623
- "πŸ₯ƒ Transform your documents into beautiful webpages!",
624
- elem_classes="app-description"
625
- )
626
 
627
- # Main interface
628
  with gr.Row():
629
  with gr.Column(scale=1, min_width=350):
630
- with gr.Group():
631
- gr.Markdown("### πŸ“„ Upload Document")
632
- file_input = gr.File(
633
- label="Choose your file",
634
- file_types=[".pdf", ".png", ".jpg", ".jpeg", ".bmp", ".tiff"],
635
- file_count="single",
636
- height=140
637
- )
638
-
639
- process_btn = gr.Button(
640
- "✨ Convert to Webpage",
641
- variant="primary",
642
- size="lg",
643
- scale=1
644
- )
645
-
646
- status_output = gr.Textbox(
647
- label="Status",
648
- placeholder="Ready to convert your document...",
649
- interactive=False,
650
- lines=3,
651
- max_lines=3
652
- )
653
 
654
  with gr.Column(scale=2):
655
- gr.Markdown("### πŸ“‹ Results")
656
  with gr.Tabs():
657
- with gr.TabItem("❀️ Preview", id="preview"):
658
- html_preview = gr.HTML(
659
- label="",
660
- value="<div style='padding: 40px; text-align: center; color: #6b7280;'>Your converted webpage will appear here</div>",
661
- )
662
-
663
- with gr.TabItem("πŸ“ Markdown Source", id="markdown"):
664
- markdown_output = gr.Textbox(
665
- label="",
666
- placeholder="Extracted markdown content will appear here...",
667
- lines=22,
668
- interactive=False,
669
- show_copy_button=True
670
- )
671
-
672
- with gr.TabItem("🌐 HTML Source", id="html"):
673
- html_output = gr.Code(
674
- label="",
675
- language="html",
676
- lines=22,
677
- interactive=False
678
- )
679
 
680
- # Success & Download section
681
  with gr.Row(visible=False) as download_section:
682
- with gr.Column():
683
- gr.Markdown("""
684
- <div style="background: linear-gradient(135deg, #10b981, #059669); border-radius: 12px; padding: 20px; color: white; text-align: center; margin: 20px 0;">
685
- <h3 style="margin: 0 0 8px 0; color: white;">βœ… Conversion Successful!</h3>
686
- <p style="margin: 0; opacity: 0.9;">Your document has been converted to a beautiful webpage</p>
687
- </div>
688
- """)
689
-
690
- with gr.Row():
691
- with gr.Column(scale=1):
692
- gr.Markdown("### πŸ“₯ Download Your Webpage")
693
- download_btn = gr.File(
694
- label="HTML File",
695
- visible=True
696
- )
697
-
698
- with gr.Column(scale=1):
699
- gr.Markdown("### πŸš€ Quick Deploy Guide")
700
- gr.Markdown("""
701
- 1. **GitHub Pages**: Upload as `index.html` to your repo
702
- 2. **Netlify**: Drag & drop the file to netlify.app
703
- 3. **Vercel**: Use their simple file deployment
704
- 4. **Local**: Double-click to open in browser
705
- """, elem_classes="deploy-guide")
706
 
707
- # Event handlers
708
  def process_and_update(file):
709
  status, markdown_content, html_content = process_upload(file)
710
 
711
- # Create download file if HTML was generated
712
  download_file = None
713
  show_download = False
714
-
715
  if html_content:
716
  filename = Path(file.name).stem if file else "converted_page"
717
  download_file = save_html_file(html_content, filename)
718
  show_download = True
719
 
720
- # Preview content with better styling when no content
721
- preview_content = html_content if html_content else """
722
- <div style='padding: 60px 20px; text-align: center; color: #6b7280;
723
- background: #f9fafb; border-radius: 8px; border: 2px dashed #d1d5db;'>
724
- <h3 style='color: #9ca3af; margin: 0;'>No preview available</h3>
725
- <p style='margin: 8px 0 0 0;'>Convert a document to see the preview</p>
726
- </div>
727
- """
728
 
729
  return (
730
- status, # status_output
731
- markdown_content, # markdown_output
732
- html_content, # html_output
733
- preview_content, # html_preview
734
- download_file, # download_btn
735
- gr.update(visible=show_download) # download_section
736
  )
737
 
738
  process_btn.click(
739
  fn=process_and_update,
740
  inputs=[file_input],
741
- outputs=[
742
- status_output,
743
- markdown_output,
744
- html_output,
745
- html_preview,
746
- download_btn,
747
- download_section
748
- ]
749
  )
750
 
751
- # Footer
752
  gr.Markdown(
753
- """
754
- <div style="text-align: center; padding: 20px 0; margin-top: 40px; border-top: 1px solid #e5e7eb; color: #6b7280; font-size: 14px;">
755
- Powered by <a href="https://github.com/PaddlePaddle/PaddleOCR" target="_blank" style="color: #6366f1; text-decoration: none;">PaddleOCR</a>
756
- for text extraction and <a href="https://huggingface.co/BAIDU" target="_blank" style="color: #6366f1; text-decoration: none;">ERNIE</a>
757
- for HTML generation
758
- </div>
759
- """,
760
- elem_id="footer"
761
  )
762
-
763
- # Tips section
764
- with gr.Accordion("πŸ’‘ Tips for Best Results", open=False):
765
- gr.Markdown("""
766
- **File Types:** PDF, PNG, JPG, JPEG, BMP, TIFF
767
-
768
- **For Best OCR Results:**
769
- - Use high-resolution, clear images
770
- - Ensure good contrast between text and background
771
- - Avoid skewed or rotated documents
772
- - PDFs generally produce the best results
773
-
774
- **πŸš€ Deploy to GitHub Pages:**
775
- 1. Create a new GitHub repository or use an existing one
776
- 2. Download the generated HTML file from above
777
- 3. Upload it to your repository as `index.html`
778
- 4. Go to repository Settings β†’ Pages
779
- 5. Select "Deploy from a branch" β†’ Choose "main" branch
780
- 6. Your page will be live at `https://yourusername.github.io/yourrepository`
781
-
782
- **πŸ’‘ Pro Tips:**
783
- - Enable custom domains in GitHub Pages settings
784
- - Use GitHub Actions for automated deployments
785
- - Consider using Jekyll themes for enhanced styling
786
- """)
787
-
788
 
789
  if __name__ == "__main__":
790
  app.launch()
 
9
  import markdown
10
  from dotenv import load_dotenv
11
  from openai import OpenAI
12
+ from urllib.parse import urlparse
13
 
14
  # Load environment variables from .env file
15
  load_dotenv()
16
 
17
+ # API Configuration for PaddleOCR-VL
18
  API_URL = os.getenv("API_URL", "")
19
+ TOKEN = os.getenv("TOKEN", "")
20
 
21
 
22
  class Doc2PageConverter:
 
30
  base_url="https://qianfan.baidubce.com/v2",
31
  api_key=self.qianfan_token
32
  )
33
+ def extract_text_with_vl_api(self, file_path: str) -> str:
34
+ if not API_URL:
35
+ raise ValueError("API_URL must be configured in .env file")
36
 
37
+ headers = {"Content-Type": "application/json"}
38
+ if TOKEN:
39
+ headers["Authorization"] = f"bearer {TOKEN}"
40
+
41
  try:
42
+ is_url = isinstance(file_path, str) and file_path.startswith(("http://", "https://"))
43
+
44
+ if is_url:
45
+ path = urlparse(file_path).path
46
+ ext = os.path.splitext(path)[1].lower()
 
 
 
47
  else:
48
+ ext = os.path.splitext(file_path)[1].lower()
49
+
50
+ if ext == '.pdf':
51
+ file_type = 0 # PDF ζ–‡δ»Ά
52
+ elif ext in ['.png', '.jpg', '.jpeg', '.bmp', '.gif']:
53
+ file_type = 1 # 图片文仢
54
+ else:
55
+ raise ValueError(f"δΈζ”―ζŒηš„ζ–‡δ»Άη±»εž‹: '{ext}'")
56
+
57
+ if is_url:
58
+ response = requests.get(file_path, timeout=60)
59
+ response.raise_for_status()
60
+ content = response.content
61
+ else:
62
+ with open(file_path, "rb") as f:
63
+ content = f.read()
64
+
65
+ b64_content = base64.b64encode(content).decode("utf-8")
66
+
67
+ except Exception as e:
68
+ raise RuntimeError(f"θ―»ε–ε’ŒηΌ–η ζ–‡δ»Άε€±θ΄₯: {e}")
69
+
70
+ payload = {
71
+ "file": b64_content,
72
+ "fileType": file_type,
73
+ "useLayoutDetection": True,
74
+ "useDocUnwarping": False,
75
+ "useDocOrientationClassify": False,
76
+ "useChartRecognition": False,
77
+ }
78
+
79
+ try:
80
+ print(f"Sending PaddleOCR-VL API request to {API_URL}...")
81
+ response = requests.post(API_URL, json=payload, headers=headers, timeout=300)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  response.raise_for_status()
83
+ result_data = response.json()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  except requests.exceptions.RequestException as e:
85
+ raise RuntimeError(f"PaddleOCR-VL API request failed: {e}")
86
+ except json.JSONDecodeError:
87
+ raise RuntimeError(f"Invalid JSON response from VL API: {response.text}")
88
+
89
+ if result_data.get("errorCode", -1) != 0:
90
+ error_msg = result_data.get("errorMessage", "Unknown API error")
91
+ raise RuntimeError(f"PaddleOCR-VL API returned an error: {error_msg}")
92
+
93
+ layout_results = result_data.get("result", {}).get("layoutParsingResults", [])
94
+ if not layout_results:
95
  return ""
 
 
 
 
 
 
 
 
96
 
97
+ first_page_result = layout_results[0]
98
+ # print(first_page_result.get("prunedResult"))
99
+ markdown_data = first_page_result.get("markdown", {})
100
+
101
+ full_markdown_text = markdown_data.get("text", "")
102
+ image_map = markdown_data.get("images", {})
103
 
104
+ if image_map:
105
+ for placeholder, real_url in image_map.items():
106
+ full_markdown_text = full_markdown_text.replace(f'src="{placeholder}"', f'src="{real_url}"')
107
 
108
+ return full_markdown_text
 
 
 
 
 
 
109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
 
 
111
  def markdown_to_html_with_ernie(self, markdown_text: str) -> str:
112
+ """Convert markdown to HTML using ERNIE API. (No changes needed)"""
113
  if not self.client:
 
114
  return self.basic_markdown_to_html(markdown_text)
115
 
116
  try:
117
  prompt = f"""Please convert the following markdown text into a modern, clean HTML page. Use contemporary typography with the Inter font family and clean design principles. Make it visually appealing with proper CSS styling, responsive design, and excellent readability.
 
118
  Design requirements:
119
  - Use Inter font from Google Fonts
120
  - Clean, modern spacing and typography
 
122
  - Good color contrast and hierarchy
123
  - Responsive design that works on all devices
124
  - Include proper HTML structure with head, body, and semantic elements
125
+ Important: Add a footer at the bottom with "Powered by PaddleOCR-VL and ERNIE" where PaddleOCR-VL links to https://github.com/PaddlePaddle/PaddleOCR and ERNIE links to https://huggingface.co/BAIDU. Style it with modern, subtle styling.
 
 
126
  Markdown content:
127
  {markdown_text}
 
128
  IMPORTANT: Return ONLY the raw HTML code starting with <!DOCTYPE html> and ending with </html>. Do NOT wrap it in markdown code blocks or add any explanations. I need the pure HTML content that can be directly saved as an .html file."""
129
 
130
  messages = [{"role": "user", "content": prompt}]
 
135
  max_tokens=64000,
136
  )
137
 
138
+ html_content = response.choices[0].message.content.strip()
139
 
 
140
  if html_content.startswith('```html'):
141
+ html_content = html_content[7:]
142
  elif html_content.startswith('```'):
143
+ html_content = html_content[3:]
144
 
145
  if html_content.endswith('```'):
146
+ html_content = html_content[:-3]
147
 
148
+ return html_content.strip()
 
 
 
149
 
150
  except Exception as e:
151
  print(f"Error calling ERNIE API: {e}")
152
  return self.basic_markdown_to_html(markdown_text)
153
 
154
  def basic_markdown_to_html(self, markdown_text: str) -> str:
155
+ """Fallback markdown to HTML conversion. (No changes needed)"""
156
  html = markdown.markdown(markdown_text)
157
 
 
158
  complete_html = f"""
159
  <!DOCTYPE html>
160
  <html lang="en">
 
163
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
164
  <title>Converted Document</title>
165
  <style>
 
166
  @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
 
 
 
 
 
 
 
167
  body {{
168
+ font-family: 'Inter', system-ui, sans-serif; line-height: 1.7; color: #1a1a1a;
169
+ max-width: 850px; margin: 0 auto; padding: 32px 24px; background: #fafafa;
 
 
 
 
 
 
 
170
  }}
 
171
  .container {{
172
+ background: #ffffff; padding: 48px; border-radius: 12px;
 
 
173
  box-shadow: 0 1px 3px rgba(0,0,0,0.08), 0 4px 24px rgba(0,0,0,0.04);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  }}
175
+ img {{ max-width: 100%; height: auto; border-radius: 8px; margin: 20px 0; }}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  .footer {{
177
+ margin-top: 64px; padding-top: 24px; border-top: 1px solid #e5e7eb;
178
+ text-align: center; font-size: 14px; color: #6b7280;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  }}
180
+ .footer a {{ color: #6366f1; text-decoration: none; }}
181
+ .footer a:hover {{ text-decoration: underline; }}
182
  </style>
183
  </head>
184
  <body>
185
  <div class="container">
186
  {html}
187
  <div class="footer">
188
+ Powered by <a href="https://github.com/PaddlePaddle/PaddleOCR" target="_blank">PaddleOCR-VL</a> and
189
  <a href="https://huggingface.co/BAIDU" target="_blank">ERNIE</a>
190
  </div>
191
  </div>
 
197
  def process_document(self, file_path: str) -> Tuple[str, str]:
198
  """Process uploaded document and convert to HTML"""
199
  try:
200
+ markdown_content = self.extract_text_with_vl_api(file_path)
 
 
 
 
 
 
 
 
 
201
 
202
  if not markdown_content.strip():
203
+ return ("Warning: No text content extracted from the document."), ""
 
204
 
 
205
  html_content = self.markdown_to_html_with_ernie(markdown_content)
206
 
207
  return markdown_content, html_content
 
209
  except Exception as e:
210
  return f"Error processing document: {str(e)}", ""
211
 
212
+ # --- Gradio UI and event handling logic (unchanged) ---
213
  converter = Doc2PageConverter()
214
 
215
  def process_upload(file):
 
216
  if file is None:
217
  return "Please upload a file.", "", ""
 
218
  try:
 
219
  markdown_result, html_result = converter.process_document(file.name)
 
220
  if html_result:
221
  return "Document processed successfully!", markdown_result, html_result
222
  else:
223
+ return markdown_result, "", ""
 
224
  except Exception as e:
225
  return f"Error: {str(e)}", "", ""
226
 
227
  def save_html_file(html_content, filename="converted_page"):
 
228
  if not html_content:
229
  return None
 
 
230
  temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False,
231
  prefix=f"{filename}_")
232
  temp_file.write(html_content)
233
  temp_file.close()
 
234
  return temp_file.name
235
 
 
236
  custom_theme = gr.themes.Default(
237
+ primary_hue="blue", secondary_hue="gray", neutral_hue="gray",
 
 
238
  font=("Inter", "system-ui", "sans-serif"),
 
239
  ).set(
240
+ body_background_fill="#fafafa", background_fill_primary="#ffffff",
241
+ border_color_primary="#e5e7eb", button_primary_background_fill="#6366f1",
242
+ button_primary_background_fill_hover="#4f46e5", button_primary_text_color="#ffffff",
 
 
 
 
243
  )
244
 
 
245
  with gr.Blocks(
246
+ title="Doc2Page - Document to Webpage Converter",
247
  theme=custom_theme,
248
+ css=".gradio-container { max-width: 1200px !important; margin: auto; }"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
  ) as app:
250
 
251
+ gr.Markdown("# Doc2Page\nπŸ₯ƒ Transform your documents into beautiful webpages!")
 
 
 
 
 
 
 
 
252
 
 
253
  with gr.Row():
254
  with gr.Column(scale=1, min_width=350):
255
+ file_input = gr.File(
256
+ label="πŸ“„ Upload Document",
257
+ file_types=[".pdf", ".png", ".jpg", ".jpeg", ".bmp", ".tiff"],
258
+ )
259
+ process_btn = gr.Button("✨ Convert to Webpage", variant="primary")
260
+ status_output = gr.Textbox(label="Status", interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
 
262
  with gr.Column(scale=2):
 
263
  with gr.Tabs():
264
+ with gr.TabItem("❀️ Preview"):
265
+ html_preview = gr.HTML(label="", value="<div style='text-align: center; color: #6b7280;'>Your converted webpage will appear here</div>")
266
+ with gr.TabItem("πŸ“ Markdown Source"):
267
+ markdown_output = gr.Textbox(label="", interactive=False, show_copy_button=True)
268
+ with gr.TabItem("🌐 HTML Source"):
269
+ html_output = gr.Code(label="", language="html", interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
 
 
271
  with gr.Row(visible=False) as download_section:
272
+ gr.Markdown("### πŸ“₯ Download Your Webpage")
273
+ download_btn = gr.File(label="HTML File", visible=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
 
 
275
  def process_and_update(file):
276
  status, markdown_content, html_content = process_upload(file)
277
 
 
278
  download_file = None
279
  show_download = False
 
280
  if html_content:
281
  filename = Path(file.name).stem if file else "converted_page"
282
  download_file = save_html_file(html_content, filename)
283
  show_download = True
284
 
285
+ preview_content = html_content or "<div style='text-align: center; color: #9ca3af;'>No preview available</div>"
 
 
 
 
 
 
 
286
 
287
  return (
288
+ status, markdown_content, html_content, preview_content,
289
+ download_file, gr.update(visible=show_download)
 
 
 
 
290
  )
291
 
292
  process_btn.click(
293
  fn=process_and_update,
294
  inputs=[file_input],
295
+ outputs=[status_output, markdown_output, html_output, html_preview, download_btn, download_section]
 
 
 
 
 
 
 
296
  )
297
 
 
298
  gr.Markdown(
299
+ """<div style="text-align: center; padding: 20px 0; color: #6b7280;">
300
+ Powered by <a href="https://github.com/PaddlePaddle/PaddleOCR" target="_blank">PaddleOCR-VL</a> &
301
+ <a href="https://huggingface.co/BAIDU" target="_blank">ERNIE</a>
302
+ </div>"""
 
 
 
 
303
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
 
305
  if __name__ == "__main__":
306
  app.launch()