Spaces:
Sleeping
Sleeping
| """ | |
| FastAPI Backend for HTML to PDF Conversion | |
| Runs alongside Streamlit on port 7860 | |
| """ | |
| from fastapi import FastAPI, UploadFile, File, Form, HTTPException | |
| from fastapi.responses import Response, JSONResponse | |
| from fastapi.middleware.cors import CORSMiddleware | |
| import subprocess | |
| import os | |
| import tempfile | |
| import shutil | |
| import base64 | |
| import re | |
| import mimetypes | |
| from typing import List, Optional | |
| from pathlib import Path | |
| app = FastAPI( | |
| title="HTML to PDF API", | |
| description="Convert HTML to PDF with image support and page breaks", | |
| version="1.0.0" | |
| ) | |
| # Add CORS middleware | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| def detect_aspect_ratio(html_content): | |
| """Detect aspect ratio from HTML content""" | |
| viewport_match = re.search(r'<meta[^>]*viewport[^>]*content=["\']([^"\']*)["\']', html_content, re.IGNORECASE) | |
| if viewport_match: | |
| viewport = viewport_match.group(1).lower() | |
| if 'orientation=portrait' in viewport: | |
| return "9:16" | |
| elif 'orientation=landscape' in viewport: | |
| return "16:9" | |
| aspect_match = re.search(r'aspect-ratio\s*:\s*(\d+)\s*/\s*(\d+)', html_content, re.IGNORECASE) | |
| if aspect_match: | |
| width = int(aspect_match.group(1)) | |
| height = int(aspect_match.group(2)) | |
| ratio = width / height | |
| if ratio > 1.5: | |
| return "16:9" | |
| elif ratio < 0.7: | |
| return "9:16" | |
| else: | |
| return "1:1" | |
| if any(keyword in html_content.lower() for keyword in ['reveal.js', 'impress.js', 'slide', 'presentation']): | |
| return "16:9" | |
| return "9:16" | |
| def image_to_base64(image_bytes, filename): | |
| """Convert image bytes to base64 data URL""" | |
| try: | |
| mime_type, _ = mimetypes.guess_type(filename) | |
| if not mime_type: | |
| ext = os.path.splitext(filename)[1].lower() | |
| mime_map = { | |
| '.jpg': 'image/jpeg', | |
| '.jpeg': 'image/jpeg', | |
| '.png': 'image/png', | |
| '.gif': 'image/gif', | |
| '.svg': 'image/svg+xml', | |
| '.webp': 'image/webp', | |
| '.bmp': 'image/bmp' | |
| } | |
| mime_type = mime_map.get(ext, 'image/png') | |
| b64_data = base64.b64encode(image_bytes).decode('utf-8') | |
| data_url = f"data:{mime_type};base64,{b64_data}" | |
| return data_url | |
| except Exception as e: | |
| raise HTTPException(status_code=400, detail=f"Error converting {filename} to base64: {str(e)}") | |
| def embed_images_as_base64(html_content, images_dict): | |
| """Embed all images directly as base64 data URLs in the HTML""" | |
| if not images_dict: | |
| return html_content, {} | |
| replacements = {} | |
| for filename, data_url in images_dict.items(): | |
| escaped_name = re.escape(filename) | |
| # Pattern 1: img src attribute | |
| pattern1 = rf'(<img[^>]*\s+src\s*=\s*)(["\'])(?:[^"\']*?/)?{escaped_name}\2' | |
| matches1 = list(re.finditer(pattern1, html_content, flags=re.IGNORECASE | re.DOTALL)) | |
| count1 = len(matches1) | |
| if matches1: | |
| html_content = re.sub(pattern1, rf'\1\2{data_url}\2', html_content, flags=re.IGNORECASE | re.DOTALL) | |
| replacements[f"{filename} (img src)"] = count1 | |
| # Pattern 2: background-image | |
| pattern2 = rf'(background-image\s*:\s*url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))' | |
| matches2 = list(re.finditer(pattern2, html_content, flags=re.IGNORECASE)) | |
| count2 = len(matches2) | |
| if matches2: | |
| html_content = re.sub(pattern2, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE) | |
| replacements[f"{filename} (bg-image)"] = count2 | |
| # Pattern 3: CSS url() | |
| pattern3 = rf'(url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))' | |
| matches3 = list(re.finditer(pattern3, html_content, flags=re.IGNORECASE)) | |
| count3 = len(matches3) | |
| if matches3: | |
| html_content = re.sub(pattern3, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE) | |
| replacements[f"{filename} (url)"] = count3 | |
| return html_content, replacements | |
| def inject_page_breaks(html_content: str, aspect_ratio: str): | |
| """Automatically inject page breaks and page sizing CSS""" | |
| if aspect_ratio == "16:9": | |
| page_size = "A4 landscape" | |
| elif aspect_ratio == "1:1": | |
| page_size = "210mm 210mm" | |
| else: | |
| page_size = "A4 portrait" | |
| page_css = f""" | |
| <style id="auto-page-breaks"> | |
| @page {{ | |
| size: {page_size}; | |
| margin: 0; | |
| }} | |
| html, body {{ | |
| margin: 0 !important; | |
| padding: 0 !important; | |
| width: 100% !important; | |
| height: 100% !important; | |
| }} | |
| .page, .slide, section.page, article.page, div[class*="page"], div[class*="slide"] {{ | |
| width: 100% !important; | |
| min-height: 100vh !important; | |
| height: 100vh !important; | |
| page-break-after: always !important; | |
| break-after: page !important; | |
| page-break-inside: avoid !important; | |
| break-inside: avoid !important; | |
| position: relative !important; | |
| box-sizing: border-box !important; | |
| overflow: hidden !important; | |
| }} | |
| .page:last-child, .slide:last-child, | |
| section.page:last-child, article.page:last-child {{ | |
| page-break-after: auto !important; | |
| break-after: auto !important; | |
| }} | |
| body > section:not(.no-page-break), | |
| body > article:not(.no-page-break), | |
| body > div:not(.no-page-break) {{ | |
| page-break-after: always !important; | |
| break-after: page !important; | |
| min-height: 100vh; | |
| }} | |
| body > section:last-child, | |
| body > article:last-child, | |
| body > div:last-child {{ | |
| page-break-after: auto !important; | |
| }} | |
| .page-break, .page-break-after {{ | |
| page-break-after: always !important; | |
| break-after: page !important; | |
| }} | |
| .page-break-before {{ | |
| page-break-before: always !important; | |
| break-before: page !important; | |
| }} | |
| .no-page-break, .keep-together {{ | |
| page-break-inside: avoid !important; | |
| break-inside: avoid !important; | |
| }} | |
| h1, h2, h3, h4, h5, h6 {{ | |
| page-break-after: avoid !important; | |
| break-after: avoid !important; | |
| page-break-inside: avoid !important; | |
| break-inside: avoid !important; | |
| }} | |
| img, figure, table, pre, blockquote {{ | |
| page-break-inside: avoid !important; | |
| break-inside: avoid !important; | |
| }} | |
| * {{ | |
| -webkit-print-color-adjust: exact !important; | |
| print-color-adjust: exact !important; | |
| color-adjust: exact !important; | |
| }} | |
| </style> | |
| """ | |
| if '</head>' in html_content: | |
| html_content = html_content.replace('</head>', page_css + '</head>') | |
| elif '<body' in html_content: | |
| html_content = html_content.replace('<body', page_css + '<body', 1) | |
| else: | |
| html_content = page_css + html_content | |
| return html_content | |
| def convert_html_to_pdf(html_content, aspect_ratio, temp_dir): | |
| """Convert HTML content to PDF using Puppeteer""" | |
| try: | |
| html_content = inject_page_breaks(html_content, aspect_ratio) | |
| html_file = os.path.join(temp_dir, "input.html") | |
| with open(html_file, 'w', encoding='utf-8') as f: | |
| f.write(html_content) | |
| # Find puppeteer script | |
| possible_paths = [ | |
| 'puppeteer_pdf.js', | |
| '/app/puppeteer_pdf.js', | |
| os.path.join(os.path.dirname(__file__), 'puppeteer_pdf.js'), | |
| ] | |
| puppeteer_script = None | |
| for path in possible_paths: | |
| if os.path.exists(path): | |
| puppeteer_script = path | |
| break | |
| if not puppeteer_script: | |
| raise Exception("puppeteer_pdf.js not found") | |
| result = subprocess.run( | |
| ['node', puppeteer_script, html_file, aspect_ratio], | |
| capture_output=True, | |
| text=True, | |
| timeout=60, | |
| cwd=os.path.dirname(os.path.abspath(puppeteer_script)) | |
| ) | |
| if result.returncode != 0: | |
| raise Exception(f"PDF conversion failed: {result.stderr}") | |
| pdf_file = html_file.replace('.html', '.pdf') | |
| if not os.path.exists(pdf_file): | |
| raise Exception("PDF file was not generated") | |
| with open(pdf_file, 'rb') as f: | |
| pdf_bytes = f.read() | |
| return pdf_bytes | |
| except subprocess.TimeoutExpired: | |
| raise Exception("PDF conversion timed out (60 seconds)") | |
| except Exception as e: | |
| raise Exception(f"Error: {str(e)}") | |
| async def root(): | |
| """API root endpoint""" | |
| return { | |
| "message": "HTML to PDF Converter API", | |
| "version": "1.0.0", | |
| "endpoints": { | |
| "POST /convert": "Convert HTML to PDF", | |
| "GET /health": "Health check", | |
| "GET /docs": "API documentation" | |
| } | |
| } | |
| async def health(): | |
| """Health check endpoint""" | |
| return {"status": "healthy"} | |
| async def convert_to_pdf( | |
| html_file: UploadFile = File(..., description="HTML file to convert"), | |
| aspect_ratio: Optional[str] = Form(None, description="Aspect ratio: 16:9, 1:1, or 9:16"), | |
| auto_detect: bool = Form(True, description="Auto-detect aspect ratio from HTML"), | |
| images: Optional[List[UploadFile]] = File(None, description="Images to embed in HTML") | |
| ): | |
| """ | |
| Convert HTML to PDF with optional image embedding | |
| - **html_file**: HTML file to convert (required) | |
| - **aspect_ratio**: Page aspect ratio (optional if auto_detect=true) | |
| - **auto_detect**: Auto-detect aspect ratio from HTML content | |
| - **images**: Image files to embed as base64 in HTML | |
| """ | |
| temp_dir = None | |
| try: | |
| # Read HTML content | |
| html_content = await html_file.read() | |
| try: | |
| html_content = html_content.decode('utf-8') | |
| except UnicodeDecodeError: | |
| html_content = html_content.decode('latin-1') | |
| # Detect or use provided aspect ratio | |
| if auto_detect: | |
| detected_ratio = detect_aspect_ratio(html_content) | |
| aspect_ratio = detected_ratio | |
| elif not aspect_ratio: | |
| aspect_ratio = "9:16" | |
| # Validate aspect ratio | |
| if aspect_ratio not in ["16:9", "1:1", "9:16"]: | |
| raise HTTPException(status_code=400, detail="Invalid aspect ratio. Must be 16:9, 1:1, or 9:16") | |
| # Process images if provided | |
| image_replacements = {} | |
| if images: | |
| images_dict = {} | |
| for img in images: | |
| img_bytes = await img.read() | |
| data_url = image_to_base64(img_bytes, img.filename) | |
| images_dict[img.filename] = data_url | |
| html_content, image_replacements = embed_images_as_base64(html_content, images_dict) | |
| # Create temp directory and convert | |
| temp_dir = tempfile.mkdtemp() | |
| pdf_bytes = convert_html_to_pdf(html_content, aspect_ratio, temp_dir) | |
| # Return PDF | |
| return Response( | |
| content=pdf_bytes, | |
| media_type="application/pdf", | |
| headers={ | |
| "Content-Disposition": f"attachment; filename=converted.pdf", | |
| "X-Aspect-Ratio": aspect_ratio, | |
| "X-Image-Replacements": str(len(image_replacements)), | |
| "X-PDF-Size": str(len(pdf_bytes)) | |
| } | |
| ) | |
| except HTTPException: | |
| raise | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| finally: | |
| if temp_dir and os.path.exists(temp_dir): | |
| shutil.rmtree(temp_dir, ignore_errors=True) | |
| async def convert_to_pdf_base64( | |
| html_content: str = Form(..., description="HTML content as string"), | |
| aspect_ratio: Optional[str] = Form(None, description="Aspect ratio: 16:9, 1:1, or 9:16"), | |
| auto_detect: bool = Form(True, description="Auto-detect aspect ratio from HTML") | |
| ): | |
| """ | |
| Convert HTML string to PDF and return as base64 | |
| - **html_content**: HTML content as string (required) | |
| - **aspect_ratio**: Page aspect ratio (optional if auto_detect=true) | |
| - **auto_detect**: Auto-detect aspect ratio from HTML content | |
| """ | |
| temp_dir = None | |
| try: | |
| # Detect or use provided aspect ratio | |
| if auto_detect: | |
| detected_ratio = detect_aspect_ratio(html_content) | |
| aspect_ratio = detected_ratio | |
| elif not aspect_ratio: | |
| aspect_ratio = "9:16" | |
| # Validate aspect ratio | |
| if aspect_ratio not in ["16:9", "1:1", "9:16"]: | |
| raise HTTPException(status_code=400, detail="Invalid aspect ratio. Must be 16:9, 1:1, or 9:16") | |
| # Create temp directory and convert | |
| temp_dir = tempfile.mkdtemp() | |
| pdf_bytes = convert_html_to_pdf(html_content, aspect_ratio, temp_dir) | |
| # Convert to base64 | |
| pdf_base64 = base64.b64encode(pdf_bytes).decode('utf-8') | |
| return JSONResponse({ | |
| "success": True, | |
| "pdf_base64": pdf_base64, | |
| "aspect_ratio": aspect_ratio, | |
| "size_bytes": len(pdf_bytes) | |
| }) | |
| except HTTPException: | |
| raise | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| finally: | |
| if temp_dir and os.path.exists(temp_dir): | |
| shutil.rmtree(temp_dir, ignore_errors=True) | |
| if __name__ == "__main__": | |
| import uvicorn | |
| uvicorn.run(app, host="0.0.0.0", port=7860) |