web-crawling

Running

File size: 3,643 Bytes

from fastapi import APIRouter, UploadFile, File, HTTPException, BackgroundTasks, Response
from fastapi.responses import FileResponse
from pydantic import BaseModel
from pdf2docx import Converter
import os
import shutil
import pdfkit
import uuid

router = APIRouter()

TEMP_DIR = "/.tempfiles"

class HTMLRequest(BaseModel):
    html_content: str

def ensure_temp_dir():
    os.makedirs(TEMP_DIR, exist_ok=True)

def remove_file(path: str):
    if os.path.exists(path):
        os.unlink(path)

def generate_temp_filepath(extension: str) -> str:
    return os.path.join(TEMP_DIR, f"temp_{uuid.uuid4()}.{extension}")

def html_to_pdf(html_content: str, output_path: str) -> None:
    options = {
        'page-size': 'A4',
        'margin-top': '0.75in',
        'margin-right': '0.75in',
        'margin-bottom': '0.75in',
        'margin-left': '0.75in',
        'encoding': "UTF-8",
    }
    pdfkit.from_string(html_content, output_path, options=options)

def pdf_to_docx(pdf_path: str, docx_path: str) -> None:
    cv = Converter(pdf_path)
    cv.convert(docx_path)
    cv.close()

def handle_conversion(convert_func, input_path: str, output_path: str, background_tasks: BackgroundTasks):
    try:
        convert_func(input_path, output_path)
        if not os.path.exists(output_path):
            raise FileNotFoundError(f"Converted file not found: {output_path}")
        background_tasks.add_task(remove_file, input_path)
        background_tasks.add_task(remove_file, output_path)
        return FileResponse(
            output_path,
            media_type='application/vnd.openxmlformats-officedocument.wordprocessingml.document',
            filename=f"converted_document_{uuid.uuid4()}.docx"
        )
    except Exception as e:
        remove_file(input_path)
        remove_file(output_path)
        raise HTTPException(status_code=500, detail=f"Conversion failed: {str(e)}")

@router.post("/convert/pdf_to_docx")
async def convert_pdf_to_docx(background_tasks: BackgroundTasks, file: UploadFile = File(...)):
    if not file.filename.endswith('.pdf'):
        raise HTTPException(status_code=400, detail="File must be a PDF")
    
    ensure_temp_dir()
    pdf_temp_path = generate_temp_filepath("pdf")
    docx_temp_path = pdf_temp_path.replace('.pdf', '.docx')
    
    with open(pdf_temp_path, "wb") as pdf_file:
        shutil.copyfileobj(file.file, pdf_file)
    
    return handle_conversion(pdf_to_docx, pdf_temp_path, docx_temp_path, background_tasks)

@router.post("/convert/html_to_pdf")
async def convert_html_to_pdf(request: HTMLRequest):
    ensure_temp_dir()
    pdf_temp_path = generate_temp_filepath("pdf")
    
    try:
        html_to_pdf(request.html_content, pdf_temp_path)
        with open(pdf_temp_path, "rb") as pdf_file:
            pdf_content = pdf_file.read()
        remove_file(pdf_temp_path)
        return Response(content=pdf_content, media_type="application/pdf")
    except Exception as e:
        remove_file(pdf_temp_path)
        raise HTTPException(status_code=500, detail=str(e))

@router.post("/convert/html_to_docx")
async def convert_html_to_docx(background_tasks: BackgroundTasks, request: HTMLRequest):
    ensure_temp_dir()
    pdf_temp_path = generate_temp_filepath("pdf")
    docx_temp_path = pdf_temp_path.replace('.pdf', '.docx')
    
    try:
        html_to_pdf(request.html_content, pdf_temp_path)
        return handle_conversion(pdf_to_docx, pdf_temp_path, docx_temp_path, background_tasks)
    except Exception as e:
        remove_file(pdf_temp_path)
        remove_file(docx_temp_path)
        raise HTTPException(status_code=500, detail=f"Conversion failed: {str(e)}")