File size: 891 Bytes
1e9ae83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import fitz
from pdf2docx import Converter
import pypandoc


def export_to_word(pdf_path: str, output_path: str = "output.docx") -> str:
    """Convert PDF to DOCX (layout-aware)."""
    cv = Converter(pdf_path)
    cv.convert(output_path, start=0, end=None)
    cv.close()
    return output_path


def export_to_text(pdf_path: str, output_path: str = "output.txt") -> str:
    """Export selectable text to TXT."""
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(text)
    return output_path


def export_text_to_markdown(text: str, output_path: str = "output.md") -> str:
    """Export text (already extracted) to Markdown."""
    pypandoc.convert_text(text, "md", format="md", outputfile=output_path, extra_args=["--standalone"])
    return output_path