Spaces:
Running
Running
import fitz | |
from pdf2docx import Converter | |
import pypandoc | |
def export_to_word(pdf_path: str, output_path: str = "output.docx") -> str: | |
"""Convert PDF to DOCX (layout-aware).""" | |
cv = Converter(pdf_path) | |
cv.convert(output_path, start=0, end=None) | |
cv.close() | |
return output_path | |
def export_to_text(pdf_path: str, output_path: str = "output.txt") -> str: | |
"""Export selectable text to TXT.""" | |
text = "" | |
with fitz.open(pdf_path) as doc: | |
for page in doc: | |
text += page.get_text() | |
with open(output_path, "w", encoding="utf-8") as f: | |
f.write(text) | |
return output_path | |
def export_text_to_markdown(text: str, output_path: str = "output.md") -> str: | |
"""Export text (already extracted) to Markdown.""" | |
pypandoc.convert_text(text, "md", format="md", outputfile=output_path, extra_args=["--standalone"]) | |
return output_path | |