| import os | |
| import pymupdf.layout | |
| import pymupdf4llm | |
| from pathlib import Path | |
| import glob | |
| os.environ["TOKENIZERS_PARALLELISM"] = "false" | |
| def pdf_to_markdown(pdf_path, output_dir): | |
| doc = pymupdf.open(pdf_path) | |
| md = pymupdf4llm.to_markdown(doc, header=False, footer=False, page_separators=True, ignore_images=True, write_images=False, image_path=None) | |
| md_cleaned = md.encode('utf-8', errors='surrogatepass').decode('utf-8', errors='ignore') | |
| output_path = Path(output_dir) / Path(doc.name).stem | |
| Path(output_path).with_suffix(".md").write_bytes(md_cleaned.encode('utf-8')) | |
| def pdfs_to_markdowns(path_pattern, overwrite: bool = False): | |
| output_dir = Path('./docs/markdowns') | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| for pdf_path in map(Path, glob.glob(path_pattern)): | |
| md_path = (output_dir / pdf_path.stem).with_suffix(".md") | |
| if overwrite or not md_path.exists(): | |
| pdf_to_markdown(pdf_path, output_dir) | |
| if __name__ == "__main__": | |
| pdf_folder = Path('./docs/pdf') | |
| for file in os.listdir(pdf_folder): | |
| file_path = pdf_folder / file | |
| if file_path.suffix.lower() == '.pdf': | |
| print(f"Processing file: {file_path}") | |
| pdf_to_markdown(file_path, './docs/markdowns') |