import os import time import datetime from pathlib import Path from typing import List from PIL import Image from PyPDF2 import PdfReader import streamlit as st import pandas as pd from docling.document_converter import DocumentConverter, PdfFormatOption from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ConversionStatus from docling.datamodel.pipeline_options import ( PdfPipelineOptions, AcceleratorOptions, AcceleratorDevice, TableStructureOptions, TableFormerMode, EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, RapidOcrOptions, OcrMacOptions, ) from docling_core.types.doc import PictureItem, TableItem # Répertoires de sortie OUTPUT_DIR = Path("output") OUTPUT_DIR.mkdir(exist_ok=True) FIGURES_DIR = OUTPUT_DIR / "figures" FIGURES_DIR.mkdir(exist_ok=True) TABLES_DIR = OUTPUT_DIR / "tables" TABLES_DIR.mkdir(exist_ok=True) # Vérification de validité des fichiers def is_valid_file(file_path): try: if file_path.suffix.lower() in [".pdf", ".docx", ".pptx", ".html", ".png", ".jpg"]: return True else: st.error(f"❌ Format non supporté : {file_path.suffix}") return False except Exception as e: st.error(f"❌ Erreur lors de la vérification du fichier : {e}") return False # Fonction pour configurer le convertisseur de documents def create_document_converter( use_ocr: bool, export_figures: bool, export_tables: bool, accelerator: str, ocr_engine: str, table_mode: str, ocr_languages: List[str], ) -> DocumentConverter: accelerator_options = AcceleratorOptions( num_threads=8, device=AcceleratorDevice[accelerator.upper()], ) table_structure_options = TableStructureOptions( mode=TableFormerMode[table_mode.upper()], do_cell_matching=True, ) # OCR avec le moteur choisi if ocr_engine == "easyocr": ocr_options = EasyOcrOptions(lang=ocr_languages) elif ocr_engine == "tesseract_cli": ocr_options = TesseractCliOcrOptions(lang=ocr_languages) elif ocr_engine == "tesserocr": ocr_options = TesseractOcrOptions(lang=ocr_languages) elif ocr_engine == "rapidocr": ocr_options = RapidOcrOptions(lang=ocr_languages) elif ocr_engine == "ocrmac": ocr_options = OcrMacOptions(lang=ocr_languages) else: raise ValueError(f"Moteur OCR non pris en charge : {ocr_engine}") pipeline_options = PdfPipelineOptions( do_ocr=use_ocr, generate_page_images=True, generate_picture_images=export_figures, generate_table_images=export_tables, accelerator_options=accelerator_options, table_structure_options=table_structure_options, ocr_options=ocr_options, ) return DocumentConverter( allowed_formats=[ InputFormat.PDF, InputFormat.DOCX, InputFormat.PPTX, InputFormat.HTML, InputFormat.IMAGE, ], format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}, ) # Interface utilisateur avec Streamlit st.title("📊 Docling document converter ") st.subheader("📤 Téléchargez un, ou plusieurs document pour commencer le traitement") uploaded_files = st.file_uploader( "Sélectionnez vos fichiers (PDF, DOCX, PPTX, HTML, Images)", accept_multiple_files=True ) use_ocr = st.checkbox("👁️‍🗨️ Activer l'OCR", value=True) export_figures = st.checkbox("🖼️ Exporter les images", value=True) export_tables = st.checkbox("📋 Exporter les tableaux", value=True) ocr_engine = st.selectbox("Moteur OCR", ["easyocr", "tesseract_cli", "tesserocr", "rapidocr", "ocrmac"]) ocr_languages = st.text_input("Langues OCR (ex : eng, fra)", "eng").split(",") table_mode = st.selectbox("Mode des tableaux", ["ACCURATE", "FAST"]) export_formats = st.multiselect( "Formats d'exportation", ["json", "yaml", "md", "multimodal"], default=["md"] ) if st.button("Convertir"): if uploaded_files: input_paths = [] generated_files = [] figures = [] tables = [] total_files = len(uploaded_files) start_time = time.time() # Chronomètre de démarrage # Charger les fichiers téléchargés for uploaded_file in uploaded_files: file_path = OUTPUT_DIR / uploaded_file.name with open(file_path, "wb") as f: f.write(uploaded_file.read()) st.write(f"📥 Fichier reçu : `{file_path.name}` ({os.path.getsize(file_path)} octets)") if not is_valid_file(file_path): continue input_paths.append(file_path) # Configurer le convertisseur converter = create_document_converter( use_ocr, export_figures, export_tables, accelerator="cpu", ocr_engine=ocr_engine, table_mode=table_mode, ocr_languages=ocr_languages, ) # Barre de progression progress_bar = st.progress(0) status_placeholder = st.empty() # Conversion des fichiers for i, file_path in enumerate(input_paths): status_placeholder.info( f"🔄 Traitement de `{file_path.name}` ({i + 1}/{total_files})" ) # Conversion du fichier conv_results = list(converter.convert_all([file_path], raises_on_error=False)) for conv_res in conv_results: if conv_res.status == ConversionStatus.SUCCESS: st.success(f"✅ Conversion réussie : `{conv_res.input.file}`") # Exporter les résultats for fmt in export_formats: output_file = OUTPUT_DIR / f"{conv_res.input.file.stem}.{fmt}" if fmt == "md": with open(output_file, "w") as f: f.write(conv_res.document.export_to_markdown()) elif fmt == "json": with open(output_file, "w", encoding="utf-8") as f: json.dump(conv_res.document.export_to_dict(), f, ensure_ascii=False, indent=2) elif fmt == "yaml": with open(output_file, "w", encoding="utf-8") as f: yaml.dump(conv_res.document.export_to_dict(), f, allow_unicode=True) generated_files.append(output_file) # Export des figures et tables for element, _ in conv_res.document.iterate_items(): if isinstance(element, PictureItem): fig_path = FIGURES_DIR / f"{conv_res.input.file.stem}_figure.png" element.image.pil_image.save(fig_path) figures.append(fig_path) elif isinstance(element, TableItem): table_path = TABLES_DIR / f"{conv_res.input.file.stem}_table.csv" table_df = element.export_to_dataframe() table_df.to_csv(table_path, index=False) tables.append(table_path) else: st.error(f"❌ Échec de la conversion pour : `{conv_res.input.file}`") # Mise à jour de la barre de progression progress_bar.progress((i + 1) / total_files) # Affichage des fichiers générés st.subheader("📂 Fichiers générés") for generated_file in generated_files: st.markdown(f"📄 **{generated_file.name}**") with open(generated_file, "r") as f: content = f.read() st.text_area(f"Prévisualisation : {generated_file.name}", value=content, height=200) # Affichage des figures extraites if figures: st.subheader("🖼️ Figures extraites") for fig in figures: st.image(Image.open(fig), caption=fig.name) # Affichage des tableaux extraits if tables: st.subheader("📋 Tableaux extraits") for table in tables: st.markdown(f"📄 **{table.name}**") table_df = pd.read_csv(table) st.dataframe(table_df) # Temps total écoulé total_time = time.time() - start_time st.success(f"✅ Conversion terminée en {int(total_time)} secondes !") else: st.error("❌ Veuillez télécharger au moins un fichier.")