import os
import time
import datetime
from pathlib import Path
from typing import List
from PIL import Image
from PyPDF2 import PdfReader
import streamlit as st
import pandas as pd
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionStatus
from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
    AcceleratorOptions,
    AcceleratorDevice,
    TableStructureOptions,
    TableFormerMode,
    EasyOcrOptions,
    TesseractCliOcrOptions,
    TesseractOcrOptions,
    RapidOcrOptions,
    OcrMacOptions,
)
from docling_core.types.doc import PictureItem, TableItem

# Répertoires de sortie
OUTPUT_DIR = Path("output")
OUTPUT_DIR.mkdir(exist_ok=True)

FIGURES_DIR = OUTPUT_DIR / "figures"
FIGURES_DIR.mkdir(exist_ok=True)

TABLES_DIR = OUTPUT_DIR / "tables"
TABLES_DIR.mkdir(exist_ok=True)

# Vérification de validité des fichiers
def is_valid_file(file_path):
    try:
        if file_path.suffix.lower() in [".pdf", ".docx", ".pptx", ".html", ".png", ".jpg"]:
            return True
        else:
            st.error(f"❌ Format non supporté : {file_path.suffix}")
            return False
    except Exception as e:
        st.error(f"❌ Erreur lors de la vérification du fichier : {e}")
        return False

# Fonction pour configurer le convertisseur de documents
def create_document_converter(
    use_ocr: bool,
    export_figures: bool,
    export_tables: bool,
    accelerator: str,
    ocr_engine: str,
    table_mode: str,
    ocr_languages: List[str],
) -> DocumentConverter:
    accelerator_options = AcceleratorOptions(
        num_threads=8,
        device=AcceleratorDevice[accelerator.upper()],
    )

    table_structure_options = TableStructureOptions(
        mode=TableFormerMode[table_mode.upper()],
        do_cell_matching=True,
    )

    # OCR avec le moteur choisi
    if ocr_engine == "easyocr":
        ocr_options = EasyOcrOptions(lang=ocr_languages)
    elif ocr_engine == "tesseract_cli":
        ocr_options = TesseractCliOcrOptions(lang=ocr_languages)
    elif ocr_engine == "tesserocr":
        ocr_options = TesseractOcrOptions(lang=ocr_languages)
    elif ocr_engine == "rapidocr":
        ocr_options = RapidOcrOptions(lang=ocr_languages)
    elif ocr_engine == "ocrmac":
        ocr_options = OcrMacOptions(lang=ocr_languages)
    else:
        raise ValueError(f"Moteur OCR non pris en charge : {ocr_engine}")

    pipeline_options = PdfPipelineOptions(
        do_ocr=use_ocr,
        generate_page_images=True,
        generate_picture_images=export_figures,
        generate_table_images=export_tables,
        accelerator_options=accelerator_options,
        table_structure_options=table_structure_options,
        ocr_options=ocr_options,
    )

    return DocumentConverter(
        allowed_formats=[
            InputFormat.PDF,
            InputFormat.DOCX,
            InputFormat.PPTX,
            InputFormat.HTML,
            InputFormat.IMAGE,
        ],
        format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)},
    )

# Interface utilisateur avec Streamlit
st.title("📊 Docling document converter ")
st.subheader("📤 Téléchargez un, ou plusieurs document pour commencer le traitement")

uploaded_files = st.file_uploader(
    "Sélectionnez vos fichiers (PDF, DOCX, PPTX, HTML, Images)", accept_multiple_files=True
)
use_ocr = st.checkbox("👁️‍🗨️ Activer l'OCR", value=True)
export_figures = st.checkbox("🖼️ Exporter les images", value=True)
export_tables = st.checkbox("📋 Exporter les tableaux", value=True)
ocr_engine = st.selectbox("Moteur OCR", ["easyocr", "tesseract_cli", "tesserocr", "rapidocr", "ocrmac"])
ocr_languages = st.text_input("Langues OCR (ex : eng, fra)", "eng").split(",")
table_mode = st.selectbox("Mode des tableaux", ["ACCURATE", "FAST"])
export_formats = st.multiselect(
    "Formats d'exportation", ["json", "yaml", "md", "multimodal"], default=["md"]
)

if st.button("Convertir"):
    if uploaded_files:
        input_paths = []
        generated_files = []
        figures = []
        tables = []
        total_files = len(uploaded_files)
        start_time = time.time()  # Chronomètre de démarrage

        # Charger les fichiers téléchargés
        for uploaded_file in uploaded_files:
            file_path = OUTPUT_DIR / uploaded_file.name
            with open(file_path, "wb") as f:
                f.write(uploaded_file.read())
            st.write(f"📥 Fichier reçu : `{file_path.name}` ({os.path.getsize(file_path)} octets)")

            if not is_valid_file(file_path):
                continue
            input_paths.append(file_path)

        # Configurer le convertisseur
        converter = create_document_converter(
            use_ocr,
            export_figures,
            export_tables,
            accelerator="cpu",
            ocr_engine=ocr_engine,
            table_mode=table_mode,
            ocr_languages=ocr_languages,
        )

        # Barre de progression
        progress_bar = st.progress(0)
        status_placeholder = st.empty()

        # Conversion des fichiers
        for i, file_path in enumerate(input_paths):
            status_placeholder.info(
                f"🔄 Traitement de `{file_path.name}` ({i + 1}/{total_files})"
            )

            # Conversion du fichier
            conv_results = list(converter.convert_all([file_path], raises_on_error=False))
            for conv_res in conv_results:
                if conv_res.status == ConversionStatus.SUCCESS:
                    st.success(f"✅ Conversion réussie : `{conv_res.input.file}`")

                    # Exporter les résultats
                    for fmt in export_formats:
                        output_file = OUTPUT_DIR / f"{conv_res.input.file.stem}.{fmt}"
                        if fmt == "md":
                            with open(output_file, "w") as f:
                                f.write(conv_res.document.export_to_markdown())
                        elif fmt == "json":
                            with open(output_file, "w", encoding="utf-8") as f:
                                json.dump(conv_res.document.export_to_dict(), f, ensure_ascii=False, indent=2)
                        elif fmt == "yaml":
                            with open(output_file, "w", encoding="utf-8") as f:
                                yaml.dump(conv_res.document.export_to_dict(), f, allow_unicode=True)
                        generated_files.append(output_file)

                    # Export des figures et tables
                    for element, _ in conv_res.document.iterate_items():
                        if isinstance(element, PictureItem):
                            fig_path = FIGURES_DIR / f"{conv_res.input.file.stem}_figure.png"
                            element.image.pil_image.save(fig_path)
                            figures.append(fig_path)
                        elif isinstance(element, TableItem):
                            table_path = TABLES_DIR / f"{conv_res.input.file.stem}_table.csv"
                            table_df = element.export_to_dataframe()
                            table_df.to_csv(table_path, index=False)
                            tables.append(table_path)
                else:
                    st.error(f"❌ Échec de la conversion pour : `{conv_res.input.file}`")

            # Mise à jour de la barre de progression
            progress_bar.progress((i + 1) / total_files)

        # Affichage des fichiers générés
        st.subheader("📂 Fichiers générés")
        for generated_file in generated_files:
            st.markdown(f"📄 **{generated_file.name}**")
            with open(generated_file, "r") as f:
                content = f.read()
            st.text_area(f"Prévisualisation : {generated_file.name}", value=content, height=200)

        # Affichage des figures extraites
        if figures:
            st.subheader("🖼️ Figures extraites")
            for fig in figures:
                st.image(Image.open(fig), caption=fig.name)

        # Affichage des tableaux extraits
        if tables:
            st.subheader("📋 Tableaux extraits")
            for table in tables:
                st.markdown(f"📄 **{table.name}**")
                table_df = pd.read_csv(table)
                st.dataframe(table_df)

        # Temps total écoulé
        total_time = time.time() - start_time
        st.success(f"✅ Conversion terminée en {int(total_time)} secondes !")
    else:
        st.error("❌ Veuillez télécharger au moins un fichier.")