Spaces:
Sleeping
Sleeping
| """ | |
| # VERSION EXC POUR NOTIFICATIONS | |
| # RENOMMAGE : AAAAMMJJ-NOM_PRENOM-DI1-DI2 | |
| # DOUBLONS : SUFFIXES _X | |
| # VERSION COMPLETE | |
| """ | |
| """" | |
| !pip install gradio | |
| !apt-get -qq install poppler-utils tesseract-ocr > /dev/null | |
| !pip install -q pdf2image pytesseract | |
| """ | |
| import gradio as gr | |
| import os | |
| import shutil | |
| import zipfile | |
| import pytesseract | |
| from pdf2image import convert_from_path | |
| from PIL import Image | |
| import re | |
| from datetime import datetime | |
| def process_pdfs(files, classement_mode): | |
| pdf_folder = "pdf_folder" | |
| output_log = "rename_log.csv" | |
| errors_folder = os.path.join(pdf_folder, "erreurs") | |
| if os.path.exists(pdf_folder): | |
| shutil.rmtree(pdf_folder) | |
| os.makedirs(pdf_folder, exist_ok=True) | |
| os.makedirs(errors_folder, exist_ok=True) | |
| if isinstance(files, list): | |
| for file in files: | |
| if zipfile.is_zipfile(file.name): | |
| with zipfile.ZipFile(file.name, 'r') as zip_ref: | |
| zip_ref.extractall(pdf_folder) | |
| else: | |
| shutil.copy(file.name, pdf_folder) | |
| else: | |
| if zipfile.is_zipfile(files.name): | |
| with zipfile.ZipFile(files.name, 'r') as zip_ref: | |
| zip_ref.extractall(pdf_folder) | |
| else: | |
| shutil.copy(files.name, pdf_folder) | |
| title_pattern = re.compile( | |
| r"(?:Monsieur|Madame|Morveuwer|De heer)\s+((?:[\w\-éèêëàâäîïôöùûüç']+\s+){1,3}[\w\-éèêëàâäîïôöùûüç']+)", | |
| re.IGNORECASE | |
| ) | |
| seance_pattern = re.compile(r"SEANCE\s+du\s+(\d{2})[\/\-](\d{2})[\/\-](\d{4})", re.IGNORECASE) | |
| ref_pattern = re.compile(r"n\.réf\s*[:\-]?\s*das\/(?:[\w]+\/)*(\d{4})\/(\d+)", re.IGNORECASE) | |
| log_lines = ["original_filename,new_filename,date_folder,nom_prenom,ref,date_séance"] | |
| processed_files = [] | |
| error_files = [] | |
| used_filenames = {} | |
| for filename in os.listdir(pdf_folder): | |
| filepath = os.path.join(pdf_folder, filename) | |
| if not filename.lower().endswith(".pdf") or not os.path.isfile(filepath): | |
| continue | |
| try: | |
| images = convert_from_path(filepath, first_page=1, last_page=1) | |
| text = pytesseract.image_to_string(images[0], lang='fra+eng') | |
| name_match = title_pattern.search(text) | |
| safe_name = "NO_NAME" | |
| if name_match: | |
| name = name_match.group(1).strip() | |
| safe_name = re.sub(r"[^\w]", "_", name) | |
| safe_name = re.sub(r"_+", "_", safe_name) | |
| safe_name = re.sub(r"(_?DEPARTEMENT|_?ACTION|_?DIRECTION|_?SERVICE|_?UNITE|_?DIVISION)+", "", safe_name, flags=re.IGNORECASE) | |
| safe_name = safe_name.strip("_") | |
| date_match = seance_pattern.search(text) | |
| date_str = "NO_DATE" | |
| folder_path = errors_folder | |
| if date_match: | |
| day, month, year = date_match.groups() | |
| date_str = f"{year}{month}{day}" | |
| folder_path = os.path.join(pdf_folder, date_str) | |
| os.makedirs(folder_path, exist_ok=True) | |
| ref_match = ref_pattern.search(text) | |
| di1 = ref_match.group(1) if ref_match else "0000" | |
| di2 = ref_match.group(2) if ref_match else "0000" | |
| base_filename = f"{date_str}-{safe_name}-{di1}-{di2}" | |
| counter = used_filenames.get(base_filename, 0) | |
| new_filename = f"{base_filename}.pdf" if counter == 0 else f"{base_filename}_{counter}.pdf" | |
| used_filenames[base_filename] = counter + 1 | |
| # Choix du classement | |
| if classement_mode == "Par date de séance (AAAAMMJJ)": | |
| folder_path = os.path.join(pdf_folder, date_str) if date_match else errors_folder | |
| else: | |
| folder_path = os.path.join(pdf_folder, safe_name) if name_match else errors_folder | |
| os.makedirs(folder_path, exist_ok=True) | |
| new_path = os.path.join(folder_path, new_filename) | |
| if not os.path.exists(new_path): | |
| os.rename(filepath, new_path) | |
| final_filename = re.sub(r"_D_(\d+)", r"_D_\1", new_filename) | |
| final_path = os.path.join(folder_path, final_filename) | |
| else: | |
| final_path = new_path | |
| nom_final = os.path.basename(final_path).replace(".pdf", "") | |
| try: | |
| date_part, name_part, di1_part, di2_part = nom_final.split("-") | |
| date_formatted = f"{date_part[6:]}/{date_part[4:6]}/{date_part[0:4]}" | |
| nom_prenom_csv = name_part.replace("_", " ") | |
| ref_csv = f"{di1_part}/{di2_part}" | |
| except Exception: | |
| date_formatted = "NA" | |
| nom_prenom_csv = "NA" | |
| ref_csv = "NA" | |
| log_lines.append(f"{filename},{nom_final}.pdf,{date_str},{nom_prenom_csv},{ref_csv},{date_formatted}") | |
| processed_files.append(f"✅ {filename} → {new_filename}") | |
| except Exception as e: | |
| error_path = os.path.join(errors_folder, filename) | |
| shutil.move(filepath, error_path) | |
| log_lines.append(f"{filename},ERROR:{str(e).replace(',', ';')},NO_DATE,NA,NA,NA") | |
| error_files.append(f"❌ {filename} (Erreur: {str(e)})") | |
| with open(output_log, "w", encoding="utf-8") as f: | |
| f.write("\n".join(log_lines)) | |
| shutil.make_archive("renamed_pdfs", 'zip', pdf_folder) | |
| last_files = "\n".join(processed_files[-5:] + error_files[-3:]) if processed_files or error_files else "Aucun fichier traité" | |
| report = f""" | |
| **Traitement terminé !** | |
| - Fichiers traités : {len(log_lines)-1} | |
| - Avec succès : {len(processed_files)} | |
| - En erreur : {len(error_files)} | |
| - Derniers fichiers : | |
| {last_files} | |
| """ | |
| return "renamed_pdfs.zip", output_log, report | |
| # Fonction pour afficher la documentation directement dans l'interface | |
| def afficher_doc(): | |
| return """ | |
| # 📄 Documentation - Traitement des Notifications du Comité de l’Action Sociale | |
| Cet outil vous permet de : | |
| - 🧠 **Extraire automatiquement** le **nom**, la **référence** et la **date de séance** des notifications PDF (scans) | |
| - ✅ Valable uniquement pour les documents contenant : `Monsieur`, `Madame`, `Morveuwer`, `De heer` | |
| - 📅 Le classement repose sur la mention **"SEANCE du"** | |
| - 🆔 Extraction de la **référence DI (interne)** | |
| - 📄 Analyse uniquement de la **1ère page (moitié supérieure)** du document | |
| - 🗂️ **Renommer les fichiers** selon le format : `AAAAMMJJ-NOM_PRENOM-DI1-DI2.pdf` | |
| - 📆 **Classer automatiquement** les fichiers dans des dossiers selon la **date de séance** (format : `AAAAMMJJ`) | |
| - 🔀 **Gérer les doublons** en ajoutant un suffixe `_x` si un nom existe déjà | |
| - 📾 **Générer un fichier CSV de log** pour le suivi des traitements | |
| 👤 *Conçu pour les collaborateurs du CPAS Bruxelles* | |
| 📬 Contact : [omar.bajouk@cpasbxl.brussels](mailto:omar.bajouk@cpasbxl.brussels) | |
| """ | |
| # Interface Gradio | |
| with gr.Blocks(title="Renommer les notifications et trier par date du séance") as demo: | |
| gr.Markdown(afficher_doc()) | |
| with gr.Row(): | |
| input_files = gr.File(label="1. ZIP ou plusieurs PDFs", file_types=[".zip", ".pdf"], file_count="multiple") | |
| class_option = gr.Radio( | |
| choices=["Par date de séance (AAAAMMJJ)", "Par nom/référence (NOM_PRENOM-DI1-DI2)"], | |
| label="2. Choisir le mode de classement", | |
| value="Par date de séance (AAAAMMJJ)" | |
| ) | |
| btn_process = gr.Button("🚀 Traiter les fichiers") | |
| with gr.Row(): | |
| output_zip = gr.File(label="2. PDFs Renommés (ZIP)") | |
| output_log = gr.File(label="3. Fichier Log (CSV)") | |
| output_report = gr.Markdown() | |
| btn_process.click( | |
| fn=process_pdfs, | |
| inputs=[input_files, class_option], | |
| outputs=[output_zip, output_log, output_report] | |
| ) | |
| demo.launch(share=True) | |