omarbajouk's picture
Update app.py
59a0659 verified
"""
# VERSION EXC POUR NOTIFICATIONS
# RENOMMAGE : AAAAMMJJ-NOM_PRENOM-DI1-DI2
# DOUBLONS : SUFFIXES _X
# VERSION COMPLETE
"""
""""
!pip install gradio
!apt-get -qq install poppler-utils tesseract-ocr > /dev/null
!pip install -q pdf2image pytesseract
"""
import gradio as gr
import os
import shutil
import zipfile
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import re
from datetime import datetime
def process_pdfs(files, classement_mode):
pdf_folder = "pdf_folder"
output_log = "rename_log.csv"
errors_folder = os.path.join(pdf_folder, "erreurs")
if os.path.exists(pdf_folder):
shutil.rmtree(pdf_folder)
os.makedirs(pdf_folder, exist_ok=True)
os.makedirs(errors_folder, exist_ok=True)
if isinstance(files, list):
for file in files:
if zipfile.is_zipfile(file.name):
with zipfile.ZipFile(file.name, 'r') as zip_ref:
zip_ref.extractall(pdf_folder)
else:
shutil.copy(file.name, pdf_folder)
else:
if zipfile.is_zipfile(files.name):
with zipfile.ZipFile(files.name, 'r') as zip_ref:
zip_ref.extractall(pdf_folder)
else:
shutil.copy(files.name, pdf_folder)
title_pattern = re.compile(
r"(?:Monsieur|Madame|Morveuwer|De heer)\s+((?:[\w\-éèêëàâäîïôöùûüç']+\s+){1,3}[\w\-éèêëàâäîïôöùûüç']+)",
re.IGNORECASE
)
seance_pattern = re.compile(r"SEANCE\s+du\s+(\d{2})[\/\-](\d{2})[\/\-](\d{4})", re.IGNORECASE)
ref_pattern = re.compile(r"n\.réf\s*[:\-]?\s*das\/(?:[\w]+\/)*(\d{4})\/(\d+)", re.IGNORECASE)
log_lines = ["original_filename,new_filename,date_folder,nom_prenom,ref,date_séance"]
processed_files = []
error_files = []
used_filenames = {}
for filename in os.listdir(pdf_folder):
filepath = os.path.join(pdf_folder, filename)
if not filename.lower().endswith(".pdf") or not os.path.isfile(filepath):
continue
try:
images = convert_from_path(filepath, first_page=1, last_page=1)
text = pytesseract.image_to_string(images[0], lang='fra+eng')
name_match = title_pattern.search(text)
safe_name = "NO_NAME"
if name_match:
name = name_match.group(1).strip()
safe_name = re.sub(r"[^\w]", "_", name)
safe_name = re.sub(r"_+", "_", safe_name)
safe_name = re.sub(r"(_?DEPARTEMENT|_?ACTION|_?DIRECTION|_?SERVICE|_?UNITE|_?DIVISION)+", "", safe_name, flags=re.IGNORECASE)
safe_name = safe_name.strip("_")
date_match = seance_pattern.search(text)
date_str = "NO_DATE"
folder_path = errors_folder
if date_match:
day, month, year = date_match.groups()
date_str = f"{year}{month}{day}"
folder_path = os.path.join(pdf_folder, date_str)
os.makedirs(folder_path, exist_ok=True)
ref_match = ref_pattern.search(text)
di1 = ref_match.group(1) if ref_match else "0000"
di2 = ref_match.group(2) if ref_match else "0000"
base_filename = f"{date_str}-{safe_name}-{di1}-{di2}"
counter = used_filenames.get(base_filename, 0)
new_filename = f"{base_filename}.pdf" if counter == 0 else f"{base_filename}_{counter}.pdf"
used_filenames[base_filename] = counter + 1
# Choix du classement
if classement_mode == "Par date de séance (AAAAMMJJ)":
folder_path = os.path.join(pdf_folder, date_str) if date_match else errors_folder
else:
folder_path = os.path.join(pdf_folder, safe_name) if name_match else errors_folder
os.makedirs(folder_path, exist_ok=True)
new_path = os.path.join(folder_path, new_filename)
if not os.path.exists(new_path):
os.rename(filepath, new_path)
final_filename = re.sub(r"_D_(\d+)", r"_D_\1", new_filename)
final_path = os.path.join(folder_path, final_filename)
else:
final_path = new_path
nom_final = os.path.basename(final_path).replace(".pdf", "")
try:
date_part, name_part, di1_part, di2_part = nom_final.split("-")
date_formatted = f"{date_part[6:]}/{date_part[4:6]}/{date_part[0:4]}"
nom_prenom_csv = name_part.replace("_", " ")
ref_csv = f"{di1_part}/{di2_part}"
except Exception:
date_formatted = "NA"
nom_prenom_csv = "NA"
ref_csv = "NA"
log_lines.append(f"{filename},{nom_final}.pdf,{date_str},{nom_prenom_csv},{ref_csv},{date_formatted}")
processed_files.append(f"✅ {filename}{new_filename}")
except Exception as e:
error_path = os.path.join(errors_folder, filename)
shutil.move(filepath, error_path)
log_lines.append(f"{filename},ERROR:{str(e).replace(',', ';')},NO_DATE,NA,NA,NA")
error_files.append(f"❌ {filename} (Erreur: {str(e)})")
with open(output_log, "w", encoding="utf-8") as f:
f.write("\n".join(log_lines))
shutil.make_archive("renamed_pdfs", 'zip', pdf_folder)
last_files = "\n".join(processed_files[-5:] + error_files[-3:]) if processed_files or error_files else "Aucun fichier traité"
report = f"""
**Traitement terminé !**
- Fichiers traités : {len(log_lines)-1}
- Avec succès : {len(processed_files)}
- En erreur : {len(error_files)}
- Derniers fichiers :
{last_files}
"""
return "renamed_pdfs.zip", output_log, report
# Fonction pour afficher la documentation directement dans l'interface
def afficher_doc():
return """
# 📄 Documentation - Traitement des Notifications du Comité de l’Action Sociale
Cet outil vous permet de :
- 🧠 **Extraire automatiquement** le **nom**, la **référence** et la **date de séance** des notifications PDF (scans)
- ✅ Valable uniquement pour les documents contenant : `Monsieur`, `Madame`, `Morveuwer`, `De heer`
- 📅 Le classement repose sur la mention **"SEANCE du"**
- 🆔 Extraction de la **référence DI (interne)**
- 📄 Analyse uniquement de la **1ère page (moitié supérieure)** du document
- 🗂️ **Renommer les fichiers** selon le format : `AAAAMMJJ-NOM_PRENOM-DI1-DI2.pdf`
- 📆 **Classer automatiquement** les fichiers dans des dossiers selon la **date de séance** (format : `AAAAMMJJ`)
- 🔀 **Gérer les doublons** en ajoutant un suffixe `_x` si un nom existe déjà
- 📾 **Générer un fichier CSV de log** pour le suivi des traitements
👤 *Conçu pour les collaborateurs du CPAS Bruxelles*
📬 Contact : [omar.bajouk@cpasbxl.brussels](mailto:omar.bajouk@cpasbxl.brussels)
"""
# Interface Gradio
with gr.Blocks(title="Renommer les notifications et trier par date du séance") as demo:
gr.Markdown(afficher_doc())
with gr.Row():
input_files = gr.File(label="1. ZIP ou plusieurs PDFs", file_types=[".zip", ".pdf"], file_count="multiple")
class_option = gr.Radio(
choices=["Par date de séance (AAAAMMJJ)", "Par nom/référence (NOM_PRENOM-DI1-DI2)"],
label="2. Choisir le mode de classement",
value="Par date de séance (AAAAMMJJ)"
)
btn_process = gr.Button("🚀 Traiter les fichiers")
with gr.Row():
output_zip = gr.File(label="2. PDFs Renommés (ZIP)")
output_log = gr.File(label="3. Fichier Log (CSV)")
output_report = gr.Markdown()
btn_process.click(
fn=process_pdfs,
inputs=[input_files, class_option],
outputs=[output_zip, output_log, output_report]
)
demo.launch(share=True)