Spaces:
Running
Running
import gradio as gr | |
import os | |
import shutil | |
import fitz | |
from PIL import Image | |
import numpy as np | |
import cv2 | |
import pytesseract | |
from pytesseract import Output | |
import zipfile | |
from pdf2image import convert_from_path | |
import google.generativeai as genai | |
import json | |
from docx import Document | |
from docx.shared import Pt, RGBColor, Inches | |
from docx.enum.text import WD_ALIGN_PARAGRAPH | |
from docx.enum.section import WD_SECTION | |
from docx.oxml import OxmlElement | |
from docx.oxml.ns import qn | |
from typing import Dict, Any, List, Union # Ajout des imports typing nécessaires | |
import logging | |
from helpers.rapport_generator import RapportGenerator | |
from helpers.text_extraction import * | |
def authenticate(username, password): | |
return username == os.getenv("HF_USERNAME") and password == os.getenv("HF_PASSWORD") | |
# Main Processing Function | |
def process_pdf(pdf_file): | |
template_dir = os.path.join(os.getcwd(), "templates") | |
temp_dir = os.path.join(os.getcwd(), "temp_processing") | |
output_dir = os.path.join(temp_dir, 'output_images') | |
if os.path.exists(temp_dir): | |
shutil.rmtree(temp_dir) | |
os.makedirs(output_dir, exist_ok=True) | |
path_to_data_to_extract = os.path.join(template_dir, "data_to_extract.json") | |
text_file_path = os.path.join(output_dir, 'extracted_text.txt') | |
try: | |
# Convert PDF to images and process | |
images = convert_from_path(pdf_file.name) | |
annotated_images = [] | |
# Process each page | |
for i, img in enumerate(images): | |
temp_img_path = os.path.join(temp_dir, f'temp_page_{i}.png') | |
img.save(temp_img_path) | |
blocks, annotated_image_path = process_image(temp_img_path, output_dir, i) | |
annotated_images.append(annotated_image_path) | |
save_extracted_text(blocks, i + 1, output_dir) | |
# Create ZIP file | |
zip_path = os.path.join(temp_dir, "annotated_images.zip") | |
with zipfile.ZipFile(zip_path, 'w') as zipf: | |
for img_path in annotated_images: | |
zipf.write(img_path, os.path.basename(img_path)) | |
# Process with Gemini | |
extracted_data = extract_data_with_gemini(text_file_path, path_to_data_to_extract) | |
# Save extracted data to JSON file | |
json_path = os.path.join(temp_dir, "extracted_data.json") | |
with open(json_path, 'w', encoding='utf-8') as f: | |
json.dump(extracted_data, f, ensure_ascii=False, indent=2) | |
# Generate DOCX report | |
docx_path = os.path.join(temp_dir, "rapport_extraction.docx") | |
generator = RapportGenerator(json_path, docx_path) | |
generator.generate_report() | |
return text_file_path, zip_path, json_path, docx_path | |
except Exception as e: | |
raise gr.Error(f"Error processing PDF: {str(e)}") | |
# Gradio Interface | |
css = """ | |
.gradio-container { | |
font-family: 'IBM Plex Sans', sans-serif; | |
} | |
.gr-button { | |
color: white; | |
border-radius: 8px; | |
background: linear-gradient(45deg, #7928CA, #FF0080); | |
border: none; | |
} | |
""" | |
demo = gr.Interface( | |
fn=process_pdf, | |
inputs=[ | |
gr.File( | |
label="Télécharger un document PDF", | |
file_types=[".pdf"], | |
type="filepath" | |
) | |
], | |
outputs=[ | |
gr.File(label="Texte extrait (TXT)"), | |
gr.File(label="Images annotées (ZIP)"), | |
gr.File(label="Données extraites (JSON)"), | |
gr.File(label="Rapport généré (DOCX)") # Nouvelle sortie | |
], | |
title="Extraction de texte PDF et création d'un rapport DOCX", | |
description=""" | |
Téléchargez un document PDF pour : | |
1. Extraire le contenu textuel | |
2. Obtenir des images annotées montrant les blocs de texte détectés | |
3. Extraire des données structurées grâce à une analyse IA | |
4. Générer un rapport formaté au format DOCX | |
Prend en charge les documents multi-pages et les documents juridiques français. | |
""", | |
css=css, | |
examples=[], | |
cache_examples=False, | |
theme=gr.themes.Soft() | |
) | |
# Launch the app | |
if __name__ == "__main__": | |
demo.launch( | |
debug=False, | |
auth=authenticate | |
).launch() |