Spaces:
Running
Running
import gradio as gr | |
import google.generativeai as genai | |
import os | |
import json | |
import time | |
import base64 | |
import fitz # Importation correcte pour Hugging Face | |
from PIL import Image | |
import io | |
import tempfile | |
import shutil | |
# Configuration | |
GOOGLE_API_KEY = "AIzaSyA4ma5pE1pPCzHHn-i9tDWuKqQEgSltMtI" | |
genai.configure(api_key=GOOGLE_API_KEY) | |
model = genai.GenerativeModel('gemini-1.5-flash') | |
# Interface text (English only) | |
TEXT = { | |
"title": "Elixir - Document Intelligence", | |
"description": "This demo showcases the capabilities of a generative AI model to interpret, understand, and classify any type of document WITHOUT CUSTOMIZATION. For developing a complete, precise, and defined pipeline, please contact martial@lexiapro.fr.", | |
"instructions": [ | |
"1. Upload a PDF document (1-10 pages) such as an invoice, regulatory document, report...", | |
"2. Processing by Elixir", | |
"3. Transcription of identified sections and elements (without customization)" | |
], | |
"upload": "📂 Upload your document", | |
"analyze": "🔍 Analyze document", | |
"preview": "📄 Preview", | |
"tabs": { | |
"overview": "📋 Overview", | |
"entities": "👥 Entities", | |
"values": "💰 Values", | |
"dates": "📅 Dates", | |
"tables": "📊 Tables", | |
"keypoints": "🔑 Key Points", | |
"references": "🔗 References", | |
"json": "📄 Complete JSON" | |
}, | |
"no_data": "No information found", | |
"processing": "Processing...", | |
"error": { | |
"file_not_found": "File not found", | |
"pdf_conversion": "Unable to convert PDF to image", | |
"no_info": "No information extracted from PDF pages", | |
"too_many_pages": "The PDF has more than 10 pages. Please upload a document with 10 pages or less." | |
} | |
} | |
# Modern CSS - Style amélioré | |
CSS = """ | |
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap'); | |
:root { | |
--primary: #4f46e5; | |
--primary-light: #818cf8; | |
--primary-dark: #3730a3; | |
--secondary: #10b981; | |
--accent: #f59e0b; | |
--dark: #111827; | |
--light: #f9fafb; | |
--gray-50: #f8fafc; | |
--gray-100: #f1f5f9; | |
--gray-200: #e2e8f0; | |
--gray-300: #cbd5e1; | |
--gray-400: #94a3b8; | |
--gray-500: #64748b; | |
--text-primary: #1e293b; | |
--text-secondary: #475569; | |
--shadow-sm: 0 1px 2px 0 rgba(0, 0, 0, 0.05); | |
--shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06); | |
--shadow-md: 0 10px 15px -3px rgba(0, 0, 0, 0.1), 0 4px 6px -2px rgba(0, 0, 0, 0.05); | |
--radius-sm: 0.25rem; | |
--radius: 0.5rem; | |
--radius-md: 0.75rem; | |
--radius-lg: 1rem; | |
} | |
body, .gradio-container { | |
font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif !important; | |
color: var(--text-primary); | |
background-color: var(--light); | |
line-height: 1.6; | |
} | |
/* Layout principal */ | |
.container { | |
max-width: 1300px; | |
margin: 0 auto; | |
padding: 0 1rem; | |
} | |
.main-content { | |
display: flex; | |
gap: 2rem; | |
align-items: flex-start; | |
} | |
.left-panel { | |
flex: 1; | |
} | |
.right-panel { | |
flex: 2; | |
} | |
/* En-tête */ | |
.header { | |
margin-bottom: 2rem; | |
padding: 0.75rem 1.25rem; | |
background: linear-gradient(135deg, var(--primary-light), var(--primary-dark)); | |
border-radius: var(--radius-lg); | |
box-shadow: var(--shadow-md); | |
position: relative; | |
overflow: hidden; | |
color: white; | |
height: 60px; | |
display: flex; | |
align-items: center; | |
justify-content: center; | |
} | |
.header::before { | |
content: ''; | |
position: absolute; | |
top: -50%; | |
left: -50%; | |
width: 200%; | |
height: 200%; | |
background: radial-gradient(circle, rgba(255,255,255,0.1) 0%, rgba(255,255,255,0) 60%); | |
animation: pulse 15s ease-in-out infinite; | |
z-index: 1; | |
} | |
@keyframes pulse { | |
0% { transform: scale(1); opacity: 0.5; } | |
50% { transform: scale(1.05); opacity: 0.8; } | |
100% { transform: scale(1); opacity: 0.5; } | |
} | |
.header img { | |
max-height: 40px !important; | |
object-fit: contain; | |
position: relative; | |
z-index: 2; | |
} | |
/* Intro card */ | |
.intro-card { | |
background: white; | |
border-radius: var(--radius); | |
box-shadow: var(--shadow); | |
border: 1px solid var(--gray-200); | |
overflow: hidden; | |
margin-bottom: 1.5rem; | |
transition: transform 0.3s ease, box-shadow 0.3s ease; | |
} | |
.intro-card:hover { | |
transform: translateY(-3px); | |
box-shadow: var(--shadow-md); | |
} | |
.intro-header { | |
padding: 1.25rem; | |
border-bottom: 1px solid var(--gray-200); | |
background: linear-gradient(135deg, var(--primary-light), var(--primary-dark)); | |
color: white; | |
font-weight: 600; | |
display: flex; | |
align-items: center; | |
gap: 0.5rem; | |
} | |
.intro-header h3 { | |
margin: 0; | |
font-size: 1.25rem; | |
font-weight: 600; | |
text-shadow: 0 1px 2px rgba(0,0,0,0.1); | |
} | |
.intro-body { | |
padding: 1.5rem; | |
} | |
.intro-description { | |
color: var(--text-primary); | |
line-height: 1.7; | |
font-size: 1.05rem; | |
margin-bottom: 1.5rem; | |
} | |
.contact-links { | |
display: flex; | |
flex-wrap: wrap; | |
gap: 1rem; | |
margin-top: 1.5rem; | |
background: linear-gradient(to right, rgba(79, 70, 229, 0.05), rgba(79, 70, 229, 0.1)); | |
padding: 1.25rem; | |
border-radius: var(--radius); | |
border: 1px solid var(--gray-200); | |
} | |
.contact-link { | |
display: flex; | |
align-items: center; | |
gap: 0.5rem; | |
padding: 0.75rem 1rem; | |
background: white; | |
border-radius: var(--radius); | |
color: var(--primary); | |
text-decoration: none; | |
font-weight: 500; | |
transition: all 0.2s ease; | |
box-shadow: var(--shadow-sm); | |
border: 1px solid var(--gray-200); | |
} | |
.contact-link:hover { | |
transform: translateY(-2px); | |
box-shadow: var(--shadow); | |
color: var(--primary-dark); | |
border-color: var(--primary-light); | |
} | |
/* Accordéon pour workflow */ | |
.accordion { | |
border-radius: var(--radius); | |
overflow: hidden; | |
margin-bottom: 1.5rem; | |
} | |
.accordion-header { | |
background: var(--gray-50); | |
padding: 1.25rem; | |
cursor: pointer; | |
display: flex; | |
align-items: center; | |
justify-content: space-between; | |
font-weight: 600; | |
color: var(--primary); | |
border: 1px solid var(--gray-200); | |
border-radius: var(--radius); | |
transition: all 0.3s ease; | |
} | |
.accordion-header:hover { | |
background: var(--gray-100); | |
} | |
.accordion-header::after { | |
content: "↓"; | |
transition: transform 0.3s ease; | |
} | |
.accordion.active .accordion-header::after { | |
transform: rotate(180deg); | |
} | |
.accordion-content { | |
max-height: 0; | |
overflow: hidden; | |
transition: max-height 0.3s ease; | |
background: white; | |
border: 1px solid var(--gray-200); | |
border-top: 0; | |
border-radius: 0 0 var(--radius) var(--radius); | |
padding: 0 1.25rem; | |
} | |
.accordion.active .accordion-content { | |
max-height: 1000px; | |
padding: 1.25rem; | |
} | |
.workflow-container { | |
text-align: center; | |
} | |
.workflow-container img { | |
max-width: 100%; | |
border-radius: var(--radius); | |
box-shadow: var(--shadow); | |
margin-top: 1rem; | |
} | |
/* Instructions */ | |
.instructions { | |
background: white; | |
padding: 1.5rem; | |
border-radius: var(--radius); | |
border: 1px solid var(--gray-200); | |
box-shadow: var(--shadow); | |
margin-bottom: 2rem; | |
} | |
.instructions h3 { | |
color: var(--primary); | |
margin-top: 0; | |
margin-bottom: 1rem; | |
font-weight: 600; | |
font-size: 1.25rem; | |
display: flex; | |
align-items: center; | |
gap: 0.5rem; | |
} | |
.instructions h3::before { | |
content: '📋'; | |
} | |
.instructions ol { | |
margin: 0; | |
padding-left: 1.5rem; | |
} | |
.instructions li { | |
margin-bottom: 0.75rem; | |
position: relative; | |
padding-left: 0.5rem; | |
} | |
.instructions li:last-child { | |
margin-bottom: 0; | |
} | |
/* Upload section */ | |
.upload-section { | |
background: white; | |
border-radius: var(--radius); | |
box-shadow: var(--shadow); | |
border: 1px solid var(--gray-200); | |
padding: 1.5rem; | |
} | |
/* File input styling */ | |
.file-container { | |
border: 2px dashed var(--primary-light) !important; | |
border-radius: var(--radius) !important; | |
padding: 2rem !important; | |
text-align: center !important; | |
transition: all 0.3s ease !important; | |
background-color: rgba(79, 70, 229, 0.05) !important; | |
cursor: pointer !important; | |
position: relative; | |
} | |
.file-container:hover { | |
background-color: rgba(79, 70, 229, 0.1) !important; | |
} | |
.file-container::before { | |
content: "📄"; | |
font-size: 2rem; | |
display: block; | |
margin-bottom: 0.5rem; | |
} | |
button.primary { | |
background: linear-gradient(135deg, var(--primary), var(--primary-dark)) !important; | |
color: white !important; | |
border: none !important; | |
padding: 0.75rem 1.5rem !important; | |
font-weight: 600 !important; | |
border-radius: var(--radius) !important; | |
transition: all 0.3s ease !important; | |
box-shadow: 0 4px 6px rgba(79, 70, 229, 0.25) !important; | |
width: 100% !important; | |
margin-top: 1rem !important; | |
} | |
button.primary:hover { | |
transform: translateY(-2px) !important; | |
box-shadow: 0 7px 14px rgba(79, 70, 229, 0.3) !important; | |
} | |
/* Results tabs */ | |
.tabs .tab-nav { | |
background-color: var(--gray-50) !important; | |
padding: 0.5rem !important; | |
border-radius: var(--radius) var(--radius) 0 0 !important; | |
border: 1px solid var(--gray-200) !important; | |
border-bottom: none !important; | |
} | |
.tabs .tab-nav button { | |
margin: 0 !important; | |
padding: 0.75rem 1rem !important; | |
font-weight: 500 !important; | |
color: var(--text-secondary) !important; | |
position: relative !important; | |
transition: all 0.3s ease !important; | |
} | |
.tabs .tab-nav button.selected { | |
color: var(--primary) !important; | |
font-weight: 600 !important; | |
} | |
.tabs .tab-nav button.selected::after { | |
content: ''; | |
position: absolute; | |
bottom: -0.5rem; | |
left: 0; | |
width: 100%; | |
height: 3px; | |
background: var(--primary); | |
border-radius: 3px 3px 0 0; | |
} | |
.tabs .tabitem { | |
background: white !important; | |
padding: 1.5rem !important; | |
border-radius: 0 0 var(--radius) var(--radius) !important; | |
border: 1px solid var(--gray-200) !important; | |
box-shadow: var(--shadow) !important; | |
} | |
/* Card components */ | |
.info-card { | |
background: white; | |
padding: 0; | |
border-radius: var(--radius); | |
margin-bottom: 1.5rem; | |
border: 1px solid var(--gray-200); | |
box-shadow: var(--shadow); | |
overflow: hidden; | |
transition: transform 0.2s ease, box-shadow 0.2s ease; | |
} | |
.info-card:hover { | |
transform: translateY(-2px); | |
box-shadow: var(--shadow-md); | |
} | |
.info-card h3 { | |
margin: 0; | |
color: white; | |
font-size: 1.1rem; | |
font-weight: 600; | |
padding: 1rem 1.5rem; | |
background: linear-gradient(135deg, var(--primary-light), var(--primary-dark)); | |
position: relative; | |
} | |
.info-card .content { | |
padding: 1.25rem; | |
} | |
/* Formatage des listes dans les cartes */ | |
.list-container { | |
display: flex; | |
flex-direction: column; | |
gap: 1rem; | |
} | |
.list-item { | |
padding: 1rem; | |
background: var(--gray-50); | |
border-radius: var(--radius); | |
border: 1px solid var(--gray-200); | |
transition: all 0.2s ease; | |
} | |
.list-item:hover { | |
background: white; | |
border-color: var(--primary-light); | |
box-shadow: var(--shadow-sm); | |
} | |
.list-item-header { | |
font-weight: 600; | |
color: var(--primary); | |
margin-bottom: 0.5rem; | |
display: flex; | |
align-items: center; | |
gap: 0.5rem; | |
} | |
.list-item-header::before { | |
content: '•'; | |
color: var(--primary); | |
font-size: 1.5rem; | |
line-height: 1; | |
} | |
.list-item-content { | |
color: var(--text-secondary); | |
font-size: 0.95rem; | |
} | |
/* Améliorations tables */ | |
.tables-container { | |
display: flex; | |
flex-direction: column; | |
gap: 2rem; | |
} | |
.table-wrapper { | |
overflow: hidden; | |
border-radius: var(--radius); | |
box-shadow: var(--shadow); | |
background: white; | |
} | |
.table-wrapper h4 { | |
padding: 1rem; | |
margin: 0; | |
background: linear-gradient(to right, var(--primary-light), var(--primary)); | |
color: white; | |
font-weight: 600; | |
} | |
.table-description { | |
margin: 0; | |
padding: 0.75rem 1rem; | |
background: var(--gray-50); | |
color: var(--text-secondary); | |
border-bottom: 1px solid var(--gray-200); | |
font-size: 0.9rem; | |
font-style: italic; | |
} | |
.data-table { | |
width: 100%; | |
border-collapse: collapse; | |
font-size: 0.95rem; | |
} | |
.data-table th { | |
background: var(--gray-100); | |
padding: 0.75rem 1rem; | |
text-align: left; | |
font-weight: 600; | |
color: var(--primary-dark); | |
border-bottom: 2px solid var(--primary-light); | |
} | |
.data-table td { | |
padding: 0.75rem 1rem; | |
border-bottom: 1px solid var(--gray-200); | |
color: var(--text-secondary); | |
} | |
.data-table tr:last-child td { | |
border-bottom: none; | |
} | |
.data-table tr:nth-child(even) { | |
background-color: var(--gray-50); | |
} | |
.data-table tr:hover { | |
background-color: rgba(79, 70, 229, 0.05); | |
} | |
/* Metadata grid */ | |
.metadata-grid { | |
display: grid; | |
grid-template-columns: repeat(auto-fill, minmax(200px, 1fr)); | |
gap: 1rem; | |
} | |
.metadata-item { | |
background: var(--gray-50); | |
padding: 1rem; | |
border-radius: var(--radius); | |
border: 1px solid var(--gray-200); | |
transition: all 0.2s ease; | |
} | |
.metadata-item:hover { | |
background: white; | |
border-color: var(--primary-light); | |
box-shadow: var(--shadow-sm); | |
} | |
.metadata-item h4 { | |
margin: 0 0 0.5rem 0; | |
color: var(--primary); | |
font-weight: 600; | |
font-size: 0.9rem; | |
text-transform: uppercase; | |
letter-spacing: 0.5px; | |
} | |
.metadata-item p { | |
margin: 0; | |
color: var(--text-primary); | |
font-weight: 500; | |
} | |
/* JSON viewer */ | |
.json-viewer { | |
background: var(--dark); | |
color: #e2e8f0; | |
padding: 1.25rem; | |
border-radius: var(--radius); | |
overflow: auto; | |
font-family: 'Fira Code', 'Courier New', monospace; | |
font-size: 0.9rem; | |
line-height: 1.5; | |
max-height: 400px; | |
white-space: pre-wrap; | |
} | |
/* Loading animation */ | |
.loading-spinner { | |
display: inline-block; | |
width: 50px; | |
height: 50px; | |
border: 3px solid rgba(79, 70, 229, 0.3); | |
border-radius: 50%; | |
border-top-color: var(--primary); | |
animation: spin 1s ease-in-out infinite; | |
} | |
@keyframes spin { | |
to { transform: rotate(360deg); } | |
} | |
/* Error message */ | |
.error { | |
padding: 1rem; | |
background-color: #fee2e2; | |
border: 1px solid #fecaca; | |
border-radius: var(--radius); | |
color: #b91c1c; | |
font-weight: 500; | |
} | |
/* Responsive design */ | |
@media (max-width: 1024px) { | |
.main-content { | |
flex-direction: column; | |
} | |
.left-panel, .right-panel { | |
flex: none; | |
width: 100%; | |
} | |
} | |
""" | |
# Prompt pour Gemini avec instruction améliorée pour les tableaux | |
GEMINI_PROMPT = """ | |
Analyze this document and extract relevant information in JSON format. Adapt the extraction based on the document type (invoice, contract, report, KID, etc.). | |
Expected response structure: | |
{ | |
"metadata": { | |
"title": "Document title", | |
"date": "Document date", | |
"type": "Document type", | |
"author": "Document author or issuer" | |
}, | |
"entities": [ | |
{ | |
"name": "Entity name", | |
"type": "Entity type (person, organization, etc.)", | |
"role": "Role in the document" | |
} | |
], | |
"values": [ | |
{ | |
"description": "Value description", | |
"value": "Exact value", | |
"unit": "Unit if applicable" | |
} | |
], | |
"dates": [ | |
{ | |
"description": "Date description", | |
"date": "Exact date", | |
"importance": "Importance (high, medium, low)" | |
} | |
], | |
"tables": [ | |
{ | |
"title": "Table title", | |
"description": "Table description", | |
"data": [ | |
{ | |
"column1": "Value in row 1, column 1", | |
"column2": "Value in row 1, column 2", | |
"column3": "Value in row 1, column 3" | |
}, | |
{ | |
"column1": "Value in row 2, column 1", | |
"column2": "Value in row 2, column 2", | |
"column3": "Value in row 2, column 3" | |
} | |
] | |
} | |
], | |
"key_points": [ | |
{ | |
"category": "Key point category", | |
"description": "Detailed description", | |
"importance": "Importance (high, medium, low)" | |
} | |
], | |
"references": [ | |
{ | |
"type": "Reference type", | |
"value": "Reference value" | |
} | |
] | |
} | |
Important instructions: | |
1. First identify the document type and adapt the extraction accordingly | |
2. For tables (this is EXTREMELY important): | |
- Pay special attention to detect and extract ALL tables in the document | |
- Carefully identify tables even if they don't have visible borders or lines | |
- Identify column headers correctly (first row or separate header row) | |
- Extract all rows and all columns with exact cell values | |
- Maintain the same number of columns for each row | |
- Preserve the exact structure of each table | |
- For each table, provide a descriptive title based on content | |
- For each table, include a brief description explaining what the table contains | |
- If a table spans multiple pages, try to reconstruct it as one table | |
- Include ALL data from the table, don't omit any rows or columns | |
3. For values: | |
- Extract amounts, percentages, numbers | |
- Include units when present | |
4. For dates: | |
- Extract all important dates | |
- Include the context of each date | |
5. For entities: | |
- Identify people, organizations, locations | |
- Include their role in the document | |
6. For references: | |
- Extract reference numbers, codes, identifiers | |
7. For key points: | |
- Identify important information based on document type | |
- Categorize them appropriately | |
General rules: | |
- Respond only with JSON, without any additional text | |
- Extract only factual and verifiable information | |
- Be precise with values and dates | |
- If a category is not relevant for the document, leave an empty array | |
- Adapt categories based on document type | |
- Do not make assumptions about missing data | |
""" | |
def create_info_card(title, content): | |
"""Create a formatted information card""" | |
if not content: | |
return f""" | |
<div class="info-card"> | |
<h3>{title}</h3> | |
<div class="content"> | |
<p>{TEXT["no_data"]}</p> | |
</div> | |
</div> | |
""" | |
return f""" | |
<div class="info-card"> | |
<h3>{title}</h3> | |
<div class="content"> | |
{content} | |
</div> | |
</div> | |
""" | |
def format_list(items, key1, key2): | |
"""Format a list of items with two keys""" | |
if not items: | |
return TEXT["no_data"] | |
html = "<div class='list-container'>" | |
for item in items: | |
html += f""" | |
<div class='list-item'> | |
<div class='list-item-header'>{item[key1]}</div> | |
<div class='list-item-content'>{item[key2]}</div> | |
</div> | |
""" | |
html += "</div>" | |
return html | |
def format_table(table_data): | |
"""Format a table in HTML""" | |
if not table_data: | |
return TEXT["no_data"] | |
html = "<div class='tables-container'>" | |
try: | |
for table in table_data: | |
# Vérifier si la table a des données | |
if not table.get('data') or len(table['data']) == 0: | |
continue | |
title = table.get('title', 'Tableau sans titre') | |
description = table.get('description', '') | |
html += f""" | |
<div class='table-wrapper'> | |
<h4>{title}</h4> | |
<p class='table-description'>{description}</p> | |
<table class='data-table'> | |
""" | |
# Vérifier le format des données | |
first_row = table['data'][0] | |
if isinstance(first_row, dict): | |
# Extraire les en-têtes du premier élément | |
headers = list(first_row.keys()) | |
# Ajouter les en-têtes | |
html += "<tr>" | |
for header in headers: | |
html += f"<th>{header}</th>" | |
html += "</tr>" | |
# Ajouter les lignes de données | |
for row in table['data']: | |
html += "<tr>" | |
for key in headers: | |
value = row.get(key, "") | |
html += f"<td>{value}</td>" | |
html += "</tr>" | |
elif isinstance(first_row, list): | |
# Traiter les données au format liste | |
for row in table['data']: | |
html += "<tr>" | |
for cell in row: | |
html += f"<td>{cell}</td>" | |
html += "</tr>" | |
html += "</table></div>" | |
except Exception as e: | |
print(f"Erreur lors du formatage des tableaux: {str(e)}") | |
html += f""" | |
<div class='error'> | |
Erreur lors de l'affichage des tableaux. Veuillez vérifier le format JSON. | |
</div> | |
""" | |
html += "</div>" | |
if html == "<div class='tables-container'></div>": | |
return TEXT["no_data"] | |
return html | |
def process_single_image(image): | |
"""Process a single image and extract information""" | |
try: | |
print("Envoi de l'image à Gemini pour analyse...") | |
response = model.generate_content( | |
[GEMINI_PROMPT, image], | |
generation_config={ | |
"temperature": 0.1, | |
"top_p": 0.8, | |
"top_k": 40, | |
"max_output_tokens": 2048, | |
} | |
) | |
try: | |
response_text = response.text.strip() | |
print(f"Réponse reçue de Gemini, longueur: {len(response_text)} caractères") | |
# Nettoyage du texte JSON | |
if response_text.startswith("```json"): | |
response_text = response_text.replace("```json", "").replace("```", "").strip() | |
elif response_text.startswith("```"): | |
response_text = response_text.replace("```", "").strip() | |
# Parse JSON | |
json_data = json.loads(response_text) | |
# Vérifier et corriger le format des tableaux si nécessaire | |
if "tables" in json_data and json_data["tables"]: | |
for i, table in enumerate(json_data["tables"]): | |
if "data" not in table or not table["data"]: | |
table["data"] = [] | |
# S'assurer que la table a un titre | |
if "title" not in table or not table["title"]: | |
table["title"] = f"Tableau {i+1}" | |
# S'assurer que la table a une description | |
if "description" not in table: | |
table["description"] = "" | |
return json_data | |
except json.JSONDecodeError as e: | |
print(f"Erreur de décodage JSON: {str(e)}") | |
print(f"Contenu problématique: {response_text[:500]}...") | |
return {"error": "Erreur de format JSON dans la réponse"} | |
except Exception as e: | |
print(f"Erreur lors du traitement de la réponse Gemini: {str(e)}") | |
return {"error": str(e)} | |
except Exception as e: | |
print(f"Erreur lors de l'appel à Gemini: {str(e)}") | |
return {"error": str(e)} | |
def merge_results(results): | |
"""Merge multiple results into one""" | |
if not results: | |
return None | |
merged = { | |
"metadata": {}, | |
"entities": [], | |
"values": [], | |
"dates": [], | |
"tables": [], | |
"key_points": [], | |
"references": [] | |
} | |
# Merge metadata (take from first result with data) | |
for result in results: | |
if "metadata" in result and result["metadata"]: | |
merged["metadata"] = result["metadata"] | |
break | |
# Merge lists | |
for result in results: | |
for category in ["entities", "values", "dates", "tables", "key_points", "references"]: | |
if category in result and result[category]: | |
merged[category].extend(result[category]) | |
return merged | |
def process_document(file, progress=gr.Progress()): | |
"""Process a document and extract information""" | |
if not file: | |
return {"error": TEXT["error"]["file_not_found"]} | |
try: | |
if file.name.lower().endswith('.pdf'): | |
# Créer une copie temporaire du fichier au cas où il serait déplacé/modifié pendant le traitement | |
temp_dir = tempfile.mkdtemp() | |
temp_pdf = os.path.join(temp_dir, "temp.pdf") | |
shutil.copy2(file.name, temp_pdf) | |
try: | |
# Ouvrir le PDF avec PyMuPDF | |
doc = fitz.open(temp_pdf) | |
if doc.page_count > 10: | |
# Nettoyer les fichiers temporaires | |
shutil.rmtree(temp_dir, ignore_errors=True) | |
return {"error": TEXT["error"]["too_many_pages"]} | |
print(f"Traitement d'un PDF de {doc.page_count} pages") | |
results = [] | |
for i in range(doc.page_count): | |
progress((i+1) / doc.page_count, desc=f"{TEXT['processing']} page {i+1}/{doc.page_count}") | |
try: | |
page = doc[i] | |
# Augmenter la résolution pour une meilleure qualité | |
zoom = 2.0 # zoom factor | |
mat = fitz.Matrix(zoom, zoom) | |
pix = page.get_pixmap(matrix=mat, alpha=False) | |
# Convertir le pixmap en PIL Image | |
img_data = pix.tobytes("jpeg") | |
pil_img = Image.open(io.BytesIO(img_data)) | |
# Traiter l'image avec le modèle Gemini | |
result = process_single_image(pil_img) | |
if result and "error" not in result: | |
results.append(result) | |
print(f"Page {i+1} traitée avec succès") | |
else: | |
print(f"Pas d'informations extraites de la page {i+1}") | |
except Exception as e: | |
print(f"Erreur lors du traitement de la page {i+1}: {str(e)}") | |
# Fermer le document PDF | |
doc.close() | |
# Nettoyer les fichiers temporaires | |
shutil.rmtree(temp_dir, ignore_errors=True) | |
if results: | |
return merge_results(results) | |
else: | |
return {"error": TEXT["error"]["no_info"]} | |
except Exception as e: | |
print(f"Erreur lors du traitement du PDF: {str(e)}") | |
# Nettoyer les fichiers temporaires | |
shutil.rmtree(temp_dir, ignore_errors=True) | |
return {"error": str(e)} | |
elif file.name.lower().endswith(('.png', '.jpg', '.jpeg')): | |
try: | |
image = Image.open(file.name) | |
return process_single_image(image) | |
except Exception as e: | |
print(f"Erreur lors du traitement de l'image: {str(e)}") | |
return {"error": str(e)} | |
else: | |
return {"error": TEXT["error"]["file_not_found"]} | |
except Exception as e: | |
print(f"Erreur inattendue dans process_document: {str(e)}") | |
return {"error": str(e)} | |
def update_preview(file): | |
"""Update the preview with the uploaded file""" | |
if not file: | |
return [] | |
try: | |
if file.name.lower().endswith('.pdf'): | |
# Créer une copie temporaire du fichier au cas où il serait déplacé/modifié pendant le traitement | |
temp_dir = tempfile.mkdtemp() | |
temp_pdf = os.path.join(temp_dir, "temp.pdf") | |
shutil.copy2(file.name, temp_pdf) | |
try: | |
# Utiliser PyMuPDF pour convertir les pages en images | |
doc = fitz.open(temp_pdf) | |
image_paths = [] | |
# Ne traiter que les 3 premières pages | |
max_pages = min(3, doc.page_count) | |
print(f"PDF a {doc.page_count} pages, prévisualisant {max_pages} pages") | |
for i in range(max_pages): | |
try: | |
page = doc[i] | |
# Augmenter la résolution pour une meilleure qualité | |
zoom = 2.0 # zoom factor | |
mat = fitz.Matrix(zoom, zoom) | |
pix = page.get_pixmap(matrix=mat, alpha=False) | |
# Sauvegarder l'image | |
temp_filename = f"temp_preview_{i}.jpg" | |
pix.save(temp_filename, "jpeg") | |
image_paths.append(temp_filename) | |
print(f"Page {i+1} convertie et sauvegardée dans {temp_filename}") | |
except Exception as e: | |
print(f"Erreur lors du traitement de la page {i+1}: {str(e)}") | |
# Fermer le document PDF | |
doc.close() | |
print(f"Prévisualisation créée avec succès: {len(image_paths)} images") | |
# Nettoyer les fichiers temporaires | |
shutil.rmtree(temp_dir, ignore_errors=True) | |
return image_paths | |
except Exception as e: | |
print(f"Erreur lors de la conversion PDF: {str(e)}") | |
# Nettoyer les fichiers temporaires | |
shutil.rmtree(temp_dir, ignore_errors=True) | |
return [] | |
elif file.name.lower().endswith(('.png', '.jpg', '.jpeg')): | |
return [file.name] | |
else: | |
print(f"Format de fichier non pris en charge: {file.name}") | |
return [] | |
except Exception as e: | |
print(f"Erreur inattendue dans update_preview: {str(e)}") | |
return [] | |
def process_and_display(file): | |
"""Process document and display results in the interface""" | |
if not file: | |
return [f"<div class='error'>{TEXT['error']['file_not_found']}</div>"] * 8 | |
result = process_document(file) | |
if "error" in result: | |
error_msg = result["error"] | |
if error_msg in TEXT["error"]: | |
error_msg = TEXT["error"][error_msg] | |
return [f"<div class='error'>{error_msg}</div>"] * 8 | |
# Format metadata as HTML | |
metadata_html = "<div class='metadata-grid'>" | |
if "metadata" in result and result["metadata"]: | |
for key, value in result["metadata"].items(): | |
metadata_html += f""" | |
<div class='metadata-item'> | |
<h4>{key}</h4> | |
<p>{value}</p> | |
</div> | |
""" | |
else: | |
metadata_html += f"<p>{TEXT['no_data']}</p>" | |
metadata_html += "</div>" | |
# Format JSON data | |
json_html = f"<pre class='json-viewer'>{json.dumps(result, indent=2, ensure_ascii=False)}</pre>" | |
# Initialize all tabs with default values | |
outputs = [ | |
metadata_html, | |
create_info_card(TEXT["tabs"]["entities"], format_list(result.get("entities", []), "name", "role")), | |
create_info_card(TEXT["tabs"]["values"], format_list(result.get("values", []), "description", "value")), | |
create_info_card(TEXT["tabs"]["dates"], format_list(result.get("dates", []), "description", "date")), | |
create_info_card(TEXT["tabs"]["tables"], format_table(result.get("tables", []))), | |
create_info_card(TEXT["tabs"]["keypoints"], format_list(result.get("key_points", []), "category", "description")), | |
create_info_card(TEXT["tabs"]["references"], format_list(result.get("references", []), "type", "value")), | |
json_html | |
] | |
return outputs | |
# Fonction pour encoder les images en base64 | |
def get_image_base64(file_path): | |
try: | |
with open(file_path, "rb") as image_file: | |
encoded_string = base64.b64encode(image_file.read()).decode('utf-8') | |
return encoded_string | |
except Exception as e: | |
print(f"Erreur lors de l'encodage de l'image {file_path}: {str(e)}") | |
return "" | |
# Chemins vers les images | |
logo_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "static", "elixir-logo-typo.png") | |
workflow_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "static", "Editor _ Mermaid Chart-2025-04-15-142548.png") | |
# Encoder les images en base64 | |
logo_base64 = get_image_base64(logo_path) | |
workflow_base64 = get_image_base64(workflow_path) | |
# Logo et workflow HTML | |
logo_html = f"""<div class="header"> | |
<img src="data:image/png;base64,{logo_base64}" alt="Elixir Logo" style="max-height: 40px; position: relative; z-index: 2;"> | |
</div>""" | |
workflow_html = f"""<div class="workflow-container"> | |
<img src="data:image/png;base64,{workflow_base64}" alt="Elixir Workflow" style="max-width: 100%; border-radius: 0.5rem;"> | |
</div>""" | |
# Ajouter du JavaScript pour l'accordéon et autres interactivités | |
js_code = """ | |
<script> | |
document.addEventListener('DOMContentLoaded', function() { | |
// Accordéon | |
const accordions = document.querySelectorAll('.accordion-header'); | |
accordions.forEach(accordion => { | |
accordion.addEventListener('click', function() { | |
this.parentElement.classList.toggle('active'); | |
}); | |
}); | |
// Animation des cartes au survol | |
const cards = document.querySelectorAll('.card'); | |
cards.forEach(card => { | |
card.addEventListener('mouseenter', function() { | |
this.style.transform = 'translateY(-5px)'; | |
this.style.boxShadow = '0 10px 15px -3px rgba(0, 0, 0, 0.1), 0 4px 6px -2px rgba(0, 0, 0, 0.05)'; | |
}); | |
card.addEventListener('mouseleave', function() { | |
this.style.transform = 'translateY(0)'; | |
this.style.boxShadow = '0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06)'; | |
}); | |
}); | |
}); | |
</script> | |
""" | |
# Interface Gradio améliorée | |
with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo: | |
gr.HTML(js_code) # Ajouter le JavaScript | |
# En-tête avec logo | |
header = gr.HTML(logo_html) | |
# Première rangée: Document Intelligence + How Elixir Works | |
with gr.Row(equal_height=True): | |
# Document Intelligence à gauche | |
with gr.Column(scale=1): | |
gr.HTML(f""" | |
<div class="intro-card"> | |
<div class="intro-header"> | |
<h3>📄 Document Intelligence</h3> | |
</div> | |
<div class="intro-body"> | |
<div class="intro-description"> | |
{TEXT["description"]} | |
</div> | |
<div class="contact-links"> | |
<a href="https://lexiapro.fr/" target="_blank" class="contact-link"> | |
🌐 Visit lexiapro.fr | |
</a> | |
<a href="mailto:martial@lexiapro.fr" class="contact-link"> | |
✉️ Contact us | |
</a> | |
</div> | |
</div> | |
</div> | |
""") | |
# How Elixir Works à droite | |
with gr.Column(scale=1): | |
gr.HTML(f""" | |
<div class="intro-card"> | |
<div class="intro-header"> | |
<h3>🔄 How Elixir Works</h3> | |
</div> | |
<div class="intro-body"> | |
{workflow_html} | |
</div> | |
</div> | |
""") | |
# Deuxième rangée: Interface d'utilisation avec input à gauche et output à droite | |
with gr.Row(): | |
# Colonne de gauche: Instructions et upload | |
with gr.Column(scale=1): | |
# Instructions | |
gr.HTML(""" | |
<div class="instructions"> | |
<h3>How to use Elixir</h3> | |
<ol> | |
<li>Upload a PDF document (1-10 pages) such as an invoice, regulatory document, report...</li> | |
<li>Processing by Elixir</li> | |
<li>Transcription of identified sections and elements (without customization)</li> | |
</ol> | |
</div> | |
""") | |
# Section de téléchargement | |
with gr.Group(elem_classes=["upload-section"]): | |
file_input = gr.File(label=TEXT["upload"], file_types=[".pdf", ".png", ".jpg", ".jpeg"], elem_classes=["file-container"]) | |
submit_btn = gr.Button(TEXT["analyze"], variant="primary", elem_classes=["primary"]) | |
preview = gr.Gallery(label=TEXT["preview"], show_label=True, elem_id="preview-gallery") | |
# Colonne de droite: Résultats et JSON | |
with gr.Column(scale=1): | |
# Onglets de résultats | |
with gr.Tabs(elem_classes=["tabs"]) as tabs: | |
with gr.TabItem(TEXT["tabs"]["overview"]): | |
metadata_view = gr.HTML() | |
with gr.TabItem(TEXT["tabs"]["entities"]): | |
entities_view = gr.HTML() | |
with gr.TabItem(TEXT["tabs"]["values"]): | |
values_view = gr.HTML() | |
with gr.TabItem(TEXT["tabs"]["dates"]): | |
dates_view = gr.HTML() | |
with gr.TabItem(TEXT["tabs"]["tables"]): | |
tables_view = gr.HTML() | |
with gr.TabItem(TEXT["tabs"]["keypoints"]): | |
keypoints_view = gr.HTML() | |
with gr.TabItem(TEXT["tabs"]["references"]): | |
references_view = gr.HTML() | |
# JSON complet en dessous des onglets | |
gr.HTML(""" | |
<div class="intro-card" style="margin-top: 1.5rem;"> | |
<div class="intro-header"> | |
<h3>📄 Complete JSON</h3> | |
</div> | |
<div class="intro-body" style="padding: 0.75rem;"> | |
""") | |
json_view = gr.HTML() | |
gr.HTML("</div></div>") | |
# Animation de chargement | |
loading_indicator = gr.HTML(f""" | |
<div id="loading" style="display:none; text-align:center; padding: 2rem;"> | |
<div class="loading-spinner"></div> | |
<p style="margin-top: 1rem; color: var(--primary);">{TEXT['processing']}</p> | |
</div> | |
<script> | |
document.addEventListener('DOMContentLoaded', function() {{ | |
const btn = document.querySelector("button.primary"); | |
const loading = document.getElementById("loading"); | |
if (btn && loading) {{ | |
btn.addEventListener("click", function() {{ | |
loading.style.display = "block"; | |
const observer = new MutationObserver(function(mutations) {{ | |
mutations.forEach(function(mutation) {{ | |
if (mutation.addedNodes.length) {{ | |
loading.style.display = "none"; | |
observer.disconnect(); | |
}} | |
}}); | |
}}); | |
const resultsContainer = document.querySelector(".tabs"); | |
if (resultsContainer) {{ | |
observer.observe(resultsContainer, {{ childList: true, subtree: true }}); | |
}} | |
}}); | |
}} | |
}}); | |
</script> | |
""") | |
file_input.change( | |
fn=update_preview, | |
inputs=file_input, | |
outputs=preview | |
) | |
submit_btn.click( | |
fn=process_and_display, | |
inputs=file_input, | |
outputs=[metadata_view, entities_view, values_view, dates_view, tables_view, keypoints_view, references_view, json_view] | |
) | |
if __name__ == "__main__": | |
demo.launch(share=True, server_name="0.0.0.0", server_port=7860) |