Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| from google import genai | |
| import os | |
| import json | |
| import time | |
| import base64 | |
| import fitz # Importation correcte pour Hugging Face | |
| from PIL import Image | |
| import io | |
| import tempfile | |
| import shutil | |
| # Configuration | |
| GOOGLE_API_KEY = "AIzaSyDKl4nWiCv6zNBZEZgmT6Xqz0LDvm1o7UA" | |
| genai.configure(api_key=GOOGLE_API_KEY) | |
| model = genai.GenerativeModel('gemini-2.5-flash') | |
| # Interface text (English only) | |
| TEXT = { | |
| "title": "Elixir - Document Intelligence", | |
| "description": "This demo showcases the capabilities of a generative AI model to interpret, understand, and classify any type of document WITHOUT CUSTOMIZATION. For developing a complete, precise, and defined pipeline, please contact martial@lexiapro.fr.", | |
| "instructions": [ | |
| "1. Upload a PDF document (1-10 pages) such as an invoice, regulatory document, report...", | |
| "2. Processing by Elixir", | |
| "3. Transcription of identified sections and elements (without customization)" | |
| ], | |
| "upload": "📂 Upload your document", | |
| "analyze": "🔍 Analyze document", | |
| "preview": "📄 Preview", | |
| "tabs": { | |
| "overview": "📋 Overview", | |
| "entities": "👥 Entities", | |
| "values": "💰 Values", | |
| "dates": "📅 Dates", | |
| "tables": "📊 Tables", | |
| "keypoints": "🔑 Key Points", | |
| "references": "🔗 References", | |
| "json": "📄 Complete JSON" | |
| }, | |
| "no_data": "No information found", | |
| "processing": "Processing...", | |
| "error": { | |
| "file_not_found": "File not found", | |
| "pdf_conversion": "Unable to convert PDF to image", | |
| "no_info": "No information extracted from PDF pages", | |
| "too_many_pages": "The PDF has more than 10 pages. Please upload a document with 10 pages or less." | |
| } | |
| } | |
| # Modern CSS - Style amélioré | |
| CSS = """ | |
| @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap'); | |
| :root { | |
| --primary: #4f46e5; | |
| --primary-light: #818cf8; | |
| --primary-dark: #3730a3; | |
| --secondary: #10b981; | |
| --accent: #f59e0b; | |
| --dark: #111827; | |
| --light: #f9fafb; | |
| --gray-50: #f8fafc; | |
| --gray-100: #f1f5f9; | |
| --gray-200: #e2e8f0; | |
| --gray-300: #cbd5e1; | |
| --gray-400: #94a3b8; | |
| --gray-500: #64748b; | |
| --text-primary: #1e293b; | |
| --text-secondary: #475569; | |
| --shadow-sm: 0 1px 2px 0 rgba(0, 0, 0, 0.05); | |
| --shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06); | |
| --shadow-md: 0 10px 15px -3px rgba(0, 0, 0, 0.1), 0 4px 6px -2px rgba(0, 0, 0, 0.05); | |
| --radius-sm: 0.25rem; | |
| --radius: 0.5rem; | |
| --radius-md: 0.75rem; | |
| --radius-lg: 1rem; | |
| } | |
| body, .gradio-container { | |
| font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif !important; | |
| color: var(--text-primary); | |
| background-color: var(--light); | |
| line-height: 1.6; | |
| } | |
| /* Layout principal */ | |
| .container { | |
| max-width: 1300px; | |
| margin: 0 auto; | |
| padding: 0 1rem; | |
| } | |
| .main-content { | |
| display: flex; | |
| gap: 2rem; | |
| align-items: flex-start; | |
| } | |
| .left-panel { | |
| flex: 1; | |
| } | |
| .right-panel { | |
| flex: 2; | |
| } | |
| /* En-tête */ | |
| .header { | |
| margin-bottom: 2rem; | |
| padding: 0.75rem 1.25rem; | |
| background: linear-gradient(135deg, var(--primary-light), var(--primary-dark)); | |
| border-radius: var(--radius-lg); | |
| box-shadow: var(--shadow-md); | |
| position: relative; | |
| overflow: hidden; | |
| color: white; | |
| height: 60px; | |
| display: flex; | |
| align-items: center; | |
| justify-content: center; | |
| } | |
| .header::before { | |
| content: ''; | |
| position: absolute; | |
| top: -50%; | |
| left: -50%; | |
| width: 200%; | |
| height: 200%; | |
| background: radial-gradient(circle, rgba(255,255,255,0.1) 0%, rgba(255,255,255,0) 60%); | |
| animation: pulse 15s ease-in-out infinite; | |
| z-index: 1; | |
| } | |
| @keyframes pulse { | |
| 0% { transform: scale(1); opacity: 0.5; } | |
| 50% { transform: scale(1.05); opacity: 0.8; } | |
| 100% { transform: scale(1); opacity: 0.5; } | |
| } | |
| .header img { | |
| max-height: 40px !important; | |
| object-fit: contain; | |
| position: relative; | |
| z-index: 2; | |
| } | |
| /* Intro card */ | |
| .intro-card { | |
| background: white; | |
| border-radius: var(--radius); | |
| box-shadow: var(--shadow); | |
| border: 1px solid var(--gray-200); | |
| overflow: hidden; | |
| margin-bottom: 1.5rem; | |
| transition: transform 0.3s ease, box-shadow 0.3s ease; | |
| } | |
| .intro-card:hover { | |
| transform: translateY(-3px); | |
| box-shadow: var(--shadow-md); | |
| } | |
| .intro-header { | |
| padding: 1.25rem; | |
| border-bottom: 1px solid var(--gray-200); | |
| background: linear-gradient(135deg, var(--primary-light), var(--primary-dark)); | |
| color: white; | |
| font-weight: 600; | |
| display: flex; | |
| align-items: center; | |
| gap: 0.5rem; | |
| } | |
| .intro-header h3 { | |
| margin: 0; | |
| font-size: 1.25rem; | |
| font-weight: 600; | |
| text-shadow: 0 1px 2px rgba(0,0,0,0.1); | |
| } | |
| .intro-body { | |
| padding: 1.5rem; | |
| } | |
| .intro-description { | |
| color: var(--text-primary); | |
| line-height: 1.7; | |
| font-size: 1.05rem; | |
| margin-bottom: 1.5rem; | |
| } | |
| .contact-links { | |
| display: flex; | |
| flex-wrap: wrap; | |
| gap: 1rem; | |
| margin-top: 1.5rem; | |
| background: linear-gradient(to right, rgba(79, 70, 229, 0.05), rgba(79, 70, 229, 0.1)); | |
| padding: 1.25rem; | |
| border-radius: var(--radius); | |
| border: 1px solid var(--gray-200); | |
| } | |
| .contact-link { | |
| display: flex; | |
| align-items: center; | |
| gap: 0.5rem; | |
| padding: 0.75rem 1rem; | |
| background: white; | |
| border-radius: var(--radius); | |
| color: var(--primary); | |
| text-decoration: none; | |
| font-weight: 500; | |
| transition: all 0.2s ease; | |
| box-shadow: var(--shadow-sm); | |
| border: 1px solid var(--gray-200); | |
| } | |
| .contact-link:hover { | |
| transform: translateY(-2px); | |
| box-shadow: var(--shadow); | |
| color: var(--primary-dark); | |
| border-color: var(--primary-light); | |
| } | |
| /* Accordéon pour workflow */ | |
| .accordion { | |
| border-radius: var(--radius); | |
| overflow: hidden; | |
| margin-bottom: 1.5rem; | |
| } | |
| .accordion-header { | |
| background: var(--gray-50); | |
| padding: 1.25rem; | |
| cursor: pointer; | |
| display: flex; | |
| align-items: center; | |
| justify-content: space-between; | |
| font-weight: 600; | |
| color: var(--primary); | |
| border: 1px solid var(--gray-200); | |
| border-radius: var(--radius); | |
| transition: all 0.3s ease; | |
| } | |
| .accordion-header:hover { | |
| background: var(--gray-100); | |
| } | |
| .accordion-header::after { | |
| content: "↓"; | |
| transition: transform 0.3s ease; | |
| } | |
| .accordion.active .accordion-header::after { | |
| transform: rotate(180deg); | |
| } | |
| .accordion-content { | |
| max-height: 0; | |
| overflow: hidden; | |
| transition: max-height 0.3s ease; | |
| background: white; | |
| border: 1px solid var(--gray-200); | |
| border-top: 0; | |
| border-radius: 0 0 var(--radius) var(--radius); | |
| padding: 0 1.25rem; | |
| } | |
| .accordion.active .accordion-content { | |
| max-height: 1000px; | |
| padding: 1.25rem; | |
| } | |
| .workflow-container { | |
| text-align: center; | |
| } | |
| .workflow-container img { | |
| max-width: 100%; | |
| border-radius: var(--radius); | |
| box-shadow: var(--shadow); | |
| margin-top: 1rem; | |
| } | |
| /* Instructions */ | |
| .instructions { | |
| background: white; | |
| padding: 1.5rem; | |
| border-radius: var(--radius); | |
| border: 1px solid var(--gray-200); | |
| box-shadow: var(--shadow); | |
| margin-bottom: 2rem; | |
| } | |
| .instructions h3 { | |
| color: var(--primary); | |
| margin-top: 0; | |
| margin-bottom: 1rem; | |
| font-weight: 600; | |
| font-size: 1.25rem; | |
| display: flex; | |
| align-items: center; | |
| gap: 0.5rem; | |
| } | |
| .instructions h3::before { | |
| content: '📋'; | |
| } | |
| .instructions ol { | |
| margin: 0; | |
| padding-left: 1.5rem; | |
| } | |
| .instructions li { | |
| margin-bottom: 0.75rem; | |
| position: relative; | |
| padding-left: 0.5rem; | |
| } | |
| .instructions li:last-child { | |
| margin-bottom: 0; | |
| } | |
| /* Upload section */ | |
| .upload-section { | |
| background: white; | |
| border-radius: var(--radius); | |
| box-shadow: var(--shadow); | |
| border: 1px solid var(--gray-200); | |
| padding: 1.5rem; | |
| } | |
| /* File input styling */ | |
| .file-container { | |
| border: 2px dashed var(--primary-light) !important; | |
| border-radius: var(--radius) !important; | |
| padding: 2rem !important; | |
| text-align: center !important; | |
| transition: all 0.3s ease !important; | |
| background-color: rgba(79, 70, 229, 0.05) !important; | |
| cursor: pointer !important; | |
| position: relative; | |
| } | |
| .file-container:hover { | |
| background-color: rgba(79, 70, 229, 0.1) !important; | |
| } | |
| .file-container::before { | |
| content: "📄"; | |
| font-size: 2rem; | |
| display: block; | |
| margin-bottom: 0.5rem; | |
| } | |
| button.primary { | |
| background: linear-gradient(135deg, var(--primary), var(--primary-dark)) !important; | |
| color: white !important; | |
| border: none !important; | |
| padding: 0.75rem 1.5rem !important; | |
| font-weight: 600 !important; | |
| border-radius: var(--radius) !important; | |
| transition: all 0.3s ease !important; | |
| box-shadow: 0 4px 6px rgba(79, 70, 229, 0.25) !important; | |
| width: 100% !important; | |
| margin-top: 1rem !important; | |
| } | |
| button.primary:hover { | |
| transform: translateY(-2px) !important; | |
| box-shadow: 0 7px 14px rgba(79, 70, 229, 0.3) !important; | |
| } | |
| /* Results tabs */ | |
| .tabs .tab-nav { | |
| background-color: var(--gray-50) !important; | |
| padding: 0.5rem !important; | |
| border-radius: var(--radius) var(--radius) 0 0 !important; | |
| border: 1px solid var(--gray-200) !important; | |
| border-bottom: none !important; | |
| } | |
| .tabs .tab-nav button { | |
| margin: 0 !important; | |
| padding: 0.75rem 1rem !important; | |
| font-weight: 500 !important; | |
| color: var(--text-secondary) !important; | |
| position: relative !important; | |
| transition: all 0.3s ease !important; | |
| } | |
| .tabs .tab-nav button.selected { | |
| color: var(--primary) !important; | |
| font-weight: 600 !important; | |
| } | |
| .tabs .tab-nav button.selected::after { | |
| content: ''; | |
| position: absolute; | |
| bottom: -0.5rem; | |
| left: 0; | |
| width: 100%; | |
| height: 3px; | |
| background: var(--primary); | |
| border-radius: 3px 3px 0 0; | |
| } | |
| .tabs .tabitem { | |
| background: white !important; | |
| padding: 1.5rem !important; | |
| border-radius: 0 0 var(--radius) var(--radius) !important; | |
| border: 1px solid var(--gray-200) !important; | |
| box-shadow: var(--shadow) !important; | |
| } | |
| /* Card components */ | |
| .info-card { | |
| background: white; | |
| padding: 0; | |
| border-radius: var(--radius); | |
| margin-bottom: 1.5rem; | |
| border: 1px solid var(--gray-200); | |
| box-shadow: var(--shadow); | |
| overflow: hidden; | |
| transition: transform 0.2s ease, box-shadow 0.2s ease; | |
| } | |
| .info-card:hover { | |
| transform: translateY(-2px); | |
| box-shadow: var(--shadow-md); | |
| } | |
| .info-card h3 { | |
| margin: 0; | |
| color: white; | |
| font-size: 1.1rem; | |
| font-weight: 600; | |
| padding: 1rem 1.5rem; | |
| background: linear-gradient(135deg, var(--primary-light), var(--primary-dark)); | |
| position: relative; | |
| } | |
| .info-card .content { | |
| padding: 1.25rem; | |
| } | |
| /* Formatage des listes dans les cartes */ | |
| .list-container { | |
| display: flex; | |
| flex-direction: column; | |
| gap: 1rem; | |
| } | |
| .list-item { | |
| padding: 1rem; | |
| background: var(--gray-50); | |
| border-radius: var(--radius); | |
| border: 1px solid var(--gray-200); | |
| transition: all 0.2s ease; | |
| } | |
| .list-item:hover { | |
| background: white; | |
| border-color: var(--primary-light); | |
| box-shadow: var(--shadow-sm); | |
| } | |
| .list-item-header { | |
| font-weight: 600; | |
| color: var(--primary); | |
| margin-bottom: 0.5rem; | |
| display: flex; | |
| align-items: center; | |
| gap: 0.5rem; | |
| } | |
| .list-item-header::before { | |
| content: '•'; | |
| color: var(--primary); | |
| font-size: 1.5rem; | |
| line-height: 1; | |
| } | |
| .list-item-content { | |
| color: var(--text-secondary); | |
| font-size: 0.95rem; | |
| } | |
| /* Améliorations tables */ | |
| .tables-container { | |
| display: flex; | |
| flex-direction: column; | |
| gap: 2rem; | |
| } | |
| .table-wrapper { | |
| overflow: hidden; | |
| border-radius: var(--radius); | |
| box-shadow: var(--shadow); | |
| background: white; | |
| } | |
| .table-wrapper h4 { | |
| padding: 1rem; | |
| margin: 0; | |
| background: linear-gradient(to right, var(--primary-light), var(--primary)); | |
| color: white; | |
| font-weight: 600; | |
| } | |
| .table-description { | |
| margin: 0; | |
| padding: 0.75rem 1rem; | |
| background: var(--gray-50); | |
| color: var(--text-secondary); | |
| border-bottom: 1px solid var(--gray-200); | |
| font-size: 0.9rem; | |
| font-style: italic; | |
| } | |
| .data-table { | |
| width: 100%; | |
| border-collapse: collapse; | |
| font-size: 0.95rem; | |
| } | |
| .data-table th { | |
| background: var(--gray-100); | |
| padding: 0.75rem 1rem; | |
| text-align: left; | |
| font-weight: 600; | |
| color: var(--primary-dark); | |
| border-bottom: 2px solid var(--primary-light); | |
| } | |
| .data-table td { | |
| padding: 0.75rem 1rem; | |
| border-bottom: 1px solid var(--gray-200); | |
| color: var(--text-secondary); | |
| } | |
| .data-table tr:last-child td { | |
| border-bottom: none; | |
| } | |
| .data-table tr:nth-child(even) { | |
| background-color: var(--gray-50); | |
| } | |
| .data-table tr:hover { | |
| background-color: rgba(79, 70, 229, 0.05); | |
| } | |
| /* Metadata grid */ | |
| .metadata-grid { | |
| display: grid; | |
| grid-template-columns: repeat(auto-fill, minmax(200px, 1fr)); | |
| gap: 1rem; | |
| } | |
| .metadata-item { | |
| background: var(--gray-50); | |
| padding: 1rem; | |
| border-radius: var(--radius); | |
| border: 1px solid var(--gray-200); | |
| transition: all 0.2s ease; | |
| } | |
| .metadata-item:hover { | |
| background: white; | |
| border-color: var(--primary-light); | |
| box-shadow: var(--shadow-sm); | |
| } | |
| .metadata-item h4 { | |
| margin: 0 0 0.5rem 0; | |
| color: var(--primary); | |
| font-weight: 600; | |
| font-size: 0.9rem; | |
| text-transform: uppercase; | |
| letter-spacing: 0.5px; | |
| } | |
| .metadata-item p { | |
| margin: 0; | |
| color: var(--text-primary); | |
| font-weight: 500; | |
| } | |
| /* JSON viewer */ | |
| .json-viewer { | |
| background: var(--dark); | |
| color: #e2e8f0; | |
| padding: 1.25rem; | |
| border-radius: var(--radius); | |
| overflow: auto; | |
| font-family: 'Fira Code', 'Courier New', monospace; | |
| font-size: 0.9rem; | |
| line-height: 1.5; | |
| max-height: 400px; | |
| white-space: pre-wrap; | |
| } | |
| /* Loading animation */ | |
| .loading-spinner { | |
| display: inline-block; | |
| width: 50px; | |
| height: 50px; | |
| border: 3px solid rgba(79, 70, 229, 0.3); | |
| border-radius: 50%; | |
| border-top-color: var(--primary); | |
| animation: spin 1s ease-in-out infinite; | |
| } | |
| @keyframes spin { | |
| to { transform: rotate(360deg); } | |
| } | |
| /* Error message */ | |
| .error { | |
| padding: 1rem; | |
| background-color: #fee2e2; | |
| border: 1px solid #fecaca; | |
| border-radius: var(--radius); | |
| color: #b91c1c; | |
| font-weight: 500; | |
| } | |
| /* Responsive design */ | |
| @media (max-width: 1024px) { | |
| .main-content { | |
| flex-direction: column; | |
| } | |
| .left-panel, .right-panel { | |
| flex: none; | |
| width: 100%; | |
| } | |
| } | |
| """ | |
| # Prompt pour Gemini avec instruction améliorée pour les tableaux | |
| GEMINI_PROMPT = """ | |
| Analyze this document and extract relevant information in JSON format. Adapt the extraction based on the document type (invoice, contract, report, KID, etc.). | |
| Expected response structure: | |
| { | |
| "metadata": { | |
| "title": "Document title", | |
| "date": "Document date", | |
| "type": "Document type", | |
| "author": "Document author or issuer" | |
| }, | |
| "entities": [ | |
| { | |
| "name": "Entity name", | |
| "type": "Entity type (person, organization, etc.)", | |
| "role": "Role in the document" | |
| } | |
| ], | |
| "values": [ | |
| { | |
| "description": "Value description", | |
| "value": "Exact value", | |
| "unit": "Unit if applicable" | |
| } | |
| ], | |
| "dates": [ | |
| { | |
| "description": "Date description", | |
| "date": "Exact date", | |
| "importance": "Importance (high, medium, low)" | |
| } | |
| ], | |
| "tables": [ | |
| { | |
| "title": "Table title", | |
| "description": "Table description", | |
| "data": [ | |
| { | |
| "column1": "Value in row 1, column 1", | |
| "column2": "Value in row 1, column 2", | |
| "column3": "Value in row 1, column 3" | |
| }, | |
| { | |
| "column1": "Value in row 2, column 1", | |
| "column2": "Value in row 2, column 2", | |
| "column3": "Value in row 2, column 3" | |
| } | |
| ] | |
| } | |
| ], | |
| "key_points": [ | |
| { | |
| "category": "Key point category", | |
| "description": "Detailed description", | |
| "importance": "Importance (high, medium, low)" | |
| } | |
| ], | |
| "references": [ | |
| { | |
| "type": "Reference type", | |
| "value": "Reference value" | |
| } | |
| ] | |
| } | |
| Important instructions: | |
| 1. First identify the document type and adapt the extraction accordingly | |
| 2. For tables (this is EXTREMELY important): | |
| - Pay special attention to detect and extract ALL tables in the document | |
| - Carefully identify tables even if they don't have visible borders or lines | |
| - Identify column headers correctly (first row or separate header row) | |
| - Extract all rows and all columns with exact cell values | |
| - Maintain the same number of columns for each row | |
| - Preserve the exact structure of each table | |
| - For each table, provide a descriptive title based on content | |
| - For each table, include a brief description explaining what the table contains | |
| - If a table spans multiple pages, try to reconstruct it as one table | |
| - Include ALL data from the table, don't omit any rows or columns | |
| 3. For values: | |
| - Extract amounts, percentages, numbers | |
| - Include units when present | |
| 4. For dates: | |
| - Extract all important dates | |
| - Include the context of each date | |
| 5. For entities: | |
| - Identify people, organizations, locations | |
| - Include their role in the document | |
| 6. For references: | |
| - Extract reference numbers, codes, identifiers | |
| 7. For key points: | |
| - Identify important information based on document type | |
| - Categorize them appropriately | |
| General rules: | |
| - Respond only with JSON, without any additional text | |
| - Extract only factual and verifiable information | |
| - Be precise with values and dates | |
| - If a category is not relevant for the document, leave an empty array | |
| - Adapt categories based on document type | |
| - Do not make assumptions about missing data | |
| """ | |
| def create_info_card(title, content): | |
| """Create a formatted information card""" | |
| if not content: | |
| return f""" | |
| <div class="info-card"> | |
| <h3>{title}</h3> | |
| <div class="content"> | |
| <p>{TEXT["no_data"]}</p> | |
| </div> | |
| </div> | |
| """ | |
| return f""" | |
| <div class="info-card"> | |
| <h3>{title}</h3> | |
| <div class="content"> | |
| {content} | |
| </div> | |
| </div> | |
| """ | |
| def format_list(items, key1, key2): | |
| """Format a list of items with two keys""" | |
| if not items: | |
| return TEXT["no_data"] | |
| html = "<div class='list-container'>" | |
| for item in items: | |
| html += f""" | |
| <div class='list-item'> | |
| <div class='list-item-header'>{item[key1]}</div> | |
| <div class='list-item-content'>{item[key2]}</div> | |
| </div> | |
| """ | |
| html += "</div>" | |
| return html | |
| def format_table(table_data): | |
| """Format a table in HTML""" | |
| if not table_data: | |
| return TEXT["no_data"] | |
| html = "<div class='tables-container'>" | |
| try: | |
| for table in table_data: | |
| # Vérifier si la table a des données | |
| if not table.get('data') or len(table['data']) == 0: | |
| continue | |
| title = table.get('title', 'Tableau sans titre') | |
| description = table.get('description', '') | |
| html += f""" | |
| <div class='table-wrapper'> | |
| <h4>{title}</h4> | |
| <p class='table-description'>{description}</p> | |
| <table class='data-table'> | |
| """ | |
| # Vérifier le format des données | |
| first_row = table['data'][0] | |
| if isinstance(first_row, dict): | |
| # Extraire les en-têtes du premier élément | |
| headers = list(first_row.keys()) | |
| # Ajouter les en-têtes | |
| html += "<tr>" | |
| for header in headers: | |
| html += f"<th>{header}</th>" | |
| html += "</tr>" | |
| # Ajouter les lignes de données | |
| for row in table['data']: | |
| html += "<tr>" | |
| for key in headers: | |
| value = row.get(key, "") | |
| html += f"<td>{value}</td>" | |
| html += "</tr>" | |
| elif isinstance(first_row, list): | |
| # Traiter les données au format liste | |
| for row in table['data']: | |
| html += "<tr>" | |
| for cell in row: | |
| html += f"<td>{cell}</td>" | |
| html += "</tr>" | |
| html += "</table></div>" | |
| except Exception as e: | |
| print(f"Erreur lors du formatage des tableaux: {str(e)}") | |
| html += f""" | |
| <div class='error'> | |
| Erreur lors de l'affichage des tableaux. Veuillez vérifier le format JSON. | |
| </div> | |
| """ | |
| html += "</div>" | |
| if html == "<div class='tables-container'></div>": | |
| return TEXT["no_data"] | |
| return html | |
| def process_single_image(image): | |
| """Process a single image and extract information""" | |
| try: | |
| print("Envoi de l'image à Gemini pour analyse...") | |
| response = model.generate_content( | |
| [GEMINI_PROMPT, image], | |
| generation_config={ | |
| "temperature": 0.1, | |
| "top_p": 0.8, | |
| "top_k": 40, | |
| "max_output_tokens": 2048, | |
| } | |
| ) | |
| try: | |
| response_text = response.text.strip() | |
| print(f"Réponse reçue de Gemini, longueur: {len(response_text)} caractères") | |
| # Nettoyage du texte JSON | |
| if response_text.startswith("```json"): | |
| response_text = response_text.replace("```json", "").replace("```", "").strip() | |
| elif response_text.startswith("```"): | |
| response_text = response_text.replace("```", "").strip() | |
| # Parse JSON | |
| json_data = json.loads(response_text) | |
| # Vérifier et corriger le format des tableaux si nécessaire | |
| if "tables" in json_data and json_data["tables"]: | |
| for i, table in enumerate(json_data["tables"]): | |
| if "data" not in table or not table["data"]: | |
| table["data"] = [] | |
| # S'assurer que la table a un titre | |
| if "title" not in table or not table["title"]: | |
| table["title"] = f"Tableau {i+1}" | |
| # S'assurer que la table a une description | |
| if "description" not in table: | |
| table["description"] = "" | |
| return json_data | |
| except json.JSONDecodeError as e: | |
| print(f"Erreur de décodage JSON: {str(e)}") | |
| print(f"Contenu problématique: {response_text[:500]}...") | |
| return {"error": "Erreur de format JSON dans la réponse"} | |
| except Exception as e: | |
| print(f"Erreur lors du traitement de la réponse Gemini: {str(e)}") | |
| return {"error": str(e)} | |
| except Exception as e: | |
| print(f"Erreur lors de l'appel à Gemini: {str(e)}") | |
| return {"error": str(e)} | |
| def merge_results(results): | |
| """Merge multiple results into one""" | |
| if not results: | |
| return None | |
| merged = { | |
| "metadata": {}, | |
| "entities": [], | |
| "values": [], | |
| "dates": [], | |
| "tables": [], | |
| "key_points": [], | |
| "references": [] | |
| } | |
| # Merge metadata (take from first result with data) | |
| for result in results: | |
| if "metadata" in result and result["metadata"]: | |
| merged["metadata"] = result["metadata"] | |
| break | |
| # Merge lists | |
| for result in results: | |
| for category in ["entities", "values", "dates", "tables", "key_points", "references"]: | |
| if category in result and result[category]: | |
| merged[category].extend(result[category]) | |
| return merged | |
| def process_document(file, progress=gr.Progress()): | |
| """Process a document and extract information""" | |
| if not file: | |
| return {"error": TEXT["error"]["file_not_found"]} | |
| try: | |
| if file.name.lower().endswith('.pdf'): | |
| # Créer une copie temporaire du fichier au cas où il serait déplacé/modifié pendant le traitement | |
| temp_dir = tempfile.mkdtemp() | |
| temp_pdf = os.path.join(temp_dir, "temp.pdf") | |
| shutil.copy2(file.name, temp_pdf) | |
| try: | |
| # Ouvrir le PDF avec PyMuPDF | |
| doc = fitz.open(temp_pdf) | |
| if doc.page_count > 10: | |
| # Nettoyer les fichiers temporaires | |
| shutil.rmtree(temp_dir, ignore_errors=True) | |
| return {"error": TEXT["error"]["too_many_pages"]} | |
| print(f"Traitement d'un PDF de {doc.page_count} pages") | |
| results = [] | |
| for i in range(doc.page_count): | |
| progress((i+1) / doc.page_count, desc=f"{TEXT['processing']} page {i+1}/{doc.page_count}") | |
| try: | |
| page = doc[i] | |
| # Augmenter la résolution pour une meilleure qualité | |
| zoom = 2.0 # zoom factor | |
| mat = fitz.Matrix(zoom, zoom) | |
| pix = page.get_pixmap(matrix=mat, alpha=False) | |
| # Convertir le pixmap en PIL Image | |
| img_data = pix.tobytes("jpeg") | |
| pil_img = Image.open(io.BytesIO(img_data)) | |
| # Traiter l'image avec le modèle Gemini | |
| result = process_single_image(pil_img) | |
| if result and "error" not in result: | |
| results.append(result) | |
| print(f"Page {i+1} traitée avec succès") | |
| else: | |
| print(f"Pas d'informations extraites de la page {i+1}") | |
| except Exception as e: | |
| print(f"Erreur lors du traitement de la page {i+1}: {str(e)}") | |
| # Fermer le document PDF | |
| doc.close() | |
| # Nettoyer les fichiers temporaires | |
| shutil.rmtree(temp_dir, ignore_errors=True) | |
| if results: | |
| return merge_results(results) | |
| else: | |
| return {"error": TEXT["error"]["no_info"]} | |
| except Exception as e: | |
| print(f"Erreur lors du traitement du PDF: {str(e)}") | |
| # Nettoyer les fichiers temporaires | |
| shutil.rmtree(temp_dir, ignore_errors=True) | |
| return {"error": str(e)} | |
| elif file.name.lower().endswith(('.png', '.jpg', '.jpeg')): | |
| try: | |
| image = Image.open(file.name) | |
| return process_single_image(image) | |
| except Exception as e: | |
| print(f"Erreur lors du traitement de l'image: {str(e)}") | |
| return {"error": str(e)} | |
| else: | |
| return {"error": TEXT["error"]["file_not_found"]} | |
| except Exception as e: | |
| print(f"Erreur inattendue dans process_document: {str(e)}") | |
| return {"error": str(e)} | |
| def update_preview(file): | |
| """Update the preview with the uploaded file""" | |
| if not file: | |
| return [] | |
| try: | |
| if file.name.lower().endswith('.pdf'): | |
| # Créer une copie temporaire du fichier au cas où il serait déplacé/modifié pendant le traitement | |
| temp_dir = tempfile.mkdtemp() | |
| temp_pdf = os.path.join(temp_dir, "temp.pdf") | |
| shutil.copy2(file.name, temp_pdf) | |
| try: | |
| # Utiliser PyMuPDF pour convertir les pages en images | |
| doc = fitz.open(temp_pdf) | |
| image_paths = [] | |
| # Ne traiter que les 3 premières pages | |
| max_pages = min(3, doc.page_count) | |
| print(f"PDF a {doc.page_count} pages, prévisualisant {max_pages} pages") | |
| for i in range(max_pages): | |
| try: | |
| page = doc[i] | |
| # Augmenter la résolution pour une meilleure qualité | |
| zoom = 2.0 # zoom factor | |
| mat = fitz.Matrix(zoom, zoom) | |
| pix = page.get_pixmap(matrix=mat, alpha=False) | |
| # Sauvegarder l'image | |
| temp_filename = f"temp_preview_{i}.jpg" | |
| pix.save(temp_filename, "jpeg") | |
| image_paths.append(temp_filename) | |
| print(f"Page {i+1} convertie et sauvegardée dans {temp_filename}") | |
| except Exception as e: | |
| print(f"Erreur lors du traitement de la page {i+1}: {str(e)}") | |
| # Fermer le document PDF | |
| doc.close() | |
| print(f"Prévisualisation créée avec succès: {len(image_paths)} images") | |
| # Nettoyer les fichiers temporaires | |
| shutil.rmtree(temp_dir, ignore_errors=True) | |
| return image_paths | |
| except Exception as e: | |
| print(f"Erreur lors de la conversion PDF: {str(e)}") | |
| # Nettoyer les fichiers temporaires | |
| shutil.rmtree(temp_dir, ignore_errors=True) | |
| return [] | |
| elif file.name.lower().endswith(('.png', '.jpg', '.jpeg')): | |
| return [file.name] | |
| else: | |
| print(f"Format de fichier non pris en charge: {file.name}") | |
| return [] | |
| except Exception as e: | |
| print(f"Erreur inattendue dans update_preview: {str(e)}") | |
| return [] | |
| def process_and_display(file): | |
| """Process document and display results in the interface""" | |
| if not file: | |
| return [f"<div class='error'>{TEXT['error']['file_not_found']}</div>"] * 8 | |
| result = process_document(file) | |
| if "error" in result: | |
| error_msg = result["error"] | |
| if error_msg in TEXT["error"]: | |
| error_msg = TEXT["error"][error_msg] | |
| return [f"<div class='error'>{error_msg}</div>"] * 8 | |
| # Format metadata as HTML | |
| metadata_html = "<div class='metadata-grid'>" | |
| if "metadata" in result and result["metadata"]: | |
| for key, value in result["metadata"].items(): | |
| metadata_html += f""" | |
| <div class='metadata-item'> | |
| <h4>{key}</h4> | |
| <p>{value}</p> | |
| </div> | |
| """ | |
| else: | |
| metadata_html += f"<p>{TEXT['no_data']}</p>" | |
| metadata_html += "</div>" | |
| # Format JSON data | |
| json_html = f"<pre class='json-viewer'>{json.dumps(result, indent=2, ensure_ascii=False)}</pre>" | |
| # Initialize all tabs with default values | |
| outputs = [ | |
| metadata_html, | |
| create_info_card(TEXT["tabs"]["entities"], format_list(result.get("entities", []), "name", "role")), | |
| create_info_card(TEXT["tabs"]["values"], format_list(result.get("values", []), "description", "value")), | |
| create_info_card(TEXT["tabs"]["dates"], format_list(result.get("dates", []), "description", "date")), | |
| create_info_card(TEXT["tabs"]["tables"], format_table(result.get("tables", []))), | |
| create_info_card(TEXT["tabs"]["keypoints"], format_list(result.get("key_points", []), "category", "description")), | |
| create_info_card(TEXT["tabs"]["references"], format_list(result.get("references", []), "type", "value")), | |
| json_html | |
| ] | |
| return outputs | |
| # Fonction pour encoder les images en base64 | |
| def get_image_base64(file_path): | |
| try: | |
| with open(file_path, "rb") as image_file: | |
| encoded_string = base64.b64encode(image_file.read()).decode('utf-8') | |
| return encoded_string | |
| except Exception as e: | |
| print(f"Erreur lors de l'encodage de l'image {file_path}: {str(e)}") | |
| return "" | |
| # Chemins vers les images | |
| logo_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "static", "elixir-logo-typo.png") | |
| workflow_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "static", "Editor _ Mermaid Chart-2025-04-15-142548.png") | |
| # Encoder les images en base64 | |
| logo_base64 = get_image_base64(logo_path) | |
| workflow_base64 = get_image_base64(workflow_path) | |
| # Logo et workflow HTML | |
| logo_html = f"""<div class="header"> | |
| <img src="data:image/png;base64,{logo_base64}" alt="Elixir Logo" style="max-height: 40px; position: relative; z-index: 2;"> | |
| </div>""" | |
| workflow_html = f"""<div class="workflow-container"> | |
| <img src="data:image/png;base64,{workflow_base64}" alt="Elixir Workflow" style="max-width: 100%; border-radius: 0.5rem;"> | |
| </div>""" | |
| # Ajouter du JavaScript pour l'accordéon et autres interactivités | |
| js_code = """ | |
| <script> | |
| document.addEventListener('DOMContentLoaded', function() { | |
| // Accordéon | |
| const accordions = document.querySelectorAll('.accordion-header'); | |
| accordions.forEach(accordion => { | |
| accordion.addEventListener('click', function() { | |
| this.parentElement.classList.toggle('active'); | |
| }); | |
| }); | |
| // Animation des cartes au survol | |
| const cards = document.querySelectorAll('.card'); | |
| cards.forEach(card => { | |
| card.addEventListener('mouseenter', function() { | |
| this.style.transform = 'translateY(-5px)'; | |
| this.style.boxShadow = '0 10px 15px -3px rgba(0, 0, 0, 0.1), 0 4px 6px -2px rgba(0, 0, 0, 0.05)'; | |
| }); | |
| card.addEventListener('mouseleave', function() { | |
| this.style.transform = 'translateY(0)'; | |
| this.style.boxShadow = '0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06)'; | |
| }); | |
| }); | |
| }); | |
| </script> | |
| """ | |
| # Interface Gradio améliorée | |
| with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo: | |
| gr.HTML(js_code) # Ajouter le JavaScript | |
| # En-tête avec logo | |
| header = gr.HTML(logo_html) | |
| # Première rangée: Document Intelligence + How Elixir Works | |
| with gr.Row(equal_height=True): | |
| # Document Intelligence à gauche | |
| with gr.Column(scale=1): | |
| gr.HTML(f""" | |
| <div class="intro-card"> | |
| <div class="intro-header"> | |
| <h3>📄 Document Intelligence</h3> | |
| </div> | |
| <div class="intro-body"> | |
| <div class="intro-description"> | |
| {TEXT["description"]} | |
| </div> | |
| <div class="contact-links"> | |
| <a href="https://lexiapro.fr/" target="_blank" class="contact-link"> | |
| 🌐 Visit lexiapro.fr | |
| </a> | |
| <a href="mailto:martial@lexiapro.fr" class="contact-link"> | |
| ✉️ Contact us | |
| </a> | |
| </div> | |
| </div> | |
| </div> | |
| """) | |
| # How Elixir Works à droite | |
| with gr.Column(scale=1): | |
| gr.HTML(f""" | |
| <div class="intro-card"> | |
| <div class="intro-header"> | |
| <h3>🔄 How Elixir Works</h3> | |
| </div> | |
| <div class="intro-body"> | |
| {workflow_html} | |
| </div> | |
| </div> | |
| """) | |
| # Deuxième rangée: Interface d'utilisation avec input à gauche et output à droite | |
| with gr.Row(): | |
| # Colonne de gauche: Instructions et upload | |
| with gr.Column(scale=1): | |
| # Instructions | |
| gr.HTML(""" | |
| <div class="instructions"> | |
| <h3>How to use Elixir</h3> | |
| <ol> | |
| <li>Upload a PDF document (1-10 pages) such as an invoice, regulatory document, report...</li> | |
| <li>Processing by Elixir</li> | |
| <li>Transcription of identified sections and elements (without customization)</li> | |
| </ol> | |
| </div> | |
| """) | |
| # Section de téléchargement | |
| with gr.Group(elem_classes=["upload-section"]): | |
| file_input = gr.File(label=TEXT["upload"], file_types=[".pdf", ".png", ".jpg", ".jpeg"], elem_classes=["file-container"]) | |
| submit_btn = gr.Button(TEXT["analyze"], variant="primary", elem_classes=["primary"]) | |
| preview = gr.Gallery(label=TEXT["preview"], show_label=True, elem_id="preview-gallery") | |
| # Colonne de droite: Résultats et JSON | |
| with gr.Column(scale=1): | |
| # Onglets de résultats | |
| with gr.Tabs(elem_classes=["tabs"]) as tabs: | |
| with gr.TabItem(TEXT["tabs"]["overview"]): | |
| metadata_view = gr.HTML() | |
| with gr.TabItem(TEXT["tabs"]["entities"]): | |
| entities_view = gr.HTML() | |
| with gr.TabItem(TEXT["tabs"]["values"]): | |
| values_view = gr.HTML() | |
| with gr.TabItem(TEXT["tabs"]["dates"]): | |
| dates_view = gr.HTML() | |
| with gr.TabItem(TEXT["tabs"]["tables"]): | |
| tables_view = gr.HTML() | |
| with gr.TabItem(TEXT["tabs"]["keypoints"]): | |
| keypoints_view = gr.HTML() | |
| with gr.TabItem(TEXT["tabs"]["references"]): | |
| references_view = gr.HTML() | |
| # JSON complet en dessous des onglets | |
| gr.HTML(""" | |
| <div class="intro-card" style="margin-top: 1.5rem;"> | |
| <div class="intro-header"> | |
| <h3>📄 Complete JSON</h3> | |
| </div> | |
| <div class="intro-body" style="padding: 0.75rem;"> | |
| """) | |
| json_view = gr.HTML() | |
| gr.HTML("</div></div>") | |
| # Animation de chargement | |
| loading_indicator = gr.HTML(f""" | |
| <div id="loading" style="display:none; text-align:center; padding: 2rem;"> | |
| <div class="loading-spinner"></div> | |
| <p style="margin-top: 1rem; color: var(--primary);">{TEXT['processing']}</p> | |
| </div> | |
| <script> | |
| document.addEventListener('DOMContentLoaded', function() {{ | |
| const btn = document.querySelector("button.primary"); | |
| const loading = document.getElementById("loading"); | |
| if (btn && loading) {{ | |
| btn.addEventListener("click", function() {{ | |
| loading.style.display = "block"; | |
| const observer = new MutationObserver(function(mutations) {{ | |
| mutations.forEach(function(mutation) {{ | |
| if (mutation.addedNodes.length) {{ | |
| loading.style.display = "none"; | |
| observer.disconnect(); | |
| }} | |
| }}); | |
| }}); | |
| const resultsContainer = document.querySelector(".tabs"); | |
| if (resultsContainer) {{ | |
| observer.observe(resultsContainer, {{ childList: true, subtree: true }}); | |
| }} | |
| }}); | |
| }} | |
| }}); | |
| </script> | |
| """) | |
| file_input.change( | |
| fn=update_preview, | |
| inputs=file_input, | |
| outputs=preview | |
| ) | |
| submit_btn.click( | |
| fn=process_and_display, | |
| inputs=file_input, | |
| outputs=[metadata_view, entities_view, values_view, dates_view, tables_view, keypoints_view, references_view, json_view] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(share=True, server_name="0.0.0.0", server_port=7860) |