Spaces:
Sleeping
Sleeping
Upload 22 files
Browse files- .gitattributes +7 -0
- .github/copilot-instructions.md +23 -0
- .gitignore +44 -0
- Dockerfile +49 -0
- README.md +318 -10
- __pycache__/app.cpython-311.pyc +0 -0
- __pycache__/auth.cpython-311.pyc +0 -0
- __pycache__/cost_tracker.cpython-311.pyc +0 -0
- __pycache__/ocr_invoice.cpython-311.pyc +0 -0
- app.py +438 -0
- cost_tracker.py +113 -0
- invoices.db +0 -0
- ocr_invoice.py +616 -0
- requirements.txt +9 -0
- static/index.html +802 -0
- static/logo.svg +0 -0
- static/uploads/invoice (1).jpg +3 -0
- static/uploads/invoice (2).jpg +3 -0
- static/uploads/invoice (3).jpg +3 -0
- static/uploads/invoice (4).jpg +3 -0
- static/uploads/invoice (5).jpg +3 -0
- static/uploads/invoice (6).jpg +3 -0
- static/uploads/invoice (8).jpg +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
static/uploads/invoice[[:space:]](1).jpg filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
static/uploads/invoice[[:space:]](2).jpg filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
static/uploads/invoice[[:space:]](3).jpg filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
static/uploads/invoice[[:space:]](4).jpg filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
static/uploads/invoice[[:space:]](5).jpg filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
static/uploads/invoice[[:space:]](6).jpg filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
static/uploads/invoice[[:space:]](8).jpg filter=lfs diff=lfs merge=lfs -text
|
.github/copilot-instructions.md
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Invoice OCR System - Project Setup
|
| 2 |
+
|
| 3 |
+
## Project Type
|
| 4 |
+
Python FastAPI application with Google Document AI OCR + Gemini 2.5 Flash Lite
|
| 5 |
+
|
| 6 |
+
## Requirements
|
| 7 |
+
- Extract invoice data: supplier info, invoice number, dates, line items (code, description, qty, price), totals
|
| 8 |
+
- Web interface with drag-and-drop upload
|
| 9 |
+
- SQLite database for storage
|
| 10 |
+
- Cost tracking
|
| 11 |
+
- Docker deployment for Hugging Face Spaces
|
| 12 |
+
|
| 13 |
+
## Progress
|
| 14 |
+
|
| 15 |
+
- [x] Verify copilot-instructions.md created
|
| 16 |
+
- [ ] Clarify Project Requirements
|
| 17 |
+
- [ ] Scaffold the Project
|
| 18 |
+
- [ ] Customize the Project
|
| 19 |
+
- [ ] Install Required Extensions
|
| 20 |
+
- [ ] Compile the Project
|
| 21 |
+
- [ ] Create and Run Task
|
| 22 |
+
- [ ] Launch the Project
|
| 23 |
+
- [ ] Ensure Documentation is Complete
|
.gitignore
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Environment files
|
| 2 |
+
.env
|
| 3 |
+
*.env
|
| 4 |
+
|
| 5 |
+
# Google Cloud credentials
|
| 6 |
+
*.json
|
| 7 |
+
!static/*.json
|
| 8 |
+
|
| 9 |
+
# Database
|
| 10 |
+
*.db
|
| 11 |
+
*.sqlite
|
| 12 |
+
|
| 13 |
+
# Uploads and outputs
|
| 14 |
+
uploads/
|
| 15 |
+
*_invoice.json
|
| 16 |
+
|
| 17 |
+
# Python
|
| 18 |
+
__pycache__/
|
| 19 |
+
*.py[cod]
|
| 20 |
+
*$py.class
|
| 21 |
+
*.so
|
| 22 |
+
.Python
|
| 23 |
+
*.egg-info/
|
| 24 |
+
dist/
|
| 25 |
+
build/
|
| 26 |
+
|
| 27 |
+
# IDE
|
| 28 |
+
.vscode/
|
| 29 |
+
.idea/
|
| 30 |
+
*.swp
|
| 31 |
+
*.swo
|
| 32 |
+
*~
|
| 33 |
+
|
| 34 |
+
# OS
|
| 35 |
+
.DS_Store
|
| 36 |
+
Thumbs.db
|
| 37 |
+
|
| 38 |
+
# Logs
|
| 39 |
+
*.log
|
| 40 |
+
usage_costs.json
|
| 41 |
+
|
| 42 |
+
# Local test files
|
| 43 |
+
test_*.py
|
| 44 |
+
temp/
|
Dockerfile
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
# Install system dependencies
|
| 6 |
+
RUN apt-get update && apt-get install -y \
|
| 7 |
+
gcc \
|
| 8 |
+
g++ \
|
| 9 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 10 |
+
|
| 11 |
+
# Copy requirements
|
| 12 |
+
COPY requirements.txt .
|
| 13 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 14 |
+
|
| 15 |
+
# Copy application files
|
| 16 |
+
COPY app.py .
|
| 17 |
+
COPY ocr_invoice.py .
|
| 18 |
+
COPY cost_tracker.py .
|
| 19 |
+
COPY auth.py .
|
| 20 |
+
COPY allowed_users.txt .
|
| 21 |
+
COPY static/ ./static/
|
| 22 |
+
|
| 23 |
+
# Create necessary directories with proper permissions
|
| 24 |
+
RUN mkdir -p /tmp/uploads && chmod 777 /tmp/uploads
|
| 25 |
+
RUN mkdir -p /app/credentials && chmod 777 /app/credentials
|
| 26 |
+
|
| 27 |
+
# Expose port
|
| 28 |
+
EXPOSE 7860
|
| 29 |
+
|
| 30 |
+
# Set environment variables
|
| 31 |
+
ENV PYTHONUNBUFFERED=1
|
| 32 |
+
|
| 33 |
+
# Create startup script that handles credentials
|
| 34 |
+
RUN echo '#!/bin/bash\n\
|
| 35 |
+
if [ ! -z "$GOOGLE_APPLICATION_CREDENTIALS_JSON" ]; then\n\
|
| 36 |
+
echo "$GOOGLE_APPLICATION_CREDENTIALS_JSON" > /app/credentials/credentials.json\n\
|
| 37 |
+
export GOOGLE_APPLICATION_CREDENTIALS=/app/credentials/credentials.json\n\
|
| 38 |
+
echo "✓ Created credentials file from environment variable"\n\
|
| 39 |
+
elif [ ! -z "$GOOGLE_APPLICATION_CREDENTIALS" ]; then\n\
|
| 40 |
+
if [ ! -f "$GOOGLE_APPLICATION_CREDENTIALS" ]; then\n\
|
| 41 |
+
echo "$GOOGLE_APPLICATION_CREDENTIALS" > /app/credentials/credentials.json\n\
|
| 42 |
+
export GOOGLE_APPLICATION_CREDENTIALS=/app/credentials/credentials.json\n\
|
| 43 |
+
echo "✓ Created credentials file from GOOGLE_APPLICATION_CREDENTIALS"\n\
|
| 44 |
+
fi\n\
|
| 45 |
+
fi\n\
|
| 46 |
+
exec uvicorn app:app --host 0.0.0.0 --port 7860' > /app/start.sh && chmod +x /app/start.sh
|
| 47 |
+
|
| 48 |
+
# Run the application via startup script
|
| 49 |
+
CMD ["/bin/bash", "/app/start.sh"]
|
README.md
CHANGED
|
@@ -1,10 +1,318 @@
|
|
| 1 |
-
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
-
sdk: docker
|
| 7 |
-
pinned: false
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Invoice OCR System
|
| 3 |
+
emoji: 📄
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: indigo
|
| 6 |
+
sdk: docker
|
| 7 |
+
pinned: false
|
| 8 |
+
license: mit
|
| 9 |
+
app_port: 7860
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# 📄 Invoice OCR System
|
| 13 |
+
|
| 14 |
+
**High-accuracy invoice processing** powered by Google Document AI + Gemini 2.0 Flash
|
| 15 |
+
|
| 16 |
+
🔒 **Access Control**: Restricted to authorized users only via Hugging Face OAuth
|
| 17 |
+
|
| 18 |
+
## 🚀 Features
|
| 19 |
+
|
| 20 |
+
✅ **Complete Invoice Extraction**:
|
| 21 |
+
- Supplier information (name, address, tax ID)
|
| 22 |
+
- Customer/Bill-to information
|
| 23 |
+
- Invoice metadata (number, dates, PO, terms)
|
| 24 |
+
- Line items (code, description, type, qty, unit, price, total)
|
| 25 |
+
- Financial summary (subtotal, tax, total)
|
| 26 |
+
- Payment information
|
| 27 |
+
|
| 28 |
+
✅ **Type Classification**: Automatic categorization (produce, protein, beverage, dairy, grain, condiment, cleaning, packaging, miscellaneous)
|
| 29 |
+
✅ **Validation & Correction**: Mandatory math validation with ±2% tolerance
|
| 30 |
+
✅ **Cost Tracking**: Real-time processing cost calculation (~$0.002 per invoice)
|
| 31 |
+
✅ **Image + Text Mode**: Maximum accuracy with dual validation
|
| 32 |
+
✅ **SQLite Database**: Complete invoice history
|
| 33 |
+
✅ **REST API**: FastAPI backend with 7 endpoints
|
| 34 |
+
✅ **Web Interface**: Drag-and-drop upload with instant results
|
| 35 |
+
✅ **Authentication**: User whitelist for secure access
|
| 36 |
+
|
| 37 |
+
---
|
| 38 |
+
|
| 39 |
+
## 💰 Cost Per Invoice
|
| 40 |
+
|
| 41 |
+
| Component | Cost |
|
| 42 |
+
|-----------|------|
|
| 43 |
+
| Document AI OCR | $0.001500 |
|
| 44 |
+
| Gemini 2.0 Flash Input (~2,000 tokens) | $0.000200 |
|
| 45 |
+
| Gemini 2.0 Flash Output (~800 tokens) | $0.000320 |
|
| 46 |
+
| **TOTAL** | **~$0.002** |
|
| 47 |
+
|
| 48 |
+
**Free Tier**: First 1,000 invoices/month FREE!
|
| 49 |
+
|
| 50 |
+
---
|
| 51 |
+
|
| 52 |
+
## 📋 Setup
|
| 53 |
+
|
| 54 |
+
### 1. Install Dependencies
|
| 55 |
+
```bash
|
| 56 |
+
pip install -r requirements.txt
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
### 2. Configure Environment
|
| 60 |
+
Create `.env` file:
|
| 61 |
+
```bash
|
| 62 |
+
PROJECT_ID=your_project_id
|
| 63 |
+
LOCATION=eu
|
| 64 |
+
PROCESSOR_ID=your_processor_id
|
| 65 |
+
GEMINI_API_KEY=your_gemini_key
|
| 66 |
+
GOOGLE_APPLICATION_CREDENTIALS=path_to_credentials.json
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
### 3. Run Server
|
| 70 |
+
```bash
|
| 71 |
+
python app.py
|
| 72 |
+
```
|
| 73 |
+
|
| 74 |
+
### 4. Open Browser
|
| 75 |
+
```
|
| 76 |
+
http://localhost:7860
|
| 77 |
+
```
|
| 78 |
+
|
| 79 |
+
---
|
| 80 |
+
|
| 81 |
+
## 🔧 API Endpoints
|
| 82 |
+
|
| 83 |
+
```
|
| 84 |
+
POST /upload # Upload and process invoice
|
| 85 |
+
GET /invoices # List all invoices
|
| 86 |
+
GET /invoices/{id} # Get specific invoice
|
| 87 |
+
DELETE /invoices/{id} # Delete invoice
|
| 88 |
+
GET /stats # Get statistics
|
| 89 |
+
```
|
| 90 |
+
|
| 91 |
+
---
|
| 92 |
+
|
| 93 |
+
## 📊 Invoice Data Structure
|
| 94 |
+
|
| 95 |
+
```json
|
| 96 |
+
{
|
| 97 |
+
"supplier": {
|
| 98 |
+
"name": "Acme Corp",
|
| 99 |
+
"address": "123 Business St",
|
| 100 |
+
"tax_id": "12-3456789"
|
| 101 |
+
},
|
| 102 |
+
"customer": {
|
| 103 |
+
"name": "Customer Inc",
|
| 104 |
+
"address": "456 Client Ave"
|
| 105 |
+
},
|
| 106 |
+
"invoice_details": {
|
| 107 |
+
"invoice_number": "INV-2025-001",
|
| 108 |
+
"invoice_date": "2025-10-15",
|
| 109 |
+
"due_date": "2025-11-15",
|
| 110 |
+
"payment_terms": "Net 30"
|
| 111 |
+
},
|
| 112 |
+
"line_items": [
|
| 113 |
+
{
|
| 114 |
+
"item_code": "PROD-001",
|
| 115 |
+
"description": "Widget Pro",
|
| 116 |
+
"quantity": 10,
|
| 117 |
+
"unit": "pcs",
|
| 118 |
+
"unit_price": 100.00,
|
| 119 |
+
"total_price": 1000.00
|
| 120 |
+
}
|
| 121 |
+
],
|
| 122 |
+
"financial_summary": {
|
| 123 |
+
"subtotal": 1000.00,
|
| 124 |
+
"tax_amount": 100.00,
|
| 125 |
+
"total_amount": 1100.00,
|
| 126 |
+
"currency": "USD"
|
| 127 |
+
}
|
| 128 |
+
}
|
| 129 |
+
```
|
| 130 |
+
|
| 131 |
+
---
|
| 132 |
+
|
| 133 |
+
## 🚀 Deploy to Hugging Face Spaces (with Authentication)
|
| 134 |
+
|
| 135 |
+
### 1. Create Space
|
| 136 |
+
- Go to https://huggingface.co/new-space
|
| 137 |
+
- **Name**: invoice-ocr (or your choice)
|
| 138 |
+
- **SDK**: Docker
|
| 139 |
+
- **Visibility**: Private or Public (authentication works for both)
|
| 140 |
+
- Click "Create Space"
|
| 141 |
+
|
| 142 |
+
### 2. Clone and Add Files
|
| 143 |
+
```bash
|
| 144 |
+
git clone https://huggingface.co/spaces/YOUR_USERNAME/invoice-ocr
|
| 145 |
+
cd invoice-ocr
|
| 146 |
+
```
|
| 147 |
+
|
| 148 |
+
Copy all files to the cloned directory:
|
| 149 |
+
```bash
|
| 150 |
+
# Copy from your local development directory
|
| 151 |
+
cp Dockerfile requirements.txt app.py ocr_invoice.py cost_tracker.py auth.py allowed_users.txt ./
|
| 152 |
+
cp -r static/ ./
|
| 153 |
+
```
|
| 154 |
+
|
| 155 |
+
### 3. Configure User Whitelist
|
| 156 |
+
|
| 157 |
+
**Option A: Using allowed_users.txt file (Recommended)**
|
| 158 |
+
|
| 159 |
+
Edit `allowed_users.txt` and add authorized Hugging Face usernames (one per line):
|
| 160 |
+
```
|
| 161 |
+
johndoe
|
| 162 |
+
janedoe
|
| 163 |
+
companyuser1
|
| 164 |
+
```
|
| 165 |
+
|
| 166 |
+
**Option B: Using Environment Variable**
|
| 167 |
+
|
| 168 |
+
In Space settings → Variables & Secrets, add:
|
| 169 |
+
```
|
| 170 |
+
ALLOWED_USERS=johndoe,janedoe,companyuser1
|
| 171 |
+
```
|
| 172 |
+
|
| 173 |
+
### 4. Add Google Cloud Secrets
|
| 174 |
+
|
| 175 |
+
In Space settings → Variables & Secrets, add these as **Secrets**:
|
| 176 |
+
|
| 177 |
+
| Name | Value | Example |
|
| 178 |
+
|------|-------|---------|
|
| 179 |
+
| `PROJECT_ID` | Your GCP project ID | `836634225535` |
|
| 180 |
+
| `LOCATION` | Document AI location | `eu` |
|
| 181 |
+
| `PROCESSOR_ID` | Document AI processor ID | `696ca0c7b4383217` |
|
| 182 |
+
| `GEMINI_API_KEY` | Your Gemini API key | `AIza...` |
|
| 183 |
+
| `GOOGLE_APPLICATION_CREDENTIALS` | **Full JSON content** from service account key | `{"type":"service_account","project_id":"..."}` |
|
| 184 |
+
|
| 185 |
+
⚠️ **Important**: For `GOOGLE_APPLICATION_CREDENTIALS`, paste the **entire JSON file content** as the secret value, not the file path.
|
| 186 |
+
|
| 187 |
+
### 5. Deploy
|
| 188 |
+
```bash
|
| 189 |
+
git add .
|
| 190 |
+
git commit -m "Deploy invoice OCR with authentication"
|
| 191 |
+
git push
|
| 192 |
+
```
|
| 193 |
+
|
| 194 |
+
### 6. Enable Authentication (Critical Step)
|
| 195 |
+
|
| 196 |
+
After deployment, in your Space settings:
|
| 197 |
+
|
| 198 |
+
1. Go to **Settings** → **General**
|
| 199 |
+
2. Scroll to **Sign-in with Hugging Face**
|
| 200 |
+
3. **Enable** the toggle
|
| 201 |
+
4. Select **Restricted** (only whitelisted users can access)
|
| 202 |
+
|
| 203 |
+
This enables OAuth login before accessing your Space.
|
| 204 |
+
|
| 205 |
+
### 7. Test Access
|
| 206 |
+
|
| 207 |
+
1. Open your Space URL: `https://huggingface.co/spaces/YOUR_USERNAME/invoice-ocr`
|
| 208 |
+
2. You'll be prompted to sign in with Hugging Face
|
| 209 |
+
3. Only users in your whitelist can access the app
|
| 210 |
+
4. Unauthorized users see "Access Denied" page
|
| 211 |
+
|
| 212 |
+
---
|
| 213 |
+
|
| 214 |
+
## 🔒 Authentication Details
|
| 215 |
+
|
| 216 |
+
### How It Works
|
| 217 |
+
|
| 218 |
+
1. **Hugging Face OAuth**: Users must sign in with their HF account
|
| 219 |
+
2. **Whitelist Check**: `auth.py` checks username against allowed list
|
| 220 |
+
3. **Protected Endpoints**: Upload, invoice retrieval, stats require authentication
|
| 221 |
+
4. **Public Access**: Only the login page and static files are public
|
| 222 |
+
|
| 223 |
+
### Whitelist Format
|
| 224 |
+
|
| 225 |
+
**allowed_users.txt** (supports BOTH usernames AND emails):
|
| 226 |
+
```
|
| 227 |
+
# Hugging Face usernames (if you know them)
|
| 228 |
+
johndoe
|
| 229 |
+
janedoe
|
| 230 |
+
|
| 231 |
+
# OR email addresses (if you don't know usernames)
|
| 232 |
+
john@company.com
|
| 233 |
+
jane@company.com
|
| 234 |
+
|
| 235 |
+
# You can mix both!
|
| 236 |
+
```
|
| 237 |
+
|
| 238 |
+
**Environment Variables**:
|
| 239 |
+
```
|
| 240 |
+
# Option 1: Usernames
|
| 241 |
+
ALLOWED_USERS=johndoe,janedoe,companyuser1
|
| 242 |
+
|
| 243 |
+
# Option 2: Emails
|
| 244 |
+
ALLOWED_EMAILS=john@company.com,jane@company.com
|
| 245 |
+
|
| 246 |
+
# Or use both!
|
| 247 |
+
```
|
| 248 |
+
|
| 249 |
+
💡 **Don't know the username?** Just use their email address! The system checks both.
|
| 250 |
+
|
| 251 |
+
### Protected Endpoints
|
| 252 |
+
|
| 253 |
+
✅ `POST /upload` - Upload invoice
|
| 254 |
+
✅ `GET /invoices` - List invoices
|
| 255 |
+
✅ `GET /invoices/{id}` - Get invoice details
|
| 256 |
+
✅ `GET /invoices/{id}/debug` - Debug data
|
| 257 |
+
✅ `DELETE /invoices/{id}` - Delete invoice
|
| 258 |
+
✅ `GET /stats` - Statistics
|
| 259 |
+
|
| 260 |
+
❌ `GET /` - Public (serves login page)
|
| 261 |
+
❌ `GET /static/*` - Public (CSS, JS, logo)
|
| 262 |
+
|
| 263 |
+
### Local Development
|
| 264 |
+
|
| 265 |
+
Authentication is **disabled** when running locally (not in HF Space):
|
| 266 |
+
```bash
|
| 267 |
+
python app.py # No auth required on localhost:7860
|
| 268 |
+
```
|
| 269 |
+
|
| 270 |
+
To test authentication locally, set:
|
| 271 |
+
```bash
|
| 272 |
+
export SPACE_ID=test
|
| 273 |
+
export ALLOWED_USERS=your_hf_username
|
| 274 |
+
```
|
| 275 |
+
|
| 276 |
+
---
|
| 277 |
+
|
| 278 |
+
## 📈 Performance
|
| 279 |
+
|
| 280 |
+
- **Processing Time**: 2-4 seconds per invoice
|
| 281 |
+
- **Accuracy**: 99%+ text extraction
|
| 282 |
+
- **Cost**: ~$0.002 per invoice
|
| 283 |
+
- **Free Tier**: 1,000 invoices/month FREE
|
| 284 |
+
|
| 285 |
+
---
|
| 286 |
+
|
| 287 |
+
## 🆚 vs Receipt OCR
|
| 288 |
+
|
| 289 |
+
| Feature | Receipt OCR | Invoice OCR |
|
| 290 |
+
|---------|-------------|-------------|
|
| 291 |
+
| **Use Case** | Retail receipts | B2B invoices |
|
| 292 |
+
| **Line Items** | Simple items | Item codes + details |
|
| 293 |
+
| **Complexity** | Low | Medium-High |
|
| 294 |
+
| **Tokens** | ~2,000 | ~2,800 |
|
| 295 |
+
| **Cost** | $0.001881 | $0.002020 |
|
| 296 |
+
| **Fields** | 15-20 | 30-40 |
|
| 297 |
+
|
| 298 |
+
---
|
| 299 |
+
|
| 300 |
+
## 🛠️ Technology Stack
|
| 301 |
+
|
| 302 |
+
- **OCR**: Google Document AI OCR
|
| 303 |
+
- **AI**: Gemini 2.0 Flash (image + text)
|
| 304 |
+
- **Backend**: FastAPI + SQLAlchemy
|
| 305 |
+
- **Database**: SQLite
|
| 306 |
+
- **Frontend**: HTML5 + Vanilla JS
|
| 307 |
+
- **Authentication**: Hugging Face OAuth + Whitelist
|
| 308 |
+
- **Deployment**: Docker + Hugging Face Spaces
|
| 309 |
+
|
| 310 |
+
---
|
| 311 |
+
|
| 312 |
+
## 📝 License
|
| 313 |
+
|
| 314 |
+
MIT License
|
| 315 |
+
|
| 316 |
+
---
|
| 317 |
+
|
| 318 |
+
**Built for accurate B2B invoice processing** 🚀
|
__pycache__/app.cpython-311.pyc
ADDED
|
Binary file (20.5 kB). View file
|
|
|
__pycache__/auth.cpython-311.pyc
ADDED
|
Binary file (9.81 kB). View file
|
|
|
__pycache__/cost_tracker.cpython-311.pyc
ADDED
|
Binary file (6.2 kB). View file
|
|
|
__pycache__/ocr_invoice.cpython-311.pyc
ADDED
|
Binary file (27.2 kB). View file
|
|
|
app.py
ADDED
|
@@ -0,0 +1,438 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
FastAPI Backend for Invoice OCR System
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import json
|
| 7 |
+
import shutil
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from typing import List, Optional
|
| 10 |
+
from datetime import datetime
|
| 11 |
+
|
| 12 |
+
from fastapi import FastAPI, UploadFile, File, HTTPException, Depends, Request
|
| 13 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 14 |
+
from fastapi.staticfiles import StaticFiles
|
| 15 |
+
from fastapi.responses import FileResponse
|
| 16 |
+
from sqlalchemy import create_engine, Column, Integer, String, Float, DateTime, JSON, Text, func
|
| 17 |
+
from sqlalchemy.ext.declarative import declarative_base
|
| 18 |
+
from sqlalchemy.orm import sessionmaker, Session
|
| 19 |
+
|
| 20 |
+
from ocr_invoice import InvoiceOCR
|
| 21 |
+
from cost_tracker import CostTracker
|
| 22 |
+
# Removed auth for Vercel deployment
|
| 23 |
+
|
| 24 |
+
# Initialize FastAPI
|
| 25 |
+
app = FastAPI(title="Invoice OCR API", version="1.0.0")
|
| 26 |
+
|
| 27 |
+
# CORS middleware
|
| 28 |
+
app.add_middleware(
|
| 29 |
+
CORSMiddleware,
|
| 30 |
+
allow_origins=["*"],
|
| 31 |
+
allow_credentials=True,
|
| 32 |
+
allow_methods=["*"],
|
| 33 |
+
allow_headers=["*"],
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
# Database setup
|
| 37 |
+
if os.path.exists("/app"):
|
| 38 |
+
# Hugging Face environment
|
| 39 |
+
DATABASE_URL = "sqlite:////tmp/invoices.db"
|
| 40 |
+
UPLOAD_DIR = Path("/tmp/uploads")
|
| 41 |
+
else:
|
| 42 |
+
# Local environment
|
| 43 |
+
DATABASE_URL = "sqlite:///./invoices.db"
|
| 44 |
+
UPLOAD_DIR = Path("./uploads")
|
| 45 |
+
|
| 46 |
+
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
|
| 47 |
+
|
| 48 |
+
engine = create_engine(DATABASE_URL, connect_args={"check_same_thread": False})
|
| 49 |
+
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
| 50 |
+
Base = declarative_base()
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
# Database Models
|
| 54 |
+
class Invoice(Base):
|
| 55 |
+
__tablename__ = "invoices"
|
| 56 |
+
|
| 57 |
+
id = Column(Integer, primary_key=True, index=True)
|
| 58 |
+
filename = Column(String, index=True)
|
| 59 |
+
|
| 60 |
+
# Supplier info
|
| 61 |
+
supplier_name = Column(String)
|
| 62 |
+
supplier_address = Column(Text)
|
| 63 |
+
|
| 64 |
+
# Customer info
|
| 65 |
+
customer_name = Column(String)
|
| 66 |
+
customer_address = Column(Text)
|
| 67 |
+
|
| 68 |
+
# Invoice details
|
| 69 |
+
invoice_number = Column(String, index=True)
|
| 70 |
+
invoice_date = Column(String)
|
| 71 |
+
due_date = Column(String)
|
| 72 |
+
po_number = Column(String)
|
| 73 |
+
payment_terms = Column(String)
|
| 74 |
+
|
| 75 |
+
# Financial summary
|
| 76 |
+
subtotal = Column(Float)
|
| 77 |
+
tax_amount = Column(Float)
|
| 78 |
+
total_amount = Column(Float)
|
| 79 |
+
currency = Column(String)
|
| 80 |
+
|
| 81 |
+
# Line items (stored as JSON text)
|
| 82 |
+
line_items = Column(Text)
|
| 83 |
+
|
| 84 |
+
# Additional data (stored as JSON text)
|
| 85 |
+
supplier_data = Column(Text)
|
| 86 |
+
customer_data = Column(Text)
|
| 87 |
+
payment_info = Column(Text)
|
| 88 |
+
additional_notes = Column(Text)
|
| 89 |
+
raw_data = Column(Text)
|
| 90 |
+
|
| 91 |
+
# Processing metadata
|
| 92 |
+
processing_cost = Column(Float, default=0.0)
|
| 93 |
+
created_at = Column(DateTime, default=datetime.utcnow)
|
| 94 |
+
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
# Create tables
|
| 98 |
+
Base.metadata.create_all(bind=engine)
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
# Dependency
|
| 102 |
+
def get_db():
|
| 103 |
+
db = SessionLocal()
|
| 104 |
+
try:
|
| 105 |
+
yield db
|
| 106 |
+
finally:
|
| 107 |
+
db.close()
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
# Initialize OCR and Cost Tracker
|
| 111 |
+
ocr_processor = InvoiceOCR(
|
| 112 |
+
project_id=os.getenv("PROJECT_ID"),
|
| 113 |
+
location=os.getenv("LOCATION"),
|
| 114 |
+
processor_id=os.getenv("PROCESSOR_ID"),
|
| 115 |
+
gemini_api_key=os.getenv("GEMINI_API_KEY")
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
cost_tracker = CostTracker()
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
# API Routes
|
| 122 |
+
@app.get("/")
|
| 123 |
+
async def root():
|
| 124 |
+
"""Serve the main HTML page"""
|
| 125 |
+
static_dir = Path(__file__).parent / "static"
|
| 126 |
+
index_file = static_dir / "index.html"
|
| 127 |
+
if index_file.exists():
|
| 128 |
+
return FileResponse(index_file)
|
| 129 |
+
return {"message": "Invoice OCR API", "docs": "/docs"}
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
@app.post("/upload")
|
| 133 |
+
async def upload_invoice(file: UploadFile = File(...), db: Session = Depends(get_db)):
|
| 134 |
+
"""Upload and process an invoice"""
|
| 135 |
+
file_path = None
|
| 136 |
+
try:
|
| 137 |
+
# Save uploaded file
|
| 138 |
+
file_path = UPLOAD_DIR / file.filename
|
| 139 |
+
with open(file_path, "wb") as buffer:
|
| 140 |
+
shutil.copyfileobj(file.file, buffer)
|
| 141 |
+
|
| 142 |
+
# Process with OCR
|
| 143 |
+
print(f"Processing: {file.filename}")
|
| 144 |
+
invoice_data = ocr_processor.process_invoice(str(file_path), save_json=False)
|
| 145 |
+
|
| 146 |
+
if "error" in invoice_data:
|
| 147 |
+
error_msg = invoice_data.get("error", "Unknown error")
|
| 148 |
+
raw_response = invoice_data.get("raw_response", "")
|
| 149 |
+
print(f"⚠ Invoice processing error: {error_msg}")
|
| 150 |
+
if raw_response:
|
| 151 |
+
print(f"Raw response: {raw_response[:500]}")
|
| 152 |
+
raise HTTPException(status_code=500, detail=f"OCR Error: {error_msg}")
|
| 153 |
+
|
| 154 |
+
# Extract metadata for cost calculation
|
| 155 |
+
metadata = invoice_data.pop("_processing_metadata", {})
|
| 156 |
+
raw_text = metadata.get("raw_text", "")
|
| 157 |
+
includes_image = metadata.get("includes_image", True)
|
| 158 |
+
|
| 159 |
+
# Calculate processing cost
|
| 160 |
+
costs = cost_tracker.calculate_invoice_cost(
|
| 161 |
+
input_text=raw_text,
|
| 162 |
+
output_text=json.dumps(invoice_data),
|
| 163 |
+
includes_image=includes_image
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
# Extract data
|
| 167 |
+
supplier = invoice_data.get("supplier", {})
|
| 168 |
+
customer = invoice_data.get("customer", {})
|
| 169 |
+
inv_details = invoice_data.get("invoice_details", {})
|
| 170 |
+
financial = invoice_data.get("financial_summary", {})
|
| 171 |
+
line_items = invoice_data.get("line_items", [])
|
| 172 |
+
|
| 173 |
+
# Calculate totals from line items if not provided
|
| 174 |
+
if line_items:
|
| 175 |
+
calculated_subtotal = sum(item.get("total_price", 0) for item in line_items)
|
| 176 |
+
|
| 177 |
+
# If financial summary is missing or incomplete, calculate it
|
| 178 |
+
if not financial or not isinstance(financial, dict):
|
| 179 |
+
financial = {}
|
| 180 |
+
|
| 181 |
+
# Use calculated subtotal if not provided
|
| 182 |
+
if not financial.get("subtotal"):
|
| 183 |
+
financial["subtotal"] = round(calculated_subtotal, 2)
|
| 184 |
+
|
| 185 |
+
# Calculate tax if not provided (assume 0 if not specified)
|
| 186 |
+
if not financial.get("tax_amount"):
|
| 187 |
+
financial["tax_amount"] = 0.0
|
| 188 |
+
|
| 189 |
+
# Calculate total_amount if not provided
|
| 190 |
+
if not financial.get("total_amount"):
|
| 191 |
+
financial["total_amount"] = round(financial.get("subtotal", 0) + financial.get("tax_amount", 0), 2)
|
| 192 |
+
|
| 193 |
+
# Set currency if not provided
|
| 194 |
+
if not financial.get("currency"):
|
| 195 |
+
financial["currency"] = "EUR"
|
| 196 |
+
|
| 197 |
+
print(f"✓ Financial summary calculated:")
|
| 198 |
+
print(f" Subtotal: {financial.get('subtotal')} (from {len(line_items)} line items)")
|
| 199 |
+
print(f" Tax: {financial.get('tax_amount')}")
|
| 200 |
+
print(f" Total: {financial.get('total_amount')}")
|
| 201 |
+
|
| 202 |
+
# Handle both old format (nested objects) and new format (simple strings)
|
| 203 |
+
# If supplier is a string, convert to object format
|
| 204 |
+
if isinstance(supplier, str):
|
| 205 |
+
supplier = {"name": supplier, "address": "", "phone": "", "email": "", "tax_id": "", "registration_number": ""}
|
| 206 |
+
if isinstance(customer, str):
|
| 207 |
+
customer = {"name": customer, "address": "", "phone": "", "email": ""}
|
| 208 |
+
|
| 209 |
+
# Convert to JSON strings for storage
|
| 210 |
+
line_items_json = json.dumps(line_items)
|
| 211 |
+
supplier_json = json.dumps(supplier)
|
| 212 |
+
customer_json = json.dumps(customer)
|
| 213 |
+
payment_json = json.dumps(invoice_data.get("payment_info", {}))
|
| 214 |
+
raw_json = json.dumps(invoice_data)
|
| 215 |
+
|
| 216 |
+
# Save to database
|
| 217 |
+
db_invoice = Invoice(
|
| 218 |
+
filename=file.filename,
|
| 219 |
+
supplier_name=supplier.get("name", "") if isinstance(supplier, dict) else str(supplier),
|
| 220 |
+
supplier_address=supplier.get("address", "") if isinstance(supplier, dict) else "",
|
| 221 |
+
customer_name=customer.get("name", "") if isinstance(customer, dict) else str(customer),
|
| 222 |
+
customer_address=customer.get("address", "") if isinstance(customer, dict) else "",
|
| 223 |
+
invoice_number=inv_details.get("invoice_number", "") if isinstance(inv_details, dict) else str(invoice_data.get("invoice_number", "")),
|
| 224 |
+
invoice_date=inv_details.get("invoice_date") if isinstance(inv_details, dict) else invoice_data.get("invoice_date") or invoice_data.get("date"),
|
| 225 |
+
due_date=inv_details.get("due_date") if isinstance(inv_details, dict) else None,
|
| 226 |
+
po_number=inv_details.get("po_number") if isinstance(inv_details, dict) else None,
|
| 227 |
+
payment_terms=inv_details.get("payment_terms") if isinstance(inv_details, dict) else None,
|
| 228 |
+
subtotal=financial.get("subtotal") if isinstance(financial, dict) else None,
|
| 229 |
+
tax_amount=financial.get("tax_amount") if isinstance(financial, dict) else None,
|
| 230 |
+
total_amount=financial.get("total_amount") if isinstance(financial, dict) else None,
|
| 231 |
+
currency=financial.get("currency", "") if isinstance(financial, dict) else "EUR",
|
| 232 |
+
line_items=line_items_json,
|
| 233 |
+
supplier_data=supplier_json,
|
| 234 |
+
customer_data=customer_json,
|
| 235 |
+
payment_info=payment_json,
|
| 236 |
+
additional_notes=str(invoice_data.get("additional_notes", "")),
|
| 237 |
+
raw_data=raw_json,
|
| 238 |
+
processing_cost=costs["total"]
|
| 239 |
+
)
|
| 240 |
+
|
| 241 |
+
db.add(db_invoice)
|
| 242 |
+
db.commit()
|
| 243 |
+
db.refresh(db_invoice)
|
| 244 |
+
|
| 245 |
+
# Return response with proper cost structure
|
| 246 |
+
return {
|
| 247 |
+
"success": True,
|
| 248 |
+
"invoice": {
|
| 249 |
+
"id": db_invoice.id,
|
| 250 |
+
"filename": db_invoice.filename,
|
| 251 |
+
"supplier_data": supplier_json,
|
| 252 |
+
"customer_data": customer_json,
|
| 253 |
+
"invoice_details": json.dumps(inv_details),
|
| 254 |
+
"line_items": line_items_json,
|
| 255 |
+
"financial_summary": json.dumps(financial),
|
| 256 |
+
"payment_info": payment_json,
|
| 257 |
+
"additional_notes": db_invoice.additional_notes,
|
| 258 |
+
"processing_cost": db_invoice.processing_cost
|
| 259 |
+
},
|
| 260 |
+
"costs": {
|
| 261 |
+
"total_cost": costs['total'],
|
| 262 |
+
"document_ai_cost": costs['document_ai'],
|
| 263 |
+
"gemini_input_cost": costs['gemini_input'],
|
| 264 |
+
"gemini_output_cost": costs['gemini_output'],
|
| 265 |
+
"gemini_input_tokens": costs['tokens']['input'],
|
| 266 |
+
"gemini_output_tokens": costs['tokens']['output']
|
| 267 |
+
}
|
| 268 |
+
}
|
| 269 |
+
|
| 270 |
+
except HTTPException:
|
| 271 |
+
raise
|
| 272 |
+
except Exception as e:
|
| 273 |
+
import traceback
|
| 274 |
+
print(f"⚠ Error processing invoice:")
|
| 275 |
+
print(traceback.format_exc())
|
| 276 |
+
if file_path and file_path.exists():
|
| 277 |
+
try:
|
| 278 |
+
file_path.unlink()
|
| 279 |
+
except:
|
| 280 |
+
pass
|
| 281 |
+
raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")
|
| 282 |
+
|
| 283 |
+
|
| 284 |
+
@app.get("/invoices")
|
| 285 |
+
async def get_invoices(limit: int = 10, db: Session = Depends(get_db)):
|
| 286 |
+
"""Get list of processed invoices"""
|
| 287 |
+
invoices = db.query(Invoice).order_by(Invoice.created_at.desc()).limit(limit).all()
|
| 288 |
+
|
| 289 |
+
result = []
|
| 290 |
+
for inv in invoices:
|
| 291 |
+
try:
|
| 292 |
+
line_items = json.loads(inv.line_items) if inv.line_items else []
|
| 293 |
+
items_count = len(line_items)
|
| 294 |
+
except:
|
| 295 |
+
items_count = 0
|
| 296 |
+
|
| 297 |
+
# Build invoice_details JSON
|
| 298 |
+
invoice_details_json = json.dumps({
|
| 299 |
+
"invoice_number": inv.invoice_number,
|
| 300 |
+
"invoice_date": inv.invoice_date,
|
| 301 |
+
"due_date": inv.due_date,
|
| 302 |
+
"po_number": inv.po_number,
|
| 303 |
+
"payment_terms": inv.payment_terms
|
| 304 |
+
})
|
| 305 |
+
|
| 306 |
+
# Build financial_summary JSON
|
| 307 |
+
financial_summary_json = json.dumps({
|
| 308 |
+
"subtotal": inv.subtotal,
|
| 309 |
+
"tax_amount": inv.tax_amount,
|
| 310 |
+
"total_amount": inv.total_amount,
|
| 311 |
+
"currency": inv.currency
|
| 312 |
+
})
|
| 313 |
+
|
| 314 |
+
result.append({
|
| 315 |
+
"id": inv.id,
|
| 316 |
+
"filename": inv.filename,
|
| 317 |
+
"supplier_name": inv.supplier_name,
|
| 318 |
+
"customer_name": inv.customer_name,
|
| 319 |
+
"invoice_number": inv.invoice_number,
|
| 320 |
+
"invoice_date": inv.invoice_date,
|
| 321 |
+
"due_date": inv.due_date,
|
| 322 |
+
"total_amount": inv.total_amount,
|
| 323 |
+
"currency": inv.currency,
|
| 324 |
+
"items_count": items_count,
|
| 325 |
+
"processing_cost": inv.processing_cost,
|
| 326 |
+
"created_at": inv.created_at.isoformat() if inv.created_at else None,
|
| 327 |
+
# Add JSON fields needed by frontend
|
| 328 |
+
"invoice_details": invoice_details_json,
|
| 329 |
+
"financial_summary": financial_summary_json,
|
| 330 |
+
"supplier_data": inv.supplier_data
|
| 331 |
+
})
|
| 332 |
+
|
| 333 |
+
return result
|
| 334 |
+
@app.get("/invoices/{invoice_id}/debug")
|
| 335 |
+
async def debug_invoice(invoice_id: int, db: Session = Depends(get_db)):
|
| 336 |
+
"""Debug endpoint to see raw extracted data"""
|
| 337 |
+
invoice = db.query(Invoice).filter(Invoice.id == invoice_id).first()
|
| 338 |
+
|
| 339 |
+
if not invoice:
|
| 340 |
+
raise HTTPException(status_code=404, detail="Invoice not found")
|
| 341 |
+
|
| 342 |
+
line_items = json.loads(invoice.line_items) if invoice.line_items else []
|
| 343 |
+
|
| 344 |
+
return {
|
| 345 |
+
"filename": invoice.filename,
|
| 346 |
+
"line_items": line_items,
|
| 347 |
+
"items_count": len(line_items)
|
| 348 |
+
}
|
| 349 |
+
|
| 350 |
+
|
| 351 |
+
@app.get("/invoices/{invoice_id}")
|
| 352 |
+
async def get_invoice(invoice_id: int, db: Session = Depends(get_db)):
|
| 353 |
+
"""Get detailed invoice data"""
|
| 354 |
+
invoice = db.query(Invoice).filter(Invoice.id == invoice_id).first()
|
| 355 |
+
|
| 356 |
+
if not invoice:
|
| 357 |
+
raise HTTPException(status_code=404, detail="Invoice not found")
|
| 358 |
+
|
| 359 |
+
# Build invoice_details as JSON string
|
| 360 |
+
invoice_details_json = json.dumps({
|
| 361 |
+
"invoice_number": invoice.invoice_number,
|
| 362 |
+
"invoice_date": invoice.invoice_date,
|
| 363 |
+
"due_date": invoice.due_date,
|
| 364 |
+
"po_number": invoice.po_number,
|
| 365 |
+
"payment_terms": invoice.payment_terms
|
| 366 |
+
})
|
| 367 |
+
|
| 368 |
+
# Build financial_summary as JSON string
|
| 369 |
+
financial_summary_json = json.dumps({
|
| 370 |
+
"subtotal": invoice.subtotal,
|
| 371 |
+
"tax_amount": invoice.tax_amount,
|
| 372 |
+
"total_amount": invoice.total_amount,
|
| 373 |
+
"currency": invoice.currency
|
| 374 |
+
})
|
| 375 |
+
|
| 376 |
+
return {
|
| 377 |
+
"invoice": {
|
| 378 |
+
"id": invoice.id,
|
| 379 |
+
"filename": invoice.filename,
|
| 380 |
+
"supplier_data": invoice.supplier_data,
|
| 381 |
+
"customer_data": invoice.customer_data,
|
| 382 |
+
"invoice_details": invoice_details_json,
|
| 383 |
+
"line_items": invoice.line_items,
|
| 384 |
+
"financial_summary": financial_summary_json,
|
| 385 |
+
"payment_info": invoice.payment_info,
|
| 386 |
+
"currency": invoice.currency
|
| 387 |
+
},
|
| 388 |
+
"costs": {
|
| 389 |
+
"document_ai_cost": 0.0015,
|
| 390 |
+
"gemini_input_tokens": 0,
|
| 391 |
+
"gemini_input_cost": 0.0,
|
| 392 |
+
"gemini_output_tokens": 0,
|
| 393 |
+
"gemini_output_cost": 0.0,
|
| 394 |
+
"total_cost": invoice.processing_cost
|
| 395 |
+
}
|
| 396 |
+
}
|
| 397 |
+
@app.delete("/invoices/{invoice_id}")
|
| 398 |
+
async def delete_invoice(invoice_id: int, db: Session = Depends(get_db)):
|
| 399 |
+
"""Delete an invoice"""
|
| 400 |
+
invoice = db.query(Invoice).filter(Invoice.id == invoice_id).first()
|
| 401 |
+
|
| 402 |
+
if not invoice:
|
| 403 |
+
raise HTTPException(status_code=404, detail="Invoice not found")
|
| 404 |
+
|
| 405 |
+
db.delete(invoice)
|
| 406 |
+
db.commit()
|
| 407 |
+
|
| 408 |
+
return {"success": True, "message": "Invoice deleted"}
|
| 409 |
+
|
| 410 |
+
|
| 411 |
+
@app.get("/stats")
|
| 412 |
+
async def get_stats(db: Session = Depends(get_db)):
|
| 413 |
+
"""Get statistics"""
|
| 414 |
+
total_invoices = db.query(Invoice).count()
|
| 415 |
+
total_amount = db.query(Invoice).with_entities(
|
| 416 |
+
func.sum(Invoice.total_amount)
|
| 417 |
+
).scalar() or 0
|
| 418 |
+
total_cost = db.query(Invoice).with_entities(
|
| 419 |
+
func.sum(Invoice.processing_cost)
|
| 420 |
+
).scalar() or 0
|
| 421 |
+
|
| 422 |
+
return {
|
| 423 |
+
"total_invoices": total_invoices,
|
| 424 |
+
"total_invoice_amount": round(total_amount, 2),
|
| 425 |
+
"total_processing_cost": round(total_cost, 6),
|
| 426 |
+
"average_cost_per_invoice": round(total_cost / total_invoices, 6) if total_invoices > 0 else 0
|
| 427 |
+
}
|
| 428 |
+
|
| 429 |
+
|
| 430 |
+
# Mount static files
|
| 431 |
+
static_dir = Path(__file__).parent / "static"
|
| 432 |
+
if static_dir.exists():
|
| 433 |
+
app.mount("/static", StaticFiles(directory=str(static_dir)), name="static")
|
| 434 |
+
|
| 435 |
+
|
| 436 |
+
if __name__ == "__main__":
|
| 437 |
+
import uvicorn
|
| 438 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|
cost_tracker.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Cost tracking utility for Invoice OCR system
|
| 3 |
+
Tracks Document AI and Gemini API usage costs
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from typing import Dict, Any
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
import json
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class CostTracker:
|
| 13 |
+
"""Track and calculate costs for Document AI + Gemini usage"""
|
| 14 |
+
|
| 15 |
+
# Pricing (as of 2025)
|
| 16 |
+
# Document AI OCR: $1.50 per 1,000 pages = $0.0015 per page
|
| 17 |
+
DOCUMENT_AI_PER_PAGE = 0.0015 # Document OCR: $1.50 per 1,000 pages
|
| 18 |
+
# Gemini 2.0 Flash (includes text, images, and videos)
|
| 19 |
+
GEMINI_INPUT_PER_TOKEN = 0.10 / 1_000_000 # $0.10 per 1M tokens (input)
|
| 20 |
+
GEMINI_OUTPUT_PER_TOKEN = 0.40 / 1_000_000 # $0.40 per 1M tokens (output)
|
| 21 |
+
|
| 22 |
+
def __init__(self):
|
| 23 |
+
self.usage_log = []
|
| 24 |
+
self.log_file = Path("usage_costs.json")
|
| 25 |
+
self.load_usage()
|
| 26 |
+
|
| 27 |
+
def estimate_tokens(self, text: str) -> int:
|
| 28 |
+
"""Estimate token count (roughly 1 token per 4 characters)"""
|
| 29 |
+
return len(text) // 4
|
| 30 |
+
|
| 31 |
+
def calculate_invoice_cost(
|
| 32 |
+
self,
|
| 33 |
+
input_tokens: int = None,
|
| 34 |
+
output_tokens: int = None,
|
| 35 |
+
input_text: str = None,
|
| 36 |
+
output_text: str = None,
|
| 37 |
+
includes_image: bool = True
|
| 38 |
+
) -> Dict[str, float]:
|
| 39 |
+
"""
|
| 40 |
+
Calculate cost for processing one invoice
|
| 41 |
+
|
| 42 |
+
Args:
|
| 43 |
+
input_tokens: Number of input tokens (if known)
|
| 44 |
+
output_tokens: Number of output tokens (if known)
|
| 45 |
+
input_text: Input text to estimate tokens from
|
| 46 |
+
output_text: Output text to estimate tokens from
|
| 47 |
+
includes_image: Whether image is sent to Gemini (adds ~258 tokens)
|
| 48 |
+
|
| 49 |
+
Returns:
|
| 50 |
+
Dictionary with cost breakdown
|
| 51 |
+
"""
|
| 52 |
+
# Estimate tokens if not provided
|
| 53 |
+
if input_tokens is None and input_text:
|
| 54 |
+
input_tokens = self.estimate_tokens(input_text)
|
| 55 |
+
# Add image tokens if image is included
|
| 56 |
+
if includes_image:
|
| 57 |
+
input_tokens += 258 # Approximate tokens for image vision
|
| 58 |
+
if output_tokens is None and output_text:
|
| 59 |
+
output_tokens = self.estimate_tokens(json.dumps(output_text))
|
| 60 |
+
|
| 61 |
+
# Default estimates if nothing provided (invoices typically larger)
|
| 62 |
+
input_tokens = input_tokens or 2000 # Invoices have more text
|
| 63 |
+
output_tokens = output_tokens or 800 # More line items
|
| 64 |
+
|
| 65 |
+
# Calculate costs
|
| 66 |
+
docai_cost = self.DOCUMENT_AI_PER_PAGE
|
| 67 |
+
gemini_input_cost = input_tokens * self.GEMINI_INPUT_PER_TOKEN
|
| 68 |
+
gemini_output_cost = output_tokens * self.GEMINI_OUTPUT_PER_TOKEN
|
| 69 |
+
gemini_total = gemini_input_cost + gemini_output_cost
|
| 70 |
+
total_cost = docai_cost + gemini_total
|
| 71 |
+
|
| 72 |
+
return {
|
| 73 |
+
"document_ai": docai_cost,
|
| 74 |
+
"gemini_input": gemini_input_cost,
|
| 75 |
+
"gemini_output": gemini_output_cost,
|
| 76 |
+
"gemini_total": gemini_total,
|
| 77 |
+
"total": total_cost,
|
| 78 |
+
"tokens": {
|
| 79 |
+
"input": input_tokens,
|
| 80 |
+
"output": output_tokens,
|
| 81 |
+
"total": input_tokens + output_tokens
|
| 82 |
+
}
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
def print_invoice_cost(self, costs: Dict[str, float]):
|
| 86 |
+
"""Pretty print cost for an invoice"""
|
| 87 |
+
print("\n" + "="*70)
|
| 88 |
+
print("💰 INVOICE PROCESSING COST")
|
| 89 |
+
print("="*70)
|
| 90 |
+
print(f"Document AI OCR: ${costs['document_ai']:.6f}")
|
| 91 |
+
print(f"Gemini Input: ${costs['gemini_input']:.6f} ({costs['tokens']['input']:,} tokens)")
|
| 92 |
+
print(f"Gemini Output: ${costs['gemini_output']:.6f} ({costs['tokens']['output']:,} tokens)")
|
| 93 |
+
print("-" * 70)
|
| 94 |
+
print(f"TOTAL COST: ${costs['total']:.6f} ({costs['tokens']['total']:,} tokens)")
|
| 95 |
+
print("="*70 + "\n")
|
| 96 |
+
|
| 97 |
+
def save_usage(self):
|
| 98 |
+
"""Save usage log to file"""
|
| 99 |
+
try:
|
| 100 |
+
with open(self.log_file, 'w') as f:
|
| 101 |
+
json.dump(self.usage_log, f, indent=2)
|
| 102 |
+
except Exception as e:
|
| 103 |
+
print(f"Warning: Could not save usage log: {e}")
|
| 104 |
+
|
| 105 |
+
def load_usage(self):
|
| 106 |
+
"""Load usage log from file"""
|
| 107 |
+
try:
|
| 108 |
+
if self.log_file.exists():
|
| 109 |
+
with open(self.log_file, 'r') as f:
|
| 110 |
+
self.usage_log = json.load(f)
|
| 111 |
+
except Exception as e:
|
| 112 |
+
print(f"Warning: Could not load usage log: {e}")
|
| 113 |
+
self.usage_log = []
|
invoices.db
ADDED
|
Binary file (65.5 kB). View file
|
|
|
ocr_invoice.py
ADDED
|
@@ -0,0 +1,616 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Enhanced Invoice OCR using Google Document AI + Gemini AI
|
| 3 |
+
Extracts structured invoice data with 100% accuracy
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import json
|
| 8 |
+
import io
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from typing import Dict, Any, Optional, List
|
| 11 |
+
from google.cloud import documentai_v1 as documentai
|
| 12 |
+
from google.api_core.client_options import ClientOptions
|
| 13 |
+
import google.generativeai as genai
|
| 14 |
+
from dotenv import load_dotenv
|
| 15 |
+
|
| 16 |
+
# Load environment variables
|
| 17 |
+
load_dotenv()
|
| 18 |
+
|
| 19 |
+
# Configuration
|
| 20 |
+
PROJECT_ID = os.getenv("PROJECT_ID")
|
| 21 |
+
LOCATION = os.getenv("LOCATION")
|
| 22 |
+
PROCESSOR_ID = os.getenv("PROCESSOR_ID")
|
| 23 |
+
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
|
| 24 |
+
GOOGLE_CREDS = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
|
| 25 |
+
|
| 26 |
+
# Validate environment variables
|
| 27 |
+
if not all([PROJECT_ID, LOCATION, PROCESSOR_ID, GEMINI_API_KEY]):
|
| 28 |
+
print("⚠ Warning: Missing environment variables")
|
| 29 |
+
print(
|
| 30 |
+
"Please ensure PROJECT_ID, LOCATION, PROCESSOR_ID, and GEMINI_API_KEY are set in .env file"
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class InvoiceOCR:
|
| 35 |
+
"""Enhanced Invoice OCR using Document AI + Gemini"""
|
| 36 |
+
|
| 37 |
+
def __init__(self, project_id: str, location: str, processor_id: str, gemini_api_key: str):
|
| 38 |
+
self.project_id = project_id
|
| 39 |
+
self.location = location
|
| 40 |
+
self.processor_id = processor_id
|
| 41 |
+
self.processor_name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
|
| 42 |
+
|
| 43 |
+
# Initialize Document AI client
|
| 44 |
+
self._init_document_ai()
|
| 45 |
+
|
| 46 |
+
# Initialize Gemini
|
| 47 |
+
genai.configure(api_key=gemini_api_key)
|
| 48 |
+
self.gemini_model = genai.GenerativeModel(
|
| 49 |
+
'gemini-2.0-flash',
|
| 50 |
+
generation_config={
|
| 51 |
+
"temperature": 0.1,
|
| 52 |
+
"top_p": 0.8,
|
| 53 |
+
"top_k": 20,
|
| 54 |
+
"max_output_tokens": 8192, # Increased for longer invoices
|
| 55 |
+
"response_mime_type": "application/json", # Request JSON format
|
| 56 |
+
}
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
print("✓ Invoice OCR processor initialized successfully")
|
| 60 |
+
|
| 61 |
+
def _init_document_ai(self):
|
| 62 |
+
"""Initialize Document AI client with credentials"""
|
| 63 |
+
# Check if GOOGLE_CREDS is JSON content or file path
|
| 64 |
+
if GOOGLE_CREDS and GOOGLE_CREDS.strip().startswith('{'):
|
| 65 |
+
# It's JSON content (from Hugging Face secret)
|
| 66 |
+
try:
|
| 67 |
+
import json
|
| 68 |
+
from google.oauth2 import service_account
|
| 69 |
+
|
| 70 |
+
creds_dict = json.loads(GOOGLE_CREDS)
|
| 71 |
+
credentials = service_account.Credentials.from_service_account_info(creds_dict)
|
| 72 |
+
|
| 73 |
+
opts = ClientOptions(api_endpoint=f"{self.location}-documentai.googleapis.com")
|
| 74 |
+
self.docai_client = documentai.DocumentProcessorServiceClient(
|
| 75 |
+
client_options=opts,
|
| 76 |
+
credentials=credentials
|
| 77 |
+
)
|
| 78 |
+
print(f"✓ Using Google Cloud credentials from JSON content")
|
| 79 |
+
return
|
| 80 |
+
except Exception as e:
|
| 81 |
+
print(f"⚠ Could not load credentials from JSON: {e}")
|
| 82 |
+
|
| 83 |
+
# Try file path
|
| 84 |
+
if GOOGLE_CREDS and Path(GOOGLE_CREDS).exists():
|
| 85 |
+
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = GOOGLE_CREDS
|
| 86 |
+
opts = ClientOptions(api_endpoint=f"{self.location}-documentai.googleapis.com")
|
| 87 |
+
self.docai_client = documentai.DocumentProcessorServiceClient(client_options=opts)
|
| 88 |
+
print(f"✓ Using Google Cloud credentials from file: {GOOGLE_CREDS}")
|
| 89 |
+
else:
|
| 90 |
+
# Use default credentials
|
| 91 |
+
opts = ClientOptions(api_endpoint=f"{self.location}-documentai.googleapis.com")
|
| 92 |
+
self.docai_client = documentai.DocumentProcessorServiceClient(client_options=opts)
|
| 93 |
+
print("✓ Using default Google Cloud credentials")
|
| 94 |
+
|
| 95 |
+
def process_with_document_ai(self, file_path: str) -> str:
|
| 96 |
+
"""Extract text from invoice using Document AI OCR"""
|
| 97 |
+
with open(file_path, "rb") as file:
|
| 98 |
+
file_content = file.read()
|
| 99 |
+
|
| 100 |
+
# Detect MIME type
|
| 101 |
+
mime_type = self._detect_mime_type(file_path)
|
| 102 |
+
|
| 103 |
+
# Create Document AI request
|
| 104 |
+
raw_document = documentai.RawDocument(
|
| 105 |
+
content=file_content,
|
| 106 |
+
mime_type=mime_type
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
request = documentai.ProcessRequest(
|
| 110 |
+
name=self.processor_name,
|
| 111 |
+
raw_document=raw_document
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
# Process document
|
| 115 |
+
result = self.docai_client.process_document(request=request)
|
| 116 |
+
document = result.document
|
| 117 |
+
|
| 118 |
+
return document.text
|
| 119 |
+
|
| 120 |
+
def process_with_gemini_vision(self, file_path: str, raw_text: str = None) -> Dict[str, Any]:
|
| 121 |
+
"""
|
| 122 |
+
Use Gemini AI to analyze invoice with both image and text
|
| 123 |
+
"""
|
| 124 |
+
# Create the prompt for Gemini
|
| 125 |
+
prompt = self._create_gemini_prompt(raw_text)
|
| 126 |
+
|
| 127 |
+
# Read and prepare the image
|
| 128 |
+
if file_path and Path(file_path).exists():
|
| 129 |
+
try:
|
| 130 |
+
with open(file_path, 'rb') as f:
|
| 131 |
+
image_bytes = f.read()
|
| 132 |
+
|
| 133 |
+
# Import PIL for image handling
|
| 134 |
+
from PIL import Image
|
| 135 |
+
image = Image.open(io.BytesIO(image_bytes))
|
| 136 |
+
|
| 137 |
+
# Generate response with IMAGE + TEXT
|
| 138 |
+
response = self.gemini_model.generate_content([prompt, image])
|
| 139 |
+
|
| 140 |
+
except Exception as e:
|
| 141 |
+
print(f"Warning: Could not load image, falling back to text-only: {e}")
|
| 142 |
+
response = self.gemini_model.generate_content(prompt)
|
| 143 |
+
else:
|
| 144 |
+
response = self.gemini_model.generate_content(prompt)
|
| 145 |
+
|
| 146 |
+
# Parse JSON from response
|
| 147 |
+
return self._parse_gemini_response(response.text)
|
| 148 |
+
|
| 149 |
+
def _create_gemini_prompt(self, raw_text: Optional[str] = None) -> str:
|
| 150 |
+
"""Create a detailed prompt for Gemini to extract invoice data"""
|
| 151 |
+
prompt = """CRITICAL INSTRUCTIONS:
|
| 152 |
+
You are a multilingual AI specialized in extracting and structuring invoice data from text and image.
|
| 153 |
+
You must keep all field names in English but preserve content in the same language as the document (e.g., Italian item descriptions).
|
| 154 |
+
|
| 155 |
+
Your task: return valid JSON with this EXACT structure:
|
| 156 |
+
{
|
| 157 |
+
"supplier": {"name": "...", "address": "...", "phone": "...", "email": "...", "tax_id": "..."},
|
| 158 |
+
"customer": {"name": "...", "address": "...", "phone": "...", "email": "..."},
|
| 159 |
+
"invoice_details": {
|
| 160 |
+
"invoice_number": "...", // REQUIRED: Invoice number (e.g., "FT 123/2024", "INV-001")
|
| 161 |
+
"invoice_date": "YYYY-MM-DD", // REQUIRED: Invoice date in ISO format
|
| 162 |
+
"due_date": "YYYY-MM-DD", // Payment due date if shown
|
| 163 |
+
"po_number": "...", // Purchase order number if shown
|
| 164 |
+
"payment_terms": "..." // Payment terms if shown (e.g., "30 days", "Net 15")
|
| 165 |
+
},
|
| 166 |
+
"line_items": [...],
|
| 167 |
+
"financial_summary": {...}
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
**CRITICAL**: ALWAYS extract invoice_number and invoice_date. These are typically at the TOP of the invoice.
|
| 171 |
+
Look for labels like: "Invoice", "Fattura", "N.", "Nr.", "Date", "Data", "Del", etc.
|
| 172 |
+
|
| 173 |
+
---
|
| 174 |
+
|
| 175 |
+
📘 STEP 1: EXTRACT INVOICE HEADER (MOST IMPORTANT)
|
| 176 |
+
1. **Invoice Number**: Look at the top of the document for:
|
| 177 |
+
- "Fattura N." / "Invoice No." / "N." / "Nr." / "Numero Fattura"
|
| 178 |
+
- Usually near the top, often bold or prominent
|
| 179 |
+
- Extract the full number (e.g., "FT 123/2024", "INV-001", "2024/123")
|
| 180 |
+
|
| 181 |
+
2. **Invoice Date**: Look for:
|
| 182 |
+
- "Data" / "Date" / "Del" / "Data fattura" / "Invoice Date"
|
| 183 |
+
- Usually near the invoice number
|
| 184 |
+
- Convert to YYYY-MM-DD format (e.g., "09/04/2025" → "2025-04-09")
|
| 185 |
+
|
| 186 |
+
3. **Due Date**: Look for:
|
| 187 |
+
- "Scadenza" / "Due Date" / "Data scadenza"
|
| 188 |
+
- Convert to YYYY-MM-DD format
|
| 189 |
+
|
| 190 |
+
NEVER leave invoice_number or invoice_date empty if visible in the image!
|
| 191 |
+
|
| 192 |
+
---
|
| 193 |
+
|
| 194 |
+
📘 STEP 2: Read the TABLE STRUCTURE
|
| 195 |
+
1. Detect columns such as "Q.tà", "Quantità", "UM", "Prezzo", "Importo", "Totale".
|
| 196 |
+
2. Use these to align each value.
|
| 197 |
+
3. Extract numeric values exactly as printed, respecting decimal commas or dots.
|
| 198 |
+
4. Confirm values visually in the image (columns, alignment).
|
| 199 |
+
|
| 200 |
+
---
|
| 201 |
+
|
| 202 |
+
📗 STEP 3: Quantity Logic
|
| 203 |
+
If the quantity column is missing, blurred, or unclear:
|
| 204 |
+
1. Compute **quantity = total_price ÷ unit_price** .
|
| 205 |
+
2. Verify this matches the product's description (e.g. "5 KG", "12 PZ", "3 LT").
|
| 206 |
+
3. If both visible and computed quantities differ, prefer the one that visually aligns in the image.
|
| 207 |
+
|
| 208 |
+
---
|
| 209 |
+
|
| 210 |
+
📕 STEP 4: Infer the "type" in the SAME LANGUAGE as the invoice
|
| 211 |
+
Use the product name to classify into a natural, short category term in that language.
|
| 212 |
+
|
| 213 |
+
**Italian examples:**
|
| 214 |
+
- carne, pollo, manzo, prosciutto, salsiccia, pesce, gamberoni, scampi → `"carne"` or `"pesce"`
|
| 215 |
+
- pomodoro, cicoria, patate, verdure, funghi, frutta → `"vegetale"`
|
| 216 |
+
- latte, panna, burro, formaggio → `"latticino"`
|
| 217 |
+
- farina, riso, pasta, zucchero, sale, spezie → `"dispensa"`
|
| 218 |
+
- bottiglia, cartone, imballo, contenitore, alluminio → `"imballaggio"`
|
| 219 |
+
- acqua, bibita, vino → `"bevanda"`
|
| 220 |
+
If nothing fits, use `"altro"`.
|
| 221 |
+
|
| 222 |
+
Return this field exactly as one short lowercase word in the invoice language.
|
| 223 |
+
|
| 224 |
+
---
|
| 225 |
+
|
| 226 |
+
🔵 STEP 5: Validation and Correction (MANDATORY - DO NOT SKIP!)
|
| 227 |
+
**THIS IS THE MOST CRITICAL STEP - YOU MUST VALIDATE AND CORRECT EVERY LINE ITEM!**
|
| 228 |
+
|
| 229 |
+
For EVERY line item, perform this validation:
|
| 230 |
+
1. **Calculate**: expected_total = quantity × unit_price
|
| 231 |
+
2. **Compare**: Is expected_total ≈ total_price? (within ±2% tolerance)
|
| 232 |
+
3. **If NOT matching**:
|
| 233 |
+
- **RECALCULATE quantity**: quantity = total_price ÷ unit_price
|
| 234 |
+
- Round to 2 decimal places
|
| 235 |
+
- **REPLACE the old quantity with this corrected value**
|
| 236 |
+
|
| 237 |
+
**EXAMPLE**:
|
| 238 |
+
- Extracted: quantity=5, unit_price=10.50, total_price=42.00
|
| 239 |
+
- Check: 5 × 10.50 = 52.50 (NOT ≈ 42.00) ❌ WRONG!
|
| 240 |
+
- Correct: quantity = 42.00 ÷ 10.50 = 4.0 ✓
|
| 241 |
+
- Output: quantity=4.0, unit_price=10.50, total_price=42.00
|
| 242 |
+
|
| 243 |
+
**YOU MUST DO THIS FOR EVERY SINGLE LINE ITEM!**
|
| 244 |
+
Never output a line where quantity × unit_price ≠ total_price.
|
| 245 |
+
The math MUST be perfect: `quantity * unit_price = total_price` (within 2% tolerance).
|
| 246 |
+
|
| 247 |
+
---
|
| 248 |
+
|
| 249 |
+
💰 STEP 6: Extract Financial Summary (TAX/IVA)
|
| 250 |
+
Look for tax information on the invoice. It may be labeled as:
|
| 251 |
+
- **IVA** (Italian)
|
| 252 |
+
- **VAT** (English)
|
| 253 |
+
- **Tax**, **Imposta**, **Tasse**
|
| 254 |
+
- **TVA** (French)
|
| 255 |
+
- Any line showing tax percentage (e.g., "IVA 22%", "VAT 20%")
|
| 256 |
+
|
| 257 |
+
Extract:
|
| 258 |
+
- **subtotal**: Sum before tax (may be labeled "Imponibile", "Subtotal", "Net Amount")
|
| 259 |
+
- **tax_amount**: The tax value (IVA amount, not percentage)
|
| 260 |
+
- **total_amount**: Final total including tax ("Totale", "Total", "Importo Totale")
|
| 261 |
+
|
| 262 |
+
If tax is not explicitly shown, set tax_amount to 0.
|
| 263 |
+
|
| 264 |
+
---
|
| 265 |
+
|
| 266 |
+
📗 STEP 7: Output Rules
|
| 267 |
+
- Return ONLY valid JSON, no text or explanations.
|
| 268 |
+
- Numbers must use "." as decimal separator.
|
| 269 |
+
- Ensure each line's `quantity * unit_price ≈ total_price`.
|
| 270 |
+
- Include financial_summary with subtotal, tax_amount, and total_amount.
|
| 271 |
+
- **ALWAYS include invoice_details with invoice_number and invoice_date!**
|
| 272 |
+
|
| 273 |
+
Example output:
|
| 274 |
+
{
|
| 275 |
+
"supplier": {
|
| 276 |
+
"name": "DAC S.p.A.",
|
| 277 |
+
"address": "Via Roma 123, Milano",
|
| 278 |
+
"phone": "+39 02 1234567",
|
| 279 |
+
"email": "info@dac.it",
|
| 280 |
+
"tax_id": "IT12345678901"
|
| 281 |
+
},
|
| 282 |
+
"customer": {
|
| 283 |
+
"name": "Restaurant ABC",
|
| 284 |
+
"address": "Via Verdi 45, Roma",
|
| 285 |
+
"phone": "+39 06 7654321",
|
| 286 |
+
"email": "abc@restaurant.it"
|
| 287 |
+
},
|
| 288 |
+
"invoice_details": {
|
| 289 |
+
"invoice_number": "FT 123/2024",
|
| 290 |
+
"invoice_date": "2025-04-09",
|
| 291 |
+
"due_date": "2025-05-09",
|
| 292 |
+
"po_number": "PO-2024-001",
|
| 293 |
+
"payment_terms": "30 days"
|
| 294 |
+
},
|
| 295 |
+
"line_items": [
|
| 296 |
+
{
|
| 297 |
+
"item_code": "53747",
|
| 298 |
+
"description": "POLLO PETTO GR 600 X 3/4 F S/V IT.",
|
| 299 |
+
"type": "carne",
|
| 300 |
+
"quantity": 1,
|
| 301 |
+
"unit": "KG",
|
| 302 |
+
"unit_price": 7.20,
|
| 303 |
+
"total_price": 7.20
|
| 304 |
+
},
|
| 305 |
+
{
|
| 306 |
+
"item_code": "88240",
|
| 307 |
+
"description": "CICORIA F.DORO CUBO K.2,5 FOGLIA PIÙ GEL",
|
| 308 |
+
"type": "vegetale",
|
| 309 |
+
"quantity": 4,
|
| 310 |
+
"unit": "PZ",
|
| 311 |
+
"unit_price": 6.36,
|
| 312 |
+
"total_price": 25.44
|
| 313 |
+
}
|
| 314 |
+
],
|
| 315 |
+
"financial_summary": {
|
| 316 |
+
"subtotal": 32.64,
|
| 317 |
+
"tax_amount": 7.18,
|
| 318 |
+
"total_amount": 39.82,
|
| 319 |
+
"currency": "EUR"
|
| 320 |
+
}
|
| 321 |
+
}
|
| 322 |
+
"""
|
| 323 |
+
|
| 324 |
+
return prompt
|
| 325 |
+
|
| 326 |
+
def _parse_gemini_response(self, response_text: str) -> Dict[str, Any]:
|
| 327 |
+
"""Parse JSON from Gemini response with robust error handling"""
|
| 328 |
+
try:
|
| 329 |
+
# Remove markdown code blocks if present
|
| 330 |
+
text = response_text.strip()
|
| 331 |
+
|
| 332 |
+
# Remove various markdown formats
|
| 333 |
+
if text.startswith("```json"):
|
| 334 |
+
text = text[7:]
|
| 335 |
+
elif text.startswith("```JSON"):
|
| 336 |
+
text = text[7:]
|
| 337 |
+
elif text.startswith("```"):
|
| 338 |
+
text = text[3:]
|
| 339 |
+
|
| 340 |
+
if text.endswith("```"):
|
| 341 |
+
text = text[:-3]
|
| 342 |
+
|
| 343 |
+
text = text.strip()
|
| 344 |
+
|
| 345 |
+
# Find JSON object if there's extra text
|
| 346 |
+
if not text.startswith("{"):
|
| 347 |
+
start = text.find("{")
|
| 348 |
+
if start != -1:
|
| 349 |
+
text = text[start:]
|
| 350 |
+
|
| 351 |
+
# Try to parse incrementally - find first valid complete JSON object
|
| 352 |
+
# This handles cases where Gemini adds garbage after the JSON
|
| 353 |
+
brace_count = 0
|
| 354 |
+
in_string = False
|
| 355 |
+
escape_next = False
|
| 356 |
+
|
| 357 |
+
for i, char in enumerate(text):
|
| 358 |
+
# Handle string state to avoid counting braces inside strings
|
| 359 |
+
if char == '"' and not escape_next:
|
| 360 |
+
in_string = not in_string
|
| 361 |
+
elif char == '\\' and not escape_next:
|
| 362 |
+
escape_next = True
|
| 363 |
+
continue
|
| 364 |
+
|
| 365 |
+
escape_next = False
|
| 366 |
+
|
| 367 |
+
# Count braces only outside strings
|
| 368 |
+
if not in_string:
|
| 369 |
+
if char == '{':
|
| 370 |
+
brace_count += 1
|
| 371 |
+
elif char == '}':
|
| 372 |
+
brace_count -= 1
|
| 373 |
+
if brace_count == 0:
|
| 374 |
+
# Found complete JSON object
|
| 375 |
+
text = text[:i + 1]
|
| 376 |
+
break
|
| 377 |
+
|
| 378 |
+
# If we still have unbalanced braces, try to repair
|
| 379 |
+
if brace_count != 0:
|
| 380 |
+
print("⚠ JSON appears truncated, attempting repair...")
|
| 381 |
+
# Find last complete item and truncate there
|
| 382 |
+
last_complete_item = text.rfind(' }')
|
| 383 |
+
if last_complete_item != -1:
|
| 384 |
+
# Truncate to last complete item
|
| 385 |
+
text = text[:last_complete_item + 5]
|
| 386 |
+
# Close the line_items array and main object
|
| 387 |
+
text += '\n ],\n "financial_summary": {},\n "payment_info": {},\n "additional_notes": ""\n}'
|
| 388 |
+
else:
|
| 389 |
+
# Count and add missing brackets
|
| 390 |
+
open_braces = text.count("{")
|
| 391 |
+
close_braces = text.count("}")
|
| 392 |
+
open_brackets = text.count("[")
|
| 393 |
+
close_brackets = text.count("]")
|
| 394 |
+
|
| 395 |
+
if open_brackets > close_brackets:
|
| 396 |
+
text += "]" * (open_brackets - close_brackets)
|
| 397 |
+
if open_braces > close_braces:
|
| 398 |
+
text += "}" * (open_braces - close_braces)
|
| 399 |
+
|
| 400 |
+
# Parse JSON
|
| 401 |
+
invoice_data = json.loads(text)
|
| 402 |
+
|
| 403 |
+
# Validate structure
|
| 404 |
+
if not isinstance(invoice_data, dict):
|
| 405 |
+
raise ValueError("Response is not a JSON object")
|
| 406 |
+
|
| 407 |
+
# Ensure required fields exist and fix empty dicts/lists
|
| 408 |
+
required_fields = {
|
| 409 |
+
"supplier": {},
|
| 410 |
+
"customer": {},
|
| 411 |
+
"invoice_details": {},
|
| 412 |
+
"line_items": [],
|
| 413 |
+
"financial_summary": {},
|
| 414 |
+
"payment_info": {},
|
| 415 |
+
"additional_notes": ""
|
| 416 |
+
}
|
| 417 |
+
|
| 418 |
+
for field, default in required_fields.items():
|
| 419 |
+
if field not in invoice_data:
|
| 420 |
+
invoice_data[field] = default
|
| 421 |
+
elif invoice_data[field] is None:
|
| 422 |
+
invoice_data[field] = default
|
| 423 |
+
|
| 424 |
+
return invoice_data
|
| 425 |
+
|
| 426 |
+
except json.JSONDecodeError as e:
|
| 427 |
+
print(f"⚠ JSON parsing error: {e}")
|
| 428 |
+
print(f"Error at position {e.pos}")
|
| 429 |
+
print(f"Response text (first 2000 chars):\n{response_text[:2000]}")
|
| 430 |
+
print(f"Response text (last 500 chars):\n...{response_text[-500:]}")
|
| 431 |
+
return {
|
| 432 |
+
"error": f"Failed to parse JSON: {str(e)}",
|
| 433 |
+
"raw_response": response_text,
|
| 434 |
+
"supplier": {},
|
| 435 |
+
"customer": {},
|
| 436 |
+
"invoice_details": {},
|
| 437 |
+
"line_items": [],
|
| 438 |
+
"financial_summary": {},
|
| 439 |
+
"payment_info": {},
|
| 440 |
+
"additional_notes": "Parse error occurred"
|
| 441 |
+
}
|
| 442 |
+
except Exception as e:
|
| 443 |
+
print(f"⚠ Unexpected error parsing response: {e}")
|
| 444 |
+
import traceback
|
| 445 |
+
traceback.print_exc()
|
| 446 |
+
return {"error": str(e), "raw_response": response_text[:2000]}
|
| 447 |
+
|
| 448 |
+
def _detect_mime_type(self, file_path: str) -> str:
|
| 449 |
+
"""Detect MIME type from file extension"""
|
| 450 |
+
extension = Path(file_path).suffix.lower()
|
| 451 |
+
|
| 452 |
+
mime_types = {
|
| 453 |
+
'.pdf': 'application/pdf',
|
| 454 |
+
'.jpg': 'image/jpeg',
|
| 455 |
+
'.jpeg': 'image/jpeg',
|
| 456 |
+
'.png': 'image/png',
|
| 457 |
+
'.tiff': 'image/tiff',
|
| 458 |
+
'.tif': 'image/tiff'
|
| 459 |
+
}
|
| 460 |
+
|
| 461 |
+
return mime_types.get(extension, 'application/octet-stream')
|
| 462 |
+
|
| 463 |
+
def _validate_and_correct_quantities(self, line_items: list) -> list:
|
| 464 |
+
"""
|
| 465 |
+
Double-check and correct quantities for all line items.
|
| 466 |
+
Ensures: quantity × unit_price ≈ total_price (within 2% tolerance)
|
| 467 |
+
If incorrect, recalculates quantity = total_price ÷ unit_price
|
| 468 |
+
"""
|
| 469 |
+
if not line_items:
|
| 470 |
+
return line_items
|
| 471 |
+
|
| 472 |
+
corrected_items = []
|
| 473 |
+
corrections_made = 0
|
| 474 |
+
|
| 475 |
+
for item in line_items:
|
| 476 |
+
try:
|
| 477 |
+
quantity = float(item.get('quantity', 0))
|
| 478 |
+
unit_price = float(item.get('unit_price', 0))
|
| 479 |
+
total_price = float(item.get('total_price', 0))
|
| 480 |
+
|
| 481 |
+
# Skip if any value is 0 or missing
|
| 482 |
+
if quantity == 0 or unit_price == 0 or total_price == 0:
|
| 483 |
+
corrected_items.append(item)
|
| 484 |
+
continue
|
| 485 |
+
|
| 486 |
+
# Calculate expected total
|
| 487 |
+
expected_total = round(quantity * unit_price, 2)
|
| 488 |
+
|
| 489 |
+
# Check if within 2% tolerance
|
| 490 |
+
tolerance = 0.02 * total_price
|
| 491 |
+
difference = abs(expected_total - total_price)
|
| 492 |
+
|
| 493 |
+
if difference > tolerance:
|
| 494 |
+
# Math is wrong! Recalculate quantity
|
| 495 |
+
correct_quantity = round(total_price / unit_price, 2)
|
| 496 |
+
|
| 497 |
+
print(f" ⚠ Correcting quantity for '{item.get('description', 'Unknown')[:40]}':")
|
| 498 |
+
print(f" Old: qty={quantity} × {unit_price} = {expected_total} (expected {total_price})")
|
| 499 |
+
print(f" New: qty={correct_quantity} × {unit_price} = {total_price} ✓")
|
| 500 |
+
|
| 501 |
+
# Update the quantity
|
| 502 |
+
item['quantity'] = correct_quantity
|
| 503 |
+
corrections_made += 1
|
| 504 |
+
|
| 505 |
+
corrected_items.append(item)
|
| 506 |
+
|
| 507 |
+
except (ValueError, TypeError, ZeroDivisionError) as e:
|
| 508 |
+
# If there's an error, keep the original item
|
| 509 |
+
print(f" ⚠ Could not validate item: {e}")
|
| 510 |
+
corrected_items.append(item)
|
| 511 |
+
|
| 512 |
+
if corrections_made > 0:
|
| 513 |
+
print(f" ✓ Corrected {corrections_made} quantity values")
|
| 514 |
+
else:
|
| 515 |
+
print(f" ✓ All quantities verified - no corrections needed")
|
| 516 |
+
|
| 517 |
+
return corrected_items
|
| 518 |
+
|
| 519 |
+
def process_invoice(
|
| 520 |
+
self,
|
| 521 |
+
file_path: str,
|
| 522 |
+
output_json_path: Optional[str] = None,
|
| 523 |
+
save_json: bool = True
|
| 524 |
+
) -> Dict[str, Any]:
|
| 525 |
+
"""
|
| 526 |
+
Complete pipeline: Document AI OCR + Gemini AI interpretation
|
| 527 |
+
|
| 528 |
+
Args:
|
| 529 |
+
file_path: Path to invoice image/PDF
|
| 530 |
+
output_json_path: Optional path to save JSON output
|
| 531 |
+
save_json: Whether to save JSON output (set False for API usage)
|
| 532 |
+
|
| 533 |
+
Returns:
|
| 534 |
+
Structured invoice data
|
| 535 |
+
"""
|
| 536 |
+
print(f"Processing invoice: {file_path}")
|
| 537 |
+
|
| 538 |
+
# Step 1: Extract text with Document AI
|
| 539 |
+
print("Step 1: Extracting text with Document AI...")
|
| 540 |
+
raw_text = self.process_with_document_ai(file_path)
|
| 541 |
+
print(f"Document AI extracted {len(raw_text)} characters")
|
| 542 |
+
|
| 543 |
+
# Step 2: Analyze with Gemini AI
|
| 544 |
+
print("Step 2: Analyzing with Gemini AI for perfect interpretation...")
|
| 545 |
+
invoice_data = self.process_with_gemini_vision(file_path, raw_text)
|
| 546 |
+
|
| 547 |
+
# Step 3: Validate and correct quantities (backend safety check)
|
| 548 |
+
if "error" not in invoice_data and "line_items" in invoice_data:
|
| 549 |
+
print("Step 3: Double-checking quantities (backend validation)...")
|
| 550 |
+
invoice_data["line_items"] = self._validate_and_correct_quantities(
|
| 551 |
+
invoice_data.get("line_items", [])
|
| 552 |
+
)
|
| 553 |
+
|
| 554 |
+
# Add processing metadata for cost tracking
|
| 555 |
+
if "error" not in invoice_data:
|
| 556 |
+
item_count = len(invoice_data.get("line_items", []))
|
| 557 |
+
print(f"✓ Extraction complete! Found {item_count} line items.")
|
| 558 |
+
invoice_data["_processing_metadata"] = {
|
| 559 |
+
"raw_text_length": len(raw_text),
|
| 560 |
+
"raw_text": raw_text,
|
| 561 |
+
"includes_image": True
|
| 562 |
+
}
|
| 563 |
+
else:
|
| 564 |
+
print("⚠ Extraction encountered issues.")
|
| 565 |
+
|
| 566 |
+
# Save to JSON if requested
|
| 567 |
+
if save_json:
|
| 568 |
+
if not output_json_path:
|
| 569 |
+
file_stem = Path(file_path).stem
|
| 570 |
+
if os.path.exists("/app"):
|
| 571 |
+
output_json_path = f"/tmp/{file_stem}_invoice.json"
|
| 572 |
+
else:
|
| 573 |
+
output_json_path = f"{file_stem}_invoice.json"
|
| 574 |
+
|
| 575 |
+
try:
|
| 576 |
+
with open(output_json_path, 'w', encoding='utf-8') as f:
|
| 577 |
+
json.dump(invoice_data, f, indent=2, ensure_ascii=False)
|
| 578 |
+
print(f"✓ Results saved to: {output_json_path}")
|
| 579 |
+
except (PermissionError, OSError) as e:
|
| 580 |
+
print(f"⚠ Warning: Could not save JSON file: {e}")
|
| 581 |
+
|
| 582 |
+
return invoice_data
|
| 583 |
+
|
| 584 |
+
|
| 585 |
+
def main():
|
| 586 |
+
"""Example usage"""
|
| 587 |
+
import sys
|
| 588 |
+
|
| 589 |
+
if len(sys.argv) < 2:
|
| 590 |
+
print("Usage: python ocr_invoice.py <path_to_invoice_image>")
|
| 591 |
+
sys.exit(1)
|
| 592 |
+
|
| 593 |
+
input_file = sys.argv[1]
|
| 594 |
+
output_file = sys.argv[2] if len(sys.argv) > 2 else None
|
| 595 |
+
|
| 596 |
+
# Initialize invoice OCR processor
|
| 597 |
+
ocr = InvoiceOCR(
|
| 598 |
+
project_id=PROJECT_ID,
|
| 599 |
+
location=LOCATION,
|
| 600 |
+
processor_id=PROCESSOR_ID,
|
| 601 |
+
gemini_api_key=GEMINI_API_KEY
|
| 602 |
+
)
|
| 603 |
+
|
| 604 |
+
# Process invoice
|
| 605 |
+
invoice_data = ocr.process_invoice(input_file, output_file)
|
| 606 |
+
|
| 607 |
+
# Display results
|
| 608 |
+
print("\n" + "="*70)
|
| 609 |
+
print("INVOICE OCR RESULTS")
|
| 610 |
+
print("="*70)
|
| 611 |
+
print(json.dumps(invoice_data, indent=2, ensure_ascii=False))
|
| 612 |
+
print("="*70)
|
| 613 |
+
|
| 614 |
+
|
| 615 |
+
if __name__ == "__main__":
|
| 616 |
+
main()
|
requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi>=0.104.0
|
| 2 |
+
uvicorn[standard]>=0.24.0
|
| 3 |
+
python-multipart>=0.0.6
|
| 4 |
+
sqlalchemy>=2.0.0
|
| 5 |
+
google-cloud-documentai>=2.20.0
|
| 6 |
+
google-api-core>=2.11.0
|
| 7 |
+
google-generativeai>=0.8.0
|
| 8 |
+
pillow>=10.0.0
|
| 9 |
+
python-dotenv>=1.0.0
|
static/index.html
ADDED
|
@@ -0,0 +1,802 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>Invoice OCR System</title>
|
| 7 |
+
<style>
|
| 8 |
+
* {
|
| 9 |
+
margin: 0;
|
| 10 |
+
padding: 0;
|
| 11 |
+
box-sizing: border-box;
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
body {
|
| 15 |
+
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, sans-serif;
|
| 16 |
+
background: linear-gradient(135deg, #1e3a8a 0%, #1e40af 50%, #2563eb 100%);
|
| 17 |
+
min-height: 100vh;
|
| 18 |
+
padding: 20px;
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
.container {
|
| 22 |
+
max-width: 1400px;
|
| 23 |
+
margin: 0 auto;
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
header {
|
| 27 |
+
text-align: center;
|
| 28 |
+
color: white;
|
| 29 |
+
margin-bottom: 30px;
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
.logo {
|
| 33 |
+
width: 120px;
|
| 34 |
+
height: 120px;
|
| 35 |
+
margin: 0 auto 20px;
|
| 36 |
+
animation: float 3s ease-in-out infinite;
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
@keyframes float {
|
| 40 |
+
0%, 100% { transform: translateY(0px); }
|
| 41 |
+
50% { transform: translateY(-10px); }
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
h1 {
|
| 45 |
+
font-size: 2.5em;
|
| 46 |
+
margin-bottom: 10px;
|
| 47 |
+
text-shadow: 2px 2px 4px rgba(0,0,0,0.2);
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
.subtitle {
|
| 51 |
+
font-size: 1.1em;
|
| 52 |
+
opacity: 0.9;
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
.card {
|
| 56 |
+
background: white;
|
| 57 |
+
border-radius: 15px;
|
| 58 |
+
padding: 30px;
|
| 59 |
+
box-shadow: 0 10px 40px rgba(0,0,0,0.1);
|
| 60 |
+
margin-bottom: 20px;
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
.upload-zone {
|
| 64 |
+
border: 3px dashed #3b82f6;
|
| 65 |
+
border-radius: 10px;
|
| 66 |
+
padding: 60px 20px;
|
| 67 |
+
text-align: center;
|
| 68 |
+
cursor: pointer;
|
| 69 |
+
transition: all 0.3s ease;
|
| 70 |
+
background: #eff6ff;
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
.upload-zone:hover {
|
| 74 |
+
border-color: #1e40af;
|
| 75 |
+
background: #dbeafe;
|
| 76 |
+
transform: translateY(-2px);
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
.upload-zone.drag-over {
|
| 80 |
+
border-color: #1e3a8a;
|
| 81 |
+
background: #bfdbfe;
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
.upload-icon {
|
| 85 |
+
font-size: 48px;
|
| 86 |
+
margin-bottom: 15px;
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
.upload-text {
|
| 90 |
+
font-size: 1.2em;
|
| 91 |
+
color: #1e40af;
|
| 92 |
+
font-weight: 600;
|
| 93 |
+
margin-bottom: 8px;
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
.upload-subtext {
|
| 97 |
+
color: #666;
|
| 98 |
+
font-size: 0.9em;
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
#file-input {
|
| 102 |
+
display: none;
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
.btn {
|
| 106 |
+
background: linear-gradient(135deg, #1e3a8a 0%, #2563eb 100%);
|
| 107 |
+
color: white;
|
| 108 |
+
border: none;
|
| 109 |
+
padding: 12px 30px;
|
| 110 |
+
border-radius: 8px;
|
| 111 |
+
font-size: 1em;
|
| 112 |
+
font-weight: 600;
|
| 113 |
+
cursor: pointer;
|
| 114 |
+
transition: all 0.3s ease;
|
| 115 |
+
display: inline-block;
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
.btn:hover {
|
| 119 |
+
transform: translateY(-2px);
|
| 120 |
+
box-shadow: 0 5px 15px rgba(37, 99, 235, 0.4);
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
.btn:disabled {
|
| 124 |
+
opacity: 0.5;
|
| 125 |
+
cursor: not-allowed;
|
| 126 |
+
transform: none;
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
.loading {
|
| 130 |
+
display: none;
|
| 131 |
+
text-align: center;
|
| 132 |
+
padding: 40px;
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
.spinner {
|
| 136 |
+
border: 4px solid #f3f3f3;
|
| 137 |
+
border-top: 4px solid #2563eb;
|
| 138 |
+
border-radius: 50%;
|
| 139 |
+
width: 50px;
|
| 140 |
+
height: 50px;
|
| 141 |
+
animation: spin 1s linear infinite;
|
| 142 |
+
margin: 0 auto 20px;
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
@keyframes spin {
|
| 146 |
+
0% { transform: rotate(0deg); }
|
| 147 |
+
100% { transform: rotate(360deg); }
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
.results {
|
| 151 |
+
display: none;
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
.section-title {
|
| 155 |
+
font-size: 1.3em;
|
| 156 |
+
color: #333;
|
| 157 |
+
margin-bottom: 15px;
|
| 158 |
+
padding-bottom: 10px;
|
| 159 |
+
border-bottom: 2px solid #2563eb;
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
.info-grid {
|
| 163 |
+
display: grid;
|
| 164 |
+
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
|
| 165 |
+
gap: 15px;
|
| 166 |
+
margin-bottom: 25px;
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
.info-item {
|
| 170 |
+
background: #eff6ff;
|
| 171 |
+
padding: 15px;
|
| 172 |
+
border-radius: 8px;
|
| 173 |
+
border-left: 4px solid #2563eb;
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
.info-label {
|
| 177 |
+
font-size: 0.85em;
|
| 178 |
+
color: #666;
|
| 179 |
+
text-transform: uppercase;
|
| 180 |
+
letter-spacing: 0.5px;
|
| 181 |
+
margin-bottom: 5px;
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
.info-value {
|
| 185 |
+
font-size: 1.1em;
|
| 186 |
+
color: #333;
|
| 187 |
+
font-weight: 600;
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
.line-items-table {
|
| 191 |
+
width: 100%;
|
| 192 |
+
border-collapse: collapse;
|
| 193 |
+
margin-top: 15px;
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
.line-items-table th {
|
| 197 |
+
background: #1e40af;
|
| 198 |
+
color: white;
|
| 199 |
+
padding: 12px;
|
| 200 |
+
text-align: left;
|
| 201 |
+
font-weight: 600;
|
| 202 |
+
}
|
| 203 |
+
|
| 204 |
+
.line-items-table td {
|
| 205 |
+
padding: 12px;
|
| 206 |
+
border-bottom: 1px solid #e0e0e0;
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
.line-items-table tr:hover {
|
| 210 |
+
background: #eff6ff;
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
.cost-breakdown {
|
| 214 |
+
background: linear-gradient(135deg, #1e3a8a 0%, #2563eb 100%);
|
| 215 |
+
color: white;
|
| 216 |
+
padding: 20px;
|
| 217 |
+
border-radius: 10px;
|
| 218 |
+
margin-top: 20px;
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
.cost-item {
|
| 222 |
+
display: flex;
|
| 223 |
+
justify-content: space-between;
|
| 224 |
+
padding: 8px 0;
|
| 225 |
+
border-bottom: 1px solid rgba(255,255,255,0.2);
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
.cost-item:last-child {
|
| 229 |
+
border-bottom: none;
|
| 230 |
+
font-size: 1.2em;
|
| 231 |
+
font-weight: 700;
|
| 232 |
+
padding-top: 12px;
|
| 233 |
+
}
|
| 234 |
+
|
| 235 |
+
.invoice-list {
|
| 236 |
+
margin-top: 30px;
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
.invoice-card {
|
| 240 |
+
background: #eff6ff;
|
| 241 |
+
padding: 20px;
|
| 242 |
+
border-radius: 10px;
|
| 243 |
+
margin-bottom: 15px;
|
| 244 |
+
border-left: 5px solid #2563eb;
|
| 245 |
+
cursor: pointer;
|
| 246 |
+
transition: all 0.3s ease;
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
.invoice-card:hover {
|
| 250 |
+
background: #dbeafe;
|
| 251 |
+
transform: translateX(5px);
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
.invoice-header {
|
| 255 |
+
display: flex;
|
| 256 |
+
justify-content: space-between;
|
| 257 |
+
align-items: center;
|
| 258 |
+
margin-bottom: 10px;
|
| 259 |
+
}
|
| 260 |
+
|
| 261 |
+
.invoice-number {
|
| 262 |
+
font-size: 1.2em;
|
| 263 |
+
font-weight: 700;
|
| 264 |
+
color: #1e40af;
|
| 265 |
+
}
|
| 266 |
+
|
| 267 |
+
.invoice-amount {
|
| 268 |
+
font-size: 1.3em;
|
| 269 |
+
font-weight: 700;
|
| 270 |
+
color: #1e3a8a;
|
| 271 |
+
}
|
| 272 |
+
|
| 273 |
+
.invoice-meta {
|
| 274 |
+
display: flex;
|
| 275 |
+
gap: 20px;
|
| 276 |
+
font-size: 0.9em;
|
| 277 |
+
color: #666;
|
| 278 |
+
}
|
| 279 |
+
|
| 280 |
+
.stats-grid {
|
| 281 |
+
display: grid;
|
| 282 |
+
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
|
| 283 |
+
gap: 20px;
|
| 284 |
+
margin-bottom: 30px;
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
.stat-card {
|
| 288 |
+
background: linear-gradient(135deg, #1e3a8a 0%, #2563eb 100%);
|
| 289 |
+
color: white;
|
| 290 |
+
padding: 25px;
|
| 291 |
+
border-radius: 12px;
|
| 292 |
+
text-align: center;
|
| 293 |
+
}
|
| 294 |
+
|
| 295 |
+
.stat-value {
|
| 296 |
+
font-size: 2.5em;
|
| 297 |
+
font-weight: 700;
|
| 298 |
+
margin-bottom: 5px;
|
| 299 |
+
}
|
| 300 |
+
|
| 301 |
+
.stat-label {
|
| 302 |
+
font-size: 0.9em;
|
| 303 |
+
opacity: 0.9;
|
| 304 |
+
}
|
| 305 |
+
|
| 306 |
+
.empty-state {
|
| 307 |
+
text-align: center;
|
| 308 |
+
padding: 40px;
|
| 309 |
+
color: #999;
|
| 310 |
+
}
|
| 311 |
+
|
| 312 |
+
.delete-btn {
|
| 313 |
+
background: #dc3545;
|
| 314 |
+
color: white;
|
| 315 |
+
border: none;
|
| 316 |
+
padding: 8px 16px;
|
| 317 |
+
border-radius: 6px;
|
| 318 |
+
cursor: pointer;
|
| 319 |
+
font-size: 0.9em;
|
| 320 |
+
transition: all 0.3s ease;
|
| 321 |
+
}
|
| 322 |
+
|
| 323 |
+
.delete-btn:hover {
|
| 324 |
+
background: #c82333;
|
| 325 |
+
}
|
| 326 |
+
|
| 327 |
+
.view-btn {
|
| 328 |
+
background: #2563eb;
|
| 329 |
+
color: white;
|
| 330 |
+
border: none;
|
| 331 |
+
padding: 8px 16px;
|
| 332 |
+
border-radius: 6px;
|
| 333 |
+
cursor: pointer;
|
| 334 |
+
font-size: 0.9em;
|
| 335 |
+
transition: all 0.3s ease;
|
| 336 |
+
margin-left: 10px;
|
| 337 |
+
}
|
| 338 |
+
|
| 339 |
+
.view-btn:hover {
|
| 340 |
+
background: #1e40af;
|
| 341 |
+
}
|
| 342 |
+
|
| 343 |
+
.invoice-card {
|
| 344 |
+
transition: transform 0.2s ease, box-shadow 0.2s ease;
|
| 345 |
+
}
|
| 346 |
+
|
| 347 |
+
.invoice-card:hover {
|
| 348 |
+
transform: translateY(-2px);
|
| 349 |
+
box-shadow: 0 8px 16px rgba(0,0,0,0.15);
|
| 350 |
+
}
|
| 351 |
+
|
| 352 |
+
.alert {
|
| 353 |
+
padding: 15px 20px;
|
| 354 |
+
border-radius: 8px;
|
| 355 |
+
margin-bottom: 20px;
|
| 356 |
+
display: none;
|
| 357 |
+
}
|
| 358 |
+
|
| 359 |
+
.alert-success {
|
| 360 |
+
background: #d4edda;
|
| 361 |
+
color: #155724;
|
| 362 |
+
border-left: 5px solid #28a745;
|
| 363 |
+
}
|
| 364 |
+
|
| 365 |
+
.alert-error {
|
| 366 |
+
background: #f8d7da;
|
| 367 |
+
color: #721c24;
|
| 368 |
+
border-left: 5px solid #dc3545;
|
| 369 |
+
}
|
| 370 |
+
</style>
|
| 371 |
+
</head>
|
| 372 |
+
<body>
|
| 373 |
+
<div class="container">
|
| 374 |
+
<header>
|
| 375 |
+
<img src="/static/logo.svg" alt="Invoice OCR Logo" class="logo">
|
| 376 |
+
<h1>📄 Invoice OCR System</h1>
|
| 377 |
+
<p class="subtitle">Powered by Google Document AI + Gemini 2.0 Flash</p>
|
| 378 |
+
</header>
|
| 379 |
+
|
| 380 |
+
<div id="alert-success" class="alert alert-success"></div>
|
| 381 |
+
<div id="alert-error" class="alert alert-error"></div>
|
| 382 |
+
|
| 383 |
+
<!-- Upload Section -->
|
| 384 |
+
<div class="card">
|
| 385 |
+
<div class="upload-zone" id="upload-zone">
|
| 386 |
+
<div class="upload-icon">📤</div>
|
| 387 |
+
<div class="upload-text">Drop invoice here or click to upload</div>
|
| 388 |
+
<div class="upload-subtext">Supports JPG, PNG, PDF</div>
|
| 389 |
+
<input type="file" id="file-input" accept="image/*,.pdf">
|
| 390 |
+
</div>
|
| 391 |
+
<div class="loading" id="loading">
|
| 392 |
+
<div class="spinner"></div>
|
| 393 |
+
<p>Processing invoice with AI...</p>
|
| 394 |
+
<p style="color: #666; font-size: 0.9em; margin-top: 10px;">This may take 2-4 seconds</p>
|
| 395 |
+
</div>
|
| 396 |
+
</div>
|
| 397 |
+
|
| 398 |
+
<!-- Results Section -->
|
| 399 |
+
<div id="results" class="results">
|
| 400 |
+
<div class="card">
|
| 401 |
+
<h2 class="section-title">✅ Invoice Processed Successfully</h2>
|
| 402 |
+
|
| 403 |
+
<!-- Supplier Information -->
|
| 404 |
+
<h3 class="section-title" style="margin-top: 25px;">📦 Supplier Information</h3>
|
| 405 |
+
<div class="info-grid" id="supplier-info"></div>
|
| 406 |
+
|
| 407 |
+
<!-- Customer Information -->
|
| 408 |
+
<h3 class="section-title" style="margin-top: 25px;">👤 Customer Information</h3>
|
| 409 |
+
<div class="info-grid" id="customer-info"></div>
|
| 410 |
+
|
| 411 |
+
<!-- Invoice Details -->
|
| 412 |
+
<h3 class="section-title" style="margin-top: 25px;">📋 Invoice Details</h3>
|
| 413 |
+
<div class="info-grid" id="invoice-details"></div>
|
| 414 |
+
|
| 415 |
+
<!-- Line Items -->
|
| 416 |
+
<h3 class="section-title" style="margin-top: 25px;">📊 Line Items</h3>
|
| 417 |
+
<div style="overflow-x: auto;">
|
| 418 |
+
<table class="line-items-table" id="line-items-table">
|
| 419 |
+
<thead>
|
| 420 |
+
<tr>
|
| 421 |
+
<th>Item Code</th>
|
| 422 |
+
<th>Description</th>
|
| 423 |
+
<th>Type</th>
|
| 424 |
+
<th>Quantity</th>
|
| 425 |
+
<th>Unit</th>
|
| 426 |
+
<th>Unit Price</th>
|
| 427 |
+
<th>Total</th>
|
| 428 |
+
</tr>
|
| 429 |
+
</thead>
|
| 430 |
+
<tbody id="line-items-body"></tbody>
|
| 431 |
+
</table>
|
| 432 |
+
</div>
|
| 433 |
+
|
| 434 |
+
<!-- Financial Summary -->
|
| 435 |
+
<h3 class="section-title" style="margin-top: 25px;">💰 Financial Summary</h3>
|
| 436 |
+
<div class="info-grid" id="financial-summary"></div>
|
| 437 |
+
|
| 438 |
+
<!-- Payment Information -->
|
| 439 |
+
<h3 class="section-title" style="margin-top: 25px;">🏦 Payment Information</h3>
|
| 440 |
+
<div class="info-grid" id="payment-info"></div>
|
| 441 |
+
|
| 442 |
+
<!-- Cost Breakdown -->
|
| 443 |
+
<div class="cost-breakdown">
|
| 444 |
+
<h3 style="margin-bottom: 15px;">💵 Processing Cost</h3>
|
| 445 |
+
<div class="cost-item">
|
| 446 |
+
<span>Document AI OCR:</span>
|
| 447 |
+
<span id="cost-docai">$0.00000</span>
|
| 448 |
+
</div>
|
| 449 |
+
<div class="cost-item">
|
| 450 |
+
<span>Gemini Input (<span id="tokens-input">0</span> tokens):</span>
|
| 451 |
+
<span id="cost-gemini-input">$0.00000</span>
|
| 452 |
+
</div>
|
| 453 |
+
<div class="cost-item">
|
| 454 |
+
<span>Gemini Output (<span id="tokens-output">0</span> tokens):</span>
|
| 455 |
+
<span id="cost-gemini-output">$0.00000</span>
|
| 456 |
+
</div>
|
| 457 |
+
<div class="cost-item">
|
| 458 |
+
<span>Total Processing Cost:</span>
|
| 459 |
+
<span id="cost-total">$0.00000</span>
|
| 460 |
+
</div>
|
| 461 |
+
</div>
|
| 462 |
+
|
| 463 |
+
<div style="text-align: center; margin-top: 20px;">
|
| 464 |
+
<button class="btn" onclick="resetUpload()">Process Another Invoice</button>
|
| 465 |
+
</div>
|
| 466 |
+
</div>
|
| 467 |
+
</div>
|
| 468 |
+
|
| 469 |
+
<!-- Statistics -->
|
| 470 |
+
<div class="card">
|
| 471 |
+
<h2 class="section-title">📈 Statistics</h2>
|
| 472 |
+
<div class="stats-grid" id="stats-grid">
|
| 473 |
+
<div class="stat-card">
|
| 474 |
+
<div class="stat-value" id="stat-total">0</div>
|
| 475 |
+
<div class="stat-label">Total Invoices</div>
|
| 476 |
+
</div>
|
| 477 |
+
<div class="stat-card">
|
| 478 |
+
<div class="stat-value" id="stat-cost">$0.00</div>
|
| 479 |
+
<div class="stat-label">Total Cost</div>
|
| 480 |
+
</div>
|
| 481 |
+
<div class="stat-card">
|
| 482 |
+
<div class="stat-value" id="stat-avg">$0.00</div>
|
| 483 |
+
<div class="stat-label">Average Cost</div>
|
| 484 |
+
</div>
|
| 485 |
+
</div>
|
| 486 |
+
</div>
|
| 487 |
+
|
| 488 |
+
<!-- Invoice History -->
|
| 489 |
+
<div class="card">
|
| 490 |
+
<h2 class="section-title">📚 Invoice History</h2>
|
| 491 |
+
<div id="invoice-list" class="invoice-list"></div>
|
| 492 |
+
</div>
|
| 493 |
+
</div>
|
| 494 |
+
|
| 495 |
+
<script>
|
| 496 |
+
const uploadZone = document.getElementById('upload-zone');
|
| 497 |
+
const fileInput = document.getElementById('file-input');
|
| 498 |
+
const loading = document.getElementById('loading');
|
| 499 |
+
const results = document.getElementById('results');
|
| 500 |
+
|
| 501 |
+
// Drag and drop handlers
|
| 502 |
+
uploadZone.addEventListener('click', () => fileInput.click());
|
| 503 |
+
|
| 504 |
+
uploadZone.addEventListener('dragover', (e) => {
|
| 505 |
+
e.preventDefault();
|
| 506 |
+
uploadZone.classList.add('drag-over');
|
| 507 |
+
});
|
| 508 |
+
|
| 509 |
+
uploadZone.addEventListener('dragleave', () => {
|
| 510 |
+
uploadZone.classList.remove('drag-over');
|
| 511 |
+
});
|
| 512 |
+
|
| 513 |
+
uploadZone.addEventListener('drop', (e) => {
|
| 514 |
+
e.preventDefault();
|
| 515 |
+
uploadZone.classList.remove('drag-over');
|
| 516 |
+
const file = e.dataTransfer.files[0];
|
| 517 |
+
if (file) handleFileUpload(file);
|
| 518 |
+
});
|
| 519 |
+
|
| 520 |
+
fileInput.addEventListener('change', (e) => {
|
| 521 |
+
const file = e.target.files[0];
|
| 522 |
+
if (file) handleFileUpload(file);
|
| 523 |
+
});
|
| 524 |
+
|
| 525 |
+
async function handleFileUpload(file) {
|
| 526 |
+
const formData = new FormData();
|
| 527 |
+
formData.append('file', file);
|
| 528 |
+
|
| 529 |
+
uploadZone.style.display = 'none';
|
| 530 |
+
loading.style.display = 'block';
|
| 531 |
+
results.style.display = 'none';
|
| 532 |
+
hideAlerts();
|
| 533 |
+
|
| 534 |
+
try {
|
| 535 |
+
const response = await fetch('/upload', {
|
| 536 |
+
method: 'POST',
|
| 537 |
+
body: formData
|
| 538 |
+
});
|
| 539 |
+
|
| 540 |
+
const data = await response.json();
|
| 541 |
+
|
| 542 |
+
if (response.ok) {
|
| 543 |
+
displayResults(data);
|
| 544 |
+
showAlert('Invoice processed successfully!', 'success');
|
| 545 |
+
loadInvoices();
|
| 546 |
+
loadStats();
|
| 547 |
+
} else {
|
| 548 |
+
throw new Error(data.detail || 'Processing failed');
|
| 549 |
+
}
|
| 550 |
+
} catch (error) {
|
| 551 |
+
showAlert(`Error: ${error.message}`, 'error');
|
| 552 |
+
resetUpload();
|
| 553 |
+
}
|
| 554 |
+
}
|
| 555 |
+
|
| 556 |
+
function displayResults(data) {
|
| 557 |
+
loading.style.display = 'none';
|
| 558 |
+
results.style.display = 'block';
|
| 559 |
+
|
| 560 |
+
// Supplier Information
|
| 561 |
+
const supplierInfo = document.getElementById('supplier-info');
|
| 562 |
+
supplierInfo.innerHTML = '';
|
| 563 |
+
if (data.invoice.supplier_data) {
|
| 564 |
+
const supplier = JSON.parse(data.invoice.supplier_data);
|
| 565 |
+
addInfoItem(supplierInfo, 'Company Name', supplier.name);
|
| 566 |
+
addInfoItem(supplierInfo, 'Address', supplier.address);
|
| 567 |
+
addInfoItem(supplierInfo, 'Phone', supplier.phone);
|
| 568 |
+
addInfoItem(supplierInfo, 'Email', supplier.email);
|
| 569 |
+
addInfoItem(supplierInfo, 'Tax ID', supplier.tax_id);
|
| 570 |
+
}
|
| 571 |
+
|
| 572 |
+
// Customer Information
|
| 573 |
+
const customerInfo = document.getElementById('customer-info');
|
| 574 |
+
customerInfo.innerHTML = '';
|
| 575 |
+
if (data.invoice.customer_data) {
|
| 576 |
+
const customer = JSON.parse(data.invoice.customer_data);
|
| 577 |
+
addInfoItem(customerInfo, 'Company Name', customer.name);
|
| 578 |
+
addInfoItem(customerInfo, 'Address', customer.address);
|
| 579 |
+
addInfoItem(customerInfo, 'Phone', customer.phone);
|
| 580 |
+
addInfoItem(customerInfo, 'Email', customer.email);
|
| 581 |
+
}
|
| 582 |
+
|
| 583 |
+
// Invoice Details
|
| 584 |
+
const invoiceDetails = document.getElementById('invoice-details');
|
| 585 |
+
invoiceDetails.innerHTML = '';
|
| 586 |
+
if (data.invoice.invoice_details) {
|
| 587 |
+
const details = JSON.parse(data.invoice.invoice_details);
|
| 588 |
+
addInfoItem(invoiceDetails, 'Invoice Number', details.invoice_number);
|
| 589 |
+
addInfoItem(invoiceDetails, 'Invoice Date', details.invoice_date);
|
| 590 |
+
addInfoItem(invoiceDetails, 'Due Date', details.due_date);
|
| 591 |
+
addInfoItem(invoiceDetails, 'PO Number', details.po_number);
|
| 592 |
+
addInfoItem(invoiceDetails, 'Payment Terms', details.payment_terms);
|
| 593 |
+
}
|
| 594 |
+
|
| 595 |
+
// Line Items
|
| 596 |
+
const lineItemsBody = document.getElementById('line-items-body');
|
| 597 |
+
lineItemsBody.innerHTML = '';
|
| 598 |
+
const currency = data.invoice.currency || 'EUR';
|
| 599 |
+
if (data.invoice.line_items) {
|
| 600 |
+
const items = JSON.parse(data.invoice.line_items);
|
| 601 |
+
items.forEach(item => {
|
| 602 |
+
const row = document.createElement('tr');
|
| 603 |
+
|
| 604 |
+
// Format type with color coding
|
| 605 |
+
let typeHtml = '-';
|
| 606 |
+
if (item.type) {
|
| 607 |
+
const typeColors = {
|
| 608 |
+
'produce': '#10b981',
|
| 609 |
+
'protein': '#ef4444',
|
| 610 |
+
'beverage': '#3b82f6',
|
| 611 |
+
'dairy': '#f59e0b',
|
| 612 |
+
'grain': '#8b5cf6',
|
| 613 |
+
'condiment': '#f97316',
|
| 614 |
+
'cleaning': '#06b6d4',
|
| 615 |
+
'packaging': '#6b7280',
|
| 616 |
+
'miscellaneous': '#9ca3af'
|
| 617 |
+
};
|
| 618 |
+
const color = typeColors[item.type] || '#6b7280';
|
| 619 |
+
typeHtml = `<span style="background: ${color}; color: white; padding: 4px 8px; border-radius: 4px; font-size: 0.85em; font-weight: 600;">${item.type.toUpperCase()}</span>`;
|
| 620 |
+
}
|
| 621 |
+
|
| 622 |
+
row.innerHTML = `
|
| 623 |
+
<td>${item.item_code || '-'}</td>
|
| 624 |
+
<td>${item.description || '-'}</td>
|
| 625 |
+
<td>${typeHtml}</td>
|
| 626 |
+
<td>${item.quantity || '-'}</td>
|
| 627 |
+
<td>${item.unit || '-'}</td>
|
| 628 |
+
<td>${item.unit_price ? currency + ' ' + item.unit_price.toFixed(2) : '-'}</td>
|
| 629 |
+
<td><strong>${item.total_price ? currency + ' ' + item.total_price.toFixed(2) : '-'}</strong></td>
|
| 630 |
+
`;
|
| 631 |
+
lineItemsBody.appendChild(row);
|
| 632 |
+
});
|
| 633 |
+
}
|
| 634 |
+
|
| 635 |
+
// Financial Summary
|
| 636 |
+
const financialSummary = document.getElementById('financial-summary');
|
| 637 |
+
financialSummary.innerHTML = '';
|
| 638 |
+
if (data.invoice.financial_summary) {
|
| 639 |
+
const summary = JSON.parse(data.invoice.financial_summary);
|
| 640 |
+
const currencySymbol = summary.currency || currency || 'EUR';
|
| 641 |
+
addInfoItem(financialSummary, 'Subtotal', summary.subtotal ? currencySymbol + ' ' + summary.subtotal.toFixed(2) : '-');
|
| 642 |
+
addInfoItem(financialSummary, 'Tax Amount', summary.tax_amount ? currencySymbol + ' ' + summary.tax_amount.toFixed(2) : '-');
|
| 643 |
+
addInfoItem(financialSummary, 'Total Amount', summary.total_amount ? currencySymbol + ' ' + summary.total_amount.toFixed(2) : '-');
|
| 644 |
+
addInfoItem(financialSummary, 'Currency', currencySymbol);
|
| 645 |
+
} // Payment Information
|
| 646 |
+
const paymentInfo = document.getElementById('payment-info');
|
| 647 |
+
paymentInfo.innerHTML = '';
|
| 648 |
+
if (data.invoice.payment_info) {
|
| 649 |
+
const payment = JSON.parse(data.invoice.payment_info);
|
| 650 |
+
addInfoItem(paymentInfo, 'Bank Name', payment.bank_name);
|
| 651 |
+
addInfoItem(paymentInfo, 'Account Number', payment.account_number);
|
| 652 |
+
addInfoItem(paymentInfo, 'IBAN', payment.iban);
|
| 653 |
+
addInfoItem(paymentInfo, 'SWIFT/BIC', payment.swift_code);
|
| 654 |
+
}
|
| 655 |
+
|
| 656 |
+
// Cost Breakdown
|
| 657 |
+
document.getElementById('cost-docai').textContent = '$' + data.costs.document_ai_cost.toFixed(6);
|
| 658 |
+
document.getElementById('tokens-input').textContent = data.costs.gemini_input_tokens;
|
| 659 |
+
document.getElementById('cost-gemini-input').textContent = '$' + data.costs.gemini_input_cost.toFixed(6);
|
| 660 |
+
document.getElementById('tokens-output').textContent = data.costs.gemini_output_tokens;
|
| 661 |
+
document.getElementById('cost-gemini-output').textContent = '$' + data.costs.gemini_output_cost.toFixed(6);
|
| 662 |
+
document.getElementById('cost-total').textContent = '$' + data.costs.total_cost.toFixed(6);
|
| 663 |
+
}
|
| 664 |
+
|
| 665 |
+
function addInfoItem(container, label, value) {
|
| 666 |
+
// Show "N/A" for missing values instead of hiding the field
|
| 667 |
+
const displayValue = value || 'N/A';
|
| 668 |
+
const item = document.createElement('div');
|
| 669 |
+
item.className = 'info-item';
|
| 670 |
+
item.innerHTML = `
|
| 671 |
+
<div class="info-label">${label}</div>
|
| 672 |
+
<div class="info-value">${displayValue}</div>
|
| 673 |
+
`;
|
| 674 |
+
container.appendChild(item);
|
| 675 |
+
}
|
| 676 |
+
|
| 677 |
+
async function loadInvoices() {
|
| 678 |
+
try {
|
| 679 |
+
const response = await fetch('/invoices');
|
| 680 |
+
const invoices = await response.json();
|
| 681 |
+
const invoiceList = document.getElementById('invoice-list');
|
| 682 |
+
invoiceList.innerHTML = '';
|
| 683 |
+
|
| 684 |
+
if (invoices.length === 0) {
|
| 685 |
+
invoiceList.innerHTML = '<div class="empty-state">No invoices processed yet</div>';
|
| 686 |
+
return;
|
| 687 |
+
}
|
| 688 |
+
|
| 689 |
+
invoices.forEach(invoice => {
|
| 690 |
+
const details = JSON.parse(invoice.invoice_details);
|
| 691 |
+
const summary = JSON.parse(invoice.financial_summary);
|
| 692 |
+
const supplier = JSON.parse(invoice.supplier_data);
|
| 693 |
+
const currency = invoice.currency || summary.currency || 'EUR';
|
| 694 |
+
|
| 695 |
+
const card = document.createElement('div');
|
| 696 |
+
card.className = 'invoice-card';
|
| 697 |
+
card.style.cursor = 'pointer';
|
| 698 |
+
card.innerHTML = `
|
| 699 |
+
<div class="invoice-header">
|
| 700 |
+
<div class="invoice-number">${supplier.name || 'Unknown Supplier'}</div>
|
| 701 |
+
<div class="invoice-amount">${currency} ${summary.total_amount ? summary.total_amount.toFixed(2) : '0.00'}</div>
|
| 702 |
+
</div>
|
| 703 |
+
<div class="invoice-meta">
|
| 704 |
+
<span>� ${details.invoice_number || 'N/A'}</span>
|
| 705 |
+
<span>📅 ${details.invoice_date || 'N/A'}</span>
|
| 706 |
+
</div>
|
| 707 |
+
<div style="margin-top: 10px;">
|
| 708 |
+
<button class="delete-btn" id="delete-${invoice.id}">Delete</button>
|
| 709 |
+
<button class="view-btn" id="view-${invoice.id}">View Details</button>
|
| 710 |
+
</div>
|
| 711 |
+
`;
|
| 712 |
+
|
| 713 |
+
invoiceList.appendChild(card);
|
| 714 |
+
|
| 715 |
+
// Add event listeners after appending to DOM
|
| 716 |
+
document.getElementById(`delete-${invoice.id}`).addEventListener('click', (e) => {
|
| 717 |
+
e.stopPropagation();
|
| 718 |
+
deleteInvoice(invoice.id);
|
| 719 |
+
});
|
| 720 |
+
|
| 721 |
+
document.getElementById(`view-${invoice.id}`).addEventListener('click', (e) => {
|
| 722 |
+
e.stopPropagation();
|
| 723 |
+
viewInvoice(invoice.id);
|
| 724 |
+
});
|
| 725 |
+
|
| 726 |
+
// Make card clickable to view details
|
| 727 |
+
card.onclick = () => viewInvoice(invoice.id);
|
| 728 |
+
});
|
| 729 |
+
} catch (error) {
|
| 730 |
+
console.error('Error loading invoices:', error);
|
| 731 |
+
}
|
| 732 |
+
}
|
| 733 |
+
|
| 734 |
+
async function loadStats() {
|
| 735 |
+
try {
|
| 736 |
+
const response = await fetch('/stats');
|
| 737 |
+
const stats = await response.json();
|
| 738 |
+
document.getElementById('stat-total').textContent = stats.total_invoices;
|
| 739 |
+
document.getElementById('stat-cost').textContent = '$' + stats.total_processing_cost.toFixed(4);
|
| 740 |
+
document.getElementById('stat-avg').textContent = '$' + stats.average_cost_per_invoice.toFixed(6);
|
| 741 |
+
} catch (error) {
|
| 742 |
+
console.error('Error loading stats:', error);
|
| 743 |
+
}
|
| 744 |
+
}
|
| 745 |
+
|
| 746 |
+
async function viewInvoice(id) {
|
| 747 |
+
try {
|
| 748 |
+
const response = await fetch(`/invoices/${id}`);
|
| 749 |
+
const data = await response.json();
|
| 750 |
+
displayResults(data);
|
| 751 |
+
// Scroll to results
|
| 752 |
+
document.getElementById('results').scrollIntoView({ behavior: 'smooth' });
|
| 753 |
+
showAlert('Invoice loaded successfully', 'success');
|
| 754 |
+
} catch (error) {
|
| 755 |
+
showAlert(`Error: ${error.message}`, 'error');
|
| 756 |
+
}
|
| 757 |
+
}
|
| 758 |
+
|
| 759 |
+
async function deleteInvoice(id) {
|
| 760 |
+
if (!confirm('Are you sure you want to delete this invoice?')) return;
|
| 761 |
+
|
| 762 |
+
try {
|
| 763 |
+
const response = await fetch(`/invoices/${id}`, { method: 'DELETE' });
|
| 764 |
+
if (response.ok) {
|
| 765 |
+
showAlert('Invoice deleted successfully', 'success');
|
| 766 |
+
loadInvoices();
|
| 767 |
+
loadStats();
|
| 768 |
+
} else {
|
| 769 |
+
throw new Error('Delete failed');
|
| 770 |
+
}
|
| 771 |
+
} catch (error) {
|
| 772 |
+
showAlert(`Error: ${error.message}`, 'error');
|
| 773 |
+
}
|
| 774 |
+
}
|
| 775 |
+
|
| 776 |
+
function resetUpload() {
|
| 777 |
+
uploadZone.style.display = 'block';
|
| 778 |
+
loading.style.display = 'none';
|
| 779 |
+
results.style.display = 'none';
|
| 780 |
+
fileInput.value = '';
|
| 781 |
+
hideAlerts();
|
| 782 |
+
}
|
| 783 |
+
|
| 784 |
+
function showAlert(message, type) {
|
| 785 |
+
hideAlerts();
|
| 786 |
+
const alert = document.getElementById(`alert-${type}`);
|
| 787 |
+
alert.textContent = message;
|
| 788 |
+
alert.style.display = 'block';
|
| 789 |
+
setTimeout(() => hideAlerts(), 5000);
|
| 790 |
+
}
|
| 791 |
+
|
| 792 |
+
function hideAlerts() {
|
| 793 |
+
document.getElementById('alert-success').style.display = 'none';
|
| 794 |
+
document.getElementById('alert-error').style.display = 'none';
|
| 795 |
+
}
|
| 796 |
+
|
| 797 |
+
// Load initial data
|
| 798 |
+
loadInvoices();
|
| 799 |
+
loadStats();
|
| 800 |
+
</script>
|
| 801 |
+
</body>
|
| 802 |
+
</html>
|
static/logo.svg
ADDED
|
|
static/uploads/invoice (1).jpg
ADDED
|
Git LFS Details
|
static/uploads/invoice (2).jpg
ADDED
|
Git LFS Details
|
static/uploads/invoice (3).jpg
ADDED
|
Git LFS Details
|
static/uploads/invoice (4).jpg
ADDED
|
Git LFS Details
|
static/uploads/invoice (5).jpg
ADDED
|
Git LFS Details
|
static/uploads/invoice (6).jpg
ADDED
|
Git LFS Details
|
static/uploads/invoice (8).jpg
ADDED
|
Git LFS Details
|