|
|
|
"""
|
|
OCR Backend API with Azure Document Intelligence - Cleaned and Optimized
|
|
Supports file uploads, URL processing, and web scraping fallback
|
|
"""
|
|
|
|
import os
|
|
import io
|
|
import requests
|
|
import numpy as np
|
|
import logging
|
|
from typing import Optional, List, Dict, Any
|
|
from urllib.parse import urlparse, urljoin
|
|
from pathlib import Path
|
|
import mimetypes
|
|
|
|
from fastapi import FastAPI, File, UploadFile, HTTPException, Form
|
|
from fastapi.middleware.cors import CORSMiddleware
|
|
from pydantic import BaseModel, HttpUrl
|
|
import uvicorn
|
|
|
|
|
|
try:
|
|
from configs import get_config
|
|
config = get_config().ocr
|
|
print("β
Using unified configuration")
|
|
except ImportError:
|
|
print("β οΈ Unified config not available, using fallback configuration")
|
|
from dotenv import load_dotenv
|
|
load_dotenv()
|
|
|
|
class FallbackConfig:
|
|
HOST = os.getenv("HOST", "0.0.0.0")
|
|
PORT = int(os.getenv("OCR_PORT", "8400"))
|
|
DEBUG = os.getenv("DEBUG", "True").lower() == "true"
|
|
|
|
|
|
AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT", "")
|
|
AZURE_DOCUMENT_INTELLIGENCE_KEY = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_KEY", "")
|
|
|
|
|
|
MAX_IMAGES_PER_PAGE = int(os.getenv("MAX_IMAGES_PER_PAGE", "10"))
|
|
REQUEST_TIMEOUT = int(os.getenv("REQUEST_TIMEOUT", "30"))
|
|
USER_AGENT = os.getenv("USER_AGENT", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
|
|
|
|
|
|
MAX_FILE_SIZE = 50 * 1024 * 1024
|
|
|
|
config = FallbackConfig()
|
|
|
|
from azure.core.credentials import AzureKeyCredential
|
|
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
|
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
|
|
from azure.core.exceptions import HttpResponseError
|
|
|
|
from bs4 import BeautifulSoup
|
|
from PIL import Image
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
app = FastAPI(
|
|
title="OCR Backend API",
|
|
description="OCR service with Azure Document Intelligence, supporting file uploads, URLs, and web scraping",
|
|
version="2.0.0",
|
|
debug=config.DEBUG
|
|
)
|
|
|
|
|
|
app.add_middleware(
|
|
CORSMiddleware,
|
|
allow_origins=["*"],
|
|
allow_credentials=True,
|
|
allow_methods=["*"],
|
|
allow_headers=["*"],
|
|
)
|
|
|
|
|
|
class URLRequest(BaseModel):
|
|
url: HttpUrl
|
|
extract_images: bool = True
|
|
|
|
class OCRResponse(BaseModel):
|
|
success: bool
|
|
content: str
|
|
pages: List[Dict[str, Any]]
|
|
source_type: str
|
|
source_url: Optional[str] = None
|
|
error: Optional[str] = None
|
|
|
|
class WebScrapingResult(BaseModel):
|
|
text_content: str
|
|
images_found: List[str]
|
|
ocr_results: List[Dict[str, Any]]
|
|
|
|
|
|
def format_bounding_box(bounding_box):
|
|
"""Format bounding box coordinates for display"""
|
|
if not bounding_box:
|
|
return "N/A"
|
|
reshaped_bounding_box = np.array(bounding_box).reshape(-1, 2)
|
|
return ", ".join(["[{}, {}]".format(x, y) for x, y in reshaped_bounding_box])
|
|
|
|
def is_supported_file_type(content_type: str, filename: str = "") -> bool:
|
|
"""Check if the file type is supported for OCR"""
|
|
supported_types = {
|
|
'application/pdf',
|
|
'image/jpeg',
|
|
'image/jpg',
|
|
'image/png',
|
|
'image/tiff',
|
|
'image/bmp',
|
|
'image/gif'
|
|
}
|
|
|
|
if content_type and content_type.lower() in supported_types:
|
|
return True
|
|
|
|
|
|
if filename:
|
|
supported_extensions = {'.pdf', '.jpg', '.jpeg', '.png', '.tiff', '.tif', '.bmp', '.gif'}
|
|
file_ext = Path(filename).suffix.lower()
|
|
return file_ext in supported_extensions
|
|
|
|
return False
|
|
|
|
def get_document_intelligence_client():
|
|
"""Initialize Azure Document Intelligence client"""
|
|
if (config.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT == "" or
|
|
config.AZURE_DOCUMENT_INTELLIGENCE_KEY == "" or
|
|
config.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT == "YOUR_FORM_RECOGNIZER_ENDPOINT" or
|
|
config.AZURE_DOCUMENT_INTELLIGENCE_KEY == "YOUR_FORM_RECOGNIZER_KEY"):
|
|
raise HTTPException(
|
|
status_code=500,
|
|
detail="Azure Document Intelligence credentials not configured"
|
|
)
|
|
|
|
return DocumentIntelligenceClient(
|
|
endpoint=config.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT,
|
|
credential=AzureKeyCredential(config.AZURE_DOCUMENT_INTELLIGENCE_KEY)
|
|
)
|
|
|
|
async def process_ocr_from_url(url: str) -> Dict[str, Any]:
|
|
"""Process OCR from a direct URL"""
|
|
try:
|
|
client = get_document_intelligence_client()
|
|
|
|
logger.info(f"Processing OCR from URL: {url}")
|
|
poller = client.begin_analyze_document(
|
|
"prebuilt-read",
|
|
AnalyzeDocumentRequest(url_source=url)
|
|
)
|
|
result = poller.result()
|
|
|
|
return format_ocr_result(result, "direct_url", url)
|
|
|
|
except HttpResponseError as e:
|
|
logger.error(f"Azure OCR error for URL {url}: {e}")
|
|
raise HTTPException(status_code=400, detail=f"OCR processing failed: {e}")
|
|
except Exception as e:
|
|
logger.error(f"Unexpected error processing URL {url}: {e}")
|
|
raise HTTPException(status_code=500, detail=f"Unexpected error: {e}")
|
|
|
|
async def process_ocr_from_bytes(file_bytes: bytes, filename: str = "") -> Dict[str, Any]:
|
|
"""Process OCR from file bytes"""
|
|
try:
|
|
client = get_document_intelligence_client()
|
|
|
|
logger.info(f"Processing OCR from file: {filename} ({len(file_bytes)} bytes)")
|
|
poller = client.begin_analyze_document(
|
|
"prebuilt-read",
|
|
AnalyzeDocumentRequest(bytes_source=file_bytes)
|
|
)
|
|
result = poller.result()
|
|
|
|
return format_ocr_result(result, "file_upload", filename)
|
|
|
|
except HttpResponseError as e:
|
|
logger.error(f"Azure OCR error for file {filename}: {e}")
|
|
raise HTTPException(status_code=400, detail=f"OCR processing failed: {e}")
|
|
except Exception as e:
|
|
logger.error(f"Unexpected error processing file {filename}: {e}")
|
|
raise HTTPException(status_code=500, detail=f"Unexpected error: {e}")
|
|
|
|
def format_ocr_result(result, source_type: str, source_identifier: str = "") -> Dict[str, Any]:
|
|
"""Format Azure Document Intelligence result into standardized response"""
|
|
pages_data = []
|
|
|
|
for page in result.pages:
|
|
page_data = {
|
|
"page_number": page.page_number,
|
|
"width": page.width,
|
|
"height": page.height,
|
|
"unit": page.unit,
|
|
"lines": [],
|
|
"words": []
|
|
}
|
|
|
|
|
|
if hasattr(page, 'lines') and page.lines:
|
|
for line_idx, line in enumerate(page.lines):
|
|
page_data["lines"].append({
|
|
"line_number": line_idx,
|
|
"content": line.content,
|
|
"bounding_box": format_bounding_box(line.polygon) if hasattr(line, 'polygon') else "N/A"
|
|
})
|
|
|
|
|
|
if hasattr(page, 'words') and page.words:
|
|
for word in page.words:
|
|
page_data["words"].append({
|
|
"content": word.content,
|
|
"confidence": word.confidence if hasattr(word, 'confidence') else None
|
|
})
|
|
|
|
pages_data.append(page_data)
|
|
|
|
|
|
handwritten_detected = False
|
|
if hasattr(result, 'styles') and result.styles:
|
|
for style in result.styles:
|
|
if hasattr(style, 'is_handwritten') and style.is_handwritten:
|
|
handwritten_detected = True
|
|
break
|
|
|
|
return {
|
|
"success": True,
|
|
"content": result.content if hasattr(result, 'content') else "",
|
|
"pages": pages_data,
|
|
"source_type": source_type,
|
|
"source_url": source_identifier if source_type == "direct_url" else None,
|
|
"handwritten_detected": handwritten_detected,
|
|
"error": None
|
|
}
|
|
|
|
async def scrape_web_content(url: str, extract_images: bool = True) -> WebScrapingResult:
|
|
"""Scrape web content and extract text and images"""
|
|
try:
|
|
headers = {
|
|
'User-Agent': config.USER_AGENT
|
|
}
|
|
|
|
logger.info(f"Scraping web content from: {url}")
|
|
response = requests.get(url, headers=headers, timeout=config.REQUEST_TIMEOUT)
|
|
response.raise_for_status()
|
|
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
|
|
|
text_content = soup.get_text(separator=' ', strip=True)
|
|
|
|
images_found = []
|
|
ocr_results = []
|
|
|
|
if extract_images:
|
|
|
|
img_tags = soup.find_all('img')
|
|
|
|
for img in img_tags[:config.MAX_IMAGES_PER_PAGE]:
|
|
img_src = img.get('src')
|
|
if img_src:
|
|
|
|
img_url = urljoin(url, img_src)
|
|
images_found.append(img_url)
|
|
|
|
|
|
try:
|
|
|
|
img_response = requests.head(img_url, headers=headers, timeout=10)
|
|
content_type = img_response.headers.get('content-type', '')
|
|
|
|
if is_supported_file_type(content_type):
|
|
ocr_result = await process_ocr_from_url(img_url)
|
|
if ocr_result['content'].strip():
|
|
ocr_results.append({
|
|
"image_url": img_url,
|
|
"ocr_content": ocr_result['content'],
|
|
"pages": ocr_result['pages']
|
|
})
|
|
except Exception as e:
|
|
logger.warning(f"Failed to process image {img_url}: {e}")
|
|
continue
|
|
|
|
return WebScrapingResult(
|
|
text_content=text_content,
|
|
images_found=images_found,
|
|
ocr_results=ocr_results
|
|
)
|
|
|
|
except requests.RequestException as e:
|
|
logger.error(f"Failed to scrape URL {url}: {e}")
|
|
raise HTTPException(status_code=400, detail=f"Failed to scrape URL: {e}")
|
|
except Exception as e:
|
|
logger.error(f"Unexpected error scraping URL {url}: {e}")
|
|
raise HTTPException(status_code=500, detail=f"Unexpected error during web scraping: {e}")
|
|
|
|
def check_url_is_direct_file(url: str) -> tuple[bool, str]:
|
|
"""Check if URL points directly to a file"""
|
|
try:
|
|
headers = {
|
|
'User-Agent': config.USER_AGENT
|
|
}
|
|
|
|
response = requests.head(url, headers=headers, timeout=10, allow_redirects=True)
|
|
content_type = response.headers.get('content-type', '').lower()
|
|
|
|
|
|
content_disposition = response.headers.get('content-disposition', '')
|
|
filename = ""
|
|
if 'filename=' in content_disposition:
|
|
filename = content_disposition.split('filename=')[1].strip('"')
|
|
|
|
|
|
if not filename:
|
|
parsed_url = urlparse(url)
|
|
filename = Path(parsed_url.path).name
|
|
|
|
is_file = is_supported_file_type(content_type, filename)
|
|
return is_file, content_type
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Failed to check URL {url}: {e}")
|
|
return False, ""
|
|
|
|
|
|
@app.get("/")
|
|
async def root():
|
|
azure_di_available = bool(
|
|
config.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT and
|
|
config.AZURE_DOCUMENT_INTELLIGENCE_KEY and
|
|
config.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT != "YOUR_FORM_RECOGNIZER_ENDPOINT" and
|
|
config.AZURE_DOCUMENT_INTELLIGENCE_KEY != "YOUR_FORM_RECOGNIZER_KEY"
|
|
)
|
|
|
|
return {
|
|
"message": "OCR Backend API",
|
|
"version": "2.0.0",
|
|
"status": "operational",
|
|
"features": {
|
|
"file_upload": True,
|
|
"url_processing": True,
|
|
"web_scraping": True,
|
|
"azure_document_intelligence": azure_di_available,
|
|
"supported_formats": ["PDF", "JPEG", "PNG", "TIFF", "BMP", "GIF"]
|
|
},
|
|
"limits": {
|
|
"max_file_size_mb": config.MAX_FILE_SIZE / (1024 * 1024),
|
|
"max_images_per_page": config.MAX_IMAGES_PER_PAGE,
|
|
"request_timeout_seconds": config.REQUEST_TIMEOUT
|
|
}
|
|
}
|
|
|
|
@app.get("/health")
|
|
async def health_check():
|
|
azure_di_available = bool(
|
|
config.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT and
|
|
config.AZURE_DOCUMENT_INTELLIGENCE_KEY and
|
|
config.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT != "YOUR_FORM_RECOGNIZER_ENDPOINT" and
|
|
config.AZURE_DOCUMENT_INTELLIGENCE_KEY != "YOUR_FORM_RECOGNIZER_KEY"
|
|
)
|
|
|
|
|
|
azure_di_status = "not_configured"
|
|
if azure_di_available:
|
|
try:
|
|
|
|
get_document_intelligence_client()
|
|
azure_di_status = "configured"
|
|
except Exception as e:
|
|
azure_di_status = f"error: {str(e)[:100]}"
|
|
|
|
return {
|
|
"status": "healthy",
|
|
"service": "OCR Backend API",
|
|
"version": "2.0.0",
|
|
"azure_document_intelligence": azure_di_status,
|
|
"configuration": {
|
|
"max_file_size_mb": config.MAX_FILE_SIZE / (1024 * 1024),
|
|
"max_images_per_page": config.MAX_IMAGES_PER_PAGE,
|
|
"request_timeout": config.REQUEST_TIMEOUT,
|
|
"endpoint_configured": bool(config.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT),
|
|
"key_configured": bool(config.AZURE_DOCUMENT_INTELLIGENCE_KEY)
|
|
}
|
|
}
|
|
|
|
@app.post("/ocr/upload", response_model=OCRResponse)
|
|
async def ocr_upload_file(file: UploadFile = File(...)):
|
|
"""Upload a file for OCR processing"""
|
|
|
|
|
|
if not is_supported_file_type(file.content_type, file.filename):
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Unsupported file type: {file.content_type}. Supported types: PDF, JPEG, PNG, TIFF, BMP, GIF"
|
|
)
|
|
|
|
try:
|
|
|
|
file_bytes = await file.read()
|
|
|
|
|
|
if len(file_bytes) > config.MAX_FILE_SIZE:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"File too large. Maximum size: {config.MAX_FILE_SIZE / (1024*1024):.0f}MB"
|
|
)
|
|
|
|
|
|
result = await process_ocr_from_bytes(file_bytes, file.filename)
|
|
|
|
return OCRResponse(**result)
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"Unexpected error processing uploaded file: {e}")
|
|
raise HTTPException(status_code=500, detail=f"Unexpected error: {e}")
|
|
|
|
@app.post("/ocr/url", response_model=OCRResponse)
|
|
async def ocr_from_url(request: URLRequest):
|
|
"""Process OCR from URL - either direct file or web scraping"""
|
|
|
|
url_str = str(request.url)
|
|
|
|
|
|
is_direct_file, content_type = check_url_is_direct_file(url_str)
|
|
|
|
if is_direct_file:
|
|
|
|
try:
|
|
result = await process_ocr_from_url(url_str)
|
|
return OCRResponse(**result)
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"Failed to process direct file URL: {e}")
|
|
|
|
pass
|
|
|
|
|
|
try:
|
|
scraping_result = await scrape_web_content(url_str, request.extract_images)
|
|
|
|
|
|
combined_content = scraping_result.text_content
|
|
|
|
if scraping_result.ocr_results:
|
|
ocr_content = "\n\n--- OCR from Images ---\n"
|
|
for ocr_result in scraping_result.ocr_results:
|
|
ocr_content += f"\nImage: {ocr_result['image_url']}\n"
|
|
ocr_content += ocr_result['ocr_content'] + "\n"
|
|
combined_content += ocr_content
|
|
|
|
|
|
pages_data = [{
|
|
"page_number": 1,
|
|
"content_type": "web_scraped",
|
|
"text_content": scraping_result.text_content,
|
|
"images_found": len(scraping_result.images_found),
|
|
"ocr_results": len(scraping_result.ocr_results)
|
|
}]
|
|
|
|
return OCRResponse(
|
|
success=True,
|
|
content=combined_content,
|
|
pages=pages_data,
|
|
source_type="web_scraped",
|
|
source_url=url_str,
|
|
error=None
|
|
)
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"Failed to process URL {url_str}: {e}")
|
|
return OCRResponse(
|
|
success=False,
|
|
content="",
|
|
pages=[],
|
|
source_type="web_scraped",
|
|
source_url=url_str,
|
|
error=str(e)
|
|
)
|
|
|
|
@app.post("/ocr/analyze")
|
|
async def analyze_document(
|
|
file: Optional[UploadFile] = File(None),
|
|
url: Optional[str] = Form(None),
|
|
extract_images: bool = Form(True)
|
|
):
|
|
"""Unified endpoint for document analysis - accepts either file upload or URL"""
|
|
|
|
if not file and not url:
|
|
raise HTTPException(status_code=400, detail="Either file or URL must be provided")
|
|
|
|
if file and url:
|
|
raise HTTPException(status_code=400, detail="Provide either file or URL, not both")
|
|
|
|
try:
|
|
if file:
|
|
|
|
if not is_supported_file_type(file.content_type, file.filename):
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Unsupported file type: {file.content_type}"
|
|
)
|
|
|
|
file_bytes = await file.read()
|
|
|
|
|
|
if len(file_bytes) > config.MAX_FILE_SIZE:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"File too large. Maximum size: {config.MAX_FILE_SIZE / (1024*1024):.0f}MB"
|
|
)
|
|
|
|
result = await process_ocr_from_bytes(file_bytes, file.filename)
|
|
return result
|
|
|
|
else:
|
|
|
|
url_request = URLRequest(url=url, extract_images=extract_images)
|
|
response = await ocr_from_url(url_request)
|
|
return response.dict()
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"Unexpected error in analyze_document: {e}")
|
|
raise HTTPException(status_code=500, detail=f"Unexpected error: {e}")
|
|
|
|
|
|
@app.get("/supported-formats")
|
|
async def get_supported_formats():
|
|
"""Get list of supported file formats"""
|
|
return {
|
|
"supported_formats": {
|
|
"documents": ["PDF"],
|
|
"images": ["JPEG", "JPG", "PNG", "TIFF", "TIF", "BMP", "GIF"]
|
|
},
|
|
"content_types": [
|
|
"application/pdf",
|
|
"image/jpeg",
|
|
"image/jpg",
|
|
"image/png",
|
|
"image/tiff",
|
|
"image/bmp",
|
|
"image/gif"
|
|
],
|
|
"max_file_size_mb": config.MAX_FILE_SIZE / (1024 * 1024),
|
|
"max_images_per_page": config.MAX_IMAGES_PER_PAGE
|
|
}
|
|
|
|
@app.get("/config")
|
|
async def get_configuration():
|
|
"""Get current service configuration (for debugging)"""
|
|
return {
|
|
"service": "OCR Backend API",
|
|
"version": "2.0.0",
|
|
"configuration": {
|
|
"host": config.HOST,
|
|
"port": config.PORT,
|
|
"debug": config.DEBUG,
|
|
"max_file_size_mb": config.MAX_FILE_SIZE / (1024 * 1024),
|
|
"max_images_per_page": config.MAX_IMAGES_PER_PAGE,
|
|
"request_timeout": config.REQUEST_TIMEOUT,
|
|
"azure_di_configured": bool(
|
|
config.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT and
|
|
config.AZURE_DOCUMENT_INTELLIGENCE_KEY
|
|
)
|
|
}
|
|
}
|
|
|
|
if __name__ == "__main__":
|
|
print("π§ Loading OCR service configuration...")
|
|
print(f"π Will start server on {config.HOST}:{config.PORT}")
|
|
print(f"π Azure Document Intelligence: {'β
Configured' if config.AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT else 'β Not configured'}")
|
|
print(f"π Max file size: {config.MAX_FILE_SIZE / (1024*1024):.0f}MB")
|
|
|
|
uvicorn.run(
|
|
"ocr_service:app",
|
|
host=config.HOST,
|
|
port=config.PORT,
|
|
reload=config.DEBUG,
|
|
log_level="info"
|
|
) |