Dipan04's picture
Deploy Invoice Digitization Agent
8a859a8
"""
HF Agent client with proper environment variable support.
"""
import httpx
import os
import time
import logging
from typing import Dict, Optional, Tuple
from pathlib import Path
logger = logging.getLogger(__name__)
# Load from environment
TEXT_EXTRACTOR_URL = os.getenv('TEXT_EXTRACTOR_URL', 'https://point9-extract-text-and-table.hf.space/api/text')
TABLE_EXTRACTOR_URL = os.getenv('TABLE_EXTRACTOR_URL', 'https://point9-extract-text-and-table.hf.space/api/tables')
NER_URL = os.getenv('NER_URL', 'https://point9-ner.hf.space/api/ner')
CLASSIFY_URL = os.getenv('CLASSIFY_URL', 'https://point9-classify.hf.space/api/classify')
SUMMARIZER_URL = os.getenv('SUMMARIZER_URL', '') # Optional
AGENT_BEARER_TOKEN = os.getenv('AGENT_BEARER_TOKEN', '')
AGENT_TIMEOUT_SECONDS = int(os.getenv('AGENT_TIMEOUT_SECONDS', '30'))
def get_headers() -> Dict:
"""Get headers with optional bearer token."""
headers = {}
if AGENT_BEARER_TOKEN:
headers['Authorization'] = f'Bearer {AGENT_BEARER_TOKEN}'
return headers
def call_agent_with_retry(
url: str,
files: Optional[Dict] = None,
data: Optional[Dict] = None,
json: Optional[Dict] = None,
max_retries: int = 1
) -> Tuple[bool, Optional[Dict], Optional[str]]:
"""Call agent with retry logic."""
headers = get_headers()
for attempt in range(max_retries + 1):
try:
with httpx.Client(timeout=AGENT_TIMEOUT_SECONDS) as client:
if files:
response = client.post(url, headers=headers, files=files, data=data)
elif json:
response = client.post(url, headers=headers, json=json)
else:
response = client.post(url, headers=headers, data=data)
if response.status_code == 200:
return True, response.json(), None
elif response.status_code == 429:
if attempt < max_retries:
time.sleep(2)
continue
return False, None, "Rate limited"
else:
return False, None, f"HTTP {response.status_code}: {response.text[:200]}"
except httpx.TimeoutException:
if attempt < max_retries:
time.sleep(1)
continue
return False, None, f"Timeout after {AGENT_TIMEOUT_SECONDS}s"
except Exception as e:
if attempt < max_retries:
time.sleep(1)
continue
return False, None, str(e)
return False, None, "Max retries exceeded"
def extract_text_from_file(file_path: Path) -> Tuple[bool, Optional[str], Optional[str]]:
"""Extract text using HF agent."""
try:
with open(file_path, 'rb') as f:
files = {'file': (file_path.name, f, 'application/pdf')}
data = {'filename': file_path.name}
success, response, error = call_agent_with_retry(TEXT_EXTRACTOR_URL, files=files, data=data)
if success and response:
text = response.get('text', '')
if not text or len(text.strip()) < 10:
return False, None, "No text extracted"
return True, text, None
else:
return False, None, error or "Text extraction failed"
except Exception as e:
return False, None, str(e)
def extract_tables_from_file(file_path: Path) -> Tuple[bool, Optional[list], Optional[str]]:
"""Extract tables using HF agent."""
try:
with open(file_path, 'rb') as f:
files = {'file': (file_path.name, f, 'application/pdf')}
data = {'filename': file_path.name}
success, response, error = call_agent_with_retry(TABLE_EXTRACTOR_URL, files=files, data=data)
if success and response:
return True, response.get('tables', []), None
else:
return False, None, error or "Table extraction failed"
except Exception as e:
return False, None, str(e)
def extract_entities_from_text(text: str) -> Tuple[bool, Optional[list], Optional[str]]:
"""Extract entities using NER agent."""
try:
success, response, error = call_agent_with_retry(NER_URL, json={'text': text})
if success and response:
return True, response.get('entities', []), None
else:
return False, None, error or "NER failed"
except Exception as e:
return False, None, str(e)
def classify_document(text: str) -> Tuple[bool, Optional[Dict], Optional[str]]:
"""Classify document using classifier agent."""
try:
success, response, error = call_agent_with_retry(CLASSIFY_URL, json={'text': text[:2000]})
if success and response:
return True, response, None
else:
return False, None, error or "Classification failed"
except Exception as e:
return False, None, str(e)
def summarize_text(text: str) -> Tuple[bool, Optional[str], Optional[str]]:
"""Summarize text (optional)."""
if not SUMMARIZER_URL:
return True, None, None
try:
success, response, error = call_agent_with_retry(SUMMARIZER_URL, json={'text': text[:5000]})
if success and response:
return True, response.get('summary', ''), None
else:
return False, None, error or "Summarization failed"
except Exception as e:
return False, None, str(e)