Spaces:
Sleeping
Sleeping
| """ | |
| HF Agent client with proper environment variable support. | |
| """ | |
| import httpx | |
| import os | |
| import time | |
| import logging | |
| from typing import Dict, Optional, Tuple | |
| from pathlib import Path | |
| logger = logging.getLogger(__name__) | |
| # Load from environment | |
| TEXT_EXTRACTOR_URL = os.getenv('TEXT_EXTRACTOR_URL', 'https://point9-extract-text-and-table.hf.space/api/text') | |
| TABLE_EXTRACTOR_URL = os.getenv('TABLE_EXTRACTOR_URL', 'https://point9-extract-text-and-table.hf.space/api/tables') | |
| NER_URL = os.getenv('NER_URL', 'https://point9-ner.hf.space/api/ner') | |
| CLASSIFY_URL = os.getenv('CLASSIFY_URL', 'https://point9-classify.hf.space/api/classify') | |
| SUMMARIZER_URL = os.getenv('SUMMARIZER_URL', '') # Optional | |
| AGENT_BEARER_TOKEN = os.getenv('AGENT_BEARER_TOKEN', '') | |
| AGENT_TIMEOUT_SECONDS = int(os.getenv('AGENT_TIMEOUT_SECONDS', '30')) | |
| def get_headers() -> Dict: | |
| """Get headers with optional bearer token.""" | |
| headers = {} | |
| if AGENT_BEARER_TOKEN: | |
| headers['Authorization'] = f'Bearer {AGENT_BEARER_TOKEN}' | |
| return headers | |
| def call_agent_with_retry( | |
| url: str, | |
| files: Optional[Dict] = None, | |
| data: Optional[Dict] = None, | |
| json: Optional[Dict] = None, | |
| max_retries: int = 1 | |
| ) -> Tuple[bool, Optional[Dict], Optional[str]]: | |
| """Call agent with retry logic.""" | |
| headers = get_headers() | |
| for attempt in range(max_retries + 1): | |
| try: | |
| with httpx.Client(timeout=AGENT_TIMEOUT_SECONDS) as client: | |
| if files: | |
| response = client.post(url, headers=headers, files=files, data=data) | |
| elif json: | |
| response = client.post(url, headers=headers, json=json) | |
| else: | |
| response = client.post(url, headers=headers, data=data) | |
| if response.status_code == 200: | |
| return True, response.json(), None | |
| elif response.status_code == 429: | |
| if attempt < max_retries: | |
| time.sleep(2) | |
| continue | |
| return False, None, "Rate limited" | |
| else: | |
| return False, None, f"HTTP {response.status_code}: {response.text[:200]}" | |
| except httpx.TimeoutException: | |
| if attempt < max_retries: | |
| time.sleep(1) | |
| continue | |
| return False, None, f"Timeout after {AGENT_TIMEOUT_SECONDS}s" | |
| except Exception as e: | |
| if attempt < max_retries: | |
| time.sleep(1) | |
| continue | |
| return False, None, str(e) | |
| return False, None, "Max retries exceeded" | |
| def extract_text_from_file(file_path: Path) -> Tuple[bool, Optional[str], Optional[str]]: | |
| """Extract text using HF agent.""" | |
| try: | |
| with open(file_path, 'rb') as f: | |
| files = {'file': (file_path.name, f, 'application/pdf')} | |
| data = {'filename': file_path.name} | |
| success, response, error = call_agent_with_retry(TEXT_EXTRACTOR_URL, files=files, data=data) | |
| if success and response: | |
| text = response.get('text', '') | |
| if not text or len(text.strip()) < 10: | |
| return False, None, "No text extracted" | |
| return True, text, None | |
| else: | |
| return False, None, error or "Text extraction failed" | |
| except Exception as e: | |
| return False, None, str(e) | |
| def extract_tables_from_file(file_path: Path) -> Tuple[bool, Optional[list], Optional[str]]: | |
| """Extract tables using HF agent.""" | |
| try: | |
| with open(file_path, 'rb') as f: | |
| files = {'file': (file_path.name, f, 'application/pdf')} | |
| data = {'filename': file_path.name} | |
| success, response, error = call_agent_with_retry(TABLE_EXTRACTOR_URL, files=files, data=data) | |
| if success and response: | |
| return True, response.get('tables', []), None | |
| else: | |
| return False, None, error or "Table extraction failed" | |
| except Exception as e: | |
| return False, None, str(e) | |
| def extract_entities_from_text(text: str) -> Tuple[bool, Optional[list], Optional[str]]: | |
| """Extract entities using NER agent.""" | |
| try: | |
| success, response, error = call_agent_with_retry(NER_URL, json={'text': text}) | |
| if success and response: | |
| return True, response.get('entities', []), None | |
| else: | |
| return False, None, error or "NER failed" | |
| except Exception as e: | |
| return False, None, str(e) | |
| def classify_document(text: str) -> Tuple[bool, Optional[Dict], Optional[str]]: | |
| """Classify document using classifier agent.""" | |
| try: | |
| success, response, error = call_agent_with_retry(CLASSIFY_URL, json={'text': text[:2000]}) | |
| if success and response: | |
| return True, response, None | |
| else: | |
| return False, None, error or "Classification failed" | |
| except Exception as e: | |
| return False, None, str(e) | |
| def summarize_text(text: str) -> Tuple[bool, Optional[str], Optional[str]]: | |
| """Summarize text (optional).""" | |
| if not SUMMARIZER_URL: | |
| return True, None, None | |
| try: | |
| success, response, error = call_agent_with_retry(SUMMARIZER_URL, json={'text': text[:5000]}) | |
| if success and response: | |
| return True, response.get('summary', ''), None | |
| else: | |
| return False, None, error or "Summarization failed" | |
| except Exception as e: | |
| return False, None, str(e) |