| | """ |
| | Finance Entity Extractor - Professional Inference Module. |
| | |
| | Provides structured API with JSON schema enforcement for |
| | extracting financial entities from Indian banking emails. |
| | |
| | Author: Ranjit Behera |
| | License: MIT |
| | Version: 0.8.0 |
| | |
| | Example: |
| | >>> from inference import FinanceExtractor |
| | >>> extractor = FinanceExtractor() |
| | >>> result = extractor.extract("Rs.2500.00 debited from account 3545...") |
| | >>> print(result.amount) # "2500.00" |
| | """ |
| |
|
| | import json |
| | import re |
| | from dataclasses import dataclass, asdict, field |
| | from typing import Optional, Dict, Any, List |
| | from enum import Enum |
| |
|
| |
|
| | class TransactionType(str, Enum): |
| | """Transaction type enumeration.""" |
| | CREDIT = "credit" |
| | DEBIT = "debit" |
| | UNKNOWN = "unknown" |
| |
|
| |
|
| | class ExtractionFormat(str, Enum): |
| | """Supported input formats.""" |
| | EMAIL = "email" |
| | BANK_STATEMENT = "bank_statement" |
| | PHONEPE = "phonepe" |
| | GPAY = "gpay" |
| | PAYTM = "paytm" |
| |
|
| |
|
| | @dataclass |
| | class FinanceEntity: |
| | """ |
| | Structured financial entity extracted from text. |
| | |
| | All fields are validated and typed. Missing fields are None. |
| | """ |
| | amount: Optional[str] = None |
| | type: Optional[str] = None |
| | date: Optional[str] = None |
| | account: Optional[str] = None |
| | reference: Optional[str] = None |
| | merchant: Optional[str] = None |
| | category: Optional[str] = None |
| | bank: Optional[str] = None |
| | raw_response: Optional[str] = field(default=None, repr=False) |
| | |
| | def to_dict(self) -> Dict[str, Any]: |
| | """Convert to dictionary, excluding None values and internal fields.""" |
| | result = {} |
| | for k, v in asdict(self).items(): |
| | if v is not None and k != 'raw_response': |
| | result[k] = v |
| | return result |
| | |
| | def to_json(self) -> str: |
| | """Convert to JSON string.""" |
| | return json.dumps(self.to_dict(), indent=2) |
| | |
| | def is_valid(self) -> bool: |
| | """Check if extraction has minimum required fields.""" |
| | return self.amount is not None and self.type is not None |
| | |
| | def __str__(self) -> str: |
| | return self.to_json() |
| |
|
| |
|
| | def build_prompt(text: str, format_type: ExtractionFormat = ExtractionFormat.EMAIL) -> str: |
| | """ |
| | Build a standardized prompt for the model. |
| | |
| | This is the official prompt format that the model was trained on. |
| | Do not modify this format - it will degrade extraction quality. |
| | |
| | Args: |
| | text: The input text (email body, statement row, etc.) |
| | format_type: The type of input format |
| | |
| | Returns: |
| | Formatted prompt string |
| | """ |
| | |
| | prefixes = { |
| | ExtractionFormat.EMAIL: "", |
| | ExtractionFormat.BANK_STATEMENT: "[BANK_STATEMENT] ", |
| | ExtractionFormat.PHONEPE: "[PHONEPE] ", |
| | ExtractionFormat.GPAY: "[GPAY] ", |
| | ExtractionFormat.PAYTM: "[PAYTM] ", |
| | } |
| | |
| | prefix = prefixes.get(format_type, "") |
| | |
| | |
| | prompt = f"""{prefix}Extract financial entities from this email: |
| | |
| | {text} |
| | |
| | Extract: amount, type, date, account, reference, merchant, category |
| | Output JSON:""" |
| | |
| | return prompt |
| |
|
| |
|
| | def parse_json_response(response: str) -> Dict[str, Any]: |
| | """ |
| | Parse JSON from model response with fallback patterns. |
| | |
| | Handles various response formats: |
| | - Clean JSON: {"amount": "500"} |
| | - Markdown JSON: ```json {"amount": "500"} ``` |
| | - Conversational: "Here is the data: {..." |
| | |
| | Args: |
| | response: Raw model output string |
| | |
| | Returns: |
| | Parsed dictionary or empty dict if parsing fails |
| | """ |
| | |
| | try: |
| | return json.loads(response.strip()) |
| | except json.JSONDecodeError: |
| | pass |
| | |
| | |
| | patterns = [ |
| | r'\{[^{}]+\}', |
| | r'```json\s*(\{[^`]+\})\s*```', |
| | r'```\s*(\{[^`]+\})\s*```', |
| | ] |
| | |
| | for pattern in patterns: |
| | match = re.search(pattern, response, re.DOTALL) |
| | if match: |
| | try: |
| | json_str = match.group(1) if match.lastindex else match.group(0) |
| | return json.loads(json_str) |
| | except (json.JSONDecodeError, IndexError): |
| | continue |
| | |
| | return {} |
| |
|
| |
|
| | def validate_entity(data: Dict[str, Any]) -> FinanceEntity: |
| | """ |
| | Validate and normalize extracted entity data. |
| | |
| | Args: |
| | data: Raw parsed dictionary |
| | |
| | Returns: |
| | Validated FinanceEntity object |
| | """ |
| | |
| | txn_type = data.get('type', '').lower() |
| | if txn_type not in ('credit', 'debit'): |
| | txn_type = None |
| | |
| | |
| | amount = data.get('amount', '') |
| | if amount: |
| | amount = str(amount).replace(',', '').strip() |
| | |
| | try: |
| | float(amount.replace('.', '').replace('-', '')) |
| | except ValueError: |
| | amount = None |
| | else: |
| | amount = None |
| | |
| | return FinanceEntity( |
| | amount=amount, |
| | type=txn_type, |
| | date=data.get('date'), |
| | account=str(data.get('account', '')) if data.get('account') else None, |
| | reference=str(data.get('reference', '')) if data.get('reference') else None, |
| | merchant=data.get('merchant'), |
| | category=data.get('category'), |
| | bank=data.get('bank'), |
| | ) |
| |
|
| |
|
| | class FinanceExtractor: |
| | """ |
| | High-level API for financial entity extraction. |
| | |
| | Provides a clean, validated interface for extracting |
| | financial data from Indian banking emails and statements. |
| | |
| | Example: |
| | >>> extractor = FinanceExtractor() |
| | >>> result = extractor.extract( |
| | ... "Rs.2500.00 debited from account 3545 to VPA swiggy@ybl" |
| | ... ) |
| | >>> print(result.amount) # "2500.00" |
| | >>> print(result.to_json()) |
| | """ |
| | |
| | def __init__(self, model_path: str = None, adapter_path: str = None): |
| | """ |
| | Initialize the extractor. |
| | |
| | Args: |
| | model_path: Path to base model (default: from HuggingFace) |
| | adapter_path: Path to LoRA adapters (default: from HuggingFace) |
| | """ |
| | self.model_path = model_path |
| | self.adapter_path = adapter_path |
| | self._model = None |
| | self._tokenizer = None |
| | |
| | def _load_model(self): |
| | """Lazy load model on first use.""" |
| | if self._model is not None: |
| | return |
| | |
| | try: |
| | from mlx_lm import load |
| | except ImportError: |
| | raise ImportError( |
| | "mlx_lm is required for MLX inference. " |
| | "Install with: pip install mlx-lm>=0.19.0" |
| | ) |
| | |
| | if self.model_path and self.adapter_path: |
| | self._model, self._tokenizer = load( |
| | self.model_path, |
| | adapter_path=self.adapter_path |
| | ) |
| | else: |
| | |
| | self._model, self._tokenizer = load( |
| | "Ranjit0034/finance-entity-extractor" |
| | ) |
| | |
| | def extract( |
| | self, |
| | text: str, |
| | format_type: ExtractionFormat = ExtractionFormat.EMAIL, |
| | max_tokens: int = 200, |
| | ) -> FinanceEntity: |
| | """ |
| | Extract financial entities from text. |
| | |
| | Args: |
| | text: Input text (email body, statement row, etc.) |
| | format_type: Type of input format |
| | max_tokens: Maximum tokens to generate |
| | |
| | Returns: |
| | FinanceEntity with extracted data |
| | """ |
| | self._load_model() |
| | |
| | from mlx_lm import generate |
| | |
| | prompt = build_prompt(text, format_type) |
| | response = generate( |
| | self._model, |
| | self._tokenizer, |
| | prompt=prompt, |
| | max_tokens=max_tokens, |
| | ) |
| | |
| | |
| | data = parse_json_response(response) |
| | entity = validate_entity(data) |
| | entity.raw_response = response |
| | |
| | return entity |
| | |
| | def extract_batch( |
| | self, |
| | texts: List[str], |
| | format_type: ExtractionFormat = ExtractionFormat.EMAIL, |
| | ) -> List[FinanceEntity]: |
| | """ |
| | Extract entities from multiple texts. |
| | |
| | Args: |
| | texts: List of input texts |
| | format_type: Type of input format |
| | |
| | Returns: |
| | List of FinanceEntity objects |
| | """ |
| | return [self.extract(text, format_type) for text in texts] |
| |
|
| |
|
| | |
| | def extract(text: str, format_type: str = "email") -> Dict[str, Any]: |
| | """ |
| | Simple extraction function. |
| | |
| | Args: |
| | text: Input text to extract from |
| | format_type: One of "email", "bank_statement", "phonepe", "gpay", "paytm" |
| | |
| | Returns: |
| | Dictionary with extracted entities |
| | |
| | Example: |
| | >>> from inference import extract |
| | >>> result = extract("Rs.500 debited from A/c 1234") |
| | >>> print(result["amount"]) # "500" |
| | """ |
| | format_map = { |
| | "email": ExtractionFormat.EMAIL, |
| | "bank_statement": ExtractionFormat.BANK_STATEMENT, |
| | "phonepe": ExtractionFormat.PHONEPE, |
| | "gpay": ExtractionFormat.GPAY, |
| | "paytm": ExtractionFormat.PAYTM, |
| | } |
| | |
| | extractor = FinanceExtractor() |
| | fmt = format_map.get(format_type.lower(), ExtractionFormat.EMAIL) |
| | entity = extractor.extract(text, fmt) |
| | |
| | return entity.to_dict() |
| |
|
| |
|
| | if __name__ == "__main__": |
| | |
| | demo_email = """ |
| | HDFC BANK Dear Customer, |
| | Rs.2500.00 has been debited from account 3545 to VPA swiggy@ybl |
| | SWIGGY INDIA on 28-12-25. |
| | Your UPI transaction reference number is 534567891234. |
| | """ |
| | |
| | print("=" * 60) |
| | print("Finance Entity Extractor v0.8.0 - Demo") |
| | print("=" * 60) |
| | print(f"\nInput:\n{demo_email.strip()}") |
| | print("\nBuilding prompt...") |
| | prompt = build_prompt(demo_email) |
| | print(f"Prompt:\n{prompt[:200]}...") |
| | |
| | |
| | mock_response = '''{"amount": "2500.00", "type": "debit", "date": "28-12-25", "account": "3545", "reference": "534567891234", "merchant": "swiggy", "category": "food"}''' |
| | |
| | print("\nParsing response...") |
| | data = parse_json_response(mock_response) |
| | entity = validate_entity(data) |
| | |
| | print(f"\nExtracted Entity:") |
| | print(entity.to_json()) |
| | print(f"\nValid: {entity.is_valid()}") |
| |
|