finance-entity-extractor / scripts /create_benchmark.py
Ranjit Behera
FinEE v1.0 - Finance Entity Extractor
dcc24f8
"""
Create Held-Out Benchmark from Real Emails.
Extracts 100 real financial emails from the MBOX file,
ensures they were NOT used in training, and creates a
benchmark for measuring real-world performance.
Author: Ranjit Behera
"""
import json
import random
import re
from pathlib import Path
# Paths
CORPUS_FILE = Path("data/corpus/emails/financial_emails.jsonl")
TRAIN_FILE = Path("data/training/train.jsonl")
BENCHMARK_FILE = Path("data/benchmark/real_emails_benchmark.json")
def load_corpus():
"""Load the extracted financial emails."""
emails = []
with open(CORPUS_FILE, 'r') as f:
for line in f:
try:
data = json.loads(line)
emails.append(data)
except:
continue
return emails
def load_training_texts():
"""Load training data to exclude from benchmark."""
texts = set()
with open(TRAIN_FILE, 'r') as f:
for line in f:
try:
data = json.loads(line)
# Get first 100 chars as fingerprint
text = data.get('text', '')[:100]
texts.add(text)
except:
continue
return texts
def extract_entities_from_email(email_body: str) -> dict:
"""Auto-extract entities from email text for labeling."""
entities = {
'amount': '',
'type': '',
'date': '',
'account': '',
'reference': '',
'merchant': '',
'bank': ''
}
text = email_body
text_lower = text.lower()
# Detect bank
if 'hdfc' in text_lower:
entities['bank'] = 'hdfc'
elif 'icici' in text_lower:
entities['bank'] = 'icici'
elif 'sbi' in text_lower:
entities['bank'] = 'sbi'
elif 'axis' in text_lower:
entities['bank'] = 'axis'
elif 'kotak' in text_lower:
entities['bank'] = 'kotak'
elif 'phonepe' in text_lower:
entities['bank'] = 'phonepe'
elif 'gpay' in text_lower or 'google pay' in text_lower:
entities['bank'] = 'gpay'
elif 'paytm' in text_lower:
entities['bank'] = 'paytm'
# Detect type
if 'debited' in text_lower or 'sent' in text_lower or 'paid' in text_lower:
entities['type'] = 'debit'
elif 'credited' in text_lower or 'received' in text_lower:
entities['type'] = 'credit'
# Extract amount - various patterns
amount_patterns = [
r'Rs\.?\s*([\d,]+\.?\d*)',
r'INR\s*([\d,]+\.?\d*)',
r'₹\s*([\d,]+\.?\d*)',
r'([\d,]+\.?\d*)\s*has been (?:debited|credited)'
]
for pattern in amount_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
entities['amount'] = match.group(1).replace(',', '')
break
# Extract account
account_patterns = [
r'account\s*(?:no\.?|number|#|XX|X)?\s*(\d{4})',
r'A/c\s*(?:XX|X)?(\d{4})',
r'a/c\s*(\d{4})',
r'XX(\d{4})',
r'X(\d{4})'
]
for pattern in account_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
entities['account'] = match.group(1)
break
# Extract date - various formats
date_patterns = [
r'on\s*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})',
r'(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})',
r'(\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w*\s+\d{4})',
]
for pattern in date_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
entities['date'] = match.group(1)
break
# Extract reference
ref_patterns = [
r'(?:UPI\s*)?(?:Ref(?:erence)?(?:\s*(?:No|Number|#|:))?\.?\s*:?\s*)(\d{10,})',
r'transaction reference number is\s*(\d+)',
r'Txn[:\s]*(\d+)',
]
for pattern in ref_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
entities['reference'] = match.group(1)
break
# Extract merchant from VPA
vpa_match = re.search(r'VPA\s+(\w+)@\w+\s+([A-Z][A-Za-z\s]+)', text)
if vpa_match:
entities['merchant'] = vpa_match.group(2).strip().lower()
else:
# Try common merchants
merchants = ['swiggy', 'zomato', 'amazon', 'flipkart', 'uber', 'ola', 'rapido', 'bigbasket', 'blinkit', 'zepto']
for m in merchants:
if m in text_lower:
entities['merchant'] = m
break
return entities
def create_benchmark(n_samples=100):
"""Create held-out benchmark from real emails."""
print("=" * 60)
print("📊 CREATING HELD-OUT BENCHMARK")
print("=" * 60)
# Load data
print(f"\n1. Loading corpus from {CORPUS_FILE}...")
corpus = load_corpus()
print(f" Found {len(corpus)} financial emails")
print(f"\n2. Loading training data to exclude...")
train_texts = load_training_texts()
print(f" Found {len(train_texts)} training samples to exclude")
# Filter for transaction emails
print(f"\n3. Filtering for transaction emails...")
candidates = []
for email in corpus:
body = email.get('body', '')
# Skip if too short
if len(body) < 50:
continue
# Must have transaction keywords
body_lower = body.lower()
has_transaction = any(x in body_lower for x in ['debited', 'credited', 'received', 'sent'])
has_amount = any(x in body_lower for x in ['rs.', 'rs ', 'inr', '₹'])
if has_transaction and has_amount:
# Auto-extract entities
entities = extract_entities_from_email(body)
candidates.append({
'text': body,
'subject': email.get('subject', ''),
'sender': email.get('sender', ''),
'date': email.get('date', ''),
'expected_entities': entities
})
print(f" Found {len(candidates)} transaction emails")
# Sample randomly
print(f"\n4. Sampling {n_samples} emails for benchmark...")
random.seed(42) # Reproducible
benchmark = random.sample(candidates, min(n_samples, len(candidates)))
# Add IDs
for i, sample in enumerate(benchmark):
sample['id'] = i + 1
sample['auto_labeled'] = True
sample['verified'] = False
# Save benchmark
BENCHMARK_FILE.parent.mkdir(parents=True, exist_ok=True)
with open(BENCHMARK_FILE, 'w') as f:
json.dump(benchmark, f, indent=2, ensure_ascii=False)
print(f"\n✅ Benchmark saved to {BENCHMARK_FILE}")
print(f" Total samples: {len(benchmark)}")
# Stats
banks = {}
for s in benchmark:
bank = s['expected_entities'].get('bank', 'unknown')
banks[bank] = banks.get(bank, 0) + 1
print("\n📊 Benchmark by Bank:")
for bank, count in sorted(banks.items()):
print(f" {bank.upper():10} {count}")
# Show sample
print("\n" + "=" * 60)
print("📧 SAMPLE EMAIL FROM BENCHMARK:")
print("=" * 60)
if benchmark:
sample = benchmark[0]
print(f"Subject: {sample.get('subject', 'N/A')}")
print(f"Text: {sample['text'][:300]}...")
print(f"\nAuto-extracted entities:")
for k, v in sample['expected_entities'].items():
if v:
print(f" {k}: {v}")
return benchmark
if __name__ == "__main__":
create_benchmark(n_samples=100)