| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import sys |
| import os |
|
|
| _mnb_dir = os.path.dirname(os.path.abspath(__file__)) |
| if _mnb_dir not in sys.path: |
| sys.path.insert(0, _mnb_dir) |
|
|
| _root_dir = os.path.dirname(_mnb_dir) |
| if _root_dir not in sys.path: |
| sys.path.insert(0, _root_dir) |
|
|
| try: |
| from form_classifier import DocumentClassifier |
| _HAVE_DOC_CLASSIFIER = True |
| except ImportError: |
| _HAVE_DOC_CLASSIFIER = False |
|
|
|
|
| |
| |
| _FORM_KEYWORDS = { |
| "form102": [ |
| "Municipal Form No. 102", |
| "Municipal Form No.102", |
| "Certificate of Live Birth", |
| "live birth", |
| "name of child", |
| "date of birth", |
| "place of birth", |
| "birth certificate", |
| "mother", "father", |
| "infant", "newborn", |
| "attendant at birth", |
| ], |
| "form103": [ |
| "Municipal Form No. 103", |
| "Municipal Form No.103", |
| "Certificate of Death", |
| "death certificate", |
| "name of deceased", |
| "date of death", |
| "place of death", |
| "cause of death", |
| "burial", "deceased", |
| "immediate cause", |
| ], |
| "form97": [ |
| "Municipal Form No. 97", |
| "Municipal Form No.97", |
| "Certificate of Marriage", |
| "marriage certificate", |
| "name of husband", |
| "name of wife", |
| "date of marriage", |
| "place of marriage", |
| "solemnizing officer", |
| "contracting parties", |
| "witnesses", |
| ], |
| } |
|
|
| |
| _SEX_KEYWORDS = { |
| "GROOM": [ |
| "sex: male", |
| "sex male", |
| "2. sex: male", |
| " male", |
| "sex m", |
| ], |
| "BRIDE": [ |
| "sex: female", |
| "sex female", |
| "2. sex: female", |
| " female", |
| "sex f", |
| ], |
| } |
|
|
| def _keyword_classify_form(text: str) -> str: |
| """Keyword fallback for Certifications page classification.""" |
| t = text.lower() |
| scores = {k: sum(1 for kw in v if kw.lower() in t) for k, v in _FORM_KEYWORDS.items()} |
| return max(scores, key=scores.get) |
|
|
| def _keyword_classify_sex(text: str) -> str: |
| """Keyword-based sex classifier for Form 90 routing.""" |
| t = text.lower() |
| scores = {k: sum(1 for kw in v if kw.lower() in t) for k, v in _SEX_KEYWORDS.items()} |
| return max(scores, key=scores.get) |
|
|
|
|
| |
| _FORM_CODE_TO_HINT = { |
| "form102": "birth", |
| "form103": "death", |
| "form97": "marriage", |
| |
| } |
|
|
|
|
| class MNBClassifier: |
| """ |
| MNB Classifier for the Civil Registry Digitization System. |
| |
| PATH A β Certifications Page: |
| mnb = MNBClassifier() |
| form_code = mnb.classify_form_type(ocr_text) |
| # β 'form102' | 'form103' | 'form97' |
| |
| hint = mnb.get_ner_hint(ocr_text) |
| # β 'birth' | 'death' | 'marriage' |
| |
| result = mnb.classify_full(ocr_text) |
| # β {'label': 'Form 102 - Certificate of Live Birth', |
| # 'form_code': 'form102', 'confidence': 0.97, 'probabilities': {...}} |
| |
| PATH B β Application for Marriage License Page (Form 90): |
| sex_role = mnb.classify_sex(ocr_text) |
| # β 'GROOM' (Male birth cert) | 'BRIDE' (Female birth cert) |
| """ |
|
|
| def __init__(self, model_dir: str = "models"): |
| self._doc_clf = None |
| if _HAVE_DOC_CLASSIFIER: |
| try: |
| self._doc_clf = DocumentClassifier(model_dir=model_dir) |
| print(f" [MNB] Loaded DocumentClassifier from {model_dir}/") |
| except FileNotFoundError as e: |
| print(f" [MNB] {e}") |
| print(" [MNB] Using keyword fallback β run: python mnb/form_classifier.py") |
| else: |
| print(" [MNB] form_classifier.py not found β using keyword fallback") |
|
|
| |
|
|
| def classify_form_type(self, ocr_text: str) -> str: |
| """ |
| Certifications page: identify which form was uploaded. |
| Returns: 'form102' | 'form103' | 'form97' |
| """ |
| if self._doc_clf is not None: |
| return self._doc_clf.predict(ocr_text)["form_code"] |
| return _keyword_classify_form(ocr_text) |
|
|
| def classify_full(self, ocr_text: str) -> dict: |
| """ |
| Certifications page: full result with confidence scores. |
| Returns: |
| { |
| 'label': 'Form 102 - Certificate of Live Birth', |
| 'form_code': 'form102', |
| 'confidence': 0.97, |
| 'probabilities': { ... } |
| } |
| """ |
| if self._doc_clf is not None: |
| return self._doc_clf.predict(ocr_text) |
| winner = _keyword_classify_form(ocr_text) |
| return { |
| "label": winner, |
| "form_code": winner, |
| "confidence": 1.0, |
| "probabilities": {k: (1.0 if k == winner else 0.0) for k in _FORM_KEYWORDS}, |
| } |
|
|
| def get_ner_hint(self, ocr_text: str) -> str: |
| """ |
| Returns NER hint string for bridge.py: |
| 'birth' | 'death' | 'marriage' |
| """ |
| code = self.classify_form_type(ocr_text) |
| return _FORM_CODE_TO_HINT.get(code, "birth") |
|
|
| |
|
|
| def classify_sex(self, ocr_text: str) -> str: |
| """ |
| Form 90 upload page only. |
| Reads the SEX field on a PSA/NSO birth certificate. |
| Returns: 'GROOM' (Male) | 'BRIDE' (Female) |
| """ |
| return _keyword_classify_sex(ocr_text) |
|
|
| def classify_sex_proba(self, ocr_text: str) -> dict: |
| """ |
| Returns confidence scores for sex classification. |
| Returns: {'GROOM': 0.9, 'BRIDE': 0.1} |
| """ |
| winner = _keyword_classify_sex(ocr_text) |
| return {k: (1.0 if k == winner else 0.0) for k in _SEX_KEYWORDS} |
|
|
|
|
| |
| if __name__ == "__main__": |
| mnb = MNBClassifier() |
|
|
| print("\n ββ PATH A: Certifications Page Tests ββ") |
| cert_tests = [ |
| ( |
| "Municipal Form No. 102 Certificate of Live Birth " |
| "Name of child Maria Santos Date of birth 01/15/1990 " |
| "Place of birth Brgy. San Jose Tarlac City " |
| "Name of mother Lani Santos Name of father Jose Santos " |
| "Sex Female birth certificate infant", |
| "form102" |
| ), |
| ( |
| "Municipal Form No.102 Certificate of Live Birth " |
| "PSA Child Juan Dela Cruz born 03/22/1985 Capas Tarlac " |
| "mother Rosa father Pedro Sex Male", |
| "form102" |
| ), |
| ( |
| "Municipal Form No. 103 Certificate of Death " |
| "Name of deceased Pedro Reyes Date of death 03/22/2020 " |
| "Cause of death Cardiac Arrest death certificate burial", |
| "form103" |
| ), |
| ( |
| "Municipal Form No.103 Certificate of Death " |
| "Deceased Ana Torres died 07/04/2000 Pneumonia burial permit", |
| "form103" |
| ), |
| ( |
| "Municipal Form No. 97 Certificate of Marriage " |
| "Name of husband Carlos Bautista Name of wife Ana Torres " |
| "Date of marriage 07/04/2005 solemnizing officer witnesses", |
| "form97" |
| ), |
| ( |
| "Municipal Form No.97 Certificate of Marriage " |
| "Husband Jose Santos wife Maria Reyes married 11/30/1995 " |
| "contracting parties", |
| "form97" |
| ), |
| ] |
|
|
| for text, expected in cert_tests: |
| result = mnb.classify_full(text) |
| mark = "β
" if result["form_code"] == expected else "β" |
| print(f" {mark} Expected={expected:<8} Got={result['form_code']:<8} " |
| f"Confidence={result['confidence']:.1%} ({result['label']})") |
|
|
| print("\n ββ PATH B: Form 90 Marriage License β Sex Routing Tests ββ") |
| sex_tests = [ |
| ( |
| "Municipal Form No.102 Certificate of Live Birth PSA " |
| "CHILD (First): Juan Dela Cruz SEX: Male " |
| "Date of Birth March 15 1990 Mother Maria Dela Cruz", |
| "GROOM" |
| ), |
| ( |
| "Municipal Form No.102 Certificate of Live Birth NSO " |
| "CHILD (First): Ana Santos SEX: Female " |
| "Date of Birth August 21 1995 Mother Gloria Santos", |
| "BRIDE" |
| ), |
| ] |
| for text, expected in sex_tests: |
| pred = mnb.classify_sex(text) |
| mark = "β
" if pred == expected else "β" |
| print(f" {mark} Expected={expected} Got={pred}") |
|
|