Spaces:

Rogue2003
/

Receipt_Agent

Sleeping

App Files Files Community

Raghu commited on Dec 7, 2025

Commit

6fe5290

1 Parent(s): b8f0f36

Improve LayoutLM total detection: add confidence scores, validate against OCR text, use OCR fallback when LayoutLM misses total

Browse files

Files changed (1) hide show

app.py +111 -18

app.py CHANGED Viewed

@@ -876,6 +876,7 @@ class LayoutLMFieldExtractor:
         return words, boxes
     def predict_fields(self, image, ocr_results=None):
         if self.model is None:
             self.load()
@@ -901,40 +902,122 @@ class LayoutLMFieldExtractor:
         with torch.no_grad():
             outputs = self.model(**encoding)
             logits = outputs.logits[0]
             preds = logits.argmax(-1).cpu().tolist()
             tokens = self.processor.tokenizer.convert_ids_to_tokens(encoding["input_ids"][0].cpu())
         entities = {"VENDOR": [], "DATE": [], "TOTAL": [], "TIME": []}
-        current = {"label": None, "tokens": []}
-        for token, pred in zip(tokens, preds):
             label = self.id2label.get(pred, "O")
             if token in ["[PAD]", "[CLS]", "[SEP]"]:
                 continue
             if label.startswith("B-"):
-                # flush previous
                 if current["label"] and current["tokens"]:
-                    entities[current["label"]].append(" ".join(current["tokens"]))
-                current = {"label": label[2:], "tokens": [token]}
             elif label.startswith("I-") and current["label"] == label[2:]:
                 current["tokens"].append(token)
             else:
                 if current["label"] and current["tokens"]:
-                    entities[current["label"]].append(" ".join(current["tokens"]))
-                current = {"label": None, "tokens": []}
-        if current["label"] and current["tokens"]:
-            entities[current["label"]].append(" ".join(current["tokens"]))
-        def pick_first(key):
-            vals = entities.get(key, [])
-            return vals[0].replace("▁", " ").strip() if vals else None
-        return {
-            "vendor": pick_first("VENDOR"),
-            "date": pick_first("DATE"),
-            "total": pick_first("TOTAL"),
-            "time": pick_first("TIME"),
-        }
 # ============================================================================
@@ -1157,6 +1240,13 @@ def process_receipt(image):
                 if not ocr_val and layoutlm_val:
                     # OCR didn't find it, use LayoutLM
                     fields[field_name] = layoutlm_val
                 elif ocr_val and layoutlm_val and field_name == 'total':
                     # For total: validate LayoutLM against OCR text
                     ocr_text = ' '.join([r['text'] for r in ocr_results])
@@ -1170,6 +1260,9 @@ def process_receipt(image):
                     else:
                         # LayoutLM doesn't match OCR, trust OCR (more reliable)
                         fields['total'] = ocr_val
                 elif ocr_val and layoutlm_val and field_name != 'total':
                     # For other fields, prefer LayoutLM if it's longer/more complete
                     if len(str(layoutlm_val)) > len(str(ocr_val)):

         return words, boxes
     def predict_fields(self, image, ocr_results=None):
+        """Predict fields with confidence scores and improved total extraction."""
         if self.model is None:
             self.load()
         with torch.no_grad():
             outputs = self.model(**encoding)
             logits = outputs.logits[0]
+            # Get softmax probabilities for confidence
+            probs = torch.softmax(logits, dim=-1)
             preds = logits.argmax(-1).cpu().tolist()
+            probs_np = probs.cpu().numpy()
             tokens = self.processor.tokenizer.convert_ids_to_tokens(encoding["input_ids"][0].cpu())
+        # Extract entities with confidence scores
         entities = {"VENDOR": [], "DATE": [], "TOTAL": [], "TIME": []}
+        entity_confidences = {"VENDOR": [], "DATE": [], "TOTAL": [], "TIME": []}
+        entity_positions = {"VENDOR": [], "DATE": [], "TOTAL": [], "TIME": []}
+        current = {"label": None, "tokens": [], "start_idx": None}
+        for idx, (token, pred) in enumerate(zip(tokens, preds)):
             label = self.id2label.get(pred, "O")
+            conf = float(probs_np[idx, pred])
             if token in ["[PAD]", "[CLS]", "[SEP]"]:
                 continue
             if label.startswith("B-"):
+                # Flush previous
                 if current["label"] and current["tokens"]:
+                    entity_text = " ".join(current["tokens"]).replace("▁", " ").strip()
+                    entities[current["label"]].append(entity_text)
+                    entity_confidences[current["label"]].append(conf)
+                    entity_positions[current["label"]].append(current["start_idx"])
+                current = {"label": label[2:], "tokens": [token], "start_idx": idx}
             elif label.startswith("I-") and current["label"] == label[2:]:
                 current["tokens"].append(token)
             else:
                 if current["label"] and current["tokens"]:
+                    entity_text = " ".join(current["tokens"]).replace("▁", " ").strip()
+                    entities[current["label"]].append(entity_text)
+                    entity_confidences[current["label"]].append(conf)
+                    entity_positions[current["label"]].append(current["start_idx"])
+                current = {"label": None, "tokens": [], "start_idx": None}
+        if current["label"] and current["tokens"]:
+            entity_text = " ".join(current["tokens"]).replace("▁", " ").strip()
+            entities[current["label"]].append(entity_text)
+            entity_confidences[current["label"]].append(conf)
+            entity_positions[current["label"]].append(current["start_idx"])
+        # Smart field selection with confidence and position awareness
+        result = {}
+        # Vendor: prefer first high-confidence result
+        if entities["VENDOR"]:
+            best_vendor_idx = max(range(len(entities["VENDOR"])),
+                                 key=lambda i: entity_confidences["VENDOR"][i])
+            if entity_confidences["VENDOR"][best_vendor_idx] > 0.3:
+                result["vendor"] = entities["VENDOR"][best_vendor_idx]
+        # Date: prefer first high-confidence result
+        if entities["DATE"]:
+            best_date_idx = max(range(len(entities["DATE"])),
+                               key=lambda i: entity_confidences["DATE"][i])
+            if entity_confidences["DATE"][best_date_idx] > 0.3:
+                result["date"] = entities["DATE"][best_date_idx]
+        # Time: prefer first high-confidence result
+        if entities["TIME"]:
+            best_time_idx = max(range(len(entities["TIME"])),
+                                key=lambda i: entity_confidences["TIME"][i])
+            if entity_confidences["TIME"][best_time_idx] > 0.3:
+                result["time"] = entities["TIME"][best_time_idx]
+        # Total: improved extraction - look for amounts near "TOTAL" keyword in OCR
+        if entities["TOTAL"]:
+            # Get all total candidates with confidence
+            total_candidates = [(entities["TOTAL"][i], entity_confidences["TOTAL"][i],
+                                entity_positions["TOTAL"][i])
+                               for i in range(len(entities["TOTAL"]))]
+            # If OCR results available, validate against OCR text
+            if ocr_results:
+                ocr_text = ' '.join([r['text'] for r in ocr_results]).upper()
+                ocr_lines = [r['text'] for r in ocr_results]
+                # Find amounts near "TOTAL" keyword
+                best_total = None
+                best_conf = 0
+                for total_val, conf, pos in total_candidates:
+                    # Clean the total value
+                    total_clean = str(total_val).replace('$', '').replace(',', '').replace('.', '').strip()
+                    # Check if this total appears near "TOTAL" keyword in OCR
+                    for i, line in enumerate(ocr_lines):
+                        line_upper = line.upper()
+                        if 'TOTAL' in line_upper or 'AMOUNT DUE' in line_upper:
+                            # Check this line and next 2 lines for the amount
+                            search_text = ' '.join(ocr_lines[i:min(i+3, len(ocr_lines))])
+                            search_clean = search_text.replace('$', '').replace(',', '').replace('.', '')
+                            if total_clean in search_clean:
+                                # Found near TOTAL keyword - high confidence
+                                if conf > best_conf:
+                                    best_total = total_val
+                                    best_conf = conf
+                                break
+                if best_total:
+                    result["total"] = best_total
+                else:
+                    # Fallback: use highest confidence total
+                    best_idx = max(range(len(total_candidates)), key=lambda i: total_candidates[i][1])
+                    if total_candidates[best_idx][1] > 0.3:
+                        result["total"] = total_candidates[best_idx][0]
+            else:
+                # No OCR, use highest confidence
+                best_idx = max(range(len(total_candidates)), key=lambda i: total_candidates[i][1])
+                if total_candidates[best_idx][1] > 0.3:
+                    result["total"] = total_candidates[best_idx][0]
+        return result
 # ============================================================================
                 if not ocr_val and layoutlm_val:
                     # OCR didn't find it, use LayoutLM
                     fields[field_name] = layoutlm_val
+                elif ocr_val and not layoutlm_val:
+                    # LayoutLM didn't find it, but OCR did - use OCR (especially for total)
+                    if field_name == 'total':
+                        fields[field_name] = ocr_val
+                    else:
+                        # For other fields, prefer OCR if LayoutLM missed it
+                        fields[field_name] = ocr_val
                 elif ocr_val and layoutlm_val and field_name == 'total':
                     # For total: validate LayoutLM against OCR text
                     ocr_text = ' '.join([r['text'] for r in ocr_results])
                     else:
                         # LayoutLM doesn't match OCR, trust OCR (more reliable)
                         fields['total'] = ocr_val
+                elif ocr_val and not layoutlm_val and field_name == 'total':
+                    # LayoutLM didn't find total, but OCR did - use OCR
+                    fields['total'] = ocr_val
                 elif ocr_val and layoutlm_val and field_name != 'total':
                     # For other fields, prefer LayoutLM if it's longer/more complete
                     if len(str(layoutlm_val)) > len(str(ocr_val)):