Spaces:

VladRet2026
/

ConvertAudioToJSON

Sleeping

App Files Files

Vlad Juracovschi commited on 19 days ago

Commit

8b892b9

1 Parent(s): 81b7609

OptimizedUserSearch

Browse files

Files changed (1) hide show

extractors/user_extractor.py +59 -1

extractors/user_extractor.py CHANGED Viewed

@@ -22,6 +22,38 @@ class ExpenseUserExtractor:
         self.threshold = threshold
         self.supplier_terms = {normalize_text(supplier) for supplier in suppliers}
         self.user_matcher = ExpenseSupplierExtractor(suppliers=users)
     def extract(
         self,
@@ -45,8 +77,32 @@ class ExpenseUserExtractor:
                 }
             return payload
         match = self.user_matcher.extract(
-            text=text,
             date_phrase=date_phrase,
             excluded_phrases=[supplier_phrase] if supplier_phrase else None,
             debug=debug,
@@ -66,6 +122,8 @@ class ExpenseUserExtractor:
                 "threshold": self.threshold,
                 "excluded_supplier_phrase": supplier_phrase,
                 "normalized_text": normalized_text,
                 "matcher_debug": match.get("supplier_debug"),
             }

         self.threshold = threshold
         self.supplier_terms = {normalize_text(supplier) for supplier in suppliers}
         self.user_matcher = ExpenseSupplierExtractor(suppliers=users)
+        self.user_terms = {
+            token
+            for user in users
+            for token in normalize_text(user).split()
+            if token and len(token) > 1 and not token.isdigit()
+        }
+    def _is_user_like_token(self, token: str, similarity_threshold: float = 0.50) -> bool:
+        if token in self.user_terms:
+            return True
+        return self.user_matcher.lexical_support(token) >= similarity_threshold
+    def _build_user_candidate_text(
+        self,
+        normalized_text: str,
+        supplier_phrase: str | None,
+        date_phrase: str | None,
+    ) -> tuple[str, list[str]]:
+        excluded_tokens: set[str] = set(self.user_matcher.noise_terms)
+        if supplier_phrase:
+            excluded_tokens.update(normalize_text(supplier_phrase).split())
+        if date_phrase:
+            excluded_tokens.update(normalize_text(date_phrase).split())
+        user_like_tokens: list[str] = []
+        for token in normalized_text.split():
+            if token in excluded_tokens or token.isdigit() or len(token) <= 1:
+                continue
+            if self._is_user_like_token(token):
+                user_like_tokens.append(token)
+        return " ".join(user_like_tokens), user_like_tokens
     def extract(
         self,
                 }
             return payload
+        candidate_text, user_like_tokens = self._build_user_candidate_text(
+            normalized_text=normalized_text,
+            supplier_phrase=supplier_phrase,
+            date_phrase=date_phrase,
+        )
+        if not candidate_text:
+            payload = {
+                "user": None,
+                "user_score": None,
+                "matched_user_phrase": None,
+            }
+            if debug:
+                payload["user_debug"] = {
+                    "mode": "supplier-matcher",
+                    "threshold": self.threshold,
+                    "excluded_supplier_phrase": supplier_phrase,
+                    "normalized_text": normalized_text,
+                    "candidate_text": candidate_text,
+                    "user_like_tokens": user_like_tokens,
+                    "matcher_debug": None,
+                }
+            return payload
         match = self.user_matcher.extract(
+            text=candidate_text,
             date_phrase=date_phrase,
             excluded_phrases=[supplier_phrase] if supplier_phrase else None,
             debug=debug,
                 "threshold": self.threshold,
                 "excluded_supplier_phrase": supplier_phrase,
                 "normalized_text": normalized_text,
+                "candidate_text": candidate_text,
+                "user_like_tokens": user_like_tokens,
                 "matcher_debug": match.get("supplier_debug"),
             }