Spaces:

VladRet2026
/

ConvertAudioToJSON

Running

App Files Files

VladRet2026 commited on 18 days ago

Commit

0780f6f

1 Parent(s): 44706f3

CheckOptimizedUserSearchWIthMorph

Browse files

Files changed (1) hide show

extractors/user_extractor.py +29 -13

extractors/user_extractor.py CHANGED Viewed

@@ -12,14 +12,15 @@ from extractors.supplier_extractor import ExpenseSupplierExtractor, normalize_te
 class ExpenseUserExtractor:
     """Ищет пользователя тем же fuzzy-matcher, что и поставщика."""
     def __init__(
         self,
         users: list[str],
         suppliers: list[str],
-        model: Any = None,
         threshold: float = 0.25,
     ) -> None:
-        self.users = users
         self.threshold = threshold
         self.supplier_terms = {normalize_text(supplier) for supplier in suppliers}
         self.user_matcher = ExpenseSupplierExtractor(suppliers=users)
@@ -41,7 +42,9 @@ class ExpenseUserExtractor:
             )
         # Сохраняем низкий порог для имён, но не пропускаем нарицательные слова.
-        accepted = lexical >= 0.40 or (has_person_grammeme and lexical >= 0.30)
         return accepted, lexical, has_person_grammeme
     def _build_user_candidate_text(
@@ -49,7 +52,8 @@ class ExpenseUserExtractor:
         normalized_text: str,
         supplier_phrase: str | None,
         date_phrase: str | None,
-    ) -> tuple[str, list[str], list[dict[str, Any]]]:
         excluded_tokens: set[str] = set(self.user_matcher.noise_terms)
         if supplier_phrase:
             excluded_tokens.update(normalize_text(supplier_phrase).split())
@@ -58,17 +62,18 @@ class ExpenseUserExtractor:
         excluded_tokens.update(self.supplier_terms)
         candidate_tokens: list[str] = []
-        candidate_debug: list[dict[str, Any]] = []
         for token in normalized_text.split():
             if token in excluded_tokens or token.isdigit() or len(token) <= 1:
                 continue
             accepted, lexical, has_person_grammeme = self._looks_like_person_token(token)
-            candidate_debug.append({
-                "token": token,
-                "lexical_support": round(lexical, 4),
-                "has_person_grammeme": has_person_grammeme,
-                "accepted": accepted,
-            })
             if accepted:
                 candidate_tokens.append(token)
@@ -100,6 +105,7 @@ class ExpenseUserExtractor:
             normalized_text=normalized_text,
             supplier_phrase=supplier_phrase,
             date_phrase=date_phrase,
         )
         if not candidate_text:
@@ -112,11 +118,16 @@ class ExpenseUserExtractor:
                 payload["user_debug"] = {
                     "mode": "supplier-matcher",
                     "threshold": self.threshold,
                     "excluded_supplier_phrase": supplier_phrase,
                     "normalized_text": normalized_text,
                     "candidate_text": candidate_text,
                     "candidate_tokens": candidate_tokens,
-                    "candidate_token_debug": candidate_debug,
                     "matcher_debug": None,
                 }
             return payload
@@ -140,11 +151,16 @@ class ExpenseUserExtractor:
             payload["user_debug"] = {
                 "mode": "supplier-matcher",
                 "threshold": self.threshold,
                 "excluded_supplier_phrase": supplier_phrase,
                 "normalized_text": normalized_text,
                 "candidate_text": candidate_text,
                 "candidate_tokens": candidate_tokens,
-                "candidate_token_debug": candidate_debug,
                 "matcher_debug": match.get("supplier_debug"),
             }

 class ExpenseUserExtractor:
     """Ищет пользователя тем же fuzzy-matcher, что и поставщика."""
+    MIN_LEXICAL_SUPPORT = 0.40
+    MIN_LEXICAL_WITH_PERSON = 0.30
     def __init__(
         self,
         users: list[str],
         suppliers: list[str],
         threshold: float = 0.25,
     ) -> None:
         self.threshold = threshold
         self.supplier_terms = {normalize_text(supplier) for supplier in suppliers}
         self.user_matcher = ExpenseSupplierExtractor(suppliers=users)
             )
         # Сохраняем низкий порог для имён, но не пропускаем нарицательные слова.
+        accepted = lexical >= self.MIN_LEXICAL_SUPPORT or (
+            has_person_grammeme and lexical >= self.MIN_LEXICAL_WITH_PERSON
+        )
         return accepted, lexical, has_person_grammeme
     def _build_user_candidate_text(
         normalized_text: str,
         supplier_phrase: str | None,
         date_phrase: str | None,
+        include_debug: bool = False,
+    ) -> tuple[str, list[str], list[dict[str, Any]] | None]:
         excluded_tokens: set[str] = set(self.user_matcher.noise_terms)
         if supplier_phrase:
             excluded_tokens.update(normalize_text(supplier_phrase).split())
         excluded_tokens.update(self.supplier_terms)
         candidate_tokens: list[str] = []
+        candidate_debug: list[dict[str, Any]] | None = [] if include_debug else None
         for token in normalized_text.split():
             if token in excluded_tokens or token.isdigit() or len(token) <= 1:
                 continue
             accepted, lexical, has_person_grammeme = self._looks_like_person_token(token)
+            if candidate_debug is not None:
+                candidate_debug.append({
+                    "token": token,
+                    "lexical_support": round(lexical, 4),
+                    "has_person_grammeme": has_person_grammeme,
+                    "accepted": accepted,
+                })
             if accepted:
                 candidate_tokens.append(token)
             normalized_text=normalized_text,
             supplier_phrase=supplier_phrase,
             date_phrase=date_phrase,
+            include_debug=debug,
         )
         if not candidate_text:
                 payload["user_debug"] = {
                     "mode": "supplier-matcher",
                     "threshold": self.threshold,
+                    "rules": {
+                        "min_lexical_support": self.MIN_LEXICAL_SUPPORT,
+                        "min_lexical_with_person_grammeme": self.MIN_LEXICAL_WITH_PERSON,
+                        "morph_enabled": self.morph is not None,
+                    },
                     "excluded_supplier_phrase": supplier_phrase,
                     "normalized_text": normalized_text,
                     "candidate_text": candidate_text,
                     "candidate_tokens": candidate_tokens,
+                    "candidate_token_debug": candidate_debug or [],
                     "matcher_debug": None,
                 }
             return payload
             payload["user_debug"] = {
                 "mode": "supplier-matcher",
                 "threshold": self.threshold,
+                "rules": {
+                    "min_lexical_support": self.MIN_LEXICAL_SUPPORT,
+                    "min_lexical_with_person_grammeme": self.MIN_LEXICAL_WITH_PERSON,
+                    "morph_enabled": self.morph is not None,
+                },
                 "excluded_supplier_phrase": supplier_phrase,
                 "normalized_text": normalized_text,
                 "candidate_text": candidate_text,
                 "candidate_tokens": candidate_tokens,
+                "candidate_token_debug": candidate_debug or [],
                 "matcher_debug": match.get("supplier_debug"),
             }