Spaces:

VladRet2026
/

ConvertAudioToJSON

Running

App Files Files

VladRet2026 commited on 18 days ago

Commit

bd1a487

1 Parent(s): 0780f6f

CreatedSepareatedUserSearch

Browse files

Files changed (2) hide show

app.py +2 -2
extractors/user_extractor.py +75 -38

app.py CHANGED Viewed

@@ -428,7 +428,7 @@ def health():
 @app.get("/test-data")
 def test_data():
     """Тестирует извлечение данных из текста без использования Whisper."""
-    debug = (request.args.get("debug")).strip().lower() == "1"
     extractor = build_default_pipeline(suppliers=TEST_SUPPLIERS, users=TEST_USERS)
     started = time.time()
@@ -472,7 +472,7 @@ def process_audio():
     audio = request.files.get("audio")
     mode = (request.form.get("mode") or "expense").strip()
-    debug = (request.args.get("debug") == "1" or "")
     context = parse_context(request.form.get("context"))
     if audio is None:

 @app.get("/test-data")
 def test_data():
     """Тестирует извлечение данных из текста без использования Whisper."""
+    debug = (request.args.get("debug") or "").strip().lower() == "1"
     extractor = build_default_pipeline(suppliers=TEST_SUPPLIERS, users=TEST_USERS)
     started = time.time()
     audio = request.files.get("audio")
     mode = (request.form.get("mode") or "expense").strip()
+    debug = (request.args.get("debug") or "") == "1"
     context = parse_context(request.form.get("context"))
     if audio is None:

extractors/user_extractor.py CHANGED Viewed

@@ -36,10 +36,10 @@ class ExpenseUserExtractor:
         has_person_grammeme = False
         if self.morph is not None:
             parses = self.morph.parse(token)
-            has_person_grammeme = any(
-                {"Name", "Surn", "Patr"}.intersection(set(parse.tag.grammemes))
-                for parse in parses
-            )
         # Сохраняем низкий порог для имён, но не пропускаем нарицательные слова.
         accepted = lexical >= self.MIN_LEXICAL_SUPPORT or (
@@ -79,6 +79,72 @@ class ExpenseUserExtractor:
         return " ".join(candidate_tokens), candidate_tokens, candidate_debug
     def extract(
         self,
         text: str,
@@ -114,42 +180,13 @@ class ExpenseUserExtractor:
                 "user_score": None,
                 "matched_user_phrase": None,
             }
-            if debug:
-                payload["user_debug"] = {
-                    "mode": "supplier-matcher",
-                    "threshold": self.threshold,
-                    "rules": {
-                        "min_lexical_support": self.MIN_LEXICAL_SUPPORT,
-                        "min_lexical_with_person_grammeme": self.MIN_LEXICAL_WITH_PERSON,
-                        "morph_enabled": self.morph is not None,
-                    },
-                    "excluded_supplier_phrase": supplier_phrase,
-                    "normalized_text": normalized_text,
-                    "candidate_text": candidate_text,
-                    "candidate_tokens": candidate_tokens,
-                    "candidate_token_debug": candidate_debug or [],
-                    "matcher_debug": None,
-                }
-            return payload
-        match = self.user_matcher.extract(
-            text=candidate_text,
-            date_phrase=date_phrase,
-            excluded_phrases=[supplier_phrase] if supplier_phrase else None,
-            debug=debug,
-            score_threshold=self.threshold,
-            combined_threshold=self.threshold,
-        )
-        payload = {
-            "user": match.get("supplier"),
-            "user_score": match.get("supplier_score"),
-            "matched_user_phrase": match.get("matched_supplier_phrase"),
-        }
         if debug:
             payload["user_debug"] = {
-                "mode": "supplier-matcher",
                 "threshold": self.threshold,
                 "rules": {
                     "min_lexical_support": self.MIN_LEXICAL_SUPPORT,
@@ -161,7 +198,7 @@ class ExpenseUserExtractor:
                 "candidate_text": candidate_text,
                 "candidate_tokens": candidate_tokens,
                 "candidate_token_debug": candidate_debug or [],
-                "matcher_debug": match.get("supplier_debug"),
             }
         return payload

         has_person_grammeme = False
         if self.morph is not None:
             parses = self.morph.parse(token)
+            if parses:
+                has_person_grammeme = bool(
+                    {"Name", "Surn", "Patr"}.intersection(set(parses[0].tag.grammemes))
+                )
         # Сохраняем низкий порог для имён, но не пропускаем нарицательные слова.
         accepted = lexical >= self.MIN_LEXICAL_SUPPORT or (
         return " ".join(candidate_tokens), candidate_tokens, candidate_debug
+    def _match_user_from_candidates(
+        self,
+        candidate_tokens: list[str],
+        include_debug: bool = False,
+    ) -> tuple[dict[str, Any], dict[str, Any] | None]:
+        phrases: list[str] = []
+        seen: set[str] = set()
+        max_words = self.user_matcher.max_words
+        for i in range(len(candidate_tokens)):
+            for j in range(i + 1, min(i + 1 + max_words, len(candidate_tokens) + 1)):
+                phrase = " ".join(candidate_tokens[i:j])
+                if phrase not in seen:
+                    seen.add(phrase)
+                    phrases.append(phrase)
+        best_row: dict[str, Any] | None = None
+        debug_rows: list[dict[str, Any]] = []
+        for phrase in phrases:
+            row = self.user_matcher.score_phrase(phrase)
+            score = float(row.get("score", -1.0))
+            support = self.user_matcher.lexical_support(phrase)
+            combined = 0.75 * score + 0.25 * support
+            if include_debug:
+                debug_rows.append({
+                    "phrase": phrase,
+                    "supplier": row.get("supplier"),
+                    "score": round(score, 4),
+                    "support": round(support, 4),
+                    "combined": round(combined, 4),
+                })
+            if score >= self.threshold or combined >= self.threshold:
+                enriched = {
+                    "user": row.get("supplier"),
+                    "user_score": round(score, 4) if score >= 0 else None,
+                    "matched_user_phrase": phrase,
+                    "combined": combined,
+                }
+                if best_row is None or combined > float(best_row.get("combined", -1.0)):
+                    best_row = enriched
+        if best_row is None:
+            match_payload = {
+                "user": None,
+                "user_score": None,
+                "matched_user_phrase": None,
+            }
+        else:
+            match_payload = {
+                "user": best_row.get("user"),
+                "user_score": best_row.get("user_score"),
+                "matched_user_phrase": best_row.get("matched_user_phrase"),
+            }
+        match_debug = None
+        if include_debug:
+            match_debug = {
+                "phrases_count": len(phrases),
+                "score_threshold": self.threshold,
+                "combined_threshold": self.threshold,
+                "top_candidates": sorted(debug_rows, key=lambda item: item["combined"], reverse=True)[:8],
+            }
+        return match_payload, match_debug
     def extract(
         self,
         text: str,
                 "user_score": None,
                 "matched_user_phrase": None,
             }
+            match_debug = None
+        else:
+            payload, match_debug = self._match_user_from_candidates(candidate_tokens, include_debug=debug)
         if debug:
             payload["user_debug"] = {
+                "mode": "user-matcher",
                 "threshold": self.threshold,
                 "rules": {
                     "min_lexical_support": self.MIN_LEXICAL_SUPPORT,
                 "candidate_text": candidate_text,
                 "candidate_tokens": candidate_tokens,
                 "candidate_token_debug": candidate_debug or [],
+                "matcher_debug": match_debug,
             }
         return payload