Spaces:
Running
Running
Commit ·
bd1a487
1
Parent(s): 0780f6f
CreatedSepareatedUserSearch
Browse files- app.py +2 -2
- extractors/user_extractor.py +75 -38
app.py
CHANGED
|
@@ -428,7 +428,7 @@ def health():
|
|
| 428 |
@app.get("/test-data")
|
| 429 |
def test_data():
|
| 430 |
"""Тестирует извлечение данных из текста без использования Whisper."""
|
| 431 |
-
debug = (request.args.get("debug")).strip().lower() == "1"
|
| 432 |
extractor = build_default_pipeline(suppliers=TEST_SUPPLIERS, users=TEST_USERS)
|
| 433 |
|
| 434 |
started = time.time()
|
|
@@ -472,7 +472,7 @@ def process_audio():
|
|
| 472 |
|
| 473 |
audio = request.files.get("audio")
|
| 474 |
mode = (request.form.get("mode") or "expense").strip()
|
| 475 |
-
debug = (request.args.get("debug")
|
| 476 |
context = parse_context(request.form.get("context"))
|
| 477 |
|
| 478 |
if audio is None:
|
|
|
|
| 428 |
@app.get("/test-data")
|
| 429 |
def test_data():
|
| 430 |
"""Тестирует извлечение данных из текста без использования Whisper."""
|
| 431 |
+
debug = (request.args.get("debug") or "").strip().lower() == "1"
|
| 432 |
extractor = build_default_pipeline(suppliers=TEST_SUPPLIERS, users=TEST_USERS)
|
| 433 |
|
| 434 |
started = time.time()
|
|
|
|
| 472 |
|
| 473 |
audio = request.files.get("audio")
|
| 474 |
mode = (request.form.get("mode") or "expense").strip()
|
| 475 |
+
debug = (request.args.get("debug") or "") == "1"
|
| 476 |
context = parse_context(request.form.get("context"))
|
| 477 |
|
| 478 |
if audio is None:
|
extractors/user_extractor.py
CHANGED
|
@@ -36,10 +36,10 @@ class ExpenseUserExtractor:
|
|
| 36 |
has_person_grammeme = False
|
| 37 |
if self.morph is not None:
|
| 38 |
parses = self.morph.parse(token)
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
|
| 44 |
# Сохраняем низкий порог для имён, но не пропускаем нарицательные слова.
|
| 45 |
accepted = lexical >= self.MIN_LEXICAL_SUPPORT or (
|
|
@@ -79,6 +79,72 @@ class ExpenseUserExtractor:
|
|
| 79 |
|
| 80 |
return " ".join(candidate_tokens), candidate_tokens, candidate_debug
|
| 81 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
def extract(
|
| 83 |
self,
|
| 84 |
text: str,
|
|
@@ -114,42 +180,13 @@ class ExpenseUserExtractor:
|
|
| 114 |
"user_score": None,
|
| 115 |
"matched_user_phrase": None,
|
| 116 |
}
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
"threshold": self.threshold,
|
| 121 |
-
"rules": {
|
| 122 |
-
"min_lexical_support": self.MIN_LEXICAL_SUPPORT,
|
| 123 |
-
"min_lexical_with_person_grammeme": self.MIN_LEXICAL_WITH_PERSON,
|
| 124 |
-
"morph_enabled": self.morph is not None,
|
| 125 |
-
},
|
| 126 |
-
"excluded_supplier_phrase": supplier_phrase,
|
| 127 |
-
"normalized_text": normalized_text,
|
| 128 |
-
"candidate_text": candidate_text,
|
| 129 |
-
"candidate_tokens": candidate_tokens,
|
| 130 |
-
"candidate_token_debug": candidate_debug or [],
|
| 131 |
-
"matcher_debug": None,
|
| 132 |
-
}
|
| 133 |
-
return payload
|
| 134 |
-
|
| 135 |
-
match = self.user_matcher.extract(
|
| 136 |
-
text=candidate_text,
|
| 137 |
-
date_phrase=date_phrase,
|
| 138 |
-
excluded_phrases=[supplier_phrase] if supplier_phrase else None,
|
| 139 |
-
debug=debug,
|
| 140 |
-
score_threshold=self.threshold,
|
| 141 |
-
combined_threshold=self.threshold,
|
| 142 |
-
)
|
| 143 |
-
|
| 144 |
-
payload = {
|
| 145 |
-
"user": match.get("supplier"),
|
| 146 |
-
"user_score": match.get("supplier_score"),
|
| 147 |
-
"matched_user_phrase": match.get("matched_supplier_phrase"),
|
| 148 |
-
}
|
| 149 |
|
| 150 |
if debug:
|
| 151 |
payload["user_debug"] = {
|
| 152 |
-
"mode": "
|
| 153 |
"threshold": self.threshold,
|
| 154 |
"rules": {
|
| 155 |
"min_lexical_support": self.MIN_LEXICAL_SUPPORT,
|
|
@@ -161,7 +198,7 @@ class ExpenseUserExtractor:
|
|
| 161 |
"candidate_text": candidate_text,
|
| 162 |
"candidate_tokens": candidate_tokens,
|
| 163 |
"candidate_token_debug": candidate_debug or [],
|
| 164 |
-
"matcher_debug":
|
| 165 |
}
|
| 166 |
|
| 167 |
return payload
|
|
|
|
| 36 |
has_person_grammeme = False
|
| 37 |
if self.morph is not None:
|
| 38 |
parses = self.morph.parse(token)
|
| 39 |
+
if parses:
|
| 40 |
+
has_person_grammeme = bool(
|
| 41 |
+
{"Name", "Surn", "Patr"}.intersection(set(parses[0].tag.grammemes))
|
| 42 |
+
)
|
| 43 |
|
| 44 |
# Сохраняем низкий порог для имён, но не пропускаем нарицательные слова.
|
| 45 |
accepted = lexical >= self.MIN_LEXICAL_SUPPORT or (
|
|
|
|
| 79 |
|
| 80 |
return " ".join(candidate_tokens), candidate_tokens, candidate_debug
|
| 81 |
|
| 82 |
+
def _match_user_from_candidates(
|
| 83 |
+
self,
|
| 84 |
+
candidate_tokens: list[str],
|
| 85 |
+
include_debug: bool = False,
|
| 86 |
+
) -> tuple[dict[str, Any], dict[str, Any] | None]:
|
| 87 |
+
phrases: list[str] = []
|
| 88 |
+
seen: set[str] = set()
|
| 89 |
+
max_words = self.user_matcher.max_words
|
| 90 |
+
for i in range(len(candidate_tokens)):
|
| 91 |
+
for j in range(i + 1, min(i + 1 + max_words, len(candidate_tokens) + 1)):
|
| 92 |
+
phrase = " ".join(candidate_tokens[i:j])
|
| 93 |
+
if phrase not in seen:
|
| 94 |
+
seen.add(phrase)
|
| 95 |
+
phrases.append(phrase)
|
| 96 |
+
|
| 97 |
+
best_row: dict[str, Any] | None = None
|
| 98 |
+
debug_rows: list[dict[str, Any]] = []
|
| 99 |
+
for phrase in phrases:
|
| 100 |
+
row = self.user_matcher.score_phrase(phrase)
|
| 101 |
+
score = float(row.get("score", -1.0))
|
| 102 |
+
support = self.user_matcher.lexical_support(phrase)
|
| 103 |
+
combined = 0.75 * score + 0.25 * support
|
| 104 |
+
|
| 105 |
+
if include_debug:
|
| 106 |
+
debug_rows.append({
|
| 107 |
+
"phrase": phrase,
|
| 108 |
+
"supplier": row.get("supplier"),
|
| 109 |
+
"score": round(score, 4),
|
| 110 |
+
"support": round(support, 4),
|
| 111 |
+
"combined": round(combined, 4),
|
| 112 |
+
})
|
| 113 |
+
|
| 114 |
+
if score >= self.threshold or combined >= self.threshold:
|
| 115 |
+
enriched = {
|
| 116 |
+
"user": row.get("supplier"),
|
| 117 |
+
"user_score": round(score, 4) if score >= 0 else None,
|
| 118 |
+
"matched_user_phrase": phrase,
|
| 119 |
+
"combined": combined,
|
| 120 |
+
}
|
| 121 |
+
if best_row is None or combined > float(best_row.get("combined", -1.0)):
|
| 122 |
+
best_row = enriched
|
| 123 |
+
|
| 124 |
+
if best_row is None:
|
| 125 |
+
match_payload = {
|
| 126 |
+
"user": None,
|
| 127 |
+
"user_score": None,
|
| 128 |
+
"matched_user_phrase": None,
|
| 129 |
+
}
|
| 130 |
+
else:
|
| 131 |
+
match_payload = {
|
| 132 |
+
"user": best_row.get("user"),
|
| 133 |
+
"user_score": best_row.get("user_score"),
|
| 134 |
+
"matched_user_phrase": best_row.get("matched_user_phrase"),
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
match_debug = None
|
| 138 |
+
if include_debug:
|
| 139 |
+
match_debug = {
|
| 140 |
+
"phrases_count": len(phrases),
|
| 141 |
+
"score_threshold": self.threshold,
|
| 142 |
+
"combined_threshold": self.threshold,
|
| 143 |
+
"top_candidates": sorted(debug_rows, key=lambda item: item["combined"], reverse=True)[:8],
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
return match_payload, match_debug
|
| 147 |
+
|
| 148 |
def extract(
|
| 149 |
self,
|
| 150 |
text: str,
|
|
|
|
| 180 |
"user_score": None,
|
| 181 |
"matched_user_phrase": None,
|
| 182 |
}
|
| 183 |
+
match_debug = None
|
| 184 |
+
else:
|
| 185 |
+
payload, match_debug = self._match_user_from_candidates(candidate_tokens, include_debug=debug)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
|
| 187 |
if debug:
|
| 188 |
payload["user_debug"] = {
|
| 189 |
+
"mode": "user-matcher",
|
| 190 |
"threshold": self.threshold,
|
| 191 |
"rules": {
|
| 192 |
"min_lexical_support": self.MIN_LEXICAL_SUPPORT,
|
|
|
|
| 198 |
"candidate_text": candidate_text,
|
| 199 |
"candidate_tokens": candidate_tokens,
|
| 200 |
"candidate_token_debug": candidate_debug or [],
|
| 201 |
+
"matcher_debug": match_debug,
|
| 202 |
}
|
| 203 |
|
| 204 |
return payload
|