VladRet2026 commited on
Commit
bd1a487
·
1 Parent(s): 0780f6f

CreatedSepareatedUserSearch

Browse files
Files changed (2) hide show
  1. app.py +2 -2
  2. extractors/user_extractor.py +75 -38
app.py CHANGED
@@ -428,7 +428,7 @@ def health():
428
  @app.get("/test-data")
429
  def test_data():
430
  """Тестирует извлечение данных из текста без использования Whisper."""
431
- debug = (request.args.get("debug")).strip().lower() == "1"
432
  extractor = build_default_pipeline(suppliers=TEST_SUPPLIERS, users=TEST_USERS)
433
 
434
  started = time.time()
@@ -472,7 +472,7 @@ def process_audio():
472
 
473
  audio = request.files.get("audio")
474
  mode = (request.form.get("mode") or "expense").strip()
475
- debug = (request.args.get("debug") == "1" or "")
476
  context = parse_context(request.form.get("context"))
477
 
478
  if audio is None:
 
428
  @app.get("/test-data")
429
  def test_data():
430
  """Тестирует извлечение данных из текста без использования Whisper."""
431
+ debug = (request.args.get("debug") or "").strip().lower() == "1"
432
  extractor = build_default_pipeline(suppliers=TEST_SUPPLIERS, users=TEST_USERS)
433
 
434
  started = time.time()
 
472
 
473
  audio = request.files.get("audio")
474
  mode = (request.form.get("mode") or "expense").strip()
475
+ debug = (request.args.get("debug") or "") == "1"
476
  context = parse_context(request.form.get("context"))
477
 
478
  if audio is None:
extractors/user_extractor.py CHANGED
@@ -36,10 +36,10 @@ class ExpenseUserExtractor:
36
  has_person_grammeme = False
37
  if self.morph is not None:
38
  parses = self.morph.parse(token)
39
- has_person_grammeme = any(
40
- {"Name", "Surn", "Patr"}.intersection(set(parse.tag.grammemes))
41
- for parse in parses
42
- )
43
 
44
  # Сохраняем низкий порог для имён, но не пропускаем нарицательные слова.
45
  accepted = lexical >= self.MIN_LEXICAL_SUPPORT or (
@@ -79,6 +79,72 @@ class ExpenseUserExtractor:
79
 
80
  return " ".join(candidate_tokens), candidate_tokens, candidate_debug
81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  def extract(
83
  self,
84
  text: str,
@@ -114,42 +180,13 @@ class ExpenseUserExtractor:
114
  "user_score": None,
115
  "matched_user_phrase": None,
116
  }
117
- if debug:
118
- payload["user_debug"] = {
119
- "mode": "supplier-matcher",
120
- "threshold": self.threshold,
121
- "rules": {
122
- "min_lexical_support": self.MIN_LEXICAL_SUPPORT,
123
- "min_lexical_with_person_grammeme": self.MIN_LEXICAL_WITH_PERSON,
124
- "morph_enabled": self.morph is not None,
125
- },
126
- "excluded_supplier_phrase": supplier_phrase,
127
- "normalized_text": normalized_text,
128
- "candidate_text": candidate_text,
129
- "candidate_tokens": candidate_tokens,
130
- "candidate_token_debug": candidate_debug or [],
131
- "matcher_debug": None,
132
- }
133
- return payload
134
-
135
- match = self.user_matcher.extract(
136
- text=candidate_text,
137
- date_phrase=date_phrase,
138
- excluded_phrases=[supplier_phrase] if supplier_phrase else None,
139
- debug=debug,
140
- score_threshold=self.threshold,
141
- combined_threshold=self.threshold,
142
- )
143
-
144
- payload = {
145
- "user": match.get("supplier"),
146
- "user_score": match.get("supplier_score"),
147
- "matched_user_phrase": match.get("matched_supplier_phrase"),
148
- }
149
 
150
  if debug:
151
  payload["user_debug"] = {
152
- "mode": "supplier-matcher",
153
  "threshold": self.threshold,
154
  "rules": {
155
  "min_lexical_support": self.MIN_LEXICAL_SUPPORT,
@@ -161,7 +198,7 @@ class ExpenseUserExtractor:
161
  "candidate_text": candidate_text,
162
  "candidate_tokens": candidate_tokens,
163
  "candidate_token_debug": candidate_debug or [],
164
- "matcher_debug": match.get("supplier_debug"),
165
  }
166
 
167
  return payload
 
36
  has_person_grammeme = False
37
  if self.morph is not None:
38
  parses = self.morph.parse(token)
39
+ if parses:
40
+ has_person_grammeme = bool(
41
+ {"Name", "Surn", "Patr"}.intersection(set(parses[0].tag.grammemes))
42
+ )
43
 
44
  # Сохраняем низкий порог для имён, но не пропускаем нарицательные слова.
45
  accepted = lexical >= self.MIN_LEXICAL_SUPPORT or (
 
79
 
80
  return " ".join(candidate_tokens), candidate_tokens, candidate_debug
81
 
82
+ def _match_user_from_candidates(
83
+ self,
84
+ candidate_tokens: list[str],
85
+ include_debug: bool = False,
86
+ ) -> tuple[dict[str, Any], dict[str, Any] | None]:
87
+ phrases: list[str] = []
88
+ seen: set[str] = set()
89
+ max_words = self.user_matcher.max_words
90
+ for i in range(len(candidate_tokens)):
91
+ for j in range(i + 1, min(i + 1 + max_words, len(candidate_tokens) + 1)):
92
+ phrase = " ".join(candidate_tokens[i:j])
93
+ if phrase not in seen:
94
+ seen.add(phrase)
95
+ phrases.append(phrase)
96
+
97
+ best_row: dict[str, Any] | None = None
98
+ debug_rows: list[dict[str, Any]] = []
99
+ for phrase in phrases:
100
+ row = self.user_matcher.score_phrase(phrase)
101
+ score = float(row.get("score", -1.0))
102
+ support = self.user_matcher.lexical_support(phrase)
103
+ combined = 0.75 * score + 0.25 * support
104
+
105
+ if include_debug:
106
+ debug_rows.append({
107
+ "phrase": phrase,
108
+ "supplier": row.get("supplier"),
109
+ "score": round(score, 4),
110
+ "support": round(support, 4),
111
+ "combined": round(combined, 4),
112
+ })
113
+
114
+ if score >= self.threshold or combined >= self.threshold:
115
+ enriched = {
116
+ "user": row.get("supplier"),
117
+ "user_score": round(score, 4) if score >= 0 else None,
118
+ "matched_user_phrase": phrase,
119
+ "combined": combined,
120
+ }
121
+ if best_row is None or combined > float(best_row.get("combined", -1.0)):
122
+ best_row = enriched
123
+
124
+ if best_row is None:
125
+ match_payload = {
126
+ "user": None,
127
+ "user_score": None,
128
+ "matched_user_phrase": None,
129
+ }
130
+ else:
131
+ match_payload = {
132
+ "user": best_row.get("user"),
133
+ "user_score": best_row.get("user_score"),
134
+ "matched_user_phrase": best_row.get("matched_user_phrase"),
135
+ }
136
+
137
+ match_debug = None
138
+ if include_debug:
139
+ match_debug = {
140
+ "phrases_count": len(phrases),
141
+ "score_threshold": self.threshold,
142
+ "combined_threshold": self.threshold,
143
+ "top_candidates": sorted(debug_rows, key=lambda item: item["combined"], reverse=True)[:8],
144
+ }
145
+
146
+ return match_payload, match_debug
147
+
148
  def extract(
149
  self,
150
  text: str,
 
180
  "user_score": None,
181
  "matched_user_phrase": None,
182
  }
183
+ match_debug = None
184
+ else:
185
+ payload, match_debug = self._match_user_from_candidates(candidate_tokens, include_debug=debug)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
  if debug:
188
  payload["user_debug"] = {
189
+ "mode": "user-matcher",
190
  "threshold": self.threshold,
191
  "rules": {
192
  "min_lexical_support": self.MIN_LEXICAL_SUPPORT,
 
198
  "candidate_text": candidate_text,
199
  "candidate_tokens": candidate_tokens,
200
  "candidate_token_debug": candidate_debug or [],
201
+ "matcher_debug": match_debug,
202
  }
203
 
204
  return payload