Vlad Juracovschi commited on
Commit
44706f3
·
1 Parent(s): 8b892b9

OptimizedUserSearchWithMorph

Browse files
Files changed (2) hide show
  1. app.py +1 -1
  2. extractors/user_extractor.py +41 -20
app.py CHANGED
@@ -472,7 +472,7 @@ def process_audio():
472
 
473
  audio = request.files.get("audio")
474
  mode = (request.form.get("mode") or "expense").strip()
475
- debug = ((request.form.get("debug") or request.args.get("debug") or "").strip().lower() in {"1", "true", "yes"})
476
  context = parse_context(request.form.get("context"))
477
 
478
  if audio is None:
 
472
 
473
  audio = request.files.get("audio")
474
  mode = (request.form.get("mode") or "expense").strip()
475
+ debug = (request.args.get("debug") == "1" or "")
476
  context = parse_context(request.form.get("context"))
477
 
478
  if audio is None:
extractors/user_extractor.py CHANGED
@@ -3,6 +3,7 @@
3
  from __future__ import annotations
4
 
5
  import re
 
6
  from typing import Any
7
 
8
  from extractors.supplier_extractor import ExpenseSupplierExtractor, normalize_text
@@ -22,38 +23,56 @@ class ExpenseUserExtractor:
22
  self.threshold = threshold
23
  self.supplier_terms = {normalize_text(supplier) for supplier in suppliers}
24
  self.user_matcher = ExpenseSupplierExtractor(suppliers=users)
25
- self.user_terms = {
26
- token
27
- for user in users
28
- for token in normalize_text(user).split()
29
- if token and len(token) > 1 and not token.isdigit()
30
- }
31
-
32
- def _is_user_like_token(self, token: str, similarity_threshold: float = 0.50) -> bool:
33
- if token in self.user_terms:
34
- return True
35
- return self.user_matcher.lexical_support(token) >= similarity_threshold
 
 
 
 
 
 
 
 
 
36
 
37
  def _build_user_candidate_text(
38
  self,
39
  normalized_text: str,
40
  supplier_phrase: str | None,
41
  date_phrase: str | None,
42
- ) -> tuple[str, list[str]]:
43
  excluded_tokens: set[str] = set(self.user_matcher.noise_terms)
44
  if supplier_phrase:
45
  excluded_tokens.update(normalize_text(supplier_phrase).split())
46
  if date_phrase:
47
  excluded_tokens.update(normalize_text(date_phrase).split())
 
48
 
49
- user_like_tokens: list[str] = []
 
50
  for token in normalized_text.split():
51
  if token in excluded_tokens or token.isdigit() or len(token) <= 1:
52
  continue
53
- if self._is_user_like_token(token):
54
- user_like_tokens.append(token)
55
-
56
- return " ".join(user_like_tokens), user_like_tokens
 
 
 
 
 
 
 
57
 
58
  def extract(
59
  self,
@@ -77,7 +96,7 @@ class ExpenseUserExtractor:
77
  }
78
  return payload
79
 
80
- candidate_text, user_like_tokens = self._build_user_candidate_text(
81
  normalized_text=normalized_text,
82
  supplier_phrase=supplier_phrase,
83
  date_phrase=date_phrase,
@@ -96,7 +115,8 @@ class ExpenseUserExtractor:
96
  "excluded_supplier_phrase": supplier_phrase,
97
  "normalized_text": normalized_text,
98
  "candidate_text": candidate_text,
99
- "user_like_tokens": user_like_tokens,
 
100
  "matcher_debug": None,
101
  }
102
  return payload
@@ -123,7 +143,8 @@ class ExpenseUserExtractor:
123
  "excluded_supplier_phrase": supplier_phrase,
124
  "normalized_text": normalized_text,
125
  "candidate_text": candidate_text,
126
- "user_like_tokens": user_like_tokens,
 
127
  "matcher_debug": match.get("supplier_debug"),
128
  }
129
 
 
3
  from __future__ import annotations
4
 
5
  import re
6
+ import importlib
7
  from typing import Any
8
 
9
  from extractors.supplier_extractor import ExpenseSupplierExtractor, normalize_text
 
23
  self.threshold = threshold
24
  self.supplier_terms = {normalize_text(supplier) for supplier in suppliers}
25
  self.user_matcher = ExpenseSupplierExtractor(suppliers=users)
26
+ self.morph: Any = None
27
+ try:
28
+ pymorphy3_module = importlib.import_module("pymorphy3")
29
+ self.morph = pymorphy3_module.MorphAnalyzer()
30
+ except Exception:
31
+ self.morph = None
32
+
33
+ def _looks_like_person_token(self, token: str) -> tuple[bool, float, bool]:
34
+ lexical = self.user_matcher.lexical_support(token)
35
+ has_person_grammeme = False
36
+ if self.morph is not None:
37
+ parses = self.morph.parse(token)
38
+ has_person_grammeme = any(
39
+ {"Name", "Surn", "Patr"}.intersection(set(parse.tag.grammemes))
40
+ for parse in parses
41
+ )
42
+
43
+ # Сохраняем низкий порог для имён, но не пропускаем нарицательные слова.
44
+ accepted = lexical >= 0.40 or (has_person_grammeme and lexical >= 0.30)
45
+ return accepted, lexical, has_person_grammeme
46
 
47
  def _build_user_candidate_text(
48
  self,
49
  normalized_text: str,
50
  supplier_phrase: str | None,
51
  date_phrase: str | None,
52
+ ) -> tuple[str, list[str], list[dict[str, Any]]]:
53
  excluded_tokens: set[str] = set(self.user_matcher.noise_terms)
54
  if supplier_phrase:
55
  excluded_tokens.update(normalize_text(supplier_phrase).split())
56
  if date_phrase:
57
  excluded_tokens.update(normalize_text(date_phrase).split())
58
+ excluded_tokens.update(self.supplier_terms)
59
 
60
+ candidate_tokens: list[str] = []
61
+ candidate_debug: list[dict[str, Any]] = []
62
  for token in normalized_text.split():
63
  if token in excluded_tokens or token.isdigit() or len(token) <= 1:
64
  continue
65
+ accepted, lexical, has_person_grammeme = self._looks_like_person_token(token)
66
+ candidate_debug.append({
67
+ "token": token,
68
+ "lexical_support": round(lexical, 4),
69
+ "has_person_grammeme": has_person_grammeme,
70
+ "accepted": accepted,
71
+ })
72
+ if accepted:
73
+ candidate_tokens.append(token)
74
+
75
+ return " ".join(candidate_tokens), candidate_tokens, candidate_debug
76
 
77
  def extract(
78
  self,
 
96
  }
97
  return payload
98
 
99
+ candidate_text, candidate_tokens, candidate_debug = self._build_user_candidate_text(
100
  normalized_text=normalized_text,
101
  supplier_phrase=supplier_phrase,
102
  date_phrase=date_phrase,
 
115
  "excluded_supplier_phrase": supplier_phrase,
116
  "normalized_text": normalized_text,
117
  "candidate_text": candidate_text,
118
+ "candidate_tokens": candidate_tokens,
119
+ "candidate_token_debug": candidate_debug,
120
  "matcher_debug": None,
121
  }
122
  return payload
 
143
  "excluded_supplier_phrase": supplier_phrase,
144
  "normalized_text": normalized_text,
145
  "candidate_text": candidate_text,
146
+ "candidate_tokens": candidate_tokens,
147
+ "candidate_token_debug": candidate_debug,
148
  "matcher_debug": match.get("supplier_debug"),
149
  }
150