VladRet2026 commited on
Commit
0780f6f
·
1 Parent(s): 44706f3

CheckOptimizedUserSearchWIthMorph

Browse files
Files changed (1) hide show
  1. extractors/user_extractor.py +29 -13
extractors/user_extractor.py CHANGED
@@ -12,14 +12,15 @@ from extractors.supplier_extractor import ExpenseSupplierExtractor, normalize_te
12
  class ExpenseUserExtractor:
13
  """Ищет пользователя тем же fuzzy-matcher, что и поставщика."""
14
 
 
 
 
15
  def __init__(
16
  self,
17
  users: list[str],
18
  suppliers: list[str],
19
- model: Any = None,
20
  threshold: float = 0.25,
21
  ) -> None:
22
- self.users = users
23
  self.threshold = threshold
24
  self.supplier_terms = {normalize_text(supplier) for supplier in suppliers}
25
  self.user_matcher = ExpenseSupplierExtractor(suppliers=users)
@@ -41,7 +42,9 @@ class ExpenseUserExtractor:
41
  )
42
 
43
  # Сохраняем низкий порог для имён, но не пропускаем нарицательные слова.
44
- accepted = lexical >= 0.40 or (has_person_grammeme and lexical >= 0.30)
 
 
45
  return accepted, lexical, has_person_grammeme
46
 
47
  def _build_user_candidate_text(
@@ -49,7 +52,8 @@ class ExpenseUserExtractor:
49
  normalized_text: str,
50
  supplier_phrase: str | None,
51
  date_phrase: str | None,
52
- ) -> tuple[str, list[str], list[dict[str, Any]]]:
 
53
  excluded_tokens: set[str] = set(self.user_matcher.noise_terms)
54
  if supplier_phrase:
55
  excluded_tokens.update(normalize_text(supplier_phrase).split())
@@ -58,17 +62,18 @@ class ExpenseUserExtractor:
58
  excluded_tokens.update(self.supplier_terms)
59
 
60
  candidate_tokens: list[str] = []
61
- candidate_debug: list[dict[str, Any]] = []
62
  for token in normalized_text.split():
63
  if token in excluded_tokens or token.isdigit() or len(token) <= 1:
64
  continue
65
  accepted, lexical, has_person_grammeme = self._looks_like_person_token(token)
66
- candidate_debug.append({
67
- "token": token,
68
- "lexical_support": round(lexical, 4),
69
- "has_person_grammeme": has_person_grammeme,
70
- "accepted": accepted,
71
- })
 
72
  if accepted:
73
  candidate_tokens.append(token)
74
 
@@ -100,6 +105,7 @@ class ExpenseUserExtractor:
100
  normalized_text=normalized_text,
101
  supplier_phrase=supplier_phrase,
102
  date_phrase=date_phrase,
 
103
  )
104
 
105
  if not candidate_text:
@@ -112,11 +118,16 @@ class ExpenseUserExtractor:
112
  payload["user_debug"] = {
113
  "mode": "supplier-matcher",
114
  "threshold": self.threshold,
 
 
 
 
 
115
  "excluded_supplier_phrase": supplier_phrase,
116
  "normalized_text": normalized_text,
117
  "candidate_text": candidate_text,
118
  "candidate_tokens": candidate_tokens,
119
- "candidate_token_debug": candidate_debug,
120
  "matcher_debug": None,
121
  }
122
  return payload
@@ -140,11 +151,16 @@ class ExpenseUserExtractor:
140
  payload["user_debug"] = {
141
  "mode": "supplier-matcher",
142
  "threshold": self.threshold,
 
 
 
 
 
143
  "excluded_supplier_phrase": supplier_phrase,
144
  "normalized_text": normalized_text,
145
  "candidate_text": candidate_text,
146
  "candidate_tokens": candidate_tokens,
147
- "candidate_token_debug": candidate_debug,
148
  "matcher_debug": match.get("supplier_debug"),
149
  }
150
 
 
12
  class ExpenseUserExtractor:
13
  """Ищет пользователя тем же fuzzy-matcher, что и поставщика."""
14
 
15
+ MIN_LEXICAL_SUPPORT = 0.40
16
+ MIN_LEXICAL_WITH_PERSON = 0.30
17
+
18
  def __init__(
19
  self,
20
  users: list[str],
21
  suppliers: list[str],
 
22
  threshold: float = 0.25,
23
  ) -> None:
 
24
  self.threshold = threshold
25
  self.supplier_terms = {normalize_text(supplier) for supplier in suppliers}
26
  self.user_matcher = ExpenseSupplierExtractor(suppliers=users)
 
42
  )
43
 
44
  # Сохраняем низкий порог для имён, но не пропускаем нарицательные слова.
45
+ accepted = lexical >= self.MIN_LEXICAL_SUPPORT or (
46
+ has_person_grammeme and lexical >= self.MIN_LEXICAL_WITH_PERSON
47
+ )
48
  return accepted, lexical, has_person_grammeme
49
 
50
  def _build_user_candidate_text(
 
52
  normalized_text: str,
53
  supplier_phrase: str | None,
54
  date_phrase: str | None,
55
+ include_debug: bool = False,
56
+ ) -> tuple[str, list[str], list[dict[str, Any]] | None]:
57
  excluded_tokens: set[str] = set(self.user_matcher.noise_terms)
58
  if supplier_phrase:
59
  excluded_tokens.update(normalize_text(supplier_phrase).split())
 
62
  excluded_tokens.update(self.supplier_terms)
63
 
64
  candidate_tokens: list[str] = []
65
+ candidate_debug: list[dict[str, Any]] | None = [] if include_debug else None
66
  for token in normalized_text.split():
67
  if token in excluded_tokens or token.isdigit() or len(token) <= 1:
68
  continue
69
  accepted, lexical, has_person_grammeme = self._looks_like_person_token(token)
70
+ if candidate_debug is not None:
71
+ candidate_debug.append({
72
+ "token": token,
73
+ "lexical_support": round(lexical, 4),
74
+ "has_person_grammeme": has_person_grammeme,
75
+ "accepted": accepted,
76
+ })
77
  if accepted:
78
  candidate_tokens.append(token)
79
 
 
105
  normalized_text=normalized_text,
106
  supplier_phrase=supplier_phrase,
107
  date_phrase=date_phrase,
108
+ include_debug=debug,
109
  )
110
 
111
  if not candidate_text:
 
118
  payload["user_debug"] = {
119
  "mode": "supplier-matcher",
120
  "threshold": self.threshold,
121
+ "rules": {
122
+ "min_lexical_support": self.MIN_LEXICAL_SUPPORT,
123
+ "min_lexical_with_person_grammeme": self.MIN_LEXICAL_WITH_PERSON,
124
+ "morph_enabled": self.morph is not None,
125
+ },
126
  "excluded_supplier_phrase": supplier_phrase,
127
  "normalized_text": normalized_text,
128
  "candidate_text": candidate_text,
129
  "candidate_tokens": candidate_tokens,
130
+ "candidate_token_debug": candidate_debug or [],
131
  "matcher_debug": None,
132
  }
133
  return payload
 
151
  payload["user_debug"] = {
152
  "mode": "supplier-matcher",
153
  "threshold": self.threshold,
154
+ "rules": {
155
+ "min_lexical_support": self.MIN_LEXICAL_SUPPORT,
156
+ "min_lexical_with_person_grammeme": self.MIN_LEXICAL_WITH_PERSON,
157
+ "morph_enabled": self.morph is not None,
158
+ },
159
  "excluded_supplier_phrase": supplier_phrase,
160
  "normalized_text": normalized_text,
161
  "candidate_text": candidate_text,
162
  "candidate_tokens": candidate_tokens,
163
+ "candidate_token_debug": candidate_debug or [],
164
  "matcher_debug": match.get("supplier_debug"),
165
  }
166