Spaces:
Running
Running
Commit ·
0780f6f
1
Parent(s): 44706f3
CheckOptimizedUserSearchWIthMorph
Browse files- extractors/user_extractor.py +29 -13
extractors/user_extractor.py
CHANGED
|
@@ -12,14 +12,15 @@ from extractors.supplier_extractor import ExpenseSupplierExtractor, normalize_te
|
|
| 12 |
class ExpenseUserExtractor:
|
| 13 |
"""Ищет пользователя тем же fuzzy-matcher, что и поставщика."""
|
| 14 |
|
|
|
|
|
|
|
|
|
|
| 15 |
def __init__(
|
| 16 |
self,
|
| 17 |
users: list[str],
|
| 18 |
suppliers: list[str],
|
| 19 |
-
model: Any = None,
|
| 20 |
threshold: float = 0.25,
|
| 21 |
) -> None:
|
| 22 |
-
self.users = users
|
| 23 |
self.threshold = threshold
|
| 24 |
self.supplier_terms = {normalize_text(supplier) for supplier in suppliers}
|
| 25 |
self.user_matcher = ExpenseSupplierExtractor(suppliers=users)
|
|
@@ -41,7 +42,9 @@ class ExpenseUserExtractor:
|
|
| 41 |
)
|
| 42 |
|
| 43 |
# Сохраняем низкий порог для имён, но не пропускаем нарицательные слова.
|
| 44 |
-
accepted = lexical >=
|
|
|
|
|
|
|
| 45 |
return accepted, lexical, has_person_grammeme
|
| 46 |
|
| 47 |
def _build_user_candidate_text(
|
|
@@ -49,7 +52,8 @@ class ExpenseUserExtractor:
|
|
| 49 |
normalized_text: str,
|
| 50 |
supplier_phrase: str | None,
|
| 51 |
date_phrase: str | None,
|
| 52 |
-
|
|
|
|
| 53 |
excluded_tokens: set[str] = set(self.user_matcher.noise_terms)
|
| 54 |
if supplier_phrase:
|
| 55 |
excluded_tokens.update(normalize_text(supplier_phrase).split())
|
|
@@ -58,17 +62,18 @@ class ExpenseUserExtractor:
|
|
| 58 |
excluded_tokens.update(self.supplier_terms)
|
| 59 |
|
| 60 |
candidate_tokens: list[str] = []
|
| 61 |
-
candidate_debug: list[dict[str, Any]] = []
|
| 62 |
for token in normalized_text.split():
|
| 63 |
if token in excluded_tokens or token.isdigit() or len(token) <= 1:
|
| 64 |
continue
|
| 65 |
accepted, lexical, has_person_grammeme = self._looks_like_person_token(token)
|
| 66 |
-
candidate_debug
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
|
|
|
| 72 |
if accepted:
|
| 73 |
candidate_tokens.append(token)
|
| 74 |
|
|
@@ -100,6 +105,7 @@ class ExpenseUserExtractor:
|
|
| 100 |
normalized_text=normalized_text,
|
| 101 |
supplier_phrase=supplier_phrase,
|
| 102 |
date_phrase=date_phrase,
|
|
|
|
| 103 |
)
|
| 104 |
|
| 105 |
if not candidate_text:
|
|
@@ -112,11 +118,16 @@ class ExpenseUserExtractor:
|
|
| 112 |
payload["user_debug"] = {
|
| 113 |
"mode": "supplier-matcher",
|
| 114 |
"threshold": self.threshold,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
"excluded_supplier_phrase": supplier_phrase,
|
| 116 |
"normalized_text": normalized_text,
|
| 117 |
"candidate_text": candidate_text,
|
| 118 |
"candidate_tokens": candidate_tokens,
|
| 119 |
-
"candidate_token_debug": candidate_debug,
|
| 120 |
"matcher_debug": None,
|
| 121 |
}
|
| 122 |
return payload
|
|
@@ -140,11 +151,16 @@ class ExpenseUserExtractor:
|
|
| 140 |
payload["user_debug"] = {
|
| 141 |
"mode": "supplier-matcher",
|
| 142 |
"threshold": self.threshold,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
"excluded_supplier_phrase": supplier_phrase,
|
| 144 |
"normalized_text": normalized_text,
|
| 145 |
"candidate_text": candidate_text,
|
| 146 |
"candidate_tokens": candidate_tokens,
|
| 147 |
-
"candidate_token_debug": candidate_debug,
|
| 148 |
"matcher_debug": match.get("supplier_debug"),
|
| 149 |
}
|
| 150 |
|
|
|
|
| 12 |
class ExpenseUserExtractor:
|
| 13 |
"""Ищет пользователя тем же fuzzy-matcher, что и поставщика."""
|
| 14 |
|
| 15 |
+
MIN_LEXICAL_SUPPORT = 0.40
|
| 16 |
+
MIN_LEXICAL_WITH_PERSON = 0.30
|
| 17 |
+
|
| 18 |
def __init__(
|
| 19 |
self,
|
| 20 |
users: list[str],
|
| 21 |
suppliers: list[str],
|
|
|
|
| 22 |
threshold: float = 0.25,
|
| 23 |
) -> None:
|
|
|
|
| 24 |
self.threshold = threshold
|
| 25 |
self.supplier_terms = {normalize_text(supplier) for supplier in suppliers}
|
| 26 |
self.user_matcher = ExpenseSupplierExtractor(suppliers=users)
|
|
|
|
| 42 |
)
|
| 43 |
|
| 44 |
# Сохраняем низкий порог для имён, но не пропускаем нарицательные слова.
|
| 45 |
+
accepted = lexical >= self.MIN_LEXICAL_SUPPORT or (
|
| 46 |
+
has_person_grammeme and lexical >= self.MIN_LEXICAL_WITH_PERSON
|
| 47 |
+
)
|
| 48 |
return accepted, lexical, has_person_grammeme
|
| 49 |
|
| 50 |
def _build_user_candidate_text(
|
|
|
|
| 52 |
normalized_text: str,
|
| 53 |
supplier_phrase: str | None,
|
| 54 |
date_phrase: str | None,
|
| 55 |
+
include_debug: bool = False,
|
| 56 |
+
) -> tuple[str, list[str], list[dict[str, Any]] | None]:
|
| 57 |
excluded_tokens: set[str] = set(self.user_matcher.noise_terms)
|
| 58 |
if supplier_phrase:
|
| 59 |
excluded_tokens.update(normalize_text(supplier_phrase).split())
|
|
|
|
| 62 |
excluded_tokens.update(self.supplier_terms)
|
| 63 |
|
| 64 |
candidate_tokens: list[str] = []
|
| 65 |
+
candidate_debug: list[dict[str, Any]] | None = [] if include_debug else None
|
| 66 |
for token in normalized_text.split():
|
| 67 |
if token in excluded_tokens or token.isdigit() or len(token) <= 1:
|
| 68 |
continue
|
| 69 |
accepted, lexical, has_person_grammeme = self._looks_like_person_token(token)
|
| 70 |
+
if candidate_debug is not None:
|
| 71 |
+
candidate_debug.append({
|
| 72 |
+
"token": token,
|
| 73 |
+
"lexical_support": round(lexical, 4),
|
| 74 |
+
"has_person_grammeme": has_person_grammeme,
|
| 75 |
+
"accepted": accepted,
|
| 76 |
+
})
|
| 77 |
if accepted:
|
| 78 |
candidate_tokens.append(token)
|
| 79 |
|
|
|
|
| 105 |
normalized_text=normalized_text,
|
| 106 |
supplier_phrase=supplier_phrase,
|
| 107 |
date_phrase=date_phrase,
|
| 108 |
+
include_debug=debug,
|
| 109 |
)
|
| 110 |
|
| 111 |
if not candidate_text:
|
|
|
|
| 118 |
payload["user_debug"] = {
|
| 119 |
"mode": "supplier-matcher",
|
| 120 |
"threshold": self.threshold,
|
| 121 |
+
"rules": {
|
| 122 |
+
"min_lexical_support": self.MIN_LEXICAL_SUPPORT,
|
| 123 |
+
"min_lexical_with_person_grammeme": self.MIN_LEXICAL_WITH_PERSON,
|
| 124 |
+
"morph_enabled": self.morph is not None,
|
| 125 |
+
},
|
| 126 |
"excluded_supplier_phrase": supplier_phrase,
|
| 127 |
"normalized_text": normalized_text,
|
| 128 |
"candidate_text": candidate_text,
|
| 129 |
"candidate_tokens": candidate_tokens,
|
| 130 |
+
"candidate_token_debug": candidate_debug or [],
|
| 131 |
"matcher_debug": None,
|
| 132 |
}
|
| 133 |
return payload
|
|
|
|
| 151 |
payload["user_debug"] = {
|
| 152 |
"mode": "supplier-matcher",
|
| 153 |
"threshold": self.threshold,
|
| 154 |
+
"rules": {
|
| 155 |
+
"min_lexical_support": self.MIN_LEXICAL_SUPPORT,
|
| 156 |
+
"min_lexical_with_person_grammeme": self.MIN_LEXICAL_WITH_PERSON,
|
| 157 |
+
"morph_enabled": self.morph is not None,
|
| 158 |
+
},
|
| 159 |
"excluded_supplier_phrase": supplier_phrase,
|
| 160 |
"normalized_text": normalized_text,
|
| 161 |
"candidate_text": candidate_text,
|
| 162 |
"candidate_tokens": candidate_tokens,
|
| 163 |
+
"candidate_token_debug": candidate_debug or [],
|
| 164 |
"matcher_debug": match.get("supplier_debug"),
|
| 165 |
}
|
| 166 |
|