Spaces:
Sleeping
Sleeping
Vlad Juracovschi commited on
Commit ·
8b892b9
1
Parent(s): 81b7609
OptimizedUserSearch
Browse files- extractors/user_extractor.py +59 -1
extractors/user_extractor.py
CHANGED
|
@@ -22,6 +22,38 @@ class ExpenseUserExtractor:
|
|
| 22 |
self.threshold = threshold
|
| 23 |
self.supplier_terms = {normalize_text(supplier) for supplier in suppliers}
|
| 24 |
self.user_matcher = ExpenseSupplierExtractor(suppliers=users)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
def extract(
|
| 27 |
self,
|
|
@@ -45,8 +77,32 @@ class ExpenseUserExtractor:
|
|
| 45 |
}
|
| 46 |
return payload
|
| 47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
match = self.user_matcher.extract(
|
| 49 |
-
text=
|
| 50 |
date_phrase=date_phrase,
|
| 51 |
excluded_phrases=[supplier_phrase] if supplier_phrase else None,
|
| 52 |
debug=debug,
|
|
@@ -66,6 +122,8 @@ class ExpenseUserExtractor:
|
|
| 66 |
"threshold": self.threshold,
|
| 67 |
"excluded_supplier_phrase": supplier_phrase,
|
| 68 |
"normalized_text": normalized_text,
|
|
|
|
|
|
|
| 69 |
"matcher_debug": match.get("supplier_debug"),
|
| 70 |
}
|
| 71 |
|
|
|
|
| 22 |
self.threshold = threshold
|
| 23 |
self.supplier_terms = {normalize_text(supplier) for supplier in suppliers}
|
| 24 |
self.user_matcher = ExpenseSupplierExtractor(suppliers=users)
|
| 25 |
+
self.user_terms = {
|
| 26 |
+
token
|
| 27 |
+
for user in users
|
| 28 |
+
for token in normalize_text(user).split()
|
| 29 |
+
if token and len(token) > 1 and not token.isdigit()
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
def _is_user_like_token(self, token: str, similarity_threshold: float = 0.50) -> bool:
|
| 33 |
+
if token in self.user_terms:
|
| 34 |
+
return True
|
| 35 |
+
return self.user_matcher.lexical_support(token) >= similarity_threshold
|
| 36 |
+
|
| 37 |
+
def _build_user_candidate_text(
|
| 38 |
+
self,
|
| 39 |
+
normalized_text: str,
|
| 40 |
+
supplier_phrase: str | None,
|
| 41 |
+
date_phrase: str | None,
|
| 42 |
+
) -> tuple[str, list[str]]:
|
| 43 |
+
excluded_tokens: set[str] = set(self.user_matcher.noise_terms)
|
| 44 |
+
if supplier_phrase:
|
| 45 |
+
excluded_tokens.update(normalize_text(supplier_phrase).split())
|
| 46 |
+
if date_phrase:
|
| 47 |
+
excluded_tokens.update(normalize_text(date_phrase).split())
|
| 48 |
+
|
| 49 |
+
user_like_tokens: list[str] = []
|
| 50 |
+
for token in normalized_text.split():
|
| 51 |
+
if token in excluded_tokens or token.isdigit() or len(token) <= 1:
|
| 52 |
+
continue
|
| 53 |
+
if self._is_user_like_token(token):
|
| 54 |
+
user_like_tokens.append(token)
|
| 55 |
+
|
| 56 |
+
return " ".join(user_like_tokens), user_like_tokens
|
| 57 |
|
| 58 |
def extract(
|
| 59 |
self,
|
|
|
|
| 77 |
}
|
| 78 |
return payload
|
| 79 |
|
| 80 |
+
candidate_text, user_like_tokens = self._build_user_candidate_text(
|
| 81 |
+
normalized_text=normalized_text,
|
| 82 |
+
supplier_phrase=supplier_phrase,
|
| 83 |
+
date_phrase=date_phrase,
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
if not candidate_text:
|
| 87 |
+
payload = {
|
| 88 |
+
"user": None,
|
| 89 |
+
"user_score": None,
|
| 90 |
+
"matched_user_phrase": None,
|
| 91 |
+
}
|
| 92 |
+
if debug:
|
| 93 |
+
payload["user_debug"] = {
|
| 94 |
+
"mode": "supplier-matcher",
|
| 95 |
+
"threshold": self.threshold,
|
| 96 |
+
"excluded_supplier_phrase": supplier_phrase,
|
| 97 |
+
"normalized_text": normalized_text,
|
| 98 |
+
"candidate_text": candidate_text,
|
| 99 |
+
"user_like_tokens": user_like_tokens,
|
| 100 |
+
"matcher_debug": None,
|
| 101 |
+
}
|
| 102 |
+
return payload
|
| 103 |
+
|
| 104 |
match = self.user_matcher.extract(
|
| 105 |
+
text=candidate_text,
|
| 106 |
date_phrase=date_phrase,
|
| 107 |
excluded_phrases=[supplier_phrase] if supplier_phrase else None,
|
| 108 |
debug=debug,
|
|
|
|
| 122 |
"threshold": self.threshold,
|
| 123 |
"excluded_supplier_phrase": supplier_phrase,
|
| 124 |
"normalized_text": normalized_text,
|
| 125 |
+
"candidate_text": candidate_text,
|
| 126 |
+
"user_like_tokens": user_like_tokens,
|
| 127 |
"matcher_debug": match.get("supplier_debug"),
|
| 128 |
}
|
| 129 |
|