Spaces:
Sleeping
Sleeping
VladGeekPro commited on
Commit ·
cf7b5c2
1
Parent(s): 027e68b
AgainFixedUserAndSupplierProblem
Browse files
extractors/supplier_extractor.py
CHANGED
|
@@ -296,6 +296,8 @@ class ExpenseSupplierExtractor:
|
|
| 296 |
date_phrase: str | None = None,
|
| 297 |
excluded_phrases: list[str] | None = None,
|
| 298 |
debug: bool = False,
|
|
|
|
|
|
|
| 299 |
) -> dict[str, Any]:
|
| 300 |
"""
|
| 301 |
Извлекает поставщика из текста.
|
|
@@ -305,11 +307,12 @@ class ExpenseSupplierExtractor:
|
|
| 305 |
date_phrase: Фраза даты для исключения
|
| 306 |
excluded_phrases: Дополнительные фразы для исключения
|
| 307 |
debug: Включить отладочную информацию
|
|
|
|
|
|
|
| 308 |
|
| 309 |
Returns:
|
| 310 |
Словарь с supplier, supplier_score, matched_supplier_phrase
|
| 311 |
"""
|
| 312 |
-
threshold = 0.50
|
| 313 |
excluded_tokens: set[str] = set()
|
| 314 |
if date_phrase:
|
| 315 |
excluded_tokens.update(normalize_text(date_phrase).split())
|
|
@@ -333,7 +336,7 @@ class ExpenseSupplierExtractor:
|
|
| 333 |
if len(token) > 1:
|
| 334 |
tokens.append(token)
|
| 335 |
|
| 336 |
-
tokens = [t for t in tokens if len(t) > 1 and t not in excluded_tokens]
|
| 337 |
|
| 338 |
phrases: list[str] = []
|
| 339 |
seen: set[str] = set()
|
|
@@ -367,7 +370,7 @@ class ExpenseSupplierExtractor:
|
|
| 367 |
})
|
| 368 |
|
| 369 |
enriched = {**row, "combined": combined}
|
| 370 |
-
passes = score >=
|
| 371 |
if passes and (supplier not in best_by_supplier or combined > float(best_by_supplier[supplier].get("combined", -1.0))):
|
| 372 |
best_by_supplier[supplier] = enriched
|
| 373 |
|
|
@@ -405,6 +408,8 @@ class ExpenseSupplierExtractor:
|
|
| 405 |
"tokens": tokens,
|
| 406 |
"phrases_count": len(phrases),
|
| 407 |
"excluded_tokens": sorted(excluded_tokens)[:80],
|
|
|
|
|
|
|
| 408 |
"top_candidates": top_candidates,
|
| 409 |
}
|
| 410 |
|
|
|
|
| 296 |
date_phrase: str | None = None,
|
| 297 |
excluded_phrases: list[str] | None = None,
|
| 298 |
debug: bool = False,
|
| 299 |
+
score_threshold: float = 0.50,
|
| 300 |
+
combined_threshold: float = 0.48,
|
| 301 |
) -> dict[str, Any]:
|
| 302 |
"""
|
| 303 |
Извлекает поставщика из текста.
|
|
|
|
| 307 |
date_phrase: Фраза даты для исключения
|
| 308 |
excluded_phrases: Дополнительные фразы для исключения
|
| 309 |
debug: Включить отладочную информацию
|
| 310 |
+
score_threshold: Минимальный raw-score для принятия совпадения
|
| 311 |
+
combined_threshold: Минимальный combined-score для принятия совпадения
|
| 312 |
|
| 313 |
Returns:
|
| 314 |
Словарь с supplier, supplier_score, matched_supplier_phrase
|
| 315 |
"""
|
|
|
|
| 316 |
excluded_tokens: set[str] = set()
|
| 317 |
if date_phrase:
|
| 318 |
excluded_tokens.update(normalize_text(date_phrase).split())
|
|
|
|
| 336 |
if len(token) > 1:
|
| 337 |
tokens.append(token)
|
| 338 |
|
| 339 |
+
tokens = [t for t in tokens if (len(t) > 1 or t.isdigit()) and t not in excluded_tokens]
|
| 340 |
|
| 341 |
phrases: list[str] = []
|
| 342 |
seen: set[str] = set()
|
|
|
|
| 370 |
})
|
| 371 |
|
| 372 |
enriched = {**row, "combined": combined}
|
| 373 |
+
passes = score >= score_threshold or combined >= combined_threshold
|
| 374 |
if passes and (supplier not in best_by_supplier or combined > float(best_by_supplier[supplier].get("combined", -1.0))):
|
| 375 |
best_by_supplier[supplier] = enriched
|
| 376 |
|
|
|
|
| 408 |
"tokens": tokens,
|
| 409 |
"phrases_count": len(phrases),
|
| 410 |
"excluded_tokens": sorted(excluded_tokens)[:80],
|
| 411 |
+
"score_threshold": score_threshold,
|
| 412 |
+
"combined_threshold": combined_threshold,
|
| 413 |
"top_candidates": top_candidates,
|
| 414 |
}
|
| 415 |
|
extractors/user_extractor.py
CHANGED
|
@@ -50,13 +50,14 @@ class ExpenseUserExtractor:
|
|
| 50 |
date_phrase=date_phrase,
|
| 51 |
excluded_phrases=[supplier_phrase] if supplier_phrase else None,
|
| 52 |
debug=debug,
|
|
|
|
|
|
|
| 53 |
)
|
| 54 |
|
| 55 |
-
score = match.get("supplier_score")
|
| 56 |
payload = {
|
| 57 |
-
"user": match.get("supplier")
|
| 58 |
-
"user_score":
|
| 59 |
-
"matched_user_phrase": match.get("matched_supplier_phrase")
|
| 60 |
}
|
| 61 |
|
| 62 |
if debug:
|
|
|
|
| 50 |
date_phrase=date_phrase,
|
| 51 |
excluded_phrases=[supplier_phrase] if supplier_phrase else None,
|
| 52 |
debug=debug,
|
| 53 |
+
score_threshold=self.threshold,
|
| 54 |
+
combined_threshold=self.threshold,
|
| 55 |
)
|
| 56 |
|
|
|
|
| 57 |
payload = {
|
| 58 |
+
"user": match.get("supplier"),
|
| 59 |
+
"user_score": match.get("supplier_score"),
|
| 60 |
+
"matched_user_phrase": match.get("matched_supplier_phrase"),
|
| 61 |
}
|
| 62 |
|
| 63 |
if debug:
|