VladGeekPro commited on
Commit
cf7b5c2
·
1 Parent(s): 027e68b

AgainFixedUserAndSupplierProblem

Browse files
extractors/supplier_extractor.py CHANGED
@@ -296,6 +296,8 @@ class ExpenseSupplierExtractor:
296
  date_phrase: str | None = None,
297
  excluded_phrases: list[str] | None = None,
298
  debug: bool = False,
 
 
299
  ) -> dict[str, Any]:
300
  """
301
  Извлекает поставщика из текста.
@@ -305,11 +307,12 @@ class ExpenseSupplierExtractor:
305
  date_phrase: Фраза даты для исключения
306
  excluded_phrases: Дополнительные фразы для исключения
307
  debug: Включить отладочную информацию
 
 
308
 
309
  Returns:
310
  Словарь с supplier, supplier_score, matched_supplier_phrase
311
  """
312
- threshold = 0.50
313
  excluded_tokens: set[str] = set()
314
  if date_phrase:
315
  excluded_tokens.update(normalize_text(date_phrase).split())
@@ -333,7 +336,7 @@ class ExpenseSupplierExtractor:
333
  if len(token) > 1:
334
  tokens.append(token)
335
 
336
- tokens = [t for t in tokens if len(t) > 1 and t not in excluded_tokens]
337
 
338
  phrases: list[str] = []
339
  seen: set[str] = set()
@@ -367,7 +370,7 @@ class ExpenseSupplierExtractor:
367
  })
368
 
369
  enriched = {**row, "combined": combined}
370
- passes = score >= threshold or combined >= 0.48
371
  if passes and (supplier not in best_by_supplier or combined > float(best_by_supplier[supplier].get("combined", -1.0))):
372
  best_by_supplier[supplier] = enriched
373
 
@@ -405,6 +408,8 @@ class ExpenseSupplierExtractor:
405
  "tokens": tokens,
406
  "phrases_count": len(phrases),
407
  "excluded_tokens": sorted(excluded_tokens)[:80],
 
 
408
  "top_candidates": top_candidates,
409
  }
410
 
 
296
  date_phrase: str | None = None,
297
  excluded_phrases: list[str] | None = None,
298
  debug: bool = False,
299
+ score_threshold: float = 0.50,
300
+ combined_threshold: float = 0.48,
301
  ) -> dict[str, Any]:
302
  """
303
  Извлекает поставщика из текста.
 
307
  date_phrase: Фраза даты для исключения
308
  excluded_phrases: Дополнительные фразы для исключения
309
  debug: Включить отладочную информацию
310
+ score_threshold: Минимальный raw-score для принятия совпадения
311
+ combined_threshold: Минимальный combined-score для принятия совпадения
312
 
313
  Returns:
314
  Словарь с supplier, supplier_score, matched_supplier_phrase
315
  """
 
316
  excluded_tokens: set[str] = set()
317
  if date_phrase:
318
  excluded_tokens.update(normalize_text(date_phrase).split())
 
336
  if len(token) > 1:
337
  tokens.append(token)
338
 
339
+ tokens = [t for t in tokens if (len(t) > 1 or t.isdigit()) and t not in excluded_tokens]
340
 
341
  phrases: list[str] = []
342
  seen: set[str] = set()
 
370
  })
371
 
372
  enriched = {**row, "combined": combined}
373
+ passes = score >= score_threshold or combined >= combined_threshold
374
  if passes and (supplier not in best_by_supplier or combined > float(best_by_supplier[supplier].get("combined", -1.0))):
375
  best_by_supplier[supplier] = enriched
376
 
 
408
  "tokens": tokens,
409
  "phrases_count": len(phrases),
410
  "excluded_tokens": sorted(excluded_tokens)[:80],
411
+ "score_threshold": score_threshold,
412
+ "combined_threshold": combined_threshold,
413
  "top_candidates": top_candidates,
414
  }
415
 
extractors/user_extractor.py CHANGED
@@ -50,13 +50,14 @@ class ExpenseUserExtractor:
50
  date_phrase=date_phrase,
51
  excluded_phrases=[supplier_phrase] if supplier_phrase else None,
52
  debug=debug,
 
 
53
  )
54
 
55
- score = match.get("supplier_score")
56
  payload = {
57
- "user": match.get("supplier") if score is not None and score >= self.threshold else None,
58
- "user_score": score if score is not None and score >= self.threshold else None,
59
- "matched_user_phrase": match.get("matched_supplier_phrase") if score is not None and score >= self.threshold else None,
60
  }
61
 
62
  if debug:
 
50
  date_phrase=date_phrase,
51
  excluded_phrases=[supplier_phrase] if supplier_phrase else None,
52
  debug=debug,
53
+ score_threshold=self.threshold,
54
+ combined_threshold=self.threshold,
55
  )
56
 
 
57
  payload = {
58
+ "user": match.get("supplier"),
59
+ "user_score": match.get("supplier_score"),
60
+ "matched_user_phrase": match.get("matched_supplier_phrase"),
61
  }
62
 
63
  if debug: