Vlad Juracovschi commited on
Commit
8b892b9
·
1 Parent(s): 81b7609

OptimizedUserSearch

Browse files
Files changed (1) hide show
  1. extractors/user_extractor.py +59 -1
extractors/user_extractor.py CHANGED
@@ -22,6 +22,38 @@ class ExpenseUserExtractor:
22
  self.threshold = threshold
23
  self.supplier_terms = {normalize_text(supplier) for supplier in suppliers}
24
  self.user_matcher = ExpenseSupplierExtractor(suppliers=users)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  def extract(
27
  self,
@@ -45,8 +77,32 @@ class ExpenseUserExtractor:
45
  }
46
  return payload
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  match = self.user_matcher.extract(
49
- text=text,
50
  date_phrase=date_phrase,
51
  excluded_phrases=[supplier_phrase] if supplier_phrase else None,
52
  debug=debug,
@@ -66,6 +122,8 @@ class ExpenseUserExtractor:
66
  "threshold": self.threshold,
67
  "excluded_supplier_phrase": supplier_phrase,
68
  "normalized_text": normalized_text,
 
 
69
  "matcher_debug": match.get("supplier_debug"),
70
  }
71
 
 
22
  self.threshold = threshold
23
  self.supplier_terms = {normalize_text(supplier) for supplier in suppliers}
24
  self.user_matcher = ExpenseSupplierExtractor(suppliers=users)
25
+ self.user_terms = {
26
+ token
27
+ for user in users
28
+ for token in normalize_text(user).split()
29
+ if token and len(token) > 1 and not token.isdigit()
30
+ }
31
+
32
+ def _is_user_like_token(self, token: str, similarity_threshold: float = 0.50) -> bool:
33
+ if token in self.user_terms:
34
+ return True
35
+ return self.user_matcher.lexical_support(token) >= similarity_threshold
36
+
37
+ def _build_user_candidate_text(
38
+ self,
39
+ normalized_text: str,
40
+ supplier_phrase: str | None,
41
+ date_phrase: str | None,
42
+ ) -> tuple[str, list[str]]:
43
+ excluded_tokens: set[str] = set(self.user_matcher.noise_terms)
44
+ if supplier_phrase:
45
+ excluded_tokens.update(normalize_text(supplier_phrase).split())
46
+ if date_phrase:
47
+ excluded_tokens.update(normalize_text(date_phrase).split())
48
+
49
+ user_like_tokens: list[str] = []
50
+ for token in normalized_text.split():
51
+ if token in excluded_tokens or token.isdigit() or len(token) <= 1:
52
+ continue
53
+ if self._is_user_like_token(token):
54
+ user_like_tokens.append(token)
55
+
56
+ return " ".join(user_like_tokens), user_like_tokens
57
 
58
  def extract(
59
  self,
 
77
  }
78
  return payload
79
 
80
+ candidate_text, user_like_tokens = self._build_user_candidate_text(
81
+ normalized_text=normalized_text,
82
+ supplier_phrase=supplier_phrase,
83
+ date_phrase=date_phrase,
84
+ )
85
+
86
+ if not candidate_text:
87
+ payload = {
88
+ "user": None,
89
+ "user_score": None,
90
+ "matched_user_phrase": None,
91
+ }
92
+ if debug:
93
+ payload["user_debug"] = {
94
+ "mode": "supplier-matcher",
95
+ "threshold": self.threshold,
96
+ "excluded_supplier_phrase": supplier_phrase,
97
+ "normalized_text": normalized_text,
98
+ "candidate_text": candidate_text,
99
+ "user_like_tokens": user_like_tokens,
100
+ "matcher_debug": None,
101
+ }
102
+ return payload
103
+
104
  match = self.user_matcher.extract(
105
+ text=candidate_text,
106
  date_phrase=date_phrase,
107
  excluded_phrases=[supplier_phrase] if supplier_phrase else None,
108
  debug=debug,
 
122
  "threshold": self.threshold,
123
  "excluded_supplier_phrase": supplier_phrase,
124
  "normalized_text": normalized_text,
125
+ "candidate_text": candidate_text,
126
+ "user_like_tokens": user_like_tokens,
127
  "matcher_debug": match.get("supplier_debug"),
128
  }
129