VladGeekPro commited on
Commit
0bd0146
·
1 Parent(s): f654298

SolvedCommitConflicts

Browse files
Files changed (1) hide show
  1. app.py +0 -931
app.py CHANGED
@@ -62,937 +62,6 @@ def get_whisper_model() -> Any:
62
  return _WHISPER_MODEL
63
 
64
 
65
- <<<<<<< HEAD
66
- =======
67
- def normalize_text(text: str) -> str:
68
- text = unicodedata.normalize("NFKD", text.lower())
69
- text = "".join(ch for ch in text if not unicodedata.combining(ch))
70
- return re.sub(r"[^\w\s]", "", text).strip()
71
-
72
-
73
- def tokenize_text(text: str) -> list[str]:
74
- return normalize_text(text).split()
75
-
76
-
77
- def lemmatize_word(word: str) -> str:
78
- return MORPH.parse(word)[0].normal_form if re.fullmatch(r"[а-я]+", word) else word
79
-
80
-
81
- def lemmatize_text(text: str) -> list[str]:
82
- return [lemmatize_word(word) for word in tokenize_text(text)]
83
-
84
-
85
- def variants(text: str) -> list[str]:
86
- base = normalize_text(text)
87
- result = [base]
88
-
89
- for schema in (iuliia.WIKIPEDIA, iuliia.MOSMETRO, iuliia.ALA_LC):
90
- try:
91
- v = normalize_text(schema.translate(base))
92
- if v and v not in result:
93
- result.append(v)
94
- except Exception:
95
- pass
96
-
97
- for v in list(result):
98
- core = " ".join(w for w in v.split() if len(w) > 1 and any(ch.isalpha() for ch in w))
99
- core = normalize_text(core)
100
- if core and core not in result:
101
- result.insert(0, core)
102
-
103
- return result
104
-
105
-
106
- def token_alignment_score(phrase_variant: str, candidate_tokens: list[str]) -> float:
107
- phrase_tokens = [t for t in phrase_variant.split() if len(t) > 2]
108
- if not phrase_tokens or not candidate_tokens:
109
- return 0.0
110
- best_scores = []
111
- for pt in phrase_tokens:
112
- best = 0.0
113
- for ct in candidate_tokens:
114
- sim = Levenshtein.normalized_similarity(pt, ct)
115
- if sim > best:
116
- best = sim
117
- best_scores.append(best)
118
- return sum(best_scores) / len(best_scores)
119
-
120
-
121
- def length_penalty(phrase_len: int, candidate_len: int) -> float:
122
- if phrase_len == 0 or candidate_len == 0:
123
- return 0.0
124
- ratio = min(phrase_len, candidate_len) / max(phrase_len, candidate_len)
125
- if ratio >= 0.80:
126
- return 1.0
127
- if ratio >= 0.60:
128
- return 0.90
129
- if ratio >= 0.40:
130
- return 0.70
131
- return 0.50
132
-
133
-
134
- def canonicalize_for_similarity(text: str) -> str:
135
- t = normalize_text(text).replace(" ", "")
136
- replacements = (
137
- ("sch", "sh"),
138
- ("tch", "ch"),
139
- ("dzh", "j"),
140
- ("zh", "j"),
141
- ("sh", "s"),
142
- ("ch", "c"),
143
- ("kh", "h"),
144
- ("ph", "f"),
145
- ("ck", "k"),
146
- ("qu", "k"),
147
- ("q", "k"),
148
- ("w", "v"),
149
- ("x", "ks"),
150
- ("ts", "z"),
151
- ("tz", "z"),
152
- )
153
- for src, dst in replacements:
154
- t = t.replace(src, dst)
155
- return re.sub(r"(.)\1+", r"\1", t)
156
-
157
-
158
- def phonetic_similarity(left: str, right: str) -> float:
159
- l = canonicalize_for_similarity(left)
160
- r = canonicalize_for_similarity(right)
161
- if not l or not r:
162
- return 0.0
163
- char = fuzz.ratio(l, r) / 100.0
164
- lev = Levenshtein.normalized_similarity(l, r)
165
- return 0.50 * char + 0.50 * lev
166
-
167
-
168
- @dataclass(frozen=True)
169
- class ParsedDate:
170
- date_iso: str
171
- matched_expression: Optional[str]
172
-
173
-
174
- @dataclass(frozen=True)
175
- class Token:
176
- original: str
177
- normalized: str
178
- raw_lemma: str
179
- lemma: str
180
- lemma_correction: Optional[str]
181
- start: int
182
- end: int
183
- lemma_start: int
184
- lemma_end: int
185
-
186
-
187
- WORD_RE = re.compile(r"[0-9]+(?:[./-][0-9]+)*|[а-яё]+", re.IGNORECASE)
188
-
189
-
190
- class UniversalDateParser:
191
- MONTHS = {
192
- "январь": 1, "февраль": 2, "март": 3, "апрель": 4, "май": 5, "июнь": 6,
193
- "июль": 7, "август": 8, "сентябрь": 9, "октябрь": 10, "ноябрь": 11, "декабрь": 12,
194
- }
195
- WEEKDAYS = {
196
- "понедельник": 0, "вторник": 1, "среда": 2, "четверг": 3,
197
- "пятница": 4, "суббота": 5, "воскресенье": 6,
198
- }
199
- DIRECT_RELATIVE = {"послезавтра": 2, "позавчера": -2, "сегодня": 0, "вчера": -1, "завтра": 1}
200
- ORDINAL_DAYS = {
201
- "первый": 1, "второй": 2, "третий": 3, "четвертый": 4, "пятый": 5, "шестой": 6,
202
- "седьмой": 7, "восьмой": 8, "девятый": 9, "десятый": 10, "одиннадцатый": 11,
203
- "двенадцатый": 12, "тринадцатый": 13, "четырнадцатый": 14, "пятнадцатый": 15,
204
- "шестнадцатый": 16, "семнадцатый": 17, "восемнадцатый": 18, "девятнадцатый": 19,
205
- "двадцатый": 20, "двадцать первый": 21, "двадцать второй": 22, "двадцать третий": 23,
206
- "двадцать четвертый": 24, "двадцать пятый": 25, "двадцать шестой": 26,
207
- "двадцать седьмой": 27, "двадцать восьмой": 28, "двадцать девятый": 29,
208
- "тридцатый": 30, "тридцать п��рвый": 31,
209
- }
210
- NUMBER_WORDS = {
211
- "ноль": 0, "один": 1, "два": 2, "три": 3, "четыре": 4, "пять": 5, "шесть": 6,
212
- "семь": 7, "восемь": 8, "девять": 9, "десять": 10, "одиннадцать": 11,
213
- "двенадцать": 12, "тринадцать": 13, "четырнадцать": 14, "пятнадцать": 15,
214
- "шестнадцать": 16, "семнадцать": 17, "восемнадцать": 18, "девятнадцать": 19,
215
- "двадцать": 20, "тридцать": 30,
216
- }
217
- FUTURE_HINTS = ("завтра", "послезавтра", "через", "быть", "заплатить", "следующий", "последующий")
218
- PAST_HINTS = ("вчера", "позавчера", "назад", "прошлый", "предыдущий", "оплатить", "купить", "заказать")
219
-
220
- DIRECT_RELATIVE_RE = re.compile(r"(?<!\S)(послезавтра|позавчера|сегодня|вчера|завтра)(?!\S)")
221
- WEEK_RELATIVE_RE = re.compile(
222
- r"(?<!\S)на (?P<which>следующий|последующий|прошлый|предыдущий|этот) неделя"
223
- r"(?: (?P<prep>в|во|на) (?P<weekday>понедельник|вторник|среда|четверг|пятница|суббота|воскресенье))?(?!\S)"
224
- )
225
- QUANTITY_RELATIVE_RE = re.compile(
226
- r"(?<!\S)(?P<number>\d+|[а-яё]+(?: [а-яё]+)?) "
227
- r"(?P<unit>месяц|неделя|день) "
228
- r"(?P<ago>назад)"
229
- r"(?: (?P<prep>в|во|на) (?P<weekday>понедельник|вторник|среда|четверг|пятница|суббота|воскресенье))?(?!\S)",
230
- re.IGNORECASE,
231
- )
232
- FORWARD_QUANTITY_RE = re.compile(
233
- r"(?<!\S)(?P<through>через) "
234
- r"(?P<number>\d+|[а-яё]+(?: [а-яё]+)?) "
235
- r"(?P<unit>месяц|неделя|день)"
236
- r"(?: (?P<prep>в|во|на) (?P<weekday>понедельник|вторник|среда|четверг|пятница|суббота|воскресенье))?(?!\S)",
237
- re.IGNORECASE,
238
- )
239
- FORWARD_SINGLE_UNIT_RE = re.compile(
240
- r"(?<!\S)(?P<through>через) "
241
- r"(?P<unit>месяц|неделя|день)"
242
- r"(?: (?P<prep>в|во|на) (?P<weekday>понедельник|вторник|среда|четверг|пятница|суббота|воскресенье))?(?!\S)",
243
- re.IGNORECASE,
244
- )
245
- TEXTUAL_ABSOLUTE_RE = re.compile(
246
- r"(?<!\S)(?P<day>\d{1,2}|[а-яё]+(?: [а-яё]+)?) "
247
- r"(?P<month>январь|февраль|март|апрель|май|июнь|июль|август|сентябрь|октябрь|ноябрь|декабрь)"
248
- r"(?: (?P<year>\d{4}))?(?!\S)",
249
- re.IGNORECASE,
250
- )
251
- PERIOD_EDGE_RE = re.compile(
252
- r"(?<!\S)(?:в )?(?P<edge>начало|конец) (?P<which>этот|следующий|последующий|прошлый|предыдущий) (?P<unit>неделя|месяц)(?!\S)",
253
- re.IGNORECASE,
254
- )
255
-
256
- @classmethod
257
- def temporal_vocabulary(cls) -> set[str]:
258
- vocab: set[str] = set()
259
- vocab.update(cls.MONTHS)
260
- vocab.update(cls.WEEKDAYS)
261
- vocab.update(cls.DIRECT_RELATIVE)
262
- vocab.update(cls.ORDINAL_DAYS)
263
- vocab.update(cls.NUMBER_WORDS)
264
- vocab.update({
265
- "неделя", "месяц", "день", "назад", "через", "начало", "конец", "на", "в", "во",
266
- "этот", "прошлый", "предыдущий", "следующий", "последующий",
267
- })
268
- return vocab
269
-
270
- @staticmethod
271
- def similarity(left: str, right: str) -> float:
272
- return difflib.SequenceMatcher(None, left, right).ratio()
273
-
274
- @classmethod
275
- def pick_temporal_correction(cls, normalized: str, raw_lemma: str) -> tuple[str, Optional[str]]:
276
- vocab = cls.temporal_vocabulary()
277
- if raw_lemma in vocab or not normalized.isalpha() or len(normalized) < 5:
278
- return raw_lemma, None
279
-
280
- candidates = list(difflib.get_close_matches(normalized, list(vocab), n=4, cutoff=0.74))
281
- candidates.extend(difflib.get_close_matches(raw_lemma, list(vocab), n=4, cutoff=0.74))
282
- candidates = list(dict.fromkeys(candidates))
283
- if not candidates:
284
- return raw_lemma, None
285
-
286
- best = max(candidates, key=lambda item: max(cls.similarity(normalized, item), cls.similarity(raw_lemma, item)))
287
- best_score = max(cls.similarity(normalized, best), cls.similarity(raw_lemma, best))
288
- return (best, f"{raw_lemma}->{best}") if best_score >= 0.80 else (raw_lemma, None)
289
-
290
- @staticmethod
291
- def normalize_word(word: str) -> str:
292
- return word.lower().replace("ё", "е")
293
-
294
- @classmethod
295
- def lemmatize(cls, word: str) -> str:
296
- return MORPH.parse(word)[0].normal_form if word.isalpha() else word
297
-
298
- @classmethod
299
- def tokenize(cls, text: str) -> list[Token]:
300
- tokens: list[Token] = []
301
- lemma_cursor = 0
302
-
303
- for match in WORD_RE.finditer(text):
304
- original = match.group(0)
305
- normalized = cls.normalize_word(original)
306
- raw_lemma = cls.lemmatize(normalized)
307
- lemma, correction = cls.pick_temporal_correction(normalized, raw_lemma)
308
- lemma_start = lemma_cursor
309
- lemma_end = lemma_start + len(lemma)
310
- tokens.append(Token(original, normalized, raw_lemma, lemma, correction, match.start(), match.end(), lemma_start, lemma_end))
311
- lemma_cursor = lemma_end + 1
312
-
313
- return tokens
314
-
315
- @staticmethod
316
- def lemma_text(tokens: list[Token]) -> str:
317
- return " ".join(token.lemma for token in tokens)
318
-
319
- @staticmethod
320
- def surface_text(text: str, tokens: list[Token], start_idx: int, end_idx: int) -> str:
321
- return text[tokens[start_idx].start:tokens[end_idx].end].strip() if tokens else ""
322
-
323
- @staticmethod
324
- def lemma_span_to_token_range(tokens: list[Token], span: tuple[int, int]) -> Optional[tuple[int, int]]:
325
- start_char, end_char = span
326
- start_idx = end_idx = None
327
-
328
- for idx, token in enumerate(tokens):
329
- if start_idx is None and token.lemma_start <= start_char < token.lemma_end:
330
- start_idx = idx
331
- if token.lemma_start < end_char <= token.lemma_end:
332
- end_idx = idx
333
- break
334
-
335
- return (start_idx, end_idx) if start_idx is not None and end_idx is not None else None
336
-
337
- @classmethod
338
- def make_parsed_date(cls, text: str, tokens: list[Token], match, parsed_date: date) -> Optional[ParsedDate]:
339
- token_span = cls.lemma_span_to_token_range(tokens, match.span())
340
- if token_span is None:
341
- return None
342
- return ParsedDate(parsed_date.isoformat(), cls.surface_text(text, tokens, token_span[0], token_span[1]))
343
-
344
- @classmethod
345
- def parse_number_phrase(cls, phrase: str) -> Optional[int]:
346
- phrase = phrase.strip()
347
- if not phrase:
348
- return None
349
- if phrase.isdigit():
350
- return int(phrase)
351
-
352
- parts = phrase.split()
353
- if len(parts) == 1:
354
- return cls.NUMBER_WORDS.get(parts[0])
355
- if len(parts) == 2 and parts[0] in {"двадцать", "тридцать"}:
356
- base = cls.NUMBER_WORDS.get(parts[0])
357
- addon = cls.NUMBER_WORDS.get(parts[1])
358
- if base is not None and addon is not None and 1 <= addon <= 9:
359
- return base + addon
360
- return None
361
-
362
- @classmethod
363
- def parse_day_phrase(cls, phrase: str) -> Optional[int]:
364
- if phrase.isdigit():
365
- value = int(phrase)
366
- return value if 1 <= value <= 31 else None
367
- return cls.ORDINAL_DAYS.get(phrase.strip())
368
-
369
- @staticmethod
370
- def shift_months(value: date, months: int) -> date:
371
- month_index = value.month - 1 + months
372
- year = value.year + month_index // 12
373
- month = month_index % 12 + 1
374
- day = min(value.day, calendar.monthrange(year, month)[1])
375
- return date(year, month, day)
376
-
377
- @staticmethod
378
- def parse_numeric_absolute(tokens: list[Token]) -> Optional[ParsedDate]:
379
- for token in tokens:
380
- separator = "." if "." in token.original else "-" if "-" in token.original else "/" if "/" in token.original else None
381
- if separator is None:
382
- continue
383
-
384
- parts = token.original.split(separator)
385
- if len(parts) != 3 or not all(part.isdigit() for part in parts):
386
- continue
387
-
388
- try:
389
- if len(parts[0]) == 4:
390
- parsed = date(int(parts[0]), int(parts[1]), int(parts[2]))
391
- elif len(parts[2]) == 4:
392
- parsed = date(int(parts[2]), int(parts[1]), int(parts[0]))
393
- else:
394
- continue
395
- return ParsedDate(parsed.isoformat(), token.original)
396
- except ValueError:
397
- continue
398
-
399
- return None
400
-
401
- @classmethod
402
- def parse_textual_absolute(cls, text: str, tokens: list[Token], reference_date: date) -> Optional[ParsedDate]:
403
- lemma_text = cls.lemma_text(tokens)
404
- for match in cls.TEXTUAL_ABSOLUTE_RE.finditer(lemma_text):
405
- day = cls.parse_day_phrase(match.group("day"))
406
- month = cls.MONTHS.get(match.group("month"))
407
- if day is None or month is None:
408
- continue
409
-
410
- year = int(match.group("year")) if match.group("year") else reference_date.year
411
- try:
412
- parsed = date(year, month, day)
413
- except ValueError:
414
- continue
415
-
416
- result = cls.make_parsed_date(text, tokens, match, parsed)
417
- if result is not None:
418
- return result
419
-
420
- return None
421
-
422
- @classmethod
423
- def parse_direct_relative(cls, text: str, tokens: list[Token], reference_date: date) -> Optional[ParsedDate]:
424
- lemma_text = cls.lemma_text(tokens)
425
- match = cls.DIRECT_RELATIVE_RE.search(lemma_text)
426
- if not match:
427
- return None
428
-
429
- parsed = reference_date + timedelta(days=cls.DIRECT_RELATIVE[match.group(1)])
430
- return cls.make_parsed_date(text, tokens, match, parsed)
431
-
432
- @staticmethod
433
- def week_monday(value: date) -> date:
434
- return value - timedelta(days=value.weekday())
435
-
436
- @classmethod
437
- def parse_week_relative(cls, text: str, tokens: list[Token], reference_date: date) -> Optional[ParsedDate]:
438
- lemma_text = cls.lemma_text(tokens)
439
- match = cls.WEEK_RELATIVE_RE.search(lemma_text)
440
- if not match:
441
- return None
442
-
443
- offsets = {"следующий": 7, "последующий": 7, "прошлый": -7, "предыдущий": -7, "этот": 0}
444
- anchor = reference_date + timedelta(days=offsets[match.group("which")])
445
-
446
- if match.group("weekday"):
447
- anchor = cls.week_monday(anchor) + timedelta(days=cls.WEEKDAYS[match.group("weekday")])
448
-
449
- return cls.make_parsed_date(text, tokens, match, anchor)
450
-
451
- @classmethod
452
- def parse_period_edge(cls, text: str, tokens: list[Token], reference_date: date) -> Optional[ParsedDate]:
453
- lemma_text = cls.lemma_text(tokens)
454
- match = cls.PERIOD_EDGE_RE.search(lemma_text)
455
- if not match:
456
- return None
457
-
458
- edge, which, unit = match.group("edge"), match.group("which"), match.group("unit")
459
-
460
- if unit == "неделя":
461
- offsets = {"прошлый": -7, "предыдущий": -7, "этот": 0, "следующий": 7, "последующий": 7}
462
- monday = cls.week_monday(reference_date + timedelta(days=offsets[which]))
463
- parsed_date = monday if edge == "начало" else monday + timedelta(days=6)
464
- else:
465
- month_offset = {"прошлый": -1, "предыдущий": -1, "этот": 0, "следующий": 1, "последующий": 1}[which]
466
- shifted = cls.shift_months(date(reference_date.year, reference_date.month, 1), month_offset)
467
- parsed_date = shifted if edge == "начало" else date(shifted.year, shifted.month, calendar.monthrange(shifted.year, shifted.month)[1])
468
-
469
- return cls.make_parsed_date(text, tokens, match, parsed_date)
470
-
471
- @classmethod
472
- def parse_quantity_relative(cls, text: str, tokens: list[Token], reference_date: date) -> Optional[ParsedDate]:
473
- lemma_text = cls.lemma_text(tokens)
474
-
475
- for regex, direction in ((cls.QUANTITY_RELATIVE_RE, -1), (cls.FORWARD_QUANTITY_RE, 1)):
476
- for match in regex.finditer(lemma_text):
477
- number = cls.parse_number_phrase(match.group("number"))
478
- if number is None:
479
- continue
480
-
481
- unit = match.group("unit")
482
- if unit == "месяц":
483
- anchor = cls.shift_months(reference_date, direction * number)
484
- else:
485
- days = number * 7 if unit == "неделя" else number
486
- anchor = reference_date + timedelta(days=direction * days)
487
-
488
- if match.group("weekday"):
489
- anchor = cls.week_monday(anchor) + timedelta(days=cls.WEEKDAYS[match.group("weekday")])
490
-
491
- result = cls.make_parsed_date(text, tokens, match, anchor)
492
- if result is not None:
493
- return result
494
-
495
- for match in cls.FORWARD_SINGLE_UNIT_RE.finditer(lemma_text):
496
- unit = match.group("unit")
497
- if unit == "месяц":
498
- anchor = cls.shift_months(reference_date, 1)
499
- else:
500
- days = 7 if unit == "неделя" else 1
501
- anchor = reference_date + timedelta(days=days)
502
-
503
- if match.group("weekday"):
504
- anchor = cls.week_monday(anchor) + timedelta(days=cls.WEEKDAYS[match.group("weekday")])
505
-
506
- result = cls.make_parsed_date(text, tokens, match, anchor)
507
- if result is not None:
508
- return result
509
-
510
- return None
511
-
512
- @classmethod
513
- def preference_for_text(cls, tokens: list[Token]) -> str:
514
- lemmas = [token.lemma for token in tokens]
515
- future = sum(1 for hint in cls.FUTURE_HINTS if hint in lemmas)
516
- past = sum(1 for hint in cls.PAST_HINTS if hint in lemmas)
517
- return "future" if future > past else "past"
518
-
519
- @staticmethod
520
- def choose_best(matches: list[tuple[str, datetime]]) -> tuple[str, datetime]:
521
- return sorted(matches, key=lambda item: (len(item[0]), -item[1].timestamp()), reverse=True)[0]
522
-
523
- def parse(self, text: str, reference_date: date) -> Optional[ParsedDate]:
524
- tokens = self.tokenize(text)
525
-
526
- for parser in (
527
- lambda: self.parse_numeric_absolute(tokens),
528
- lambda: self.parse_textual_absolute(text, tokens, reference_date),
529
- lambda: self.parse_direct_relative(text, tokens, reference_date),
530
- lambda: self.parse_week_relative(text, tokens, reference_date),
531
- lambda: self.parse_period_edge(text, tokens, reference_date),
532
- lambda: self.parse_quantity_relative(text, tokens, reference_date),
533
- ):
534
- parsed = parser()
535
- if parsed is not None:
536
- return parsed
537
-
538
- normalized = " ".join(token.normalized for token in tokens)
539
- relative_base = datetime.combine(reference_date, datetime.min.time()).replace(hour=12)
540
- result = search_dates(
541
- normalized,
542
- languages=["ru"],
543
- settings={
544
- "RELATIVE_BASE": relative_base,
545
- "PREFER_DATES_FROM": self.preference_for_text(tokens),
546
- "STRICT_PARSING": False,
547
- "REQUIRE_PARTS": [],
548
- "NORMALIZE": True,
549
- "RETURN_AS_TIMEZONE_AWARE": False,
550
- "DATE_ORDER": "DMY",
551
- },
552
- )
553
-
554
- filtered: list[tuple[str, datetime]] = []
555
- for matched, value in result or []:
556
- if isinstance(value, datetime) and not matched.strip().isdigit() and 2020 <= value.year <= 2100:
557
- filtered.append((matched.strip(), value))
558
-
559
- if not filtered:
560
- return None
561
-
562
- matched_expression, value = self.choose_best(filtered)
563
- return ParsedDate(date_iso=value.date().isoformat(), matched_expression=matched_expression)
564
-
565
-
566
- class ExpenseDateExtractor:
567
- def __init__(self) -> None:
568
- self.parser = UniversalDateParser()
569
-
570
- def extract(self, text: str, reference_date: str | date | None = None) -> dict[str, Any]:
571
- ref_date = self.to_date(reference_date or date.today().isoformat())
572
- parsed = self.parser.parse(text=text, reference_date=ref_date)
573
-
574
- return {
575
- "date": datetime.strptime(parsed.date_iso, "%Y-%m-%d").strftime("%d.%m.%Y") if parsed else None,
576
- "date_iso": parsed.date_iso if parsed else None,
577
- "matched_date_phrase": parsed.matched_expression if parsed else None,
578
- }
579
-
580
- @staticmethod
581
- def to_date(value: str | date) -> date:
582
- return value if isinstance(value, date) else datetime.strptime(value, "%Y-%m-%d").date()
583
-
584
-
585
- # Парсер дат: "natasha" (рекомендуется) или "legacy"
586
- DATE_PARSER_MODE = os.getenv("DATE_PARSER_MODE", "natasha")
587
-
588
- def get_date_extractor():
589
- """
590
- Возвращает экстрактор дат.
591
- - natasha: Лучший для русского языка (по умолчанию)
592
- - legacy: Старый код ExpenseDateExtractor
593
- """
594
- if DATE_PARSER_MODE == "natasha":
595
- return NatashaDateExtractor()
596
- return ExpenseDateExtractor()
597
-
598
-
599
- class ExpenseUserExtractor:
600
- def __init__(self, users: list[str], suppliers: list[str], model: SentenceTransformer, threshold: float = 0.6) -> None:
601
- self.users = users
602
- self.model = model
603
- self.threshold = threshold
604
- self.supplier_terms = {normalize_text(supplier) for supplier in suppliers}
605
- self.user_terms = [normalize_text(user) for user in users]
606
- self.user_embeddings = model.encode(
607
- [f"passage: {user}" for user in self.user_terms],
608
- convert_to_tensor=True,
609
- normalize_embeddings=True,
610
- )
611
-
612
- def extract(self, text: str, supplier_phrase: str | None = None, date_phrase: str | None = None) -> dict[str, Any]:
613
- excluded_tokens: set[str] = set()
614
- if supplier_phrase:
615
- excluded_tokens.update(normalize_text(supplier_phrase).split())
616
- if date_phrase:
617
- excluded_tokens.update(normalize_text(date_phrase).split())
618
-
619
- best_user = None
620
- best_score = -1.0
621
- best_phrase = None
622
-
623
- for word in lemmatize_text(text):
624
- if len(word) < 3:
625
- continue
626
- if word in excluded_tokens or word in self.supplier_terms:
627
- continue
628
-
629
- query_emb = self.model.encode(
630
- f"query: {word}",
631
- convert_to_tensor=True,
632
- normalize_embeddings=True,
633
- )
634
- similarities = torch.cosine_similarity(query_emb.unsqueeze(0), self.user_embeddings, dim=1)
635
- idx = int(torch.argmax(similarities))
636
- score = similarities[idx].item()
637
-
638
- if score > best_score:
639
- best_score = score
640
- best_user = self.users[idx]
641
- best_phrase = word
642
-
643
- if best_score >= self.threshold:
644
- return {
645
- "user": best_user,
646
- "user_score": round(best_score, 4),
647
- "matched_user_phrase": best_phrase,
648
- }
649
-
650
- if re.search(r"(?<!\S)я(?!\S)", normalize_text(text), re.IGNORECASE):
651
- return {
652
- "user": "Я",
653
- "user_score": 1.0,
654
- "matched_user_phrase": "я",
655
- }
656
-
657
- return {
658
- "user": None,
659
- "user_score": None,
660
- "matched_user_phrase": None,
661
- }
662
-
663
-
664
- class ExpenseSupplierExtractor:
665
- def __init__(self, suppliers: list[str]) -> None:
666
- self.suppliers = suppliers
667
- self.sup_norm = [normalize_text(s) for s in suppliers]
668
- self.sup_tokens = [s.split() for s in self.sup_norm]
669
- self.sup_num_sets = [self.numeric_tokens(s) for s in self.sup_norm]
670
- self.sup_number_tokens = {token for supplier in self.sup_tokens for token in supplier if token.isdigit()}
671
- self.supplier_lexicon = [
672
- token
673
- for token in sorted({tok for tokens in self.sup_tokens for tok in tokens})
674
- if token and not token.isdigit()
675
- ]
676
- self.tfidf = TfidfVectorizer(analyzer="char_wb", ngram_range=(3, 5))
677
- self.sup_mat = self.tfidf.fit_transform(self.sup_norm)
678
- self.max_words = max(len(s.split()) for s in self.sup_norm)
679
- self.variant_cache: dict[str, list[str]] = {}
680
- self.lexical_token_cache: dict[str, float] = {}
681
- self.phrase_support_cache: dict[str, float] = {}
682
- self.noise_terms = {
683
- "за", "на", "из", "для", "под", "над", "при", "без", "и", "или",
684
- "купил", "купила", "купили", "покупка", "заказал", "заказала", "заказали",
685
- "оплатил", "оплатила", "оплатили", "заплатил", "заплатила", "заплатили",
686
- "был", "была", "было", "были", "утром", "днем", "днём", "вечером", "ночью",
687
- "товар", "товары", "продукт", "продукты", "десерт", "еда",
688
- "лей", "лея", "леи", "целых", "сотых", "сом", "сомов", "руб", "рублей", "грн", "usd", "eur",
689
- }
690
- self.noise_terms.update(UniversalDateParser.temporal_vocabulary())
691
-
692
- @staticmethod
693
- def numeric_tokens(text: str) -> set[str]:
694
- return set(re.findall(r"\d+", text))
695
-
696
- def cached_variants(self, text: str) -> list[str]:
697
- key = normalize_text(text)
698
- cached = self.variant_cache.get(key)
699
- if cached is None:
700
- cached = variants(key)
701
- self.variant_cache[key] = cached
702
- return cached
703
-
704
- @staticmethod
705
- def split_words(text: str) -> list[str]:
706
- return [w for w in normalize_text(text).split() if w]
707
-
708
- @classmethod
709
- def is_supplier_extension(cls, base_supplier: str, extended_supplier: str) -> bool:
710
- base_tokens = cls.split_words(base_supplier)
711
- extended_tokens = cls.split_words(extended_supplier)
712
- return len(base_tokens) < len(extended_tokens) and extended_tokens[:len(base_tokens)] == base_tokens
713
-
714
- @classmethod
715
- def phrase_token_count(cls, phrase: str | None) -> int:
716
- return len(cls.split_words(phrase or ""))
717
-
718
- @classmethod
719
- def resolve_overlapping_suppliers(cls, ranking: list[dict[str, Any]]) -> dict[str, Any]:
720
- if not ranking:
721
- return {"supplier": None, "score": -1.0, "phrase": None}
722
-
723
- best = ranking[0]
724
- best_combined = float(best.get("combined", best.get("score", -1.0)))
725
- best_phrase_len = cls.phrase_token_count(best.get("phrase"))
726
-
727
- for alt in ranking[1:]:
728
- if not cls.is_supplier_extension(str(best.get("supplier") or ""), str(alt.get("supplier") or "")):
729
- continue
730
-
731
- alt_combined = float(alt.get("combined", alt.get("score", -1.0)))
732
- alt_phrase_len = cls.phrase_token_count(alt.get("phrase"))
733
-
734
- if alt_phrase_len > best_phrase_len and alt_combined >= best_combined - 0.15:
735
- best = alt
736
- best_combined = alt_combined
737
- best_phrase_len = alt_phrase_len
738
-
739
- return best
740
-
741
- @staticmethod
742
- def numeric_compatibility_multiplier(phrase_nums: set[str], candidate_nums: set[str]) -> float:
743
- if not phrase_nums and not candidate_nums:
744
- return 1.0
745
- if phrase_nums == candidate_nums:
746
- return 1.08
747
- if phrase_nums and candidate_nums:
748
- return 1.03 if phrase_nums & candidate_nums else 0.80
749
- return 0.82
750
-
751
- def lexical_support(self, phrase: str) -> float:
752
- tokens = [token for token in normalize_text(phrase).split() if token and not token.isdigit()]
753
- if not tokens or not self.supplier_lexicon:
754
- return 0.0
755
-
756
- support_scores: list[float] = []
757
- for token in tokens:
758
- cached = self.lexical_token_cache.get(token)
759
- if cached is not None:
760
- support_scores.append(cached)
761
- continue
762
-
763
- best = 0.0
764
- for token_variant in self.cached_variants(token):
765
- for lex in self.supplier_lexicon:
766
- lev = Levenshtein.normalized_similarity(token_variant, lex)
767
- phon = phonetic_similarity(token_variant, lex)
768
- sim = max(lev, phon)
769
- if sim > best:
770
- best = sim
771
-
772
- self.lexical_token_cache[token] = best
773
- support_scores.append(best)
774
-
775
- return sum(support_scores) / len(support_scores)
776
-
777
- def score_phrase(self, phrase: str) -> dict[str, Any]:
778
- vs = self.cached_variants(phrase)
779
- q = self.tfidf.transform(vs)
780
- tf = cosine_similarity(q, self.sup_mat)
781
-
782
- best: dict[str, Any] = {"supplier": None, "score": -1.0, "phrase": phrase, "variant": ""}
783
- for i, cand in enumerate(self.sup_norm):
784
- local = -1.0
785
- local_variant = ""
786
- candidate_nums = self.sup_num_sets[i]
787
- for j, v in enumerate(vs):
788
- char = fuzz.ratio(v, cand) / 100.0
789
- tf_val = float(tf[j, i])
790
- penalty = length_penalty(len(v), len(cand))
791
- phon = phonetic_similarity(v, cand)
792
- phrase_nums = self.numeric_tokens(v)
793
-
794
- if len(v.split()) == 1 and len(cand.split()) == 1:
795
- lev = Levenshtein.normalized_similarity(v, cand)
796
- val = (0.45 * lev + 0.25 * char + 0.10 * tf_val + 0.20 * phon) * penalty
797
- else:
798
- align = token_alignment_score(v, self.sup_tokens[i])
799
- tok = fuzz.token_set_ratio(v, cand) / 100.0
800
- val = (0.30 * char + 0.20 * tok + 0.10 * tf_val + 0.20 * align + 0.20 * phon) * penalty
801
-
802
- compact_v = v.replace(" ", "")
803
- compact_cand = cand.replace(" ", "")
804
- compact_char = fuzz.ratio(compact_v, compact_cand) / 100.0
805
- compact_lev = Levenshtein.normalized_similarity(compact_v, compact_cand)
806
- compact_phon = phonetic_similarity(compact_v, compact_cand)
807
- compact = max(compact_char, compact_lev, compact_phon)
808
- if compact > 0.55:
809
- val = max(val, compact * penalty)
810
-
811
- val *= self.numeric_compatibility_multiplier(phrase_nums, candidate_nums)
812
-
813
- if val > local:
814
- local = val
815
- local_variant = v
816
-
817
- if local > best["score"]:
818
- best = {"supplier": self.suppliers[i], "score": local, "phrase": phrase, "variant": local_variant}
819
- return best
820
-
821
- def extract(self, text: str, date_phrase: str | None = None, debug: bool = False) -> dict[str, Any]:
822
- threshold = 0.50
823
- excluded_tokens: set[str] = set()
824
- if date_phrase:
825
- excluded_tokens.update(normalize_text(date_phrase).split())
826
- excluded_tokens.update(self.noise_terms)
827
-
828
- raw_tokens = normalize_text(text).split()
829
- tokens: list[str] = []
830
- for token in raw_tokens:
831
- if token in excluded_tokens:
832
- continue
833
-
834
- if token.isdigit():
835
- if token in self.sup_number_tokens:
836
- tokens.append(token)
837
-
838
- if tokens and len(token) <= 3 and len(tokens[-1]) >= 4 and tokens[-1].isalpha():
839
- tokens.append(f"{tokens[-1]}{token}")
840
- continue
841
-
842
- if len(token) > 1:
843
- tokens.append(token)
844
-
845
- tokens = [t for t in tokens if len(t) > 1 and t not in excluded_tokens]
846
-
847
- phrases: list[str] = []
848
- seen: set[str] = set()
849
- for i in range(len(tokens)):
850
- for j in range(i + 1, min(i + 1 + self.max_words, len(tokens) + 1)):
851
- p = " ".join(tokens[i:j])
852
- if p not in seen:
853
- seen.add(p)
854
- phrases.append(p)
855
-
856
- results = [self.score_phrase(p) for p in phrases]
857
- candidate_rows: list[dict[str, Any]] = []
858
- best_by_supplier: dict[str, dict[str, Any]] = {}
859
- for row in results:
860
- supplier = row["supplier"]
861
- score = float(row.get("score", -1.0))
862
- phrase = str(row.get("phrase") or "")
863
- support = self.phrase_support_cache.get(phrase)
864
- if support is None:
865
- support = self.lexical_support(phrase)
866
- self.phrase_support_cache[phrase] = support
867
- combined = 0.75 * score + 0.25 * support
868
-
869
- if debug:
870
- candidate_rows.append({
871
- "supplier": supplier,
872
- "phrase": phrase,
873
- "score": round(score, 4),
874
- "support": round(support, 4),
875
- "combined": round(combined, 4),
876
- })
877
-
878
- enriched = {**row, "combined": combined}
879
- passes = score >= threshold or combined >= 0.48
880
- if passes and (supplier not in best_by_supplier or combined > float(best_by_supplier[supplier].get("combined", -1.0))):
881
- best_by_supplier[supplier] = enriched
882
-
883
- if not best_by_supplier and results:
884
- def support_for_phrase(phrase: str) -> float:
885
- cached_support = self.phrase_support_cache.get(phrase)
886
- if cached_support is None:
887
- cached_support = self.lexical_support(phrase)
888
- self.phrase_support_cache[phrase] = cached_support
889
- return cached_support
890
-
891
- fallback = max(
892
- results,
893
- key=lambda item: 0.75 * float(item.get("score", -1.0)) + 0.25 * support_for_phrase(str(item.get("phrase") or "")),
894
- )
895
- fallback_score = float(fallback.get("score", -1.0))
896
- fallback_phrase = str(fallback.get("phrase") or "")
897
- fallback_support = support_for_phrase(fallback_phrase)
898
- fallback_combined = 0.75 * fallback_score + 0.25 * fallback_support
899
- if fallback_score >= 0.40 and fallback_support >= 0.43 and fallback_combined >= 0.43:
900
- best_by_supplier[fallback["supplier"]] = {**fallback, "combined": fallback_combined}
901
-
902
- supplier_ranking = sorted(best_by_supplier.values(), key=lambda x: float(x.get("combined", x["score"])), reverse=True)
903
- best = self.resolve_overlapping_suppliers(supplier_ranking)
904
-
905
- payload = {
906
- "supplier": best["supplier"],
907
- "supplier_score": round(best["score"], 4) if best["score"] >= 0 else None,
908
- "matched_supplier_phrase": best.get("phrase"),
909
- }
910
-
911
- if debug:
912
- top_candidates = sorted(candidate_rows, key=lambda item: item["combined"], reverse=True)[:8]
913
- payload["supplier_debug"] = {
914
- "tokens": tokens,
915
- "phrases_count": len(phrases),
916
- "top_candidates": top_candidates,
917
- }
918
-
919
- return payload
920
-
921
-
922
- class ExpenseAmountExtractor:
923
- def __init__(self, suppliers: list[str]) -> None:
924
- self.model = get_amount_model()
925
-
926
- @staticmethod
927
- def to_float(value: str) -> Optional[float]:
928
- cleaned = value.replace(" ", "").replace("\u00A0", "")
929
- match = re.search(r"\d+(?:[,]\d{1,2})?", cleaned)
930
- if not match:
931
- return None
932
- try:
933
- return float(match.group(0).replace(",", "."))
934
- except ValueError:
935
- return None
936
-
937
- @staticmethod
938
- def phrase_span(text: str, phrase: Optional[str]) -> Optional[tuple[int, int]]:
939
- if not phrase:
940
- return None
941
- idx = text.lower().find(phrase.lower())
942
- if idx == -1:
943
- return None
944
- return idx, idx + len(phrase)
945
-
946
- @staticmethod
947
- def overlaps(span1: tuple[int, int], span2: Optional[tuple[int, int]]) -> bool:
948
- if span2 is None:
949
- return False
950
- return span1[0] < span2[1] and span2[0] < span1[1]
951
-
952
- @staticmethod
953
- def expand_amount_text(text: str, start: int, end: int) -> tuple[str, tuple[int, int]]:
954
- suffix = re.match(r",\d{1,2}", text[end:])
955
- if suffix:
956
- new_end = end + len(suffix.group(0))
957
- return text[start:new_end].strip(), (start, new_end)
958
-
959
- prefix = re.search(r"(\d{1,3}(?:\s*\d{3})*),", text[:start])
960
- if prefix:
961
- new_start = prefix.start(1)
962
- return text[new_start:end].strip(), (new_start, end)
963
-
964
- return text[start:end].strip(), (start, end)
965
-
966
- def extract(
967
- self,
968
- text: str,
969
- matched_date_phrase: Optional[str] = None,
970
- matched_supplier_phrase: Optional[str] = None,
971
- ) -> dict[str, Any]:
972
- if self.model is None:
973
- return {"amount": None, "amount_text": None}
974
-
975
- date_span = self.phrase_span(text, matched_date_phrase)
976
- supplier_span = self.phrase_span(text, matched_supplier_phrase)
977
- entities = self.model.predict_entities(text, ["money"], threshold=0.3)
978
-
979
- for ent in sorted(entities, key=lambda item: float(item.get("score", 0.0)), reverse=True):
980
- raw_span = (int(ent.get("start", 0)), int(ent.get("end", 0)))
981
- amount_text, span = self.expand_amount_text(text, raw_span[0], raw_span[1])
982
- amount = self.to_float(amount_text)
983
- overlaps_date = self.overlaps(span, date_span)
984
- overlaps_supplier = self.overlaps(span, supplier_span)
985
-
986
- if amount is None:
987
- continue
988
- if overlaps_date or overlaps_supplier:
989
- continue
990
- return {"amount": amount, "amount_text": amount_text}
991
-
992
- return {"amount": None, "amount_text": None}
993
-
994
-
995
- >>>>>>> acab9140c760dd9ad71b1a76a9ae5c130efa1829
996
  class ExpenseTextExtractor:
997
  """
998
  Главный экстрактор данных о расходах.
 
62
  return _WHISPER_MODEL
63
 
64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  class ExpenseTextExtractor:
66
  """
67
  Главный экстрактор данных о расходах.