VladRet2026 commited on
Commit
a07dfb6
·
verified ·
1 Parent(s): d9e6bfd

addedMainPage

Browse files
Files changed (1) hide show
  1. app.py +1186 -1174
app.py CHANGED
@@ -1,1175 +1,1187 @@
1
- from __future__ import annotations
2
-
3
- import calendar
4
- import difflib
5
- import json
6
- import os
7
- import re
8
- import tempfile
9
- import unicodedata
10
- from dataclasses import dataclass
11
- from datetime import date, datetime, timedelta
12
- from pathlib import Path
13
- from typing import Any, Optional
14
-
15
- import iuliia
16
- import torch
17
- from dateparser.search import search_dates
18
- from flask import Flask, jsonify, request
19
- from gliner import GLiNER
20
- from pymorphy3 import MorphAnalyzer
21
- from rapidfuzz import fuzz
22
- from rapidfuzz.distance import Levenshtein
23
- from sentence_transformers import SentenceTransformer
24
- from sklearn.feature_extraction.text import TfidfVectorizer
25
- from sklearn.metrics.pairwise import cosine_similarity
26
-
27
- MORPH = MorphAnalyzer()
28
- DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
29
- _MODEL: Optional[SentenceTransformer] = None
30
- _AMOUNT_MODEL: Optional[Any] = None
31
- _WHISPER_MODEL: Optional[Any] = None
32
-
33
-
34
- app = Flask(__name__)
35
- app.config["MAX_CONTENT_LENGTH"] = 20 * 1024 * 1024
36
-
37
-
38
- def get_embedding_model() -> SentenceTransformer:
39
- global _MODEL
40
-
41
- if _MODEL is None:
42
- _MODEL = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B", device=DEVICE)
43
-
44
- return _MODEL
45
-
46
-
47
- def get_amount_model() -> Optional[Any]:
48
- global _AMOUNT_MODEL
49
-
50
- if _AMOUNT_MODEL is None and GLiNER is not None:
51
- _AMOUNT_MODEL = GLiNER.from_pretrained("urchade/gliner_multi-v2.1")
52
-
53
- return _AMOUNT_MODEL
54
-
55
-
56
- def get_whisper_model() -> Any:
57
- global _WHISPER_MODEL
58
-
59
- if _WHISPER_MODEL is None:
60
- from faster_whisper import WhisperModel
61
-
62
- model_name = os.getenv("WHISPER_MODEL", "large-v3")
63
- compute_type = os.getenv("WHISPER_COMPUTE_TYPE", "float16" if torch.cuda.is_available() else "int8")
64
- _WHISPER_MODEL = WhisperModel(model_name, device=DEVICE, compute_type=compute_type)
65
-
66
- return _WHISPER_MODEL
67
-
68
-
69
- def normalize_text(text: str) -> str:
70
- text = unicodedata.normalize("NFKD", text.lower())
71
- text = "".join(ch for ch in text if not unicodedata.combining(ch))
72
- return re.sub(r"[^\w\s]", "", text).strip()
73
-
74
-
75
- def tokenize_text(text: str) -> list[str]:
76
- return normalize_text(text).split()
77
-
78
-
79
- def lemmatize_word(word: str) -> str:
80
- return MORPH.parse(word)[0].normal_form if re.fullmatch(r"[а-я]+", word) else word
81
-
82
-
83
- def lemmatize_text(text: str) -> list[str]:
84
- return [lemmatize_word(word) for word in tokenize_text(text)]
85
-
86
-
87
- def variants(text: str) -> list[str]:
88
- base = normalize_text(text)
89
- result = [base]
90
-
91
- for schema in (iuliia.WIKIPEDIA, iuliia.MOSMETRO, iuliia.ALA_LC):
92
- try:
93
- v = normalize_text(schema.translate(base))
94
- if v and v not in result:
95
- result.append(v)
96
- except Exception:
97
- pass
98
-
99
- for v in list(result):
100
- core = " ".join(w for w in v.split() if len(w) > 1 and any(ch.isalpha() for ch in w))
101
- core = normalize_text(core)
102
- if core and core not in result:
103
- result.insert(0, core)
104
-
105
- return result
106
-
107
-
108
- def token_alignment_score(phrase_variant: str, candidate_tokens: list[str]) -> float:
109
- phrase_tokens = [t for t in phrase_variant.split() if len(t) > 2]
110
- if not phrase_tokens or not candidate_tokens:
111
- return 0.0
112
- best_scores = []
113
- for pt in phrase_tokens:
114
- best = 0.0
115
- for ct in candidate_tokens:
116
- sim = Levenshtein.normalized_similarity(pt, ct)
117
- if sim > best:
118
- best = sim
119
- best_scores.append(best)
120
- return sum(best_scores) / len(best_scores)
121
-
122
-
123
- def length_penalty(phrase_len: int, candidate_len: int) -> float:
124
- if phrase_len == 0 or candidate_len == 0:
125
- return 0.0
126
- ratio = min(phrase_len, candidate_len) / max(phrase_len, candidate_len)
127
- if ratio >= 0.80:
128
- return 1.0
129
- if ratio >= 0.60:
130
- return 0.90
131
- if ratio >= 0.40:
132
- return 0.70
133
- return 0.50
134
-
135
-
136
- def canonicalize_for_similarity(text: str) -> str:
137
- t = normalize_text(text).replace(" ", "")
138
- replacements = (
139
- ("sch", "sh"),
140
- ("tch", "ch"),
141
- ("dzh", "j"),
142
- ("zh", "j"),
143
- ("sh", "s"),
144
- ("ch", "c"),
145
- ("kh", "h"),
146
- ("ph", "f"),
147
- ("ck", "k"),
148
- ("qu", "k"),
149
- ("q", "k"),
150
- ("w", "v"),
151
- ("x", "ks"),
152
- ("ts", "z"),
153
- ("tz", "z"),
154
- )
155
- for src, dst in replacements:
156
- t = t.replace(src, dst)
157
- return re.sub(r"(.)\1+", r"\1", t)
158
-
159
-
160
- def phonetic_similarity(left: str, right: str) -> float:
161
- l = canonicalize_for_similarity(left)
162
- r = canonicalize_for_similarity(right)
163
- if not l or not r:
164
- return 0.0
165
- char = fuzz.ratio(l, r) / 100.0
166
- lev = Levenshtein.normalized_similarity(l, r)
167
- return 0.50 * char + 0.50 * lev
168
-
169
-
170
- @dataclass(frozen=True)
171
- class ParsedDate:
172
- date_iso: str
173
- matched_expression: Optional[str]
174
-
175
-
176
- @dataclass(frozen=True)
177
- class Token:
178
- original: str
179
- normalized: str
180
- raw_lemma: str
181
- lemma: str
182
- lemma_correction: Optional[str]
183
- start: int
184
- end: int
185
- lemma_start: int
186
- lemma_end: int
187
-
188
-
189
- WORD_RE = re.compile(r"[0-9]+(?:[./-][0-9]+)*|[а-яё]+", re.IGNORECASE)
190
-
191
-
192
- class UniversalDateParser:
193
- MONTHS = {
194
- "январь": 1, "февраль": 2, "март": 3, "апрель": 4, "май": 5, "июнь": 6,
195
- "июль": 7, "август": 8, "сентябрь": 9, "октябрь": 10, "ноябрь": 11, "декабрь": 12,
196
- }
197
- WEEKDAYS = {
198
- "понедельник": 0, "вторник": 1, "среда": 2, "четверг": 3,
199
- "пятница": 4, "суббота": 5, "воскресенье": 6,
200
- }
201
- DIRECT_RELATIVE = {"послезавтра": 2, "позавчера": -2, "сегодня": 0, "вчера": -1, "завтра": 1}
202
- ORDINAL_DAYS = {
203
- "первый": 1, "второй": 2, "третий": 3, "четвертый": 4, "пятый": 5, "шестой": 6,
204
- "седьмой": 7, "восьмой": 8, "девятый": 9, "десятый": 10, "одиннадцатый": 11,
205
- "двенадцатый": 12, "тринадцатый": 13, "четырнадцатый": 14, "пятнадцатый": 15,
206
- "шестнадцатый": 16, "семнадцатый": 17, "восемнадцатый": 18, "девятнадцатый": 19,
207
- "двадцатый": 20, "двадцать первый": 21, "двадцать второй": 22, "двадцать третий": 23,
208
- "двадцать четвертый": 24, "двадцать пятый": 25, "двадцать шестой": 26,
209
- "двадцать седьмой": 27, "двадцать восьмой": 28, "двадцать девятый": 29,
210
- "тридцатый": 30, "тридцать первый": 31,
211
- }
212
- NUMBER_WORDS = {
213
- "ноль": 0, "один": 1, "два": 2, "три": 3, "четыре": 4, "пять": 5, "шесть": 6,
214
- "семь": 7, "восемь": 8, "девять": 9, "десять": 10, "одиннадцать": 11,
215
- "двенадцать": 12, "тринадцать": 13, "четырнадцать": 14, "пятнадцать": 15,
216
- "шестнадцать": 16, "семнадцать": 17, "восемнадцать": 18, "девятнадцать": 19,
217
- "двадцать": 20, "тридцать": 30,
218
- }
219
- FUTURE_HINTS = ("завтра", "послезавтра", "через", "быть", "заплатить", "следующий", "последующий")
220
- PAST_HINTS = ("вчера", "позавчера", "назад", "прошлый", "предыдущий", "оплатить", "купить", "заказать")
221
-
222
- DIRECT_RELATIVE_RE = re.compile(r"(?<!\S)(послезавтра|позавчера|сегодня|вчера|завтра)(?!\S)")
223
- WEEK_RELATIVE_RE = re.compile(
224
- r"(?<!\S)на (?P<which>следующий|последующий|прошлый|предыдущий|этот) неделя"
225
- r"(?: (?P<prep>в|во|на) (?P<weekday>понедельник|вторник|среда|четверг|пятница|суббота|воскресенье))?(?!\S)"
226
- )
227
- QUANTITY_RELATIVE_RE = re.compile(
228
- r"(?<!\S)(?P<number>\d+|[а-яё]+(?: [а-яё]+)?) "
229
- r"(?P<unit>месяц|неделя|день) "
230
- r"(?P<ago>назад)"
231
- r"(?: (?P<prep>в|во|на) (?P<weekday>понедельник|вторник|среда|четверг|пятница|суббота|воскресенье))?(?!\S)",
232
- re.IGNORECASE,
233
- )
234
- FORWARD_QUANTITY_RE = re.compile(
235
- r"(?<!\S)(?P<through>через) "
236
- r"(?P<number>\d+|[а-яё]+(?: [а-яё]+)?) "
237
- r"(?P<unit>месяц|неделя|день)"
238
- r"(?: (?P<prep>в|во|на) (?P<weekday>понедельник|вторник|среда|четверг|пятница|суббота|воскресенье))?(?!\S)",
239
- re.IGNORECASE,
240
- )
241
- FORWARD_SINGLE_UNIT_RE = re.compile(
242
- r"(?<!\S)(?P<through>через) "
243
- r"(?P<unit>месяц|неделя|день)"
244
- r"(?: (?P<prep>в|во|на) (?P<weekday>понедельник|вторник|среда|четверг|пятница|суббота|воскресенье))?(?!\S)",
245
- re.IGNORECASE,
246
- )
247
- TEXTUAL_ABSOLUTE_RE = re.compile(
248
- r"(?<!\S)(?P<day>\d{1,2}|[а-яё]+(?: [а-яё]+)?) "
249
- r"(?P<month>январь|февраль|март|апрель|май|июнь|июль|август|сентябрь|октябрь|ноябрь|декабрь)"
250
- r"(?: (?P<year>\d{4}))?(?!\S)",
251
- re.IGNORECASE,
252
- )
253
- PERIOD_EDGE_RE = re.compile(
254
- r"(?<!\S)(?:в )?(?P<edge>начало|конец) (?P<which>этот|следующий|последующий|пр��шлый|предыдущий) (?P<unit>неделя|месяц)(?!\S)",
255
- re.IGNORECASE,
256
- )
257
-
258
- @classmethod
259
- def temporal_vocabulary(cls) -> set[str]:
260
- vocab: set[str] = set()
261
- vocab.update(cls.MONTHS)
262
- vocab.update(cls.WEEKDAYS)
263
- vocab.update(cls.DIRECT_RELATIVE)
264
- vocab.update(cls.ORDINAL_DAYS)
265
- vocab.update(cls.NUMBER_WORDS)
266
- vocab.update({
267
- "неделя", "месяц", "день", "назад", "через", "начало", "конец", "на", "в", "во",
268
- "этот", "прошлый", "предыдущий", "следующий", "последующий",
269
- })
270
- return vocab
271
-
272
- @staticmethod
273
- def similarity(left: str, right: str) -> float:
274
- return difflib.SequenceMatcher(None, left, right).ratio()
275
-
276
- @classmethod
277
- def pick_temporal_correction(cls, normalized: str, raw_lemma: str) -> tuple[str, Optional[str]]:
278
- vocab = cls.temporal_vocabulary()
279
- if raw_lemma in vocab or not normalized.isalpha() or len(normalized) < 5:
280
- return raw_lemma, None
281
-
282
- candidates = list(difflib.get_close_matches(normalized, list(vocab), n=4, cutoff=0.74))
283
- candidates.extend(difflib.get_close_matches(raw_lemma, list(vocab), n=4, cutoff=0.74))
284
- candidates = list(dict.fromkeys(candidates))
285
- if not candidates:
286
- return raw_lemma, None
287
-
288
- best = max(candidates, key=lambda item: max(cls.similarity(normalized, item), cls.similarity(raw_lemma, item)))
289
- best_score = max(cls.similarity(normalized, best), cls.similarity(raw_lemma, best))
290
- return (best, f"{raw_lemma}->{best}") if best_score >= 0.80 else (raw_lemma, None)
291
-
292
- @staticmethod
293
- def normalize_word(word: str) -> str:
294
- return word.lower().replace("ё", "е")
295
-
296
- @classmethod
297
- def lemmatize(cls, word: str) -> str:
298
- return MORPH.parse(word)[0].normal_form if word.isalpha() else word
299
-
300
- @classmethod
301
- def tokenize(cls, text: str) -> list[Token]:
302
- tokens: list[Token] = []
303
- lemma_cursor = 0
304
-
305
- for match in WORD_RE.finditer(text):
306
- original = match.group(0)
307
- normalized = cls.normalize_word(original)
308
- raw_lemma = cls.lemmatize(normalized)
309
- lemma, correction = cls.pick_temporal_correction(normalized, raw_lemma)
310
- lemma_start = lemma_cursor
311
- lemma_end = lemma_start + len(lemma)
312
- tokens.append(Token(original, normalized, raw_lemma, lemma, correction, match.start(), match.end(), lemma_start, lemma_end))
313
- lemma_cursor = lemma_end + 1
314
-
315
- return tokens
316
-
317
- @staticmethod
318
- def lemma_text(tokens: list[Token]) -> str:
319
- return " ".join(token.lemma for token in tokens)
320
-
321
- @staticmethod
322
- def surface_text(text: str, tokens: list[Token], start_idx: int, end_idx: int) -> str:
323
- return text[tokens[start_idx].start:tokens[end_idx].end].strip() if tokens else ""
324
-
325
- @staticmethod
326
- def lemma_span_to_token_range(tokens: list[Token], span: tuple[int, int]) -> Optional[tuple[int, int]]:
327
- start_char, end_char = span
328
- start_idx = end_idx = None
329
-
330
- for idx, token in enumerate(tokens):
331
- if start_idx is None and token.lemma_start <= start_char < token.lemma_end:
332
- start_idx = idx
333
- if token.lemma_start < end_char <= token.lemma_end:
334
- end_idx = idx
335
- break
336
-
337
- return (start_idx, end_idx) if start_idx is not None and end_idx is not None else None
338
-
339
- @classmethod
340
- def make_parsed_date(cls, text: str, tokens: list[Token], match, parsed_date: date) -> Optional[ParsedDate]:
341
- token_span = cls.lemma_span_to_token_range(tokens, match.span())
342
- if token_span is None:
343
- return None
344
- return ParsedDate(parsed_date.isoformat(), cls.surface_text(text, tokens, token_span[0], token_span[1]))
345
-
346
- @classmethod
347
- def parse_number_phrase(cls, phrase: str) -> Optional[int]:
348
- phrase = phrase.strip()
349
- if not phrase:
350
- return None
351
- if phrase.isdigit():
352
- return int(phrase)
353
-
354
- parts = phrase.split()
355
- if len(parts) == 1:
356
- return cls.NUMBER_WORDS.get(parts[0])
357
- if len(parts) == 2 and parts[0] in {"двадцать", "тридцать"}:
358
- base = cls.NUMBER_WORDS.get(parts[0])
359
- addon = cls.NUMBER_WORDS.get(parts[1])
360
- if base is not None and addon is not None and 1 <= addon <= 9:
361
- return base + addon
362
- return None
363
-
364
- @classmethod
365
- def parse_day_phrase(cls, phrase: str) -> Optional[int]:
366
- if phrase.isdigit():
367
- value = int(phrase)
368
- return value if 1 <= value <= 31 else None
369
- return cls.ORDINAL_DAYS.get(phrase.strip())
370
-
371
- @staticmethod
372
- def shift_months(value: date, months: int) -> date:
373
- month_index = value.month - 1 + months
374
- year = value.year + month_index // 12
375
- month = month_index % 12 + 1
376
- day = min(value.day, calendar.monthrange(year, month)[1])
377
- return date(year, month, day)
378
-
379
- @staticmethod
380
- def parse_numeric_absolute(tokens: list[Token]) -> Optional[ParsedDate]:
381
- for token in tokens:
382
- separator = "." if "." in token.original else "-" if "-" in token.original else "/" if "/" in token.original else None
383
- if separator is None:
384
- continue
385
-
386
- parts = token.original.split(separator)
387
- if len(parts) != 3 or not all(part.isdigit() for part in parts):
388
- continue
389
-
390
- try:
391
- if len(parts[0]) == 4:
392
- parsed = date(int(parts[0]), int(parts[1]), int(parts[2]))
393
- elif len(parts[2]) == 4:
394
- parsed = date(int(parts[2]), int(parts[1]), int(parts[0]))
395
- else:
396
- continue
397
- return ParsedDate(parsed.isoformat(), token.original)
398
- except ValueError:
399
- continue
400
-
401
- return None
402
-
403
- @classmethod
404
- def parse_textual_absolute(cls, text: str, tokens: list[Token], reference_date: date) -> Optional[ParsedDate]:
405
- lemma_text = cls.lemma_text(tokens)
406
- for match in cls.TEXTUAL_ABSOLUTE_RE.finditer(lemma_text):
407
- day = cls.parse_day_phrase(match.group("day"))
408
- month = cls.MONTHS.get(match.group("month"))
409
- if day is None or month is None:
410
- continue
411
-
412
- year = int(match.group("year")) if match.group("year") else reference_date.year
413
- try:
414
- parsed = date(year, month, day)
415
- except ValueError:
416
- continue
417
-
418
- result = cls.make_parsed_date(text, tokens, match, parsed)
419
- if result is not None:
420
- return result
421
-
422
- return None
423
-
424
- @classmethod
425
- def parse_direct_relative(cls, text: str, tokens: list[Token], reference_date: date) -> Optional[ParsedDate]:
426
- lemma_text = cls.lemma_text(tokens)
427
- match = cls.DIRECT_RELATIVE_RE.search(lemma_text)
428
- if not match:
429
- return None
430
-
431
- parsed = reference_date + timedelta(days=cls.DIRECT_RELATIVE[match.group(1)])
432
- return cls.make_parsed_date(text, tokens, match, parsed)
433
-
434
- @staticmethod
435
- def week_monday(value: date) -> date:
436
- return value - timedelta(days=value.weekday())
437
-
438
- @classmethod
439
- def parse_week_relative(cls, text: str, tokens: list[Token], reference_date: date) -> Optional[ParsedDate]:
440
- lemma_text = cls.lemma_text(tokens)
441
- match = cls.WEEK_RELATIVE_RE.search(lemma_text)
442
- if not match:
443
- return None
444
-
445
- offsets = {"следующий": 7, "последующий": 7, "прошлый": -7, "предыдущий": -7, "этот": 0}
446
- anchor = reference_date + timedelta(days=offsets[match.group("which")])
447
-
448
- if match.group("weekday"):
449
- anchor = cls.week_monday(anchor) + timedelta(days=cls.WEEKDAYS[match.group("weekday")])
450
-
451
- return cls.make_parsed_date(text, tokens, match, anchor)
452
-
453
- @classmethod
454
- def parse_period_edge(cls, text: str, tokens: list[Token], reference_date: date) -> Optional[ParsedDate]:
455
- lemma_text = cls.lemma_text(tokens)
456
- match = cls.PERIOD_EDGE_RE.search(lemma_text)
457
- if not match:
458
- return None
459
-
460
- edge, which, unit = match.group("edge"), match.group("which"), match.group("unit")
461
-
462
- if unit == "неделя":
463
- offsets = {"прошлый": -7, "предыдущий": -7, "этот": 0, "следующий": 7, "последующий": 7}
464
- monday = cls.week_monday(reference_date + timedelta(days=offsets[which]))
465
- parsed_date = monday if edge == "начало" else monday + timedelta(days=6)
466
- else:
467
- month_offset = {"прошлый": -1, "предыдущий": -1, "этот": 0, "следующий": 1, "последующий": 1}[which]
468
- shifted = cls.shift_months(date(reference_date.year, reference_date.month, 1), month_offset)
469
- parsed_date = shifted if edge == "начало" else date(shifted.year, shifted.month, calendar.monthrange(shifted.year, shifted.month)[1])
470
-
471
- return cls.make_parsed_date(text, tokens, match, parsed_date)
472
-
473
- @classmethod
474
- def parse_quantity_relative(cls, text: str, tokens: list[Token], reference_date: date) -> Optional[ParsedDate]:
475
- lemma_text = cls.lemma_text(tokens)
476
-
477
- for regex, direction in ((cls.QUANTITY_RELATIVE_RE, -1), (cls.FORWARD_QUANTITY_RE, 1)):
478
- for match in regex.finditer(lemma_text):
479
- number = cls.parse_number_phrase(match.group("number"))
480
- if number is None:
481
- continue
482
-
483
- unit = match.group("unit")
484
- if unit == "месяц":
485
- anchor = cls.shift_months(reference_date, direction * number)
486
- else:
487
- days = number * 7 if unit == "неделя" else number
488
- anchor = reference_date + timedelta(days=direction * days)
489
-
490
- if match.group("weekday"):
491
- anchor = cls.week_monday(anchor) + timedelta(days=cls.WEEKDAYS[match.group("weekday")])
492
-
493
- result = cls.make_parsed_date(text, tokens, match, anchor)
494
- if result is not None:
495
- return result
496
-
497
- for match in cls.FORWARD_SINGLE_UNIT_RE.finditer(lemma_text):
498
- unit = match.group("unit")
499
- if unit == "месяц":
500
- anchor = cls.shift_months(reference_date, 1)
501
- else:
502
- days = 7 if unit == "неделя" else 1
503
- anchor = reference_date + timedelta(days=days)
504
-
505
- if match.group("weekday"):
506
- anchor = cls.week_monday(anchor) + timedelta(days=cls.WEEKDAYS[match.group("weekday")])
507
-
508
- result = cls.make_parsed_date(text, tokens, match, anchor)
509
- if result is not None:
510
- return result
511
-
512
- return None
513
-
514
- @classmethod
515
- def preference_for_text(cls, tokens: list[Token]) -> str:
516
- lemmas = [token.lemma for token in tokens]
517
- future = sum(1 for hint in cls.FUTURE_HINTS if hint in lemmas)
518
- past = sum(1 for hint in cls.PAST_HINTS if hint in lemmas)
519
- return "future" if future > past else "past"
520
-
521
- @staticmethod
522
- def choose_best(matches: list[tuple[str, datetime]]) -> tuple[str, datetime]:
523
- return sorted(matches, key=lambda item: (len(item[0]), -item[1].timestamp()), reverse=True)[0]
524
-
525
- def parse(self, text: str, reference_date: date) -> Optional[ParsedDate]:
526
- tokens = self.tokenize(text)
527
-
528
- for parser in (
529
- lambda: self.parse_numeric_absolute(tokens),
530
- lambda: self.parse_textual_absolute(text, tokens, reference_date),
531
- lambda: self.parse_direct_relative(text, tokens, reference_date),
532
- lambda: self.parse_week_relative(text, tokens, reference_date),
533
- lambda: self.parse_period_edge(text, tokens, reference_date),
534
- lambda: self.parse_quantity_relative(text, tokens, reference_date),
535
- ):
536
- parsed = parser()
537
- if parsed is not None:
538
- return parsed
539
-
540
- normalized = " ".join(token.normalized for token in tokens)
541
- relative_base = datetime.combine(reference_date, datetime.min.time()).replace(hour=12)
542
- result = search_dates(
543
- normalized,
544
- languages=["ru"],
545
- settings={
546
- "RELATIVE_BASE": relative_base,
547
- "PREFER_DATES_FROM": self.preference_for_text(tokens),
548
- "STRICT_PARSING": False,
549
- "REQUIRE_PARTS": [],
550
- "NORMALIZE": True,
551
- "RETURN_AS_TIMEZONE_AWARE": False,
552
- "DATE_ORDER": "DMY",
553
- },
554
- )
555
-
556
- filtered: list[tuple[str, datetime]] = []
557
- for matched, value in result or []:
558
- if isinstance(value, datetime) and not matched.strip().isdigit() and 2020 <= value.year <= 2100:
559
- filtered.append((matched.strip(), value))
560
-
561
- if not filtered:
562
- return None
563
-
564
- matched_expression, value = self.choose_best(filtered)
565
- return ParsedDate(date_iso=value.date().isoformat(), matched_expression=matched_expression)
566
-
567
-
568
- class ExpenseDateExtractor:
569
- def __init__(self) -> None:
570
- self.parser = UniversalDateParser()
571
-
572
- def extract(self, text: str, reference_date: str | date | None = None) -> dict[str, Any]:
573
- ref_date = self.to_date(reference_date or date.today().isoformat())
574
- parsed = self.parser.parse(text=text, reference_date=ref_date)
575
-
576
- return {
577
- "date": datetime.strptime(parsed.date_iso, "%Y-%m-%d").strftime("%d.%m.%Y") if parsed else None,
578
- "date_iso": parsed.date_iso if parsed else None,
579
- "matched_date_phrase": parsed.matched_expression if parsed else None,
580
- }
581
-
582
- @staticmethod
583
- def to_date(value: str | date) -> date:
584
- return value if isinstance(value, date) else datetime.strptime(value, "%Y-%m-%d").date()
585
-
586
-
587
- class ExpenseUserExtractor:
588
- def __init__(self, users: list[str], suppliers: list[str], model: SentenceTransformer, threshold: float = 0.6) -> None:
589
- self.users = users
590
- self.model = model
591
- self.threshold = threshold
592
- self.supplier_terms = {normalize_text(supplier) for supplier in suppliers}
593
- self.user_terms = [normalize_text(user) for user in users]
594
- self.user_embeddings = model.encode(
595
- [f"passage: {user}" for user in self.user_terms],
596
- convert_to_tensor=True,
597
- normalize_embeddings=True,
598
- )
599
-
600
- def extract(self, text: str, supplier_phrase: str | None = None, date_phrase: str | None = None) -> dict[str, Any]:
601
- excluded_tokens: set[str] = set()
602
- if supplier_phrase:
603
- excluded_tokens.update(normalize_text(supplier_phrase).split())
604
- if date_phrase:
605
- excluded_tokens.update(normalize_text(date_phrase).split())
606
-
607
- best_user = None
608
- best_score = -1.0
609
- best_phrase = None
610
-
611
- for word in lemmatize_text(text):
612
- if len(word) < 3:
613
- continue
614
- if word in excluded_tokens or word in self.supplier_terms:
615
- continue
616
-
617
- query_emb = self.model.encode(
618
- f"query: {word}",
619
- convert_to_tensor=True,
620
- normalize_embeddings=True,
621
- )
622
- similarities = torch.cosine_similarity(query_emb.unsqueeze(0), self.user_embeddings, dim=1)
623
- idx = int(torch.argmax(similarities))
624
- score = similarities[idx].item()
625
-
626
- if score > best_score:
627
- best_score = score
628
- best_user = self.users[idx]
629
- best_phrase = word
630
-
631
- if best_score >= self.threshold:
632
- return {
633
- "user": best_user,
634
- "user_score": round(best_score, 4),
635
- "matched_user_phrase": best_phrase,
636
- }
637
-
638
- if re.search(r"(?<!\S)я(?!\S)", normalize_text(text), re.IGNORECASE):
639
- return {
640
- "user": "Я",
641
- "user_score": 1.0,
642
- "matched_user_phrase": "я",
643
- }
644
-
645
- return {
646
- "user": None,
647
- "user_score": None,
648
- "matched_user_phrase": None,
649
- }
650
-
651
-
652
- class ExpenseSupplierExtractor:
653
- def __init__(self, suppliers: list[str]) -> None:
654
- self.suppliers = suppliers
655
- self.sup_norm = [normalize_text(s) for s in suppliers]
656
- self.sup_tokens = [s.split() for s in self.sup_norm]
657
- self.sup_num_sets = [self.numeric_tokens(s) for s in self.sup_norm]
658
- self.sup_number_tokens = {token for supplier in self.sup_tokens for token in supplier if token.isdigit()}
659
- self.supplier_lexicon = [
660
- token
661
- for token in sorted({tok for tokens in self.sup_tokens for tok in tokens})
662
- if token and not token.isdigit()
663
- ]
664
- self.tfidf = TfidfVectorizer(analyzer="char_wb", ngram_range=(3, 5))
665
- self.sup_mat = self.tfidf.fit_transform(self.sup_norm)
666
- self.max_words = max(len(s.split()) for s in self.sup_norm)
667
- self.variant_cache: dict[str, list[str]] = {}
668
- self.lexical_token_cache: dict[str, float] = {}
669
- self.phrase_support_cache: dict[str, float] = {}
670
- self.noise_terms = {
671
- "за", "на", "из", "для", "под", "над", "при", "без", "и", "или",
672
- "купил", "купила", "купили", "покупка", "заказал", "заказала", "заказали",
673
- "оплатил", "оплатила", "оплатили", "заплатил", "заплатила", "заплатили",
674
- "был", "была", "было", "были", "утром", "днем", "днём", "вечером", "ночью",
675
- "товар", "товары", "продукт", "продукты", "десерт", "еда",
676
- "лей", "лея", "леи", "целых", "сотых", "сом", "сомов", "руб", "рублей", "грн", "usd", "eur",
677
- }
678
- self.noise_terms.update(UniversalDateParser.temporal_vocabulary())
679
-
680
- @staticmethod
681
- def numeric_tokens(text: str) -> set[str]:
682
- return set(re.findall(r"\d+", text))
683
-
684
- def cached_variants(self, text: str) -> list[str]:
685
- key = normalize_text(text)
686
- cached = self.variant_cache.get(key)
687
- if cached is None:
688
- cached = variants(key)
689
- self.variant_cache[key] = cached
690
- return cached
691
-
692
- @staticmethod
693
- def split_words(text: str) -> list[str]:
694
- return [w for w in normalize_text(text).split() if w]
695
-
696
- @classmethod
697
- def is_supplier_extension(cls, base_supplier: str, extended_supplier: str) -> bool:
698
- base_tokens = cls.split_words(base_supplier)
699
- extended_tokens = cls.split_words(extended_supplier)
700
- return len(base_tokens) < len(extended_tokens) and extended_tokens[:len(base_tokens)] == base_tokens
701
-
702
- @classmethod
703
- def phrase_token_count(cls, phrase: str | None) -> int:
704
- return len(cls.split_words(phrase or ""))
705
-
706
- @classmethod
707
- def resolve_overlapping_suppliers(cls, ranking: list[dict[str, Any]]) -> dict[str, Any]:
708
- if not ranking:
709
- return {"supplier": None, "score": -1.0, "phrase": None}
710
-
711
- best = ranking[0]
712
- best_combined = float(best.get("combined", best.get("score", -1.0)))
713
- best_phrase_len = cls.phrase_token_count(best.get("phrase"))
714
-
715
- for alt in ranking[1:]:
716
- if not cls.is_supplier_extension(str(best.get("supplier") or ""), str(alt.get("supplier") or "")):
717
- continue
718
-
719
- alt_combined = float(alt.get("combined", alt.get("score", -1.0)))
720
- alt_phrase_len = cls.phrase_token_count(alt.get("phrase"))
721
-
722
- if alt_phrase_len > best_phrase_len and alt_combined >= best_combined - 0.15:
723
- best = alt
724
- best_combined = alt_combined
725
- best_phrase_len = alt_phrase_len
726
-
727
- return best
728
-
729
- @staticmethod
730
- def numeric_compatibility_multiplier(phrase_nums: set[str], candidate_nums: set[str]) -> float:
731
- if not phrase_nums and not candidate_nums:
732
- return 1.0
733
- if phrase_nums == candidate_nums:
734
- return 1.08
735
- if phrase_nums and candidate_nums:
736
- return 1.03 if phrase_nums & candidate_nums else 0.80
737
- return 0.82
738
-
739
- def lexical_support(self, phrase: str) -> float:
740
- tokens = [token for token in normalize_text(phrase).split() if token and not token.isdigit()]
741
- if not tokens or not self.supplier_lexicon:
742
- return 0.0
743
-
744
- support_scores: list[float] = []
745
- for token in tokens:
746
- cached = self.lexical_token_cache.get(token)
747
- if cached is not None:
748
- support_scores.append(cached)
749
- continue
750
-
751
- best = 0.0
752
- for token_variant in self.cached_variants(token):
753
- for lex in self.supplier_lexicon:
754
- lev = Levenshtein.normalized_similarity(token_variant, lex)
755
- phon = phonetic_similarity(token_variant, lex)
756
- sim = max(lev, phon)
757
- if sim > best:
758
- best = sim
759
-
760
- self.lexical_token_cache[token] = best
761
- support_scores.append(best)
762
-
763
- return sum(support_scores) / len(support_scores)
764
-
765
- def score_phrase(self, phrase: str) -> dict[str, Any]:
766
- vs = self.cached_variants(phrase)
767
- q = self.tfidf.transform(vs)
768
- tf = cosine_similarity(q, self.sup_mat)
769
-
770
- best: dict[str, Any] = {"supplier": None, "score": -1.0, "phrase": phrase, "variant": ""}
771
- for i, cand in enumerate(self.sup_norm):
772
- local = -1.0
773
- local_variant = ""
774
- candidate_nums = self.sup_num_sets[i]
775
- for j, v in enumerate(vs):
776
- char = fuzz.ratio(v, cand) / 100.0
777
- tf_val = float(tf[j, i])
778
- penalty = length_penalty(len(v), len(cand))
779
- phon = phonetic_similarity(v, cand)
780
- phrase_nums = self.numeric_tokens(v)
781
-
782
- if len(v.split()) == 1 and len(cand.split()) == 1:
783
- lev = Levenshtein.normalized_similarity(v, cand)
784
- val = (0.45 * lev + 0.25 * char + 0.10 * tf_val + 0.20 * phon) * penalty
785
- else:
786
- align = token_alignment_score(v, self.sup_tokens[i])
787
- tok = fuzz.token_set_ratio(v, cand) / 100.0
788
- val = (0.30 * char + 0.20 * tok + 0.10 * tf_val + 0.20 * align + 0.20 * phon) * penalty
789
-
790
- compact_v = v.replace(" ", "")
791
- compact_cand = cand.replace(" ", "")
792
- compact_char = fuzz.ratio(compact_v, compact_cand) / 100.0
793
- compact_lev = Levenshtein.normalized_similarity(compact_v, compact_cand)
794
- compact_phon = phonetic_similarity(compact_v, compact_cand)
795
- compact = max(compact_char, compact_lev, compact_phon)
796
- if compact > 0.55:
797
- val = max(val, compact * penalty)
798
-
799
- val *= self.numeric_compatibility_multiplier(phrase_nums, candidate_nums)
800
-
801
- if val > local:
802
- local = val
803
- local_variant = v
804
-
805
- if local > best["score"]:
806
- best = {"supplier": self.suppliers[i], "score": local, "phrase": phrase, "variant": local_variant}
807
- return best
808
-
809
- def extract(self, text: str, date_phrase: str | None = None, debug: bool = False) -> dict[str, Any]:
810
- threshold = 0.50
811
- excluded_tokens: set[str] = set()
812
- if date_phrase:
813
- excluded_tokens.update(normalize_text(date_phrase).split())
814
- excluded_tokens.update(self.noise_terms)
815
-
816
- raw_tokens = normalize_text(text).split()
817
- tokens: list[str] = []
818
- for token in raw_tokens:
819
- if token in excluded_tokens:
820
- continue
821
-
822
- if token.isdigit():
823
- if token in self.sup_number_tokens:
824
- tokens.append(token)
825
-
826
- if tokens and len(token) <= 3 and len(tokens[-1]) >= 4 and tokens[-1].isalpha():
827
- tokens.append(f"{tokens[-1]}{token}")
828
- continue
829
-
830
- if len(token) > 1:
831
- tokens.append(token)
832
-
833
- tokens = [t for t in tokens if len(t) > 1 and t not in excluded_tokens]
834
-
835
- phrases: list[str] = []
836
- seen: set[str] = set()
837
- for i in range(len(tokens)):
838
- for j in range(i + 1, min(i + 1 + self.max_words, len(tokens) + 1)):
839
- p = " ".join(tokens[i:j])
840
- if p not in seen:
841
- seen.add(p)
842
- phrases.append(p)
843
-
844
- results = [self.score_phrase(p) for p in phrases]
845
- candidate_rows: list[dict[str, Any]] = []
846
- best_by_supplier: dict[str, dict[str, Any]] = {}
847
- for row in results:
848
- supplier = row["supplier"]
849
- score = float(row.get("score", -1.0))
850
- phrase = str(row.get("phrase") or "")
851
- support = self.phrase_support_cache.get(phrase)
852
- if support is None:
853
- support = self.lexical_support(phrase)
854
- self.phrase_support_cache[phrase] = support
855
- combined = 0.75 * score + 0.25 * support
856
-
857
- if debug:
858
- candidate_rows.append({
859
- "supplier": supplier,
860
- "phrase": phrase,
861
- "score": round(score, 4),
862
- "support": round(support, 4),
863
- "combined": round(combined, 4),
864
- })
865
-
866
- enriched = {**row, "combined": combined}
867
- passes = score >= threshold or combined >= 0.48
868
- if passes and (supplier not in best_by_supplier or combined > float(best_by_supplier[supplier].get("combined", -1.0))):
869
- best_by_supplier[supplier] = enriched
870
-
871
- if not best_by_supplier and results:
872
- def support_for_phrase(phrase: str) -> float:
873
- cached_support = self.phrase_support_cache.get(phrase)
874
- if cached_support is None:
875
- cached_support = self.lexical_support(phrase)
876
- self.phrase_support_cache[phrase] = cached_support
877
- return cached_support
878
-
879
- fallback = max(
880
- results,
881
- key=lambda item: 0.75 * float(item.get("score", -1.0)) + 0.25 * support_for_phrase(str(item.get("phrase") or "")),
882
- )
883
- fallback_score = float(fallback.get("score", -1.0))
884
- fallback_phrase = str(fallback.get("phrase") or "")
885
- fallback_support = support_for_phrase(fallback_phrase)
886
- fallback_combined = 0.75 * fallback_score + 0.25 * fallback_support
887
- if fallback_score >= 0.40 and fallback_support >= 0.43 and fallback_combined >= 0.43:
888
- best_by_supplier[fallback["supplier"]] = {**fallback, "combined": fallback_combined}
889
-
890
- supplier_ranking = sorted(best_by_supplier.values(), key=lambda x: float(x.get("combined", x["score"])), reverse=True)
891
- best = self.resolve_overlapping_suppliers(supplier_ranking)
892
-
893
- payload = {
894
- "supplier": best["supplier"],
895
- "supplier_score": round(best["score"], 4) if best["score"] >= 0 else None,
896
- "matched_supplier_phrase": best.get("phrase"),
897
- }
898
-
899
- if debug:
900
- top_candidates = sorted(candidate_rows, key=lambda item: item["combined"], reverse=True)[:8]
901
- payload["supplier_debug"] = {
902
- "tokens": tokens,
903
- "phrases_count": len(phrases),
904
- "top_candidates": top_candidates,
905
- }
906
-
907
- return payload
908
-
909
-
910
- class ExpenseAmountExtractor:
911
- def __init__(self, suppliers: list[str]) -> None:
912
- self.model = get_amount_model()
913
-
914
- @staticmethod
915
- def to_float(value: str) -> Optional[float]:
916
- cleaned = value.replace(" ", "").replace("\u00A0", "")
917
- match = re.search(r"\d+(?:[,]\d{1,2})?", cleaned)
918
- if not match:
919
- return None
920
- try:
921
- return float(match.group(0).replace(",", "."))
922
- except ValueError:
923
- return None
924
-
925
- @staticmethod
926
- def phrase_span(text: str, phrase: Optional[str]) -> Optional[tuple[int, int]]:
927
- if not phrase:
928
- return None
929
- idx = text.lower().find(phrase.lower())
930
- if idx == -1:
931
- return None
932
- return idx, idx + len(phrase)
933
-
934
- @staticmethod
935
- def overlaps(span1: tuple[int, int], span2: Optional[tuple[int, int]]) -> bool:
936
- if span2 is None:
937
- return False
938
- return span1[0] < span2[1] and span2[0] < span1[1]
939
-
940
- @staticmethod
941
- def expand_amount_text(text: str, start: int, end: int) -> tuple[str, tuple[int, int]]:
942
- suffix = re.match(r",\d{1,2}", text[end:])
943
- if suffix:
944
- new_end = end + len(suffix.group(0))
945
- return text[start:new_end].strip(), (start, new_end)
946
-
947
- prefix = re.search(r"(\d{1,3}(?:\s*\d{3})*),", text[:start])
948
- if prefix:
949
- new_start = prefix.start(1)
950
- return text[new_start:end].strip(), (new_start, end)
951
-
952
- return text[start:end].strip(), (start, end)
953
-
954
- def extract(
955
- self,
956
- text: str,
957
- matched_date_phrase: Optional[str] = None,
958
- matched_supplier_phrase: Optional[str] = None,
959
- ) -> dict[str, Any]:
960
- if self.model is None:
961
- return {"amount": None, "amount_text": None}
962
-
963
- date_span = self.phrase_span(text, matched_date_phrase)
964
- supplier_span = self.phrase_span(text, matched_supplier_phrase)
965
- entities = self.model.predict_entities(text, ["money"], threshold=0.3)
966
-
967
- for ent in sorted(entities, key=lambda item: float(item.get("score", 0.0)), reverse=True):
968
- raw_span = (int(ent.get("start", 0)), int(ent.get("end", 0)))
969
- amount_text, span = self.expand_amount_text(text, raw_span[0], raw_span[1])
970
- amount = self.to_float(amount_text)
971
- overlaps_date = self.overlaps(span, date_span)
972
- overlaps_supplier = self.overlaps(span, supplier_span)
973
-
974
- if amount is None:
975
- continue
976
- if overlaps_date or overlaps_supplier:
977
- continue
978
- return {"amount": amount, "amount_text": amount_text}
979
-
980
- return {"amount": None, "amount_text": None}
981
-
982
-
983
- class ExpenseTextExtractor:
984
- def __init__(self, suppliers: list[str], users: list[str]) -> None:
985
- self.date_extractor = ExpenseDateExtractor()
986
- self.supplier_extractor = ExpenseSupplierExtractor(suppliers=suppliers)
987
- self.amount_extractor = ExpenseAmountExtractor(suppliers=suppliers)
988
- self.user_extractor = ExpenseUserExtractor(users=users, suppliers=suppliers, model=get_embedding_model())
989
-
990
- def extract(self, text: str, reference_date: str | date | None = None, debug_supplier: bool = False) -> dict[str, Any]:
991
- date_info = self.date_extractor.extract(text, reference_date=reference_date)
992
- supplier_info = self.supplier_extractor.extract(
993
- text,
994
- date_phrase=date_info.get("matched_date_phrase"),
995
- debug=debug_supplier,
996
- )
997
- user_info = self.user_extractor.extract(
998
- text,
999
- supplier_phrase=supplier_info.get("matched_supplier_phrase"),
1000
- date_phrase=date_info.get("matched_date_phrase"),
1001
- )
1002
- amount_info = self.amount_extractor.extract(
1003
- text,
1004
- matched_date_phrase=date_info["matched_date_phrase"],
1005
- matched_supplier_phrase=supplier_info["matched_supplier_phrase"],
1006
- )
1007
-
1008
- result = {
1009
- "text": text,
1010
- "user": user_info["user"],
1011
- "supplier": supplier_info["supplier"],
1012
- "amount": amount_info["amount"],
1013
- "date": date_info["date"],
1014
- "date_iso": date_info["date_iso"],
1015
- }
1016
- if debug_supplier and "supplier_debug" in supplier_info:
1017
- result["supplier_debug"] = supplier_info["supplier_debug"]
1018
- return result
1019
-
1020
-
1021
- def build_default_pipeline(suppliers: list[str], users: list[str]) -> ExpenseTextExtractor:
1022
- return ExpenseTextExtractor(suppliers=suppliers, users=users)
1023
-
1024
-
1025
- def extract_names(items: Any) -> list[str]:
1026
- if not isinstance(items, list):
1027
- return []
1028
-
1029
- names: list[str] = []
1030
- for item in items:
1031
- if isinstance(item, dict):
1032
- name = item.get("name")
1033
- if isinstance(name, str) and name.strip():
1034
- names.append(name.strip())
1035
- continue
1036
-
1037
- if isinstance(item, str) and item.strip():
1038
- names.append(item.strip())
1039
-
1040
- return names
1041
-
1042
-
1043
- def polish_notes_text(text: str) -> str:
1044
- normalized = re.sub(r"\s+", " ", text).strip()
1045
- if not normalized:
1046
- return ""
1047
-
1048
- normalized = normalized[0].upper() + normalized[1:]
1049
- if normalized[-1] not in ".!?":
1050
- normalized += "."
1051
-
1052
- return normalized
1053
-
1054
-
1055
- def transcribe_audio_text(audio_path: str) -> str:
1056
- mock_text = os.getenv("EXPENSE_VOICE_MOCK_TEXT")
1057
- if mock_text:
1058
- return mock_text.strip()
1059
-
1060
- try:
1061
- whisper_model = get_whisper_model()
1062
- segments, _ = whisper_model.transcribe(audio_path, language="ru", vad_filter=True)
1063
- text = " ".join(segment.text.strip() for segment in segments if segment.text and segment.text.strip())
1064
- if text:
1065
- return text
1066
- except Exception:
1067
- pass
1068
-
1069
- raise RuntimeError("Speech-to-text backend is unavailable. Install faster-whisper or set EXPENSE_VOICE_MOCK_TEXT.")
1070
-
1071
-
1072
- def process_voice_request(audio_path: str, mode: str, payload: dict[str, Any]) -> dict[str, Any]:
1073
- context = payload.get("context", {}) if isinstance(payload, dict) else {}
1074
- supplier_names = extract_names(context.get("suppliers"))
1075
- user_names = extract_names(context.get("users"))
1076
-
1077
- transcript = transcribe_audio_text(audio_path)
1078
-
1079
- if mode == "notes":
1080
- notes = polish_notes_text(transcript)
1081
- return {
1082
- "status": "ok",
1083
- "text": transcript,
1084
- "notes": notes,
1085
- "supplier": None,
1086
- "user": None,
1087
- "date": None,
1088
- "sum": None,
1089
- }
1090
-
1091
- if not supplier_names:
1092
- raise RuntimeError("No suppliers were provided by Laravel context.")
1093
-
1094
- if not user_names:
1095
- raise RuntimeError("No users were provided by Laravel context.")
1096
-
1097
- extractor = build_default_pipeline(suppliers=supplier_names, users=user_names)
1098
- extracted = extractor.extract(transcript, reference_date=date.today().isoformat())
1099
-
1100
- return {
1101
- "status": "ok",
1102
- "text": transcript,
1103
- "notes": polish_notes_text(extracted.get("text") or transcript),
1104
- "supplier": extracted.get("supplier"),
1105
- "user": extracted.get("user"),
1106
- "date": extracted.get("date_iso") or extracted.get("date"),
1107
- "sum": extracted.get("amount"),
1108
- }
1109
-
1110
-
1111
- def require_auth():
1112
- expected_token = os.getenv("PYTHON_API_TOKEN", os.getenv("EXPENSE_VOICE_FASTAPI_TOKEN", "")).strip()
1113
-
1114
- if not expected_token:
1115
- return None
1116
-
1117
- authorization = request.headers.get("Authorization", "")
1118
- if not authorization.startswith("Bearer "):
1119
- return jsonify({"status": "error", "message": "Missing bearer token."}), 401
1120
-
1121
- provided = authorization.removeprefix("Bearer ").strip()
1122
- if provided != expected_token:
1123
- return jsonify({"status": "error", "message": "Invalid bearer token."}), 401
1124
-
1125
- return None
1126
-
1127
-
1128
- def parse_context(raw: str | None) -> dict[str, Any]:
1129
- if not raw:
1130
- return {}
1131
-
1132
- try:
1133
- payload = json.loads(raw)
1134
- return payload if isinstance(payload, dict) else {}
1135
- except json.JSONDecodeError:
1136
- return {}
1137
-
1138
-
1139
- @app.get("/health")
1140
- def health():
1141
- return jsonify({"status": "ok"})
1142
-
1143
-
1144
- @app.post("/process-audio")
1145
- def process_audio():
1146
- auth_error = require_auth()
1147
- if auth_error:
1148
- return auth_error
1149
-
1150
- audio = request.files.get("audio")
1151
- mode = (request.form.get("mode") or "expense").strip()
1152
- context = parse_context(request.form.get("context"))
1153
-
1154
- if audio is None:
1155
- return jsonify({"status": "error", "message": "Audio file is required."}), 422
1156
-
1157
- suffix = Path(audio.filename or "voice.webm").suffix or ".webm"
1158
- temp_path = None
1159
-
1160
- try:
1161
- with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
1162
- temp_path = temp_file.name
1163
- audio.save(temp_file)
1164
-
1165
- result = process_voice_request(audio_path=temp_path, mode=mode, payload={"context": context})
1166
- return jsonify(result)
1167
- except Exception as exception:
1168
- return jsonify({"status": "error", "message": str(exception)}), 422
1169
- finally:
1170
- if temp_path and os.path.exists(temp_path):
1171
- os.unlink(temp_path)
1172
-
1173
-
1174
- if __name__ == "__main__":
 
 
 
 
 
 
 
 
 
 
 
 
1175
  app.run(host="0.0.0.0", port=int(os.getenv("PORT", "7860")))
 
1
+ from __future__ import annotations
2
+
3
+ import calendar
4
+ import difflib
5
+ import json
6
+ import os
7
+ import re
8
+ import tempfile
9
+ import unicodedata
10
+ from dataclasses import dataclass
11
+ from datetime import date, datetime, timedelta
12
+ from pathlib import Path
13
+ from typing import Any, Optional
14
+
15
+ import iuliia
16
+ import torch
17
+ from dateparser.search import search_dates
18
+ from flask import Flask, jsonify, request
19
+ from gliner import GLiNER
20
+ from pymorphy3 import MorphAnalyzer
21
+ from rapidfuzz import fuzz
22
+ from rapidfuzz.distance import Levenshtein
23
+ from sentence_transformers import SentenceTransformer
24
+ from sklearn.feature_extraction.text import TfidfVectorizer
25
+ from sklearn.metrics.pairwise import cosine_similarity
26
+
27
+ MORPH = MorphAnalyzer()
28
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
29
+ _MODEL: Optional[SentenceTransformer] = None
30
+ _AMOUNT_MODEL: Optional[Any] = None
31
+ _WHISPER_MODEL: Optional[Any] = None
32
+
33
+
34
+ app = Flask(__name__)
35
+ app.config["MAX_CONTENT_LENGTH"] = 20 * 1024 * 1024
36
+
37
+
38
+ def get_embedding_model() -> SentenceTransformer:
39
+ global _MODEL
40
+
41
+ if _MODEL is None:
42
+ _MODEL = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B", device=DEVICE)
43
+
44
+ return _MODEL
45
+
46
+
47
+ def get_amount_model() -> Optional[Any]:
48
+ global _AMOUNT_MODEL
49
+
50
+ if _AMOUNT_MODEL is None and GLiNER is not None:
51
+ _AMOUNT_MODEL = GLiNER.from_pretrained("urchade/gliner_multi-v2.1")
52
+
53
+ return _AMOUNT_MODEL
54
+
55
+
56
+ def get_whisper_model() -> Any:
57
+ global _WHISPER_MODEL
58
+
59
+ if _WHISPER_MODEL is None:
60
+ from faster_whisper import WhisperModel
61
+
62
+ model_name = os.getenv("WHISPER_MODEL", "large-v3")
63
+ compute_type = os.getenv("WHISPER_COMPUTE_TYPE", "float16" if torch.cuda.is_available() else "int8")
64
+ _WHISPER_MODEL = WhisperModel(model_name, device=DEVICE, compute_type=compute_type)
65
+
66
+ return _WHISPER_MODEL
67
+
68
+
69
+ def normalize_text(text: str) -> str:
70
+ text = unicodedata.normalize("NFKD", text.lower())
71
+ text = "".join(ch for ch in text if not unicodedata.combining(ch))
72
+ return re.sub(r"[^\w\s]", "", text).strip()
73
+
74
+
75
+ def tokenize_text(text: str) -> list[str]:
76
+ return normalize_text(text).split()
77
+
78
+
79
+ def lemmatize_word(word: str) -> str:
80
+ return MORPH.parse(word)[0].normal_form if re.fullmatch(r"[а-я]+", word) else word
81
+
82
+
83
+ def lemmatize_text(text: str) -> list[str]:
84
+ return [lemmatize_word(word) for word in tokenize_text(text)]
85
+
86
+
87
+ def variants(text: str) -> list[str]:
88
+ base = normalize_text(text)
89
+ result = [base]
90
+
91
+ for schema in (iuliia.WIKIPEDIA, iuliia.MOSMETRO, iuliia.ALA_LC):
92
+ try:
93
+ v = normalize_text(schema.translate(base))
94
+ if v and v not in result:
95
+ result.append(v)
96
+ except Exception:
97
+ pass
98
+
99
+ for v in list(result):
100
+ core = " ".join(w for w in v.split() if len(w) > 1 and any(ch.isalpha() for ch in w))
101
+ core = normalize_text(core)
102
+ if core and core not in result:
103
+ result.insert(0, core)
104
+
105
+ return result
106
+
107
+
108
+ def token_alignment_score(phrase_variant: str, candidate_tokens: list[str]) -> float:
109
+ phrase_tokens = [t for t in phrase_variant.split() if len(t) > 2]
110
+ if not phrase_tokens or not candidate_tokens:
111
+ return 0.0
112
+ best_scores = []
113
+ for pt in phrase_tokens:
114
+ best = 0.0
115
+ for ct in candidate_tokens:
116
+ sim = Levenshtein.normalized_similarity(pt, ct)
117
+ if sim > best:
118
+ best = sim
119
+ best_scores.append(best)
120
+ return sum(best_scores) / len(best_scores)
121
+
122
+
123
+ def length_penalty(phrase_len: int, candidate_len: int) -> float:
124
+ if phrase_len == 0 or candidate_len == 0:
125
+ return 0.0
126
+ ratio = min(phrase_len, candidate_len) / max(phrase_len, candidate_len)
127
+ if ratio >= 0.80:
128
+ return 1.0
129
+ if ratio >= 0.60:
130
+ return 0.90
131
+ if ratio >= 0.40:
132
+ return 0.70
133
+ return 0.50
134
+
135
+
136
+ def canonicalize_for_similarity(text: str) -> str:
137
+ t = normalize_text(text).replace(" ", "")
138
+ replacements = (
139
+ ("sch", "sh"),
140
+ ("tch", "ch"),
141
+ ("dzh", "j"),
142
+ ("zh", "j"),
143
+ ("sh", "s"),
144
+ ("ch", "c"),
145
+ ("kh", "h"),
146
+ ("ph", "f"),
147
+ ("ck", "k"),
148
+ ("qu", "k"),
149
+ ("q", "k"),
150
+ ("w", "v"),
151
+ ("x", "ks"),
152
+ ("ts", "z"),
153
+ ("tz", "z"),
154
+ )
155
+ for src, dst in replacements:
156
+ t = t.replace(src, dst)
157
+ return re.sub(r"(.)\1+", r"\1", t)
158
+
159
+
160
+ def phonetic_similarity(left: str, right: str) -> float:
161
+ l = canonicalize_for_similarity(left)
162
+ r = canonicalize_for_similarity(right)
163
+ if not l or not r:
164
+ return 0.0
165
+ char = fuzz.ratio(l, r) / 100.0
166
+ lev = Levenshtein.normalized_similarity(l, r)
167
+ return 0.50 * char + 0.50 * lev
168
+
169
+
170
+ @dataclass(frozen=True)
171
+ class ParsedDate:
172
+ date_iso: str
173
+ matched_expression: Optional[str]
174
+
175
+
176
+ @dataclass(frozen=True)
177
+ class Token:
178
+ original: str
179
+ normalized: str
180
+ raw_lemma: str
181
+ lemma: str
182
+ lemma_correction: Optional[str]
183
+ start: int
184
+ end: int
185
+ lemma_start: int
186
+ lemma_end: int
187
+
188
+
189
+ WORD_RE = re.compile(r"[0-9]+(?:[./-][0-9]+)*|[а-яё]+", re.IGNORECASE)
190
+
191
+
192
+ class UniversalDateParser:
193
+ MONTHS = {
194
+ "январь": 1, "февраль": 2, "март": 3, "апрель": 4, "май": 5, "июнь": 6,
195
+ "июль": 7, "август": 8, "сентябрь": 9, "октябрь": 10, "ноябрь": 11, "декабрь": 12,
196
+ }
197
+ WEEKDAYS = {
198
+ "понедельник": 0, "вторник": 1, "среда": 2, "четверг": 3,
199
+ "пятница": 4, "суббота": 5, "воскресенье": 6,
200
+ }
201
+ DIRECT_RELATIVE = {"послезавтра": 2, "позавчера": -2, "сегодня": 0, "вчера": -1, "завтра": 1}
202
+ ORDINAL_DAYS = {
203
+ "первый": 1, "второй": 2, "третий": 3, "четвертый": 4, "пятый": 5, "шестой": 6,
204
+ "седьмой": 7, "восьмой": 8, "девятый": 9, "десятый": 10, "одиннадцатый": 11,
205
+ "двенадцатый": 12, "тринадцатый": 13, "четырнадцатый": 14, "пятнадцатый": 15,
206
+ "шестнадцатый": 16, "семнадцатый": 17, "восемнадцатый": 18, "девятнадцатый": 19,
207
+ "двадцатый": 20, "двадцать первый": 21, "двадцать второй": 22, "двадцать третий": 23,
208
+ "двадцать четвертый": 24, "двадцать пятый": 25, "двадцать шестой": 26,
209
+ "двадцать седьмой": 27, "двадцать восьмой": 28, "двадцать девятый": 29,
210
+ "тридцатый": 30, "тридцать первый": 31,
211
+ }
212
+ NUMBER_WORDS = {
213
+ "ноль": 0, "один": 1, "два": 2, "три": 3, "четыре": 4, "пя��ь": 5, "шесть": 6,
214
+ "семь": 7, "восемь": 8, "девять": 9, "десять": 10, "одиннадцать": 11,
215
+ "двенадцать": 12, "тринадцать": 13, "четырнадцать": 14, "пятнадцать": 15,
216
+ "шестнадцать": 16, "семнадцать": 17, "восемнадцать": 18, "девятнадцать": 19,
217
+ "двадцать": 20, "тридцать": 30,
218
+ }
219
+ FUTURE_HINTS = ("завтра", "послезавтра", "через", "быть", "заплатить", "следующий", "последующий")
220
+ PAST_HINTS = ("вчера", "позавчера", "назад", "прошлый", "предыдущий", "оплатить", "купить", "заказать")
221
+
222
+ DIRECT_RELATIVE_RE = re.compile(r"(?<!\S)(послезавтра|позавчера|сегодня|вчера|завтра)(?!\S)")
223
+ WEEK_RELATIVE_RE = re.compile(
224
+ r"(?<!\S)на (?P<which>следующий|последующий|прошлый|предыдущий|этот) неделя"
225
+ r"(?: (?P<prep>в|во|на) (?P<weekday>понедельник|вторник|среда|четверг|пятница|суббота|воскресенье))?(?!\S)"
226
+ )
227
+ QUANTITY_RELATIVE_RE = re.compile(
228
+ r"(?<!\S)(?P<number>\d+|[а-яё]+(?: [а-яё]+)?) "
229
+ r"(?P<unit>месяц|неделя|день) "
230
+ r"(?P<ago>назад)"
231
+ r"(?: (?P<prep>в|во|на) (?P<weekday>понедельник|вторник|среда|четверг|пятница|суббота|воскресенье))?(?!\S)",
232
+ re.IGNORECASE,
233
+ )
234
+ FORWARD_QUANTITY_RE = re.compile(
235
+ r"(?<!\S)(?P<through>через) "
236
+ r"(?P<number>\d+|[а-яё]+(?: [а-яё]+)?) "
237
+ r"(?P<unit>месяц|неделя|день)"
238
+ r"(?: (?P<prep>в|во|на) (?P<weekday>понедельник|вторник|среда|четверг|пятница|суббота|воскресенье))?(?!\S)",
239
+ re.IGNORECASE,
240
+ )
241
+ FORWARD_SINGLE_UNIT_RE = re.compile(
242
+ r"(?<!\S)(?P<through>через) "
243
+ r"(?P<unit>месяц|неделя|день)"
244
+ r"(?: (?P<prep>в|во|на) (?P<weekday>понедельник|вторник|среда|четверг|пятница|суббота|воскресенье))?(?!\S)",
245
+ re.IGNORECASE,
246
+ )
247
+ TEXTUAL_ABSOLUTE_RE = re.compile(
248
+ r"(?<!\S)(?P<day>\d{1,2}|[а-яё]+(?: [а-яё]+)?) "
249
+ r"(?P<month>январь|февраль|март|апрель|май|июнь|июль|август|сентябрь|октябрь|ноябрь|декабрь)"
250
+ r"(?: (?P<year>\d{4}))?(?!\S)",
251
+ re.IGNORECASE,
252
+ )
253
+ PERIOD_EDGE_RE = re.compile(
254
+ r"(?<!\S)(?:в )?(?P<edge>начало|конец) (?P<which>этот|следующий|последующий|прошлый|предыдущий) (?P<unit>неделя|месяц)(?!\S)",
255
+ re.IGNORECASE,
256
+ )
257
+
258
+ @classmethod
259
+ def temporal_vocabulary(cls) -> set[str]:
260
+ vocab: set[str] = set()
261
+ vocab.update(cls.MONTHS)
262
+ vocab.update(cls.WEEKDAYS)
263
+ vocab.update(cls.DIRECT_RELATIVE)
264
+ vocab.update(cls.ORDINAL_DAYS)
265
+ vocab.update(cls.NUMBER_WORDS)
266
+ vocab.update({
267
+ "неделя", "месяц", "день", "назад", "через", "начало", "конец", "на", "в", "во",
268
+ "этот", "прошлый", "предыдущий", "следующий", "последующий",
269
+ })
270
+ return vocab
271
+
272
+ @staticmethod
273
+ def similarity(left: str, right: str) -> float:
274
+ return difflib.SequenceMatcher(None, left, right).ratio()
275
+
276
+ @classmethod
277
+ def pick_temporal_correction(cls, normalized: str, raw_lemma: str) -> tuple[str, Optional[str]]:
278
+ vocab = cls.temporal_vocabulary()
279
+ if raw_lemma in vocab or not normalized.isalpha() or len(normalized) < 5:
280
+ return raw_lemma, None
281
+
282
+ candidates = list(difflib.get_close_matches(normalized, list(vocab), n=4, cutoff=0.74))
283
+ candidates.extend(difflib.get_close_matches(raw_lemma, list(vocab), n=4, cutoff=0.74))
284
+ candidates = list(dict.fromkeys(candidates))
285
+ if not candidates:
286
+ return raw_lemma, None
287
+
288
+ best = max(candidates, key=lambda item: max(cls.similarity(normalized, item), cls.similarity(raw_lemma, item)))
289
+ best_score = max(cls.similarity(normalized, best), cls.similarity(raw_lemma, best))
290
+ return (best, f"{raw_lemma}->{best}") if best_score >= 0.80 else (raw_lemma, None)
291
+
292
+ @staticmethod
293
+ def normalize_word(word: str) -> str:
294
+ return word.lower().replace("ё", "е")
295
+
296
+ @classmethod
297
+ def lemmatize(cls, word: str) -> str:
298
+ return MORPH.parse(word)[0].normal_form if word.isalpha() else word
299
+
300
+ @classmethod
301
+ def tokenize(cls, text: str) -> list[Token]:
302
+ tokens: list[Token] = []
303
+ lemma_cursor = 0
304
+
305
+ for match in WORD_RE.finditer(text):
306
+ original = match.group(0)
307
+ normalized = cls.normalize_word(original)
308
+ raw_lemma = cls.lemmatize(normalized)
309
+ lemma, correction = cls.pick_temporal_correction(normalized, raw_lemma)
310
+ lemma_start = lemma_cursor
311
+ lemma_end = lemma_start + len(lemma)
312
+ tokens.append(Token(original, normalized, raw_lemma, lemma, correction, match.start(), match.end(), lemma_start, lemma_end))
313
+ lemma_cursor = lemma_end + 1
314
+
315
+ return tokens
316
+
317
+ @staticmethod
318
+ def lemma_text(tokens: list[Token]) -> str:
319
+ return " ".join(token.lemma for token in tokens)
320
+
321
+ @staticmethod
322
+ def surface_text(text: str, tokens: list[Token], start_idx: int, end_idx: int) -> str:
323
+ return text[tokens[start_idx].start:tokens[end_idx].end].strip() if tokens else ""
324
+
325
+ @staticmethod
326
+ def lemma_span_to_token_range(tokens: list[Token], span: tuple[int, int]) -> Optional[tuple[int, int]]:
327
+ start_char, end_char = span
328
+ start_idx = end_idx = None
329
+
330
+ for idx, token in enumerate(tokens):
331
+ if start_idx is None and token.lemma_start <= start_char < token.lemma_end:
332
+ start_idx = idx
333
+ if token.lemma_start < end_char <= token.lemma_end:
334
+ end_idx = idx
335
+ break
336
+
337
+ return (start_idx, end_idx) if start_idx is not None and end_idx is not None else None
338
+
339
+ @classmethod
340
+ def make_parsed_date(cls, text: str, tokens: list[Token], match, parsed_date: date) -> Optional[ParsedDate]:
341
+ token_span = cls.lemma_span_to_token_range(tokens, match.span())
342
+ if token_span is None:
343
+ return None
344
+ return ParsedDate(parsed_date.isoformat(), cls.surface_text(text, tokens, token_span[0], token_span[1]))
345
+
346
+ @classmethod
347
+ def parse_number_phrase(cls, phrase: str) -> Optional[int]:
348
+ phrase = phrase.strip()
349
+ if not phrase:
350
+ return None
351
+ if phrase.isdigit():
352
+ return int(phrase)
353
+
354
+ parts = phrase.split()
355
+ if len(parts) == 1:
356
+ return cls.NUMBER_WORDS.get(parts[0])
357
+ if len(parts) == 2 and parts[0] in {"двадцать", "тридцать"}:
358
+ base = cls.NUMBER_WORDS.get(parts[0])
359
+ addon = cls.NUMBER_WORDS.get(parts[1])
360
+ if base is not None and addon is not None and 1 <= addon <= 9:
361
+ return base + addon
362
+ return None
363
+
364
+ @classmethod
365
+ def parse_day_phrase(cls, phrase: str) -> Optional[int]:
366
+ if phrase.isdigit():
367
+ value = int(phrase)
368
+ return value if 1 <= value <= 31 else None
369
+ return cls.ORDINAL_DAYS.get(phrase.strip())
370
+
371
+ @staticmethod
372
+ def shift_months(value: date, months: int) -> date:
373
+ month_index = value.month - 1 + months
374
+ year = value.year + month_index // 12
375
+ month = month_index % 12 + 1
376
+ day = min(value.day, calendar.monthrange(year, month)[1])
377
+ return date(year, month, day)
378
+
379
+ @staticmethod
380
+ def parse_numeric_absolute(tokens: list[Token]) -> Optional[ParsedDate]:
381
+ for token in tokens:
382
+ separator = "." if "." in token.original else "-" if "-" in token.original else "/" if "/" in token.original else None
383
+ if separator is None:
384
+ continue
385
+
386
+ parts = token.original.split(separator)
387
+ if len(parts) != 3 or not all(part.isdigit() for part in parts):
388
+ continue
389
+
390
+ try:
391
+ if len(parts[0]) == 4:
392
+ parsed = date(int(parts[0]), int(parts[1]), int(parts[2]))
393
+ elif len(parts[2]) == 4:
394
+ parsed = date(int(parts[2]), int(parts[1]), int(parts[0]))
395
+ else:
396
+ continue
397
+ return ParsedDate(parsed.isoformat(), token.original)
398
+ except ValueError:
399
+ continue
400
+
401
+ return None
402
+
403
+ @classmethod
404
+ def parse_textual_absolute(cls, text: str, tokens: list[Token], reference_date: date) -> Optional[ParsedDate]:
405
+ lemma_text = cls.lemma_text(tokens)
406
+ for match in cls.TEXTUAL_ABSOLUTE_RE.finditer(lemma_text):
407
+ day = cls.parse_day_phrase(match.group("day"))
408
+ month = cls.MONTHS.get(match.group("month"))
409
+ if day is None or month is None:
410
+ continue
411
+
412
+ year = int(match.group("year")) if match.group("year") else reference_date.year
413
+ try:
414
+ parsed = date(year, month, day)
415
+ except ValueError:
416
+ continue
417
+
418
+ result = cls.make_parsed_date(text, tokens, match, parsed)
419
+ if result is not None:
420
+ return result
421
+
422
+ return None
423
+
424
+ @classmethod
425
+ def parse_direct_relative(cls, text: str, tokens: list[Token], reference_date: date) -> Optional[ParsedDate]:
426
+ lemma_text = cls.lemma_text(tokens)
427
+ match = cls.DIRECT_RELATIVE_RE.search(lemma_text)
428
+ if not match:
429
+ return None
430
+
431
+ parsed = reference_date + timedelta(days=cls.DIRECT_RELATIVE[match.group(1)])
432
+ return cls.make_parsed_date(text, tokens, match, parsed)
433
+
434
+ @staticmethod
435
+ def week_monday(value: date) -> date:
436
+ return value - timedelta(days=value.weekday())
437
+
438
+ @classmethod
439
+ def parse_week_relative(cls, text: str, tokens: list[Token], reference_date: date) -> Optional[ParsedDate]:
440
+ lemma_text = cls.lemma_text(tokens)
441
+ match = cls.WEEK_RELATIVE_RE.search(lemma_text)
442
+ if not match:
443
+ return None
444
+
445
+ offsets = {"следующий": 7, "последующий": 7, "прошлый": -7, "предыдущий": -7, "этот": 0}
446
+ anchor = reference_date + timedelta(days=offsets[match.group("which")])
447
+
448
+ if match.group("weekday"):
449
+ anchor = cls.week_monday(anchor) + timedelta(days=cls.WEEKDAYS[match.group("weekday")])
450
+
451
+ return cls.make_parsed_date(text, tokens, match, anchor)
452
+
453
+ @classmethod
454
+ def parse_period_edge(cls, text: str, tokens: list[Token], reference_date: date) -> Optional[ParsedDate]:
455
+ lemma_text = cls.lemma_text(tokens)
456
+ match = cls.PERIOD_EDGE_RE.search(lemma_text)
457
+ if not match:
458
+ return None
459
+
460
+ edge, which, unit = match.group("edge"), match.group("which"), match.group("unit")
461
+
462
+ if unit == "неделя":
463
+ offsets = {"прошлый": -7, "предыдущий": -7, "этот": 0, "следующий": 7, "последующий": 7}
464
+ monday = cls.week_monday(reference_date + timedelta(days=offsets[which]))
465
+ parsed_date = monday if edge == "начало" else monday + timedelta(days=6)
466
+ else:
467
+ month_offset = {"прошлый": -1, "предыдущий": -1, "этот": 0, "следующий": 1, "последующий": 1}[which]
468
+ shifted = cls.shift_months(date(reference_date.year, reference_date.month, 1), month_offset)
469
+ parsed_date = shifted if edge == "начало" else date(shifted.year, shifted.month, calendar.monthrange(shifted.year, shifted.month)[1])
470
+
471
+ return cls.make_parsed_date(text, tokens, match, parsed_date)
472
+
473
+ @classmethod
474
+ def parse_quantity_relative(cls, text: str, tokens: list[Token], reference_date: date) -> Optional[ParsedDate]:
475
+ lemma_text = cls.lemma_text(tokens)
476
+
477
+ for regex, direction in ((cls.QUANTITY_RELATIVE_RE, -1), (cls.FORWARD_QUANTITY_RE, 1)):
478
+ for match in regex.finditer(lemma_text):
479
+ number = cls.parse_number_phrase(match.group("number"))
480
+ if number is None:
481
+ continue
482
+
483
+ unit = match.group("unit")
484
+ if unit == "месяц":
485
+ anchor = cls.shift_months(reference_date, direction * number)
486
+ else:
487
+ days = number * 7 if unit == "неделя" else number
488
+ anchor = reference_date + timedelta(days=direction * days)
489
+
490
+ if match.group("weekday"):
491
+ anchor = cls.week_monday(anchor) + timedelta(days=cls.WEEKDAYS[match.group("weekday")])
492
+
493
+ result = cls.make_parsed_date(text, tokens, match, anchor)
494
+ if result is not None:
495
+ return result
496
+
497
+ for match in cls.FORWARD_SINGLE_UNIT_RE.finditer(lemma_text):
498
+ unit = match.group("unit")
499
+ if unit == "месяц":
500
+ anchor = cls.shift_months(reference_date, 1)
501
+ else:
502
+ days = 7 if unit == "неделя" else 1
503
+ anchor = reference_date + timedelta(days=days)
504
+
505
+ if match.group("weekday"):
506
+ anchor = cls.week_monday(anchor) + timedelta(days=cls.WEEKDAYS[match.group("weekday")])
507
+
508
+ result = cls.make_parsed_date(text, tokens, match, anchor)
509
+ if result is not None:
510
+ return result
511
+
512
+ return None
513
+
514
+ @classmethod
515
+ def preference_for_text(cls, tokens: list[Token]) -> str:
516
+ lemmas = [token.lemma for token in tokens]
517
+ future = sum(1 for hint in cls.FUTURE_HINTS if hint in lemmas)
518
+ past = sum(1 for hint in cls.PAST_HINTS if hint in lemmas)
519
+ return "future" if future > past else "past"
520
+
521
+ @staticmethod
522
+ def choose_best(matches: list[tuple[str, datetime]]) -> tuple[str, datetime]:
523
+ return sorted(matches, key=lambda item: (len(item[0]), -item[1].timestamp()), reverse=True)[0]
524
+
525
+ def parse(self, text: str, reference_date: date) -> Optional[ParsedDate]:
526
+ tokens = self.tokenize(text)
527
+
528
+ for parser in (
529
+ lambda: self.parse_numeric_absolute(tokens),
530
+ lambda: self.parse_textual_absolute(text, tokens, reference_date),
531
+ lambda: self.parse_direct_relative(text, tokens, reference_date),
532
+ lambda: self.parse_week_relative(text, tokens, reference_date),
533
+ lambda: self.parse_period_edge(text, tokens, reference_date),
534
+ lambda: self.parse_quantity_relative(text, tokens, reference_date),
535
+ ):
536
+ parsed = parser()
537
+ if parsed is not None:
538
+ return parsed
539
+
540
+ normalized = " ".join(token.normalized for token in tokens)
541
+ relative_base = datetime.combine(reference_date, datetime.min.time()).replace(hour=12)
542
+ result = search_dates(
543
+ normalized,
544
+ languages=["ru"],
545
+ settings={
546
+ "RELATIVE_BASE": relative_base,
547
+ "PREFER_DATES_FROM": self.preference_for_text(tokens),
548
+ "STRICT_PARSING": False,
549
+ "REQUIRE_PARTS": [],
550
+ "NORMALIZE": True,
551
+ "RETURN_AS_TIMEZONE_AWARE": False,
552
+ "DATE_ORDER": "DMY",
553
+ },
554
+ )
555
+
556
+ filtered: list[tuple[str, datetime]] = []
557
+ for matched, value in result or []:
558
+ if isinstance(value, datetime) and not matched.strip().isdigit() and 2020 <= value.year <= 2100:
559
+ filtered.append((matched.strip(), value))
560
+
561
+ if not filtered:
562
+ return None
563
+
564
+ matched_expression, value = self.choose_best(filtered)
565
+ return ParsedDate(date_iso=value.date().isoformat(), matched_expression=matched_expression)
566
+
567
+
568
+ class ExpenseDateExtractor:
569
+ def __init__(self) -> None:
570
+ self.parser = UniversalDateParser()
571
+
572
+ def extract(self, text: str, reference_date: str | date | None = None) -> dict[str, Any]:
573
+ ref_date = self.to_date(reference_date or date.today().isoformat())
574
+ parsed = self.parser.parse(text=text, reference_date=ref_date)
575
+
576
+ return {
577
+ "date": datetime.strptime(parsed.date_iso, "%Y-%m-%d").strftime("%d.%m.%Y") if parsed else None,
578
+ "date_iso": parsed.date_iso if parsed else None,
579
+ "matched_date_phrase": parsed.matched_expression if parsed else None,
580
+ }
581
+
582
+ @staticmethod
583
+ def to_date(value: str | date) -> date:
584
+ return value if isinstance(value, date) else datetime.strptime(value, "%Y-%m-%d").date()
585
+
586
+
587
+ class ExpenseUserExtractor:
588
+ def __init__(self, users: list[str], suppliers: list[str], model: SentenceTransformer, threshold: float = 0.6) -> None:
589
+ self.users = users
590
+ self.model = model
591
+ self.threshold = threshold
592
+ self.supplier_terms = {normalize_text(supplier) for supplier in suppliers}
593
+ self.user_terms = [normalize_text(user) for user in users]
594
+ self.user_embeddings = model.encode(
595
+ [f"passage: {user}" for user in self.user_terms],
596
+ convert_to_tensor=True,
597
+ normalize_embeddings=True,
598
+ )
599
+
600
+ def extract(self, text: str, supplier_phrase: str | None = None, date_phrase: str | None = None) -> dict[str, Any]:
601
+ excluded_tokens: set[str] = set()
602
+ if supplier_phrase:
603
+ excluded_tokens.update(normalize_text(supplier_phrase).split())
604
+ if date_phrase:
605
+ excluded_tokens.update(normalize_text(date_phrase).split())
606
+
607
+ best_user = None
608
+ best_score = -1.0
609
+ best_phrase = None
610
+
611
+ for word in lemmatize_text(text):
612
+ if len(word) < 3:
613
+ continue
614
+ if word in excluded_tokens or word in self.supplier_terms:
615
+ continue
616
+
617
+ query_emb = self.model.encode(
618
+ f"query: {word}",
619
+ convert_to_tensor=True,
620
+ normalize_embeddings=True,
621
+ )
622
+ similarities = torch.cosine_similarity(query_emb.unsqueeze(0), self.user_embeddings, dim=1)
623
+ idx = int(torch.argmax(similarities))
624
+ score = similarities[idx].item()
625
+
626
+ if score > best_score:
627
+ best_score = score
628
+ best_user = self.users[idx]
629
+ best_phrase = word
630
+
631
+ if best_score >= self.threshold:
632
+ return {
633
+ "user": best_user,
634
+ "user_score": round(best_score, 4),
635
+ "matched_user_phrase": best_phrase,
636
+ }
637
+
638
+ if re.search(r"(?<!\S)я(?!\S)", normalize_text(text), re.IGNORECASE):
639
+ return {
640
+ "user": "Я",
641
+ "user_score": 1.0,
642
+ "matched_user_phrase": "я",
643
+ }
644
+
645
+ return {
646
+ "user": None,
647
+ "user_score": None,
648
+ "matched_user_phrase": None,
649
+ }
650
+
651
+
652
+ class ExpenseSupplierExtractor:
653
+ def __init__(self, suppliers: list[str]) -> None:
654
+ self.suppliers = suppliers
655
+ self.sup_norm = [normalize_text(s) for s in suppliers]
656
+ self.sup_tokens = [s.split() for s in self.sup_norm]
657
+ self.sup_num_sets = [self.numeric_tokens(s) for s in self.sup_norm]
658
+ self.sup_number_tokens = {token for supplier in self.sup_tokens for token in supplier if token.isdigit()}
659
+ self.supplier_lexicon = [
660
+ token
661
+ for token in sorted({tok for tokens in self.sup_tokens for tok in tokens})
662
+ if token and not token.isdigit()
663
+ ]
664
+ self.tfidf = TfidfVectorizer(analyzer="char_wb", ngram_range=(3, 5))
665
+ self.sup_mat = self.tfidf.fit_transform(self.sup_norm)
666
+ self.max_words = max(len(s.split()) for s in self.sup_norm)
667
+ self.variant_cache: dict[str, list[str]] = {}
668
+ self.lexical_token_cache: dict[str, float] = {}
669
+ self.phrase_support_cache: dict[str, float] = {}
670
+ self.noise_terms = {
671
+ "за", "на", "из", "для", "под", "над", "при", "без", "и", "или",
672
+ "купил", "купила", "купили", "покупка", "заказал", "заказала", "заказали",
673
+ "оплатил", "оплатила", "оплатили", "заплатил", "заплатила", "заплатили",
674
+ "был", "была", "было", "были", "утром", "днем", "днём", "вечером", "ночью",
675
+ "товар", "товары", "продукт", "продукты", "десерт", "еда",
676
+ "лей", "лея", "леи", "целых", "сотых", "сом", "сомов", "руб", "рублей", "грн", "usd", "eur",
677
+ }
678
+ self.noise_terms.update(UniversalDateParser.temporal_vocabulary())
679
+
680
+ @staticmethod
681
+ def numeric_tokens(text: str) -> set[str]:
682
+ return set(re.findall(r"\d+", text))
683
+
684
+ def cached_variants(self, text: str) -> list[str]:
685
+ key = normalize_text(text)
686
+ cached = self.variant_cache.get(key)
687
+ if cached is None:
688
+ cached = variants(key)
689
+ self.variant_cache[key] = cached
690
+ return cached
691
+
692
+ @staticmethod
693
+ def split_words(text: str) -> list[str]:
694
+ return [w for w in normalize_text(text).split() if w]
695
+
696
+ @classmethod
697
+ def is_supplier_extension(cls, base_supplier: str, extended_supplier: str) -> bool:
698
+ base_tokens = cls.split_words(base_supplier)
699
+ extended_tokens = cls.split_words(extended_supplier)
700
+ return len(base_tokens) < len(extended_tokens) and extended_tokens[:len(base_tokens)] == base_tokens
701
+
702
+ @classmethod
703
+ def phrase_token_count(cls, phrase: str | None) -> int:
704
+ return len(cls.split_words(phrase or ""))
705
+
706
+ @classmethod
707
+ def resolve_overlapping_suppliers(cls, ranking: list[dict[str, Any]]) -> dict[str, Any]:
708
+ if not ranking:
709
+ return {"supplier": None, "score": -1.0, "phrase": None}
710
+
711
+ best = ranking[0]
712
+ best_combined = float(best.get("combined", best.get("score", -1.0)))
713
+ best_phrase_len = cls.phrase_token_count(best.get("phrase"))
714
+
715
+ for alt in ranking[1:]:
716
+ if not cls.is_supplier_extension(str(best.get("supplier") or ""), str(alt.get("supplier") or "")):
717
+ continue
718
+
719
+ alt_combined = float(alt.get("combined", alt.get("score", -1.0)))
720
+ alt_phrase_len = cls.phrase_token_count(alt.get("phrase"))
721
+
722
+ if alt_phrase_len > best_phrase_len and alt_combined >= best_combined - 0.15:
723
+ best = alt
724
+ best_combined = alt_combined
725
+ best_phrase_len = alt_phrase_len
726
+
727
+ return best
728
+
729
+ @staticmethod
730
+ def numeric_compatibility_multiplier(phrase_nums: set[str], candidate_nums: set[str]) -> float:
731
+ if not phrase_nums and not candidate_nums:
732
+ return 1.0
733
+ if phrase_nums == candidate_nums:
734
+ return 1.08
735
+ if phrase_nums and candidate_nums:
736
+ return 1.03 if phrase_nums & candidate_nums else 0.80
737
+ return 0.82
738
+
739
+ def lexical_support(self, phrase: str) -> float:
740
+ tokens = [token for token in normalize_text(phrase).split() if token and not token.isdigit()]
741
+ if not tokens or not self.supplier_lexicon:
742
+ return 0.0
743
+
744
+ support_scores: list[float] = []
745
+ for token in tokens:
746
+ cached = self.lexical_token_cache.get(token)
747
+ if cached is not None:
748
+ support_scores.append(cached)
749
+ continue
750
+
751
+ best = 0.0
752
+ for token_variant in self.cached_variants(token):
753
+ for lex in self.supplier_lexicon:
754
+ lev = Levenshtein.normalized_similarity(token_variant, lex)
755
+ phon = phonetic_similarity(token_variant, lex)
756
+ sim = max(lev, phon)
757
+ if sim > best:
758
+ best = sim
759
+
760
+ self.lexical_token_cache[token] = best
761
+ support_scores.append(best)
762
+
763
+ return sum(support_scores) / len(support_scores)
764
+
765
+ def score_phrase(self, phrase: str) -> dict[str, Any]:
766
+ vs = self.cached_variants(phrase)
767
+ q = self.tfidf.transform(vs)
768
+ tf = cosine_similarity(q, self.sup_mat)
769
+
770
+ best: dict[str, Any] = {"supplier": None, "score": -1.0, "phrase": phrase, "variant": ""}
771
+ for i, cand in enumerate(self.sup_norm):
772
+ local = -1.0
773
+ local_variant = ""
774
+ candidate_nums = self.sup_num_sets[i]
775
+ for j, v in enumerate(vs):
776
+ char = fuzz.ratio(v, cand) / 100.0
777
+ tf_val = float(tf[j, i])
778
+ penalty = length_penalty(len(v), len(cand))
779
+ phon = phonetic_similarity(v, cand)
780
+ phrase_nums = self.numeric_tokens(v)
781
+
782
+ if len(v.split()) == 1 and len(cand.split()) == 1:
783
+ lev = Levenshtein.normalized_similarity(v, cand)
784
+ val = (0.45 * lev + 0.25 * char + 0.10 * tf_val + 0.20 * phon) * penalty
785
+ else:
786
+ align = token_alignment_score(v, self.sup_tokens[i])
787
+ tok = fuzz.token_set_ratio(v, cand) / 100.0
788
+ val = (0.30 * char + 0.20 * tok + 0.10 * tf_val + 0.20 * align + 0.20 * phon) * penalty
789
+
790
+ compact_v = v.replace(" ", "")
791
+ compact_cand = cand.replace(" ", "")
792
+ compact_char = fuzz.ratio(compact_v, compact_cand) / 100.0
793
+ compact_lev = Levenshtein.normalized_similarity(compact_v, compact_cand)
794
+ compact_phon = phonetic_similarity(compact_v, compact_cand)
795
+ compact = max(compact_char, compact_lev, compact_phon)
796
+ if compact > 0.55:
797
+ val = max(val, compact * penalty)
798
+
799
+ val *= self.numeric_compatibility_multiplier(phrase_nums, candidate_nums)
800
+
801
+ if val > local:
802
+ local = val
803
+ local_variant = v
804
+
805
+ if local > best["score"]:
806
+ best = {"supplier": self.suppliers[i], "score": local, "phrase": phrase, "variant": local_variant}
807
+ return best
808
+
809
+ def extract(self, text: str, date_phrase: str | None = None, debug: bool = False) -> dict[str, Any]:
810
+ threshold = 0.50
811
+ excluded_tokens: set[str] = set()
812
+ if date_phrase:
813
+ excluded_tokens.update(normalize_text(date_phrase).split())
814
+ excluded_tokens.update(self.noise_terms)
815
+
816
+ raw_tokens = normalize_text(text).split()
817
+ tokens: list[str] = []
818
+ for token in raw_tokens:
819
+ if token in excluded_tokens:
820
+ continue
821
+
822
+ if token.isdigit():
823
+ if token in self.sup_number_tokens:
824
+ tokens.append(token)
825
+
826
+ if tokens and len(token) <= 3 and len(tokens[-1]) >= 4 and tokens[-1].isalpha():
827
+ tokens.append(f"{tokens[-1]}{token}")
828
+ continue
829
+
830
+ if len(token) > 1:
831
+ tokens.append(token)
832
+
833
+ tokens = [t for t in tokens if len(t) > 1 and t not in excluded_tokens]
834
+
835
+ phrases: list[str] = []
836
+ seen: set[str] = set()
837
+ for i in range(len(tokens)):
838
+ for j in range(i + 1, min(i + 1 + self.max_words, len(tokens) + 1)):
839
+ p = " ".join(tokens[i:j])
840
+ if p not in seen:
841
+ seen.add(p)
842
+ phrases.append(p)
843
+
844
+ results = [self.score_phrase(p) for p in phrases]
845
+ candidate_rows: list[dict[str, Any]] = []
846
+ best_by_supplier: dict[str, dict[str, Any]] = {}
847
+ for row in results:
848
+ supplier = row["supplier"]
849
+ score = float(row.get("score", -1.0))
850
+ phrase = str(row.get("phrase") or "")
851
+ support = self.phrase_support_cache.get(phrase)
852
+ if support is None:
853
+ support = self.lexical_support(phrase)
854
+ self.phrase_support_cache[phrase] = support
855
+ combined = 0.75 * score + 0.25 * support
856
+
857
+ if debug:
858
+ candidate_rows.append({
859
+ "supplier": supplier,
860
+ "phrase": phrase,
861
+ "score": round(score, 4),
862
+ "support": round(support, 4),
863
+ "combined": round(combined, 4),
864
+ })
865
+
866
+ enriched = {**row, "combined": combined}
867
+ passes = score >= threshold or combined >= 0.48
868
+ if passes and (supplier not in best_by_supplier or combined > float(best_by_supplier[supplier].get("combined", -1.0))):
869
+ best_by_supplier[supplier] = enriched
870
+
871
+ if not best_by_supplier and results:
872
+ def support_for_phrase(phrase: str) -> float:
873
+ cached_support = self.phrase_support_cache.get(phrase)
874
+ if cached_support is None:
875
+ cached_support = self.lexical_support(phrase)
876
+ self.phrase_support_cache[phrase] = cached_support
877
+ return cached_support
878
+
879
+ fallback = max(
880
+ results,
881
+ key=lambda item: 0.75 * float(item.get("score", -1.0)) + 0.25 * support_for_phrase(str(item.get("phrase") or "")),
882
+ )
883
+ fallback_score = float(fallback.get("score", -1.0))
884
+ fallback_phrase = str(fallback.get("phrase") or "")
885
+ fallback_support = support_for_phrase(fallback_phrase)
886
+ fallback_combined = 0.75 * fallback_score + 0.25 * fallback_support
887
+ if fallback_score >= 0.40 and fallback_support >= 0.43 and fallback_combined >= 0.43:
888
+ best_by_supplier[fallback["supplier"]] = {**fallback, "combined": fallback_combined}
889
+
890
+ supplier_ranking = sorted(best_by_supplier.values(), key=lambda x: float(x.get("combined", x["score"])), reverse=True)
891
+ best = self.resolve_overlapping_suppliers(supplier_ranking)
892
+
893
+ payload = {
894
+ "supplier": best["supplier"],
895
+ "supplier_score": round(best["score"], 4) if best["score"] >= 0 else None,
896
+ "matched_supplier_phrase": best.get("phrase"),
897
+ }
898
+
899
+ if debug:
900
+ top_candidates = sorted(candidate_rows, key=lambda item: item["combined"], reverse=True)[:8]
901
+ payload["supplier_debug"] = {
902
+ "tokens": tokens,
903
+ "phrases_count": len(phrases),
904
+ "top_candidates": top_candidates,
905
+ }
906
+
907
+ return payload
908
+
909
+
910
+ class ExpenseAmountExtractor:
911
+ def __init__(self, suppliers: list[str]) -> None:
912
+ self.model = get_amount_model()
913
+
914
+ @staticmethod
915
+ def to_float(value: str) -> Optional[float]:
916
+ cleaned = value.replace(" ", "").replace("\u00A0", "")
917
+ match = re.search(r"\d+(?:[,]\d{1,2})?", cleaned)
918
+ if not match:
919
+ return None
920
+ try:
921
+ return float(match.group(0).replace(",", "."))
922
+ except ValueError:
923
+ return None
924
+
925
+ @staticmethod
926
+ def phrase_span(text: str, phrase: Optional[str]) -> Optional[tuple[int, int]]:
927
+ if not phrase:
928
+ return None
929
+ idx = text.lower().find(phrase.lower())
930
+ if idx == -1:
931
+ return None
932
+ return idx, idx + len(phrase)
933
+
934
+ @staticmethod
935
+ def overlaps(span1: tuple[int, int], span2: Optional[tuple[int, int]]) -> bool:
936
+ if span2 is None:
937
+ return False
938
+ return span1[0] < span2[1] and span2[0] < span1[1]
939
+
940
+ @staticmethod
941
+ def expand_amount_text(text: str, start: int, end: int) -> tuple[str, tuple[int, int]]:
942
+ suffix = re.match(r",\d{1,2}", text[end:])
943
+ if suffix:
944
+ new_end = end + len(suffix.group(0))
945
+ return text[start:new_end].strip(), (start, new_end)
946
+
947
+ prefix = re.search(r"(\d{1,3}(?:\s*\d{3})*),", text[:start])
948
+ if prefix:
949
+ new_start = prefix.start(1)
950
+ return text[new_start:end].strip(), (new_start, end)
951
+
952
+ return text[start:end].strip(), (start, end)
953
+
954
+ def extract(
955
+ self,
956
+ text: str,
957
+ matched_date_phrase: Optional[str] = None,
958
+ matched_supplier_phrase: Optional[str] = None,
959
+ ) -> dict[str, Any]:
960
+ if self.model is None:
961
+ return {"amount": None, "amount_text": None}
962
+
963
+ date_span = self.phrase_span(text, matched_date_phrase)
964
+ supplier_span = self.phrase_span(text, matched_supplier_phrase)
965
+ entities = self.model.predict_entities(text, ["money"], threshold=0.3)
966
+
967
+ for ent in sorted(entities, key=lambda item: float(item.get("score", 0.0)), reverse=True):
968
+ raw_span = (int(ent.get("start", 0)), int(ent.get("end", 0)))
969
+ amount_text, span = self.expand_amount_text(text, raw_span[0], raw_span[1])
970
+ amount = self.to_float(amount_text)
971
+ overlaps_date = self.overlaps(span, date_span)
972
+ overlaps_supplier = self.overlaps(span, supplier_span)
973
+
974
+ if amount is None:
975
+ continue
976
+ if overlaps_date or overlaps_supplier:
977
+ continue
978
+ return {"amount": amount, "amount_text": amount_text}
979
+
980
+ return {"amount": None, "amount_text": None}
981
+
982
+
983
+ class ExpenseTextExtractor:
984
+ def __init__(self, suppliers: list[str], users: list[str]) -> None:
985
+ self.date_extractor = ExpenseDateExtractor()
986
+ self.supplier_extractor = ExpenseSupplierExtractor(suppliers=suppliers)
987
+ self.amount_extractor = ExpenseAmountExtractor(suppliers=suppliers)
988
+ self.user_extractor = ExpenseUserExtractor(users=users, suppliers=suppliers, model=get_embedding_model())
989
+
990
+ def extract(self, text: str, reference_date: str | date | None = None, debug_supplier: bool = False) -> dict[str, Any]:
991
+ date_info = self.date_extractor.extract(text, reference_date=reference_date)
992
+ supplier_info = self.supplier_extractor.extract(
993
+ text,
994
+ date_phrase=date_info.get("matched_date_phrase"),
995
+ debug=debug_supplier,
996
+ )
997
+ user_info = self.user_extractor.extract(
998
+ text,
999
+ supplier_phrase=supplier_info.get("matched_supplier_phrase"),
1000
+ date_phrase=date_info.get("matched_date_phrase"),
1001
+ )
1002
+ amount_info = self.amount_extractor.extract(
1003
+ text,
1004
+ matched_date_phrase=date_info["matched_date_phrase"],
1005
+ matched_supplier_phrase=supplier_info["matched_supplier_phrase"],
1006
+ )
1007
+
1008
+ result = {
1009
+ "text": text,
1010
+ "user": user_info["user"],
1011
+ "supplier": supplier_info["supplier"],
1012
+ "amount": amount_info["amount"],
1013
+ "date": date_info["date"],
1014
+ "date_iso": date_info["date_iso"],
1015
+ }
1016
+ if debug_supplier and "supplier_debug" in supplier_info:
1017
+ result["supplier_debug"] = supplier_info["supplier_debug"]
1018
+ return result
1019
+
1020
+
1021
+ def build_default_pipeline(suppliers: list[str], users: list[str]) -> ExpenseTextExtractor:
1022
+ return ExpenseTextExtractor(suppliers=suppliers, users=users)
1023
+
1024
+
1025
+ def extract_names(items: Any) -> list[str]:
1026
+ if not isinstance(items, list):
1027
+ return []
1028
+
1029
+ names: list[str] = []
1030
+ for item in items:
1031
+ if isinstance(item, dict):
1032
+ name = item.get("name")
1033
+ if isinstance(name, str) and name.strip():
1034
+ names.append(name.strip())
1035
+ continue
1036
+
1037
+ if isinstance(item, str) and item.strip():
1038
+ names.append(item.strip())
1039
+
1040
+ return names
1041
+
1042
+
1043
+ def polish_notes_text(text: str) -> str:
1044
+ normalized = re.sub(r"\s+", " ", text).strip()
1045
+ if not normalized:
1046
+ return ""
1047
+
1048
+ normalized = normalized[0].upper() + normalized[1:]
1049
+ if normalized[-1] not in ".!?":
1050
+ normalized += "."
1051
+
1052
+ return normalized
1053
+
1054
+
1055
+ def transcribe_audio_text(audio_path: str) -> str:
1056
+ mock_text = os.getenv("EXPENSE_VOICE_MOCK_TEXT")
1057
+ if mock_text:
1058
+ return mock_text.strip()
1059
+
1060
+ try:
1061
+ whisper_model = get_whisper_model()
1062
+ segments, _ = whisper_model.transcribe(audio_path, language="ru", vad_filter=True)
1063
+ text = " ".join(segment.text.strip() for segment in segments if segment.text and segment.text.strip())
1064
+ if text:
1065
+ return text
1066
+ except Exception:
1067
+ pass
1068
+
1069
+ raise RuntimeError("Speech-to-text backend is unavailable. Install faster-whisper or set EXPENSE_VOICE_MOCK_TEXT.")
1070
+
1071
+
1072
+ def process_voice_request(audio_path: str, mode: str, payload: dict[str, Any]) -> dict[str, Any]:
1073
+ context = payload.get("context", {}) if isinstance(payload, dict) else {}
1074
+ supplier_names = extract_names(context.get("suppliers"))
1075
+ user_names = extract_names(context.get("users"))
1076
+
1077
+ transcript = transcribe_audio_text(audio_path)
1078
+
1079
+ if mode == "notes":
1080
+ notes = polish_notes_text(transcript)
1081
+ return {
1082
+ "status": "ok",
1083
+ "text": transcript,
1084
+ "notes": notes,
1085
+ "supplier": None,
1086
+ "user": None,
1087
+ "date": None,
1088
+ "sum": None,
1089
+ }
1090
+
1091
+ if not supplier_names:
1092
+ raise RuntimeError("No suppliers were provided by Laravel context.")
1093
+
1094
+ if not user_names:
1095
+ raise RuntimeError("No users were provided by Laravel context.")
1096
+
1097
+ extractor = build_default_pipeline(suppliers=supplier_names, users=user_names)
1098
+ extracted = extractor.extract(transcript, reference_date=date.today().isoformat())
1099
+
1100
+ return {
1101
+ "status": "ok",
1102
+ "text": transcript,
1103
+ "notes": polish_notes_text(extracted.get("text") or transcript),
1104
+ "supplier": extracted.get("supplier"),
1105
+ "user": extracted.get("user"),
1106
+ "date": extracted.get("date_iso") or extracted.get("date"),
1107
+ "sum": extracted.get("amount"),
1108
+ }
1109
+
1110
+
1111
+ def require_auth():
1112
+ expected_token = os.getenv("PYTHON_API_TOKEN", os.getenv("EXPENSE_VOICE_FASTAPI_TOKEN", "")).strip()
1113
+
1114
+ if not expected_token:
1115
+ return None
1116
+
1117
+ authorization = request.headers.get("Authorization", "")
1118
+ if not authorization.startswith("Bearer "):
1119
+ return jsonify({"status": "error", "message": "Missing bearer token."}), 401
1120
+
1121
+ provided = authorization.removeprefix("Bearer ").strip()
1122
+ if provided != expected_token:
1123
+ return jsonify({"status": "error", "message": "Invalid bearer token."}), 401
1124
+
1125
+ return None
1126
+
1127
+
1128
+ def parse_context(raw: str | None) -> dict[str, Any]:
1129
+ if not raw:
1130
+ return {}
1131
+
1132
+ try:
1133
+ payload = json.loads(raw)
1134
+ return payload if isinstance(payload, dict) else {}
1135
+ except json.JSONDecodeError:
1136
+ return {}
1137
+
1138
+
1139
+ @app.get("/")
1140
+ def index():
1141
+ return jsonify({
1142
+ "status": "ok",
1143
+ "message": "Voice processing API is running",
1144
+ "endpoints": {
1145
+ "POST /process-audio": "Process audio file",
1146
+ "GET /health": "Health check"
1147
+ }
1148
+ })
1149
+
1150
+
1151
+ @app.get("/health")
1152
+ def health():
1153
+ return jsonify({"status": "ok"})
1154
+
1155
+
1156
+ @app.post("/process-audio")
1157
+ def process_audio():
1158
+ auth_error = require_auth()
1159
+ if auth_error:
1160
+ return auth_error
1161
+
1162
+ audio = request.files.get("audio")
1163
+ mode = (request.form.get("mode") or "expense").strip()
1164
+ context = parse_context(request.form.get("context"))
1165
+
1166
+ if audio is None:
1167
+ return jsonify({"status": "error", "message": "Audio file is required."}), 422
1168
+
1169
+ suffix = Path(audio.filename or "voice.webm").suffix or ".webm"
1170
+ temp_path = None
1171
+
1172
+ try:
1173
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
1174
+ temp_path = temp_file.name
1175
+ audio.save(temp_file)
1176
+
1177
+ result = process_voice_request(audio_path=temp_path, mode=mode, payload={"context": context})
1178
+ return jsonify(result)
1179
+ except Exception as exception:
1180
+ return jsonify({"status": "error", "message": str(exception)}), 422
1181
+ finally:
1182
+ if temp_path and os.path.exists(temp_path):
1183
+ os.unlink(temp_path)
1184
+
1185
+
1186
+ if __name__ == "__main__":
1187
  app.run(host="0.0.0.0", port=int(os.getenv("PORT", "7860")))