VladGeekPro commited on
Commit
95d5bed
·
0 Parent(s):

Add Duckling integration

Browse files
Files changed (5) hide show
  1. Dockerfile +43 -0
  2. app.py +1287 -0
  3. duckling_client.py +141 -0
  4. requirements.txt +12 -0
  5. supervisord.conf +26 -0
Dockerfile ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Stage 1: Копируем Duckling из официального образа
2
+ FROM rasa/duckling:latest AS duckling
3
+
4
+ # Stage 2: Основной образ
5
+ FROM python:3.11-slim
6
+
7
+ ENV PYTHONUNBUFFERED=1 \
8
+ PIP_NO_CACHE_DIR=1 \
9
+ HOME=/home/user \
10
+ PATH=/home/user/.local/bin:$PATH \
11
+ PORT=7860 \
12
+ WHISPER_MODEL=large-v3 \
13
+ WHISPER_COMPUTE_TYPE=int8 \
14
+ DUCKLING_URL=http://localhost:8000/parse
15
+
16
+ # Установка зависимостей: ffmpeg, supervisor, и библиотеки для Duckling
17
+ RUN apt-get update && apt-get install -y --no-install-recommends \
18
+ ffmpeg \
19
+ supervisor \
20
+ libgmp10 \
21
+ libpcre3 \
22
+ && rm -rf /var/lib/apt/lists/*
23
+
24
+ # Копируем бинарник Duckling и делаем его исполняемым
25
+ COPY --from=duckling /usr/local/bin/duckling-example-exe /usr/local/bin/duckling-example-exe
26
+ RUN chmod +x /usr/local/bin/duckling-example-exe
27
+
28
+ RUN useradd -m -u 1000 user
29
+
30
+ USER user
31
+ WORKDIR $HOME/app
32
+
33
+ COPY --chown=user requirements.txt ./requirements.txt
34
+ RUN pip install --upgrade pip && pip install -r requirements.txt
35
+
36
+ COPY --chown=user app.py ./app.py
37
+ COPY --chown=user duckling_client.py ./duckling_client.py
38
+ COPY --chown=user supervisord.conf ./supervisord.conf
39
+
40
+ EXPOSE 7860
41
+
42
+ # Запуск через supervisor (Duckling + Gunicorn)
43
+ CMD ["supervisord", "-c", "supervisord.conf"]
app.py ADDED
@@ -0,0 +1,1287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import calendar
4
+ import difflib
5
+ import json
6
+ import os
7
+ import re
8
+ import tempfile
9
+ import unicodedata
10
+ from dataclasses import dataclass
11
+ from datetime import date, datetime, timedelta
12
+ from pathlib import Path
13
+ from typing import Any, Optional
14
+
15
+ import iuliia
16
+ import torch
17
+ from dateparser.search import search_dates
18
+ from flask import Flask, jsonify, request
19
+ from gliner import GLiNER
20
+ from pymorphy3 import MorphAnalyzer
21
+ from rapidfuzz import fuzz
22
+ from rapidfuzz.distance import Levenshtein
23
+ from sentence_transformers import SentenceTransformer
24
+ from sklearn.feature_extraction.text import TfidfVectorizer
25
+ from sklearn.metrics.pairwise import cosine_similarity
26
+
27
+ # Duckling клиент для извлечения дат
28
+ from duckling_client import parse_date_with_duckling, parse_all_dates_with_duckling
29
+
30
+ # HuggingFace Token (если нужен для моделей)
31
+ HF_TOKEN = os.getenv("HF_TOKEN")
32
+
33
+ MORPH = MorphAnalyzer()
34
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
35
+ _MODEL: Optional[SentenceTransformer] = None
36
+ _AMOUNT_MODEL: Optional[Any] = None
37
+ _WHISPER_MODEL: Optional[Any] = None
38
+
39
+
40
+ app = Flask(__name__)
41
+ app.config["MAX_CONTENT_LENGTH"] = 20 * 1024 * 1024
42
+
43
+
44
+ def get_embedding_model() -> SentenceTransformer:
45
+ global _MODEL
46
+
47
+ if _MODEL is None:
48
+ _MODEL = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B", device=DEVICE)
49
+
50
+ return _MODEL
51
+
52
+
53
+ def get_amount_model() -> Optional[Any]:
54
+ global _AMOUNT_MODEL
55
+
56
+ if _AMOUNT_MODEL is None and GLiNER is not None:
57
+ _AMOUNT_MODEL = GLiNER.from_pretrained("urchade/gliner_multi-v2.1")
58
+
59
+ return _AMOUNT_MODEL
60
+
61
+
62
+ def get_whisper_model() -> Any:
63
+ global _WHISPER_MODEL
64
+
65
+ if _WHISPER_MODEL is None:
66
+ from faster_whisper import WhisperModel
67
+
68
+ model_name = os.getenv("WHISPER_MODEL", "large-v3")
69
+ compute_type = os.getenv("WHISPER_COMPUTE_TYPE", "float16" if torch.cuda.is_available() else "int8")
70
+ _WHISPER_MODEL = WhisperModel(model_name, device=DEVICE, compute_type=compute_type)
71
+
72
+ return _WHISPER_MODEL
73
+
74
+
75
+ def normalize_text(text: str) -> str:
76
+ text = unicodedata.normalize("NFKD", text.lower())
77
+ text = "".join(ch for ch in text if not unicodedata.combining(ch))
78
+ return re.sub(r"[^\w\s]", "", text).strip()
79
+
80
+
81
+ def tokenize_text(text: str) -> list[str]:
82
+ return normalize_text(text).split()
83
+
84
+
85
+ def lemmatize_word(word: str) -> str:
86
+ return MORPH.parse(word)[0].normal_form if re.fullmatch(r"[а-я]+", word) else word
87
+
88
+
89
+ def lemmatize_text(text: str) -> list[str]:
90
+ return [lemmatize_word(word) for word in tokenize_text(text)]
91
+
92
+
93
+ def variants(text: str) -> list[str]:
94
+ base = normalize_text(text)
95
+ result = [base]
96
+
97
+ for schema in (iuliia.WIKIPEDIA, iuliia.MOSMETRO, iuliia.ALA_LC):
98
+ try:
99
+ v = normalize_text(schema.translate(base))
100
+ if v and v not in result:
101
+ result.append(v)
102
+ except Exception:
103
+ pass
104
+
105
+ for v in list(result):
106
+ core = " ".join(w for w in v.split() if len(w) > 1 and any(ch.isalpha() for ch in w))
107
+ core = normalize_text(core)
108
+ if core and core not in result:
109
+ result.insert(0, core)
110
+
111
+ return result
112
+
113
+
114
+ def token_alignment_score(phrase_variant: str, candidate_tokens: list[str]) -> float:
115
+ phrase_tokens = [t for t in phrase_variant.split() if len(t) > 2]
116
+ if not phrase_tokens or not candidate_tokens:
117
+ return 0.0
118
+ best_scores = []
119
+ for pt in phrase_tokens:
120
+ best = 0.0
121
+ for ct in candidate_tokens:
122
+ sim = Levenshtein.normalized_similarity(pt, ct)
123
+ if sim > best:
124
+ best = sim
125
+ best_scores.append(best)
126
+ return sum(best_scores) / len(best_scores)
127
+
128
+
129
+ def length_penalty(phrase_len: int, candidate_len: int) -> float:
130
+ if phrase_len == 0 or candidate_len == 0:
131
+ return 0.0
132
+ ratio = min(phrase_len, candidate_len) / max(phrase_len, candidate_len)
133
+ if ratio >= 0.80:
134
+ return 1.0
135
+ if ratio >= 0.60:
136
+ return 0.90
137
+ if ratio >= 0.40:
138
+ return 0.70
139
+ return 0.50
140
+
141
+
142
+ def canonicalize_for_similarity(text: str) -> str:
143
+ t = normalize_text(text).replace(" ", "")
144
+ replacements = (
145
+ ("sch", "sh"),
146
+ ("tch", "ch"),
147
+ ("dzh", "j"),
148
+ ("zh", "j"),
149
+ ("sh", "s"),
150
+ ("ch", "c"),
151
+ ("kh", "h"),
152
+ ("ph", "f"),
153
+ ("ck", "k"),
154
+ ("qu", "k"),
155
+ ("q", "k"),
156
+ ("w", "v"),
157
+ ("x", "ks"),
158
+ ("ts", "z"),
159
+ ("tz", "z"),
160
+ )
161
+ for src, dst in replacements:
162
+ t = t.replace(src, dst)
163
+ return re.sub(r"(.)\1+", r"\1", t)
164
+
165
+
166
+ def phonetic_similarity(left: str, right: str) -> float:
167
+ l = canonicalize_for_similarity(left)
168
+ r = canonicalize_for_similarity(right)
169
+ if not l or not r:
170
+ return 0.0
171
+ char = fuzz.ratio(l, r) / 100.0
172
+ lev = Levenshtein.normalized_similarity(l, r)
173
+ return 0.50 * char + 0.50 * lev
174
+
175
+
176
+ @dataclass(frozen=True)
177
+ class ParsedDate:
178
+ date_iso: str
179
+ matched_expression: Optional[str]
180
+
181
+
182
+ @dataclass(frozen=True)
183
+ class Token:
184
+ original: str
185
+ normalized: str
186
+ raw_lemma: str
187
+ lemma: str
188
+ lemma_correction: Optional[str]
189
+ start: int
190
+ end: int
191
+ lemma_start: int
192
+ lemma_end: int
193
+
194
+
195
+ WORD_RE = re.compile(r"[0-9]+(?:[./-][0-9]+)*|[а-яё]+", re.IGNORECASE)
196
+
197
+
198
+ class UniversalDateParser:
199
+ MONTHS = {
200
+ "январь": 1, "февраль": 2, "март": 3, "апрель": 4, "май": 5, "июнь": 6,
201
+ "июль": 7, "август": 8, "сентябрь": 9, "октябрь": 10, "ноябрь": 11, "декабрь": 12,
202
+ }
203
+ WEEKDAYS = {
204
+ "понедельник": 0, "вторник": 1, "среда": 2, "четверг": 3,
205
+ "пятница": 4, "суббота": 5, "воскресенье": 6,
206
+ }
207
+ DIRECT_RELATIVE = {"послезавтра": 2, "позавчера": -2, "сегодня": 0, "вчера": -1, "завтра": 1}
208
+ ORDINAL_DAYS = {
209
+ "первый": 1, "второй": 2, "третий": 3, "четвертый": 4, "пятый": 5, "шестой": 6,
210
+ "седьмой": 7, "восьмой": 8, "девятый": 9, "десятый": 10, "одиннадцатый": 11,
211
+ "двенадцатый": 12, "тринадцатый": 13, "четырнадцатый": 14, "пятнадцатый": 15,
212
+ "шестнадцатый": 16, "семнадцатый": 17, "восемнадцатый": 18, "девятнадцатый": 19,
213
+ "двадцатый": 20, "двадцать первый": 21, "двадцать второй": 22, "двадцать третий": 23,
214
+ "двадцать четвертый": 24, "двадцать пятый": 25, "двадцать шестой": 26,
215
+ "двадцать седьмой": 27, "двадцать восьмой": 28, "двадцать девятый": 29,
216
+ "тридцатый": 30, "тридцать первый": 31,
217
+ }
218
+ NUMBER_WORDS = {
219
+ "ноль": 0, "один": 1, "два": 2, "три": 3, "четыре": 4, "пять": 5, "шесть": 6,
220
+ "семь": 7, "восемь": 8, "девять": 9, "десять": 10, "одиннадцать": 11,
221
+ "двенадцать": 12, "тринадцать": 13, "четырнадцать": 14, "пятнадцать": 15,
222
+ "шестнадцать": 16, "семнадцать": 17, "восемнадцать": 18, "девятнадцать": 19,
223
+ "двадцать": 20, "тридцать": 30,
224
+ }
225
+ FUTURE_HINTS = ("завтра", "послезавтра", "через", "быть", "заплатить", "следующий", "последующий")
226
+ PAST_HINTS = ("вчера", "позавчера", "назад", "прошлый", "предыдущий", "оплатить", "купить", "заказать")
227
+
228
+ DIRECT_RELATIVE_RE = re.compile(r"(?<!\S)(послезавтра|позавчера|сегодня|вчера|завтра)(?!\S)")
229
+ WEEK_RELATIVE_RE = re.compile(
230
+ r"(?<!\S)на (?P<which>следующий|последующий|прошлый|предыдущий|этот) неделя"
231
+ r"(?: (?P<prep>в|во|на) (?P<weekday>понедельник|вторник|среда|четверг|пятница|суббота|воскресенье))?(?!\S)"
232
+ )
233
+ QUANTITY_RELATIVE_RE = re.compile(
234
+ r"(?<!\S)(?P<number>\d+|[а-яё]+(?: [а-яё]+)?) "
235
+ r"(?P<unit>месяц|неделя|день) "
236
+ r"(?P<ago>назад)"
237
+ r"(?: (?P<prep>в|во|на) (?P<weekday>понедельник|вторник|среда|четверг|пятница|суббота|воскресенье))?(?!\S)",
238
+ re.IGNORECASE,
239
+ )
240
+ FORWARD_QUANTITY_RE = re.compile(
241
+ r"(?<!\S)(?P<through>через) "
242
+ r"(?P<number>\d+|[а-яё]+(?: [а-яё]+)?) "
243
+ r"(?P<unit>месяц|неделя|день)"
244
+ r"(?: (?P<prep>в|во|на) (?P<weekday>понедельник|вторник|среда|четверг|пятница|суббота|воскресенье))?(?!\S)",
245
+ re.IGNORECASE,
246
+ )
247
+ FORWARD_SINGLE_UNIT_RE = re.compile(
248
+ r"(?<!\S)(?P<through>через) "
249
+ r"(?P<unit>месяц|неделя|день)"
250
+ r"(?: (?P<prep>в|во|на) (?P<weekday>понедельник|вторник|среда|четверг|пятница|суббота|воскресенье))?(?!\S)",
251
+ re.IGNORECASE,
252
+ )
253
+ TEXTUAL_ABSOLUTE_RE = re.compile(
254
+ r"(?<!\S)(?P<day>\d{1,2}|[а-яё]+(?: [а-яё]+)?) "
255
+ r"(?P<month>январь|февраль|март|апрель|май|июнь|июль|август|сентябрь|октябрь|ноябрь|декабрь)"
256
+ r"(?: (?P<year>\d{4}))?(?!\S)",
257
+ re.IGNORECASE,
258
+ )
259
+ PERIOD_EDGE_RE = re.compile(
260
+ r"(?<!\S)(?:в )?(?P<edge>начало|конец) (?P<which>этот|следующий|последующий|прошлы��|предыдущий) (?P<unit>неделя|месяц)(?!\S)",
261
+ re.IGNORECASE,
262
+ )
263
+
264
+ @classmethod
265
+ def temporal_vocabulary(cls) -> set[str]:
266
+ vocab: set[str] = set()
267
+ vocab.update(cls.MONTHS)
268
+ vocab.update(cls.WEEKDAYS)
269
+ vocab.update(cls.DIRECT_RELATIVE)
270
+ vocab.update(cls.ORDINAL_DAYS)
271
+ vocab.update(cls.NUMBER_WORDS)
272
+ vocab.update({
273
+ "неделя", "месяц", "день", "назад", "через", "начало", "конец", "на", "в", "во",
274
+ "этот", "прошлый", "предыдущий", "следующий", "последующий",
275
+ })
276
+ return vocab
277
+
278
+ @staticmethod
279
+ def similarity(left: str, right: str) -> float:
280
+ return difflib.SequenceMatcher(None, left, right).ratio()
281
+
282
+ @classmethod
283
+ def pick_temporal_correction(cls, normalized: str, raw_lemma: str) -> tuple[str, Optional[str]]:
284
+ vocab = cls.temporal_vocabulary()
285
+ if raw_lemma in vocab or not normalized.isalpha() or len(normalized) < 5:
286
+ return raw_lemma, None
287
+
288
+ candidates = list(difflib.get_close_matches(normalized, list(vocab), n=4, cutoff=0.74))
289
+ candidates.extend(difflib.get_close_matches(raw_lemma, list(vocab), n=4, cutoff=0.74))
290
+ candidates = list(dict.fromkeys(candidates))
291
+ if not candidates:
292
+ return raw_lemma, None
293
+
294
+ best = max(candidates, key=lambda item: max(cls.similarity(normalized, item), cls.similarity(raw_lemma, item)))
295
+ best_score = max(cls.similarity(normalized, best), cls.similarity(raw_lemma, best))
296
+ return (best, f"{raw_lemma}->{best}") if best_score >= 0.80 else (raw_lemma, None)
297
+
298
+ @staticmethod
299
+ def normalize_word(word: str) -> str:
300
+ return word.lower().replace("ё", "е")
301
+
302
+ @classmethod
303
+ def lemmatize(cls, word: str) -> str:
304
+ return MORPH.parse(word)[0].normal_form if word.isalpha() else word
305
+
306
+ @classmethod
307
+ def tokenize(cls, text: str) -> list[Token]:
308
+ tokens: list[Token] = []
309
+ lemma_cursor = 0
310
+
311
+ for match in WORD_RE.finditer(text):
312
+ original = match.group(0)
313
+ normalized = cls.normalize_word(original)
314
+ raw_lemma = cls.lemmatize(normalized)
315
+ lemma, correction = cls.pick_temporal_correction(normalized, raw_lemma)
316
+ lemma_start = lemma_cursor
317
+ lemma_end = lemma_start + len(lemma)
318
+ tokens.append(Token(original, normalized, raw_lemma, lemma, correction, match.start(), match.end(), lemma_start, lemma_end))
319
+ lemma_cursor = lemma_end + 1
320
+
321
+ return tokens
322
+
323
+ @staticmethod
324
+ def lemma_text(tokens: list[Token]) -> str:
325
+ return " ".join(token.lemma for token in tokens)
326
+
327
+ @staticmethod
328
+ def surface_text(text: str, tokens: list[Token], start_idx: int, end_idx: int) -> str:
329
+ return text[tokens[start_idx].start:tokens[end_idx].end].strip() if tokens else ""
330
+
331
+ @staticmethod
332
+ def lemma_span_to_token_range(tokens: list[Token], span: tuple[int, int]) -> Optional[tuple[int, int]]:
333
+ start_char, end_char = span
334
+ start_idx = end_idx = None
335
+
336
+ for idx, token in enumerate(tokens):
337
+ if start_idx is None and token.lemma_start <= start_char < token.lemma_end:
338
+ start_idx = idx
339
+ if token.lemma_start < end_char <= token.lemma_end:
340
+ end_idx = idx
341
+ break
342
+
343
+ return (start_idx, end_idx) if start_idx is not None and end_idx is not None else None
344
+
345
+ @classmethod
346
+ def make_parsed_date(cls, text: str, tokens: list[Token], match, parsed_date: date) -> Optional[ParsedDate]:
347
+ token_span = cls.lemma_span_to_token_range(tokens, match.span())
348
+ if token_span is None:
349
+ return None
350
+ return ParsedDate(parsed_date.isoformat(), cls.surface_text(text, tokens, token_span[0], token_span[1]))
351
+
352
+ @classmethod
353
+ def parse_number_phrase(cls, phrase: str) -> Optional[int]:
354
+ phrase = phrase.strip()
355
+ if not phrase:
356
+ return None
357
+ if phrase.isdigit():
358
+ return int(phrase)
359
+
360
+ parts = phrase.split()
361
+ if len(parts) == 1:
362
+ return cls.NUMBER_WORDS.get(parts[0])
363
+ if len(parts) == 2 and parts[0] in {"двадцать", "тридцать"}:
364
+ base = cls.NUMBER_WORDS.get(parts[0])
365
+ addon = cls.NUMBER_WORDS.get(parts[1])
366
+ if base is not None and addon is not None and 1 <= addon <= 9:
367
+ return base + addon
368
+ return None
369
+
370
+ @classmethod
371
+ def parse_day_phrase(cls, phrase: str) -> Optional[int]:
372
+ if phrase.isdigit():
373
+ value = int(phrase)
374
+ return value if 1 <= value <= 31 else None
375
+ return cls.ORDINAL_DAYS.get(phrase.strip())
376
+
377
+ @staticmethod
378
+ def shift_months(value: date, months: int) -> date:
379
+ month_index = value.month - 1 + months
380
+ year = value.year + month_index // 12
381
+ month = month_index % 12 + 1
382
+ day = min(value.day, calendar.monthrange(year, month)[1])
383
+ return date(year, month, day)
384
+
385
+ @staticmethod
386
+ def parse_numeric_absolute(tokens: list[Token]) -> Optional[ParsedDate]:
387
+ for token in tokens:
388
+ separator = "." if "." in token.original else "-" if "-" in token.original else "/" if "/" in token.original else None
389
+ if separator is None:
390
+ continue
391
+
392
+ parts = token.original.split(separator)
393
+ if len(parts) != 3 or not all(part.isdigit() for part in parts):
394
+ continue
395
+
396
+ try:
397
+ if len(parts[0]) == 4:
398
+ parsed = date(int(parts[0]), int(parts[1]), int(parts[2]))
399
+ elif len(parts[2]) == 4:
400
+ parsed = date(int(parts[2]), int(parts[1]), int(parts[0]))
401
+ else:
402
+ continue
403
+ return ParsedDate(parsed.isoformat(), token.original)
404
+ except ValueError:
405
+ continue
406
+
407
+ return None
408
+
409
+ @classmethod
410
+ def parse_textual_absolute(cls, text: str, tokens: list[Token], reference_date: date) -> Optional[ParsedDate]:
411
+ lemma_text = cls.lemma_text(tokens)
412
+ for match in cls.TEXTUAL_ABSOLUTE_RE.finditer(lemma_text):
413
+ day = cls.parse_day_phrase(match.group("day"))
414
+ month = cls.MONTHS.get(match.group("month"))
415
+ if day is None or month is None:
416
+ continue
417
+
418
+ year = int(match.group("year")) if match.group("year") else reference_date.year
419
+ try:
420
+ parsed = date(year, month, day)
421
+ except ValueError:
422
+ continue
423
+
424
+ result = cls.make_parsed_date(text, tokens, match, parsed)
425
+ if result is not None:
426
+ return result
427
+
428
+ return None
429
+
430
+ @classmethod
431
+ def parse_direct_relative(cls, text: str, tokens: list[Token], reference_date: date) -> Optional[ParsedDate]:
432
+ lemma_text = cls.lemma_text(tokens)
433
+ match = cls.DIRECT_RELATIVE_RE.search(lemma_text)
434
+ if not match:
435
+ return None
436
+
437
+ parsed = reference_date + timedelta(days=cls.DIRECT_RELATIVE[match.group(1)])
438
+ return cls.make_parsed_date(text, tokens, match, parsed)
439
+
440
+ @staticmethod
441
+ def week_monday(value: date) -> date:
442
+ return value - timedelta(days=value.weekday())
443
+
444
+ @classmethod
445
+ def parse_week_relative(cls, text: str, tokens: list[Token], reference_date: date) -> Optional[ParsedDate]:
446
+ lemma_text = cls.lemma_text(tokens)
447
+ match = cls.WEEK_RELATIVE_RE.search(lemma_text)
448
+ if not match:
449
+ return None
450
+
451
+ offsets = {"следующий": 7, "последующий": 7, "прошлый": -7, "предыдущий": -7, "этот": 0}
452
+ anchor = reference_date + timedelta(days=offsets[match.group("which")])
453
+
454
+ if match.group("weekday"):
455
+ anchor = cls.week_monday(anchor) + timedelta(days=cls.WEEKDAYS[match.group("weekday")])
456
+
457
+ return cls.make_parsed_date(text, tokens, match, anchor)
458
+
459
+ @classmethod
460
+ def parse_period_edge(cls, text: str, tokens: list[Token], reference_date: date) -> Optional[ParsedDate]:
461
+ lemma_text = cls.lemma_text(tokens)
462
+ match = cls.PERIOD_EDGE_RE.search(lemma_text)
463
+ if not match:
464
+ return None
465
+
466
+ edge, which, unit = match.group("edge"), match.group("which"), match.group("unit")
467
+
468
+ if unit == "неделя":
469
+ offsets = {"прошлый": -7, "предыдущий": -7, "этот": 0, "следующий": 7, "последующий": 7}
470
+ monday = cls.week_monday(reference_date + timedelta(days=offsets[which]))
471
+ parsed_date = monday if edge == "начало" else monday + timedelta(days=6)
472
+ else:
473
+ month_offset = {"прошлый": -1, "предыдущий": -1, "этот": 0, "следующий": 1, "последующий": 1}[which]
474
+ shifted = cls.shift_months(date(reference_date.year, reference_date.month, 1), month_offset)
475
+ parsed_date = shifted if edge == "начало" else date(shifted.year, shifted.month, calendar.monthrange(shifted.year, shifted.month)[1])
476
+
477
+ return cls.make_parsed_date(text, tokens, match, parsed_date)
478
+
479
+ @classmethod
480
+ def parse_quantity_relative(cls, text: str, tokens: list[Token], reference_date: date) -> Optional[ParsedDate]:
481
+ lemma_text = cls.lemma_text(tokens)
482
+
483
+ for regex, direction in ((cls.QUANTITY_RELATIVE_RE, -1), (cls.FORWARD_QUANTITY_RE, 1)):
484
+ for match in regex.finditer(lemma_text):
485
+ number = cls.parse_number_phrase(match.group("number"))
486
+ if number is None:
487
+ continue
488
+
489
+ unit = match.group("unit")
490
+ if unit == "месяц":
491
+ anchor = cls.shift_months(reference_date, direction * number)
492
+ else:
493
+ days = number * 7 if unit == "неделя" else number
494
+ anchor = reference_date + timedelta(days=direction * days)
495
+
496
+ if match.group("weekday"):
497
+ anchor = cls.week_monday(anchor) + timedelta(days=cls.WEEKDAYS[match.group("weekday")])
498
+
499
+ result = cls.make_parsed_date(text, tokens, match, anchor)
500
+ if result is not None:
501
+ return result
502
+
503
+ for match in cls.FORWARD_SINGLE_UNIT_RE.finditer(lemma_text):
504
+ unit = match.group("unit")
505
+ if unit == "месяц":
506
+ anchor = cls.shift_months(reference_date, 1)
507
+ else:
508
+ days = 7 if unit == "неделя" else 1
509
+ anchor = reference_date + timedelta(days=days)
510
+
511
+ if match.group("weekday"):
512
+ anchor = cls.week_monday(anchor) + timedelta(days=cls.WEEKDAYS[match.group("weekday")])
513
+
514
+ result = cls.make_parsed_date(text, tokens, match, anchor)
515
+ if result is not None:
516
+ return result
517
+
518
+ return None
519
+
520
+ @classmethod
521
+ def preference_for_text(cls, tokens: list[Token]) -> str:
522
+ lemmas = [token.lemma for token in tokens]
523
+ future = sum(1 for hint in cls.FUTURE_HINTS if hint in lemmas)
524
+ past = sum(1 for hint in cls.PAST_HINTS if hint in lemmas)
525
+ return "future" if future > past else "past"
526
+
527
+ @staticmethod
528
+ def choose_best(matches: list[tuple[str, datetime]]) -> tuple[str, datetime]:
529
+ return sorted(matches, key=lambda item: (len(item[0]), -item[1].timestamp()), reverse=True)[0]
530
+
531
+ def parse(self, text: str, reference_date: date) -> Optional[ParsedDate]:
532
+ tokens = self.tokenize(text)
533
+
534
+ for parser in (
535
+ lambda: self.parse_numeric_absolute(tokens),
536
+ lambda: self.parse_textual_absolute(text, tokens, reference_date),
537
+ lambda: self.parse_direct_relative(text, tokens, reference_date),
538
+ lambda: self.parse_week_relative(text, tokens, reference_date),
539
+ lambda: self.parse_period_edge(text, tokens, reference_date),
540
+ lambda: self.parse_quantity_relative(text, tokens, reference_date),
541
+ ):
542
+ parsed = parser()
543
+ if parsed is not None:
544
+ return parsed
545
+
546
+ normalized = " ".join(token.normalized for token in tokens)
547
+ relative_base = datetime.combine(reference_date, datetime.min.time()).replace(hour=12)
548
+ result = search_dates(
549
+ normalized,
550
+ languages=["ru"],
551
+ settings={
552
+ "RELATIVE_BASE": relative_base,
553
+ "PREFER_DATES_FROM": self.preference_for_text(tokens),
554
+ "STRICT_PARSING": False,
555
+ "REQUIRE_PARTS": [],
556
+ "NORMALIZE": True,
557
+ "RETURN_AS_TIMEZONE_AWARE": False,
558
+ "DATE_ORDER": "DMY",
559
+ },
560
+ )
561
+
562
+ filtered: list[tuple[str, datetime]] = []
563
+ for matched, value in result or []:
564
+ if isinstance(value, datetime) and not matched.strip().isdigit() and 2020 <= value.year <= 2100:
565
+ filtered.append((matched.strip(), value))
566
+
567
+ if not filtered:
568
+ return None
569
+
570
+ matched_expression, value = self.choose_best(filtered)
571
+ return ParsedDate(date_iso=value.date().isoformat(), matched_expression=matched_expression)
572
+
573
+
574
+ class ExpenseDateExtractor:
575
+ def __init__(self) -> None:
576
+ self.parser = UniversalDateParser()
577
+
578
+ def extract(self, text: str, reference_date: str | date | None = None) -> dict[str, Any]:
579
+ ref_date = self.to_date(reference_date or date.today().isoformat())
580
+ parsed = self.parser.parse(text=text, reference_date=ref_date)
581
+
582
+ return {
583
+ "date": datetime.strptime(parsed.date_iso, "%Y-%m-%d").strftime("%d.%m.%Y") if parsed else None,
584
+ "date_iso": parsed.date_iso if parsed else None,
585
+ "matched_date_phrase": parsed.matched_expression if parsed else None,
586
+ }
587
+
588
+ @staticmethod
589
+ def to_date(value: str | date) -> date:
590
+ return value if isinstance(value, date) else datetime.strptime(value, "%Y-%m-%d").date()
591
+
592
+
593
+ class DucklingDateExtractor:
594
+ """
595
+ Извлечение дат через Duckling.
596
+ Поддерживает как точные даты ("15 января 2025"), так и относительные ("завтра", "через 2 дня").
597
+ """
598
+
599
+ def extract(self, text: str, reference_date: str | date | None = None) -> dict[str, Any]:
600
+ """
601
+ Извлекает дату из текста с помощью Duckling.
602
+
603
+ Args:
604
+ text: Текст для анализа
605
+ reference_date: Опорная дата для относительных выражений (по умолчанию - сегодня)
606
+
607
+ Returns:
608
+ {
609
+ "date": "19.04.2026", # формат DD.MM.YYYY
610
+ "date_iso": "2026-04-19", # формат ISO
611
+ "matched_date_phrase": "завтра" # найденное выражение
612
+ }
613
+ """
614
+ ref_datetime = None
615
+ if reference_date:
616
+ ref_date = self.to_date(reference_date)
617
+ ref_datetime = datetime.combine(ref_date, datetime.min.time()).replace(hour=12)
618
+
619
+ # Получаем все даты из текста
620
+ dates = parse_all_dates_with_duckling(text, locale="ru_RU", reference_time=ref_datetime)
621
+
622
+ if not dates:
623
+ return {
624
+ "date": None,
625
+ "date_iso": None,
626
+ "matched_date_phrase": None,
627
+ }
628
+
629
+ # Берём первую найденную дату
630
+ first = dates[0]
631
+ date_iso = first["date"]
632
+
633
+ try:
634
+ parsed_date = datetime.strptime(date_iso, "%Y-%m-%d")
635
+ date_formatted = parsed_date.strftime("%d.%m.%Y")
636
+ except ValueError:
637
+ date_formatted = None
638
+
639
+ return {
640
+ "date": date_formatted,
641
+ "date_iso": date_iso,
642
+ "matched_date_phrase": first["text"],
643
+ }
644
+
645
+ def extract_all(self, text: str, reference_date: str | date | None = None) -> list[dict[str, Any]]:
646
+ """
647
+ Извлекает все даты из текста.
648
+ """
649
+ ref_datetime = None
650
+ if reference_date:
651
+ ref_date = self.to_date(reference_date)
652
+ ref_datetime = datetime.combine(ref_date, datetime.min.time()).replace(hour=12)
653
+
654
+ dates = parse_all_dates_with_duckling(text, locale="ru_RU", reference_time=ref_datetime)
655
+
656
+ results = []
657
+ for d in dates:
658
+ try:
659
+ parsed_date = datetime.strptime(d["date"], "%Y-%m-%d")
660
+ date_formatted = parsed_date.strftime("%d.%m.%Y")
661
+ except ValueError:
662
+ date_formatted = None
663
+
664
+ results.append({
665
+ "date": date_formatted,
666
+ "date_iso": d["date"],
667
+ "matched_date_phrase": d["text"],
668
+ })
669
+
670
+ return results
671
+
672
+ @staticmethod
673
+ def to_date(value: str | date) -> date:
674
+ return value if isinstance(value, date) else datetime.strptime(value, "%Y-%m-%d").date()
675
+
676
+
677
+ # Выбор парсера дат: "duckling" или "legacy" (старый код)
678
+ DATE_PARSER_MODE = os.getenv("DATE_PARSER_MODE", "duckling")
679
+
680
+ def get_date_extractor():
681
+ """Возвращает экстрактор дат в зависимости от настройки."""
682
+ if DATE_PARSER_MODE == "duckling":
683
+ return DucklingDateExtractor()
684
+ return ExpenseDateExtractor()
685
+
686
+
687
+ class ExpenseUserExtractor:
688
+ def __init__(self, users: list[str], suppliers: list[str], model: SentenceTransformer, threshold: float = 0.6) -> None:
689
+ self.users = users
690
+ self.model = model
691
+ self.threshold = threshold
692
+ self.supplier_terms = {normalize_text(supplier) for supplier in suppliers}
693
+ self.user_terms = [normalize_text(user) for user in users]
694
+ self.user_embeddings = model.encode(
695
+ [f"passage: {user}" for user in self.user_terms],
696
+ convert_to_tensor=True,
697
+ normalize_embeddings=True,
698
+ )
699
+
700
+ def extract(self, text: str, supplier_phrase: str | None = None, date_phrase: str | None = None) -> dict[str, Any]:
701
+ excluded_tokens: set[str] = set()
702
+ if supplier_phrase:
703
+ excluded_tokens.update(normalize_text(supplier_phrase).split())
704
+ if date_phrase:
705
+ excluded_tokens.update(normalize_text(date_phrase).split())
706
+
707
+ best_user = None
708
+ best_score = -1.0
709
+ best_phrase = None
710
+
711
+ for word in lemmatize_text(text):
712
+ if len(word) < 3:
713
+ continue
714
+ if word in excluded_tokens or word in self.supplier_terms:
715
+ continue
716
+
717
+ query_emb = self.model.encode(
718
+ f"query: {word}",
719
+ convert_to_tensor=True,
720
+ normalize_embeddings=True,
721
+ )
722
+ similarities = torch.cosine_similarity(query_emb.unsqueeze(0), self.user_embeddings, dim=1)
723
+ idx = int(torch.argmax(similarities))
724
+ score = similarities[idx].item()
725
+
726
+ if score > best_score:
727
+ best_score = score
728
+ best_user = self.users[idx]
729
+ best_phrase = word
730
+
731
+ if best_score >= self.threshold:
732
+ return {
733
+ "user": best_user,
734
+ "user_score": round(best_score, 4),
735
+ "matched_user_phrase": best_phrase,
736
+ }
737
+
738
+ if re.search(r"(?<!\S)я(?!\S)", normalize_text(text), re.IGNORECASE):
739
+ return {
740
+ "user": "Я",
741
+ "user_score": 1.0,
742
+ "matched_user_phrase": "я",
743
+ }
744
+
745
+ return {
746
+ "user": None,
747
+ "user_score": None,
748
+ "matched_user_phrase": None,
749
+ }
750
+
751
+
752
+ class ExpenseSupplierExtractor:
753
+ def __init__(self, suppliers: list[str]) -> None:
754
+ self.suppliers = suppliers
755
+ self.sup_norm = [normalize_text(s) for s in suppliers]
756
+ self.sup_tokens = [s.split() for s in self.sup_norm]
757
+ self.sup_num_sets = [self.numeric_tokens(s) for s in self.sup_norm]
758
+ self.sup_number_tokens = {token for supplier in self.sup_tokens for token in supplier if token.isdigit()}
759
+ self.supplier_lexicon = [
760
+ token
761
+ for token in sorted({tok for tokens in self.sup_tokens for tok in tokens})
762
+ if token and not token.isdigit()
763
+ ]
764
+ self.tfidf = TfidfVectorizer(analyzer="char_wb", ngram_range=(3, 5))
765
+ self.sup_mat = self.tfidf.fit_transform(self.sup_norm)
766
+ self.max_words = max(len(s.split()) for s in self.sup_norm)
767
+ self.variant_cache: dict[str, list[str]] = {}
768
+ self.lexical_token_cache: dict[str, float] = {}
769
+ self.phrase_support_cache: dict[str, float] = {}
770
+ self.noise_terms = {
771
+ "за", "на", "из", "для", "под", "над", "при", "без", "и", "или",
772
+ "купил", "купила", "купили", "покупка", "заказал", "заказала", "заказали",
773
+ "оплатил", "оплатила", "оплатили", "заплатил", "заплатила", "заплатили",
774
+ "был", "была", "было", "были", "утром", "днем", "днём", "вечером", "ночью",
775
+ "товар", "товары", "продукт", "продукты", "десерт", "еда",
776
+ "лей", "лея", "леи", "целых", "сотых", "сом", "сомов", "руб", "рублей", "грн", "usd", "eur",
777
+ }
778
+ self.noise_terms.update(UniversalDateParser.temporal_vocabulary())
779
+
780
+ @staticmethod
781
+ def numeric_tokens(text: str) -> set[str]:
782
+ return set(re.findall(r"\d+", text))
783
+
784
+ def cached_variants(self, text: str) -> list[str]:
785
+ key = normalize_text(text)
786
+ cached = self.variant_cache.get(key)
787
+ if cached is None:
788
+ cached = variants(key)
789
+ self.variant_cache[key] = cached
790
+ return cached
791
+
792
+ @staticmethod
793
+ def split_words(text: str) -> list[str]:
794
+ return [w for w in normalize_text(text).split() if w]
795
+
796
+ @classmethod
797
+ def is_supplier_extension(cls, base_supplier: str, extended_supplier: str) -> bool:
798
+ base_tokens = cls.split_words(base_supplier)
799
+ extended_tokens = cls.split_words(extended_supplier)
800
+ return len(base_tokens) < len(extended_tokens) and extended_tokens[:len(base_tokens)] == base_tokens
801
+
802
+ @classmethod
803
+ def phrase_token_count(cls, phrase: str | None) -> int:
804
+ return len(cls.split_words(phrase or ""))
805
+
806
+ @classmethod
807
+ def resolve_overlapping_suppliers(cls, ranking: list[dict[str, Any]]) -> dict[str, Any]:
808
+ if not ranking:
809
+ return {"supplier": None, "score": -1.0, "phrase": None}
810
+
811
+ best = ranking[0]
812
+ best_combined = float(best.get("combined", best.get("score", -1.0)))
813
+ best_phrase_len = cls.phrase_token_count(best.get("phrase"))
814
+
815
+ for alt in ranking[1:]:
816
+ if not cls.is_supplier_extension(str(best.get("supplier") or ""), str(alt.get("supplier") or "")):
817
+ continue
818
+
819
+ alt_combined = float(alt.get("combined", alt.get("score", -1.0)))
820
+ alt_phrase_len = cls.phrase_token_count(alt.get("phrase"))
821
+
822
+ if alt_phrase_len > best_phrase_len and alt_combined >= best_combined - 0.15:
823
+ best = alt
824
+ best_combined = alt_combined
825
+ best_phrase_len = alt_phrase_len
826
+
827
+ return best
828
+
829
+ @staticmethod
830
+ def numeric_compatibility_multiplier(phrase_nums: set[str], candidate_nums: set[str]) -> float:
831
+ if not phrase_nums and not candidate_nums:
832
+ return 1.0
833
+ if phrase_nums == candidate_nums:
834
+ return 1.08
835
+ if phrase_nums and candidate_nums:
836
+ return 1.03 if phrase_nums & candidate_nums else 0.80
837
+ return 0.82
838
+
839
+ def lexical_support(self, phrase: str) -> float:
840
+ tokens = [token for token in normalize_text(phrase).split() if token and not token.isdigit()]
841
+ if not tokens or not self.supplier_lexicon:
842
+ return 0.0
843
+
844
+ support_scores: list[float] = []
845
+ for token in tokens:
846
+ cached = self.lexical_token_cache.get(token)
847
+ if cached is not None:
848
+ support_scores.append(cached)
849
+ continue
850
+
851
+ best = 0.0
852
+ for token_variant in self.cached_variants(token):
853
+ for lex in self.supplier_lexicon:
854
+ lev = Levenshtein.normalized_similarity(token_variant, lex)
855
+ phon = phonetic_similarity(token_variant, lex)
856
+ sim = max(lev, phon)
857
+ if sim > best:
858
+ best = sim
859
+
860
+ self.lexical_token_cache[token] = best
861
+ support_scores.append(best)
862
+
863
+ return sum(support_scores) / len(support_scores)
864
+
865
+ def score_phrase(self, phrase: str) -> dict[str, Any]:
866
+ vs = self.cached_variants(phrase)
867
+ q = self.tfidf.transform(vs)
868
+ tf = cosine_similarity(q, self.sup_mat)
869
+
870
+ best: dict[str, Any] = {"supplier": None, "score": -1.0, "phrase": phrase, "variant": ""}
871
+ for i, cand in enumerate(self.sup_norm):
872
+ local = -1.0
873
+ local_variant = ""
874
+ candidate_nums = self.sup_num_sets[i]
875
+ for j, v in enumerate(vs):
876
+ char = fuzz.ratio(v, cand) / 100.0
877
+ tf_val = float(tf[j, i])
878
+ penalty = length_penalty(len(v), len(cand))
879
+ phon = phonetic_similarity(v, cand)
880
+ phrase_nums = self.numeric_tokens(v)
881
+
882
+ if len(v.split()) == 1 and len(cand.split()) == 1:
883
+ lev = Levenshtein.normalized_similarity(v, cand)
884
+ val = (0.45 * lev + 0.25 * char + 0.10 * tf_val + 0.20 * phon) * penalty
885
+ else:
886
+ align = token_alignment_score(v, self.sup_tokens[i])
887
+ tok = fuzz.token_set_ratio(v, cand) / 100.0
888
+ val = (0.30 * char + 0.20 * tok + 0.10 * tf_val + 0.20 * align + 0.20 * phon) * penalty
889
+
890
+ compact_v = v.replace(" ", "")
891
+ compact_cand = cand.replace(" ", "")
892
+ compact_char = fuzz.ratio(compact_v, compact_cand) / 100.0
893
+ compact_lev = Levenshtein.normalized_similarity(compact_v, compact_cand)
894
+ compact_phon = phonetic_similarity(compact_v, compact_cand)
895
+ compact = max(compact_char, compact_lev, compact_phon)
896
+ if compact > 0.55:
897
+ val = max(val, compact * penalty)
898
+
899
+ val *= self.numeric_compatibility_multiplier(phrase_nums, candidate_nums)
900
+
901
+ if val > local:
902
+ local = val
903
+ local_variant = v
904
+
905
+ if local > best["score"]:
906
+ best = {"supplier": self.suppliers[i], "score": local, "phrase": phrase, "variant": local_variant}
907
+ return best
908
+
909
+ def extract(self, text: str, date_phrase: str | None = None, debug: bool = False) -> dict[str, Any]:
910
+ threshold = 0.50
911
+ excluded_tokens: set[str] = set()
912
+ if date_phrase:
913
+ excluded_tokens.update(normalize_text(date_phrase).split())
914
+ excluded_tokens.update(self.noise_terms)
915
+
916
+ raw_tokens = normalize_text(text).split()
917
+ tokens: list[str] = []
918
+ for token in raw_tokens:
919
+ if token in excluded_tokens:
920
+ continue
921
+
922
+ if token.isdigit():
923
+ if token in self.sup_number_tokens:
924
+ tokens.append(token)
925
+
926
+ if tokens and len(token) <= 3 and len(tokens[-1]) >= 4 and tokens[-1].isalpha():
927
+ tokens.append(f"{tokens[-1]}{token}")
928
+ continue
929
+
930
+ if len(token) > 1:
931
+ tokens.append(token)
932
+
933
+ tokens = [t for t in tokens if len(t) > 1 and t not in excluded_tokens]
934
+
935
+ phrases: list[str] = []
936
+ seen: set[str] = set()
937
+ for i in range(len(tokens)):
938
+ for j in range(i + 1, min(i + 1 + self.max_words, len(tokens) + 1)):
939
+ p = " ".join(tokens[i:j])
940
+ if p not in seen:
941
+ seen.add(p)
942
+ phrases.append(p)
943
+
944
+ results = [self.score_phrase(p) for p in phrases]
945
+ candidate_rows: list[dict[str, Any]] = []
946
+ best_by_supplier: dict[str, dict[str, Any]] = {}
947
+ for row in results:
948
+ supplier = row["supplier"]
949
+ score = float(row.get("score", -1.0))
950
+ phrase = str(row.get("phrase") or "")
951
+ support = self.phrase_support_cache.get(phrase)
952
+ if support is None:
953
+ support = self.lexical_support(phrase)
954
+ self.phrase_support_cache[phrase] = support
955
+ combined = 0.75 * score + 0.25 * support
956
+
957
+ if debug:
958
+ candidate_rows.append({
959
+ "supplier": supplier,
960
+ "phrase": phrase,
961
+ "score": round(score, 4),
962
+ "support": round(support, 4),
963
+ "combined": round(combined, 4),
964
+ })
965
+
966
+ enriched = {**row, "combined": combined}
967
+ passes = score >= threshold or combined >= 0.48
968
+ if passes and (supplier not in best_by_supplier or combined > float(best_by_supplier[supplier].get("combined", -1.0))):
969
+ best_by_supplier[supplier] = enriched
970
+
971
+ if not best_by_supplier and results:
972
+ def support_for_phrase(phrase: str) -> float:
973
+ cached_support = self.phrase_support_cache.get(phrase)
974
+ if cached_support is None:
975
+ cached_support = self.lexical_support(phrase)
976
+ self.phrase_support_cache[phrase] = cached_support
977
+ return cached_support
978
+
979
+ fallback = max(
980
+ results,
981
+ key=lambda item: 0.75 * float(item.get("score", -1.0)) + 0.25 * support_for_phrase(str(item.get("phrase") or "")),
982
+ )
983
+ fallback_score = float(fallback.get("score", -1.0))
984
+ fallback_phrase = str(fallback.get("phrase") or "")
985
+ fallback_support = support_for_phrase(fallback_phrase)
986
+ fallback_combined = 0.75 * fallback_score + 0.25 * fallback_support
987
+ if fallback_score >= 0.40 and fallback_support >= 0.43 and fallback_combined >= 0.43:
988
+ best_by_supplier[fallback["supplier"]] = {**fallback, "combined": fallback_combined}
989
+
990
+ supplier_ranking = sorted(best_by_supplier.values(), key=lambda x: float(x.get("combined", x["score"])), reverse=True)
991
+ best = self.resolve_overlapping_suppliers(supplier_ranking)
992
+
993
+ payload = {
994
+ "supplier": best["supplier"],
995
+ "supplier_score": round(best["score"], 4) if best["score"] >= 0 else None,
996
+ "matched_supplier_phrase": best.get("phrase"),
997
+ }
998
+
999
+ if debug:
1000
+ top_candidates = sorted(candidate_rows, key=lambda item: item["combined"], reverse=True)[:8]
1001
+ payload["supplier_debug"] = {
1002
+ "tokens": tokens,
1003
+ "phrases_count": len(phrases),
1004
+ "top_candidates": top_candidates,
1005
+ }
1006
+
1007
+ return payload
1008
+
1009
+
1010
+ class ExpenseAmountExtractor:
1011
+ def __init__(self, suppliers: list[str]) -> None:
1012
+ self.model = get_amount_model()
1013
+
1014
+ @staticmethod
1015
+ def to_float(value: str) -> Optional[float]:
1016
+ cleaned = value.replace(" ", "").replace("\u00A0", "")
1017
+ match = re.search(r"\d+(?:[,]\d{1,2})?", cleaned)
1018
+ if not match:
1019
+ return None
1020
+ try:
1021
+ return float(match.group(0).replace(",", "."))
1022
+ except ValueError:
1023
+ return None
1024
+
1025
+ @staticmethod
1026
+ def phrase_span(text: str, phrase: Optional[str]) -> Optional[tuple[int, int]]:
1027
+ if not phrase:
1028
+ return None
1029
+ idx = text.lower().find(phrase.lower())
1030
+ if idx == -1:
1031
+ return None
1032
+ return idx, idx + len(phrase)
1033
+
1034
+ @staticmethod
1035
+ def overlaps(span1: tuple[int, int], span2: Optional[tuple[int, int]]) -> bool:
1036
+ if span2 is None:
1037
+ return False
1038
+ return span1[0] < span2[1] and span2[0] < span1[1]
1039
+
1040
+ @staticmethod
1041
+ def expand_amount_text(text: str, start: int, end: int) -> tuple[str, tuple[int, int]]:
1042
+ suffix = re.match(r",\d{1,2}", text[end:])
1043
+ if suffix:
1044
+ new_end = end + len(suffix.group(0))
1045
+ return text[start:new_end].strip(), (start, new_end)
1046
+
1047
+ prefix = re.search(r"(\d{1,3}(?:\s*\d{3})*),", text[:start])
1048
+ if prefix:
1049
+ new_start = prefix.start(1)
1050
+ return text[new_start:end].strip(), (new_start, end)
1051
+
1052
+ return text[start:end].strip(), (start, end)
1053
+
1054
+ def extract(
1055
+ self,
1056
+ text: str,
1057
+ matched_date_phrase: Optional[str] = None,
1058
+ matched_supplier_phrase: Optional[str] = None,
1059
+ ) -> dict[str, Any]:
1060
+ if self.model is None:
1061
+ return {"amount": None, "amount_text": None}
1062
+
1063
+ date_span = self.phrase_span(text, matched_date_phrase)
1064
+ supplier_span = self.phrase_span(text, matched_supplier_phrase)
1065
+ entities = self.model.predict_entities(text, ["money"], threshold=0.3)
1066
+
1067
+ for ent in sorted(entities, key=lambda item: float(item.get("score", 0.0)), reverse=True):
1068
+ raw_span = (int(ent.get("start", 0)), int(ent.get("end", 0)))
1069
+ amount_text, span = self.expand_amount_text(text, raw_span[0], raw_span[1])
1070
+ amount = self.to_float(amount_text)
1071
+ overlaps_date = self.overlaps(span, date_span)
1072
+ overlaps_supplier = self.overlaps(span, supplier_span)
1073
+
1074
+ if amount is None:
1075
+ continue
1076
+ if overlaps_date or overlaps_supplier:
1077
+ continue
1078
+ return {"amount": amount, "amount_text": amount_text}
1079
+
1080
+ return {"amount": None, "amount_text": None}
1081
+
1082
+
1083
+ class ExpenseTextExtractor:
1084
+ def __init__(self, suppliers: list[str], users: list[str]) -> None:
1085
+ self.date_extractor = ExpenseDateExtractor()
1086
+ self.supplier_extractor = ExpenseSupplierExtractor(suppliers=suppliers)
1087
+ self.amount_extractor = ExpenseAmountExtractor(suppliers=suppliers)
1088
+ self.user_extractor = ExpenseUserExtractor(users=users, suppliers=suppliers, model=get_embedding_model())
1089
+
1090
+ def extract(self, text: str, reference_date: str | date | None = None, debug_supplier: bool = False) -> dict[str, Any]:
1091
+ date_info = self.date_extractor.extract(text, reference_date=reference_date)
1092
+ supplier_info = self.supplier_extractor.extract(
1093
+ text,
1094
+ date_phrase=date_info.get("matched_date_phrase"),
1095
+ debug=debug_supplier,
1096
+ )
1097
+ user_info = self.user_extractor.extract(
1098
+ text,
1099
+ supplier_phrase=supplier_info.get("matched_supplier_phrase"),
1100
+ date_phrase=date_info.get("matched_date_phrase"),
1101
+ )
1102
+ amount_info = self.amount_extractor.extract(
1103
+ text,
1104
+ matched_date_phrase=date_info["matched_date_phrase"],
1105
+ matched_supplier_phrase=supplier_info["matched_supplier_phrase"],
1106
+ )
1107
+
1108
+ result = {
1109
+ "text": text,
1110
+ "user": user_info["user"],
1111
+ "supplier": supplier_info["supplier"],
1112
+ "amount": amount_info["amount"],
1113
+ "date": date_info["date"],
1114
+ "date_iso": date_info["date_iso"],
1115
+ }
1116
+ if debug_supplier and "supplier_debug" in supplier_info:
1117
+ result["supplier_debug"] = supplier_info["supplier_debug"]
1118
+ return result
1119
+
1120
+
1121
+ def build_default_pipeline(suppliers: list[str], users: list[str]) -> ExpenseTextExtractor:
1122
+ return ExpenseTextExtractor(suppliers=suppliers, users=users)
1123
+
1124
+
1125
+ def extract_names(items: Any) -> list[str]:
1126
+ if not isinstance(items, list):
1127
+ return []
1128
+
1129
+ names: list[str] = []
1130
+ for item in items:
1131
+ if isinstance(item, dict):
1132
+ name = item.get("name")
1133
+ if isinstance(name, str) and name.strip():
1134
+ names.append(name.strip())
1135
+ continue
1136
+
1137
+ if isinstance(item, str) and item.strip():
1138
+ names.append(item.strip())
1139
+
1140
+ return names
1141
+
1142
+
1143
+ def polish_notes_text(text: str) -> str:
1144
+ normalized = re.sub(r"\s+", " ", text).strip()
1145
+ if not normalized:
1146
+ return ""
1147
+
1148
+ normalized = normalized[0].upper() + normalized[1:]
1149
+ if normalized[-1] not in ".!?":
1150
+ normalized += "."
1151
+
1152
+ return normalized
1153
+
1154
+
1155
+ def transcribe_audio_text(audio_path: str) -> str:
1156
+ mock_text = os.getenv("EXPENSE_VOICE_MOCK_TEXT")
1157
+ if mock_text:
1158
+ return mock_text.strip()
1159
+
1160
+ try:
1161
+ whisper_model = get_whisper_model()
1162
+ segments, _ = whisper_model.transcribe(audio_path, language="ru", vad_filter=True)
1163
+ text = " ".join(segment.text.strip() for segment in segments if segment.text and segment.text.strip())
1164
+ if text:
1165
+ return text
1166
+ except Exception:
1167
+ pass
1168
+
1169
+ raise RuntimeError("Speech-to-text backend is unavailable. Install faster-whisper or set EXPENSE_VOICE_MOCK_TEXT.")
1170
+
1171
+
1172
+ def process_voice_request(audio_path: str, mode: str, payload: dict[str, Any]) -> dict[str, Any]:
1173
+ context = payload.get("context", {}) if isinstance(payload, dict) else {}
1174
+ supplier_names = extract_names(context.get("suppliers"))
1175
+ user_names = extract_names(context.get("users"))
1176
+
1177
+ transcript = transcribe_audio_text(audio_path)
1178
+
1179
+ if mode == "notes":
1180
+ notes = polish_notes_text(transcript)
1181
+ return {
1182
+ "status": "ok",
1183
+ "text": transcript,
1184
+ "notes": notes,
1185
+ "supplier": None,
1186
+ "user": None,
1187
+ "date": None,
1188
+ "sum": None,
1189
+ }
1190
+
1191
+ if not supplier_names:
1192
+ raise RuntimeError("No suppliers were provided by Laravel context.")
1193
+
1194
+ if not user_names:
1195
+ raise RuntimeError("No users were provided by Laravel context.")
1196
+
1197
+ extractor = build_default_pipeline(suppliers=supplier_names, users=user_names)
1198
+ extracted = extractor.extract(transcript, reference_date=date.today().isoformat())
1199
+
1200
+ return {
1201
+ "status": "ok",
1202
+ "text": transcript,
1203
+ "notes": polish_notes_text(extracted.get("text") or transcript),
1204
+ "supplier": extracted.get("supplier"),
1205
+ "user": extracted.get("user"),
1206
+ "date": extracted.get("date_iso") or extracted.get("date"),
1207
+ "sum": extracted.get("amount"),
1208
+ }
1209
+
1210
+
1211
+ def require_auth():
1212
+ expected_token = os.getenv("PYTHON_API_TOKEN", os.getenv("EXPENSE_VOICE_FASTAPI_TOKEN", "")).strip()
1213
+
1214
+ if not expected_token:
1215
+ return None
1216
+
1217
+ authorization = request.headers.get("Authorization", "")
1218
+ if not authorization.startswith("Bearer "):
1219
+ return jsonify({"status": "error", "message": "Missing bearer token."}), 401
1220
+
1221
+ provided = authorization.removeprefix("Bearer ").strip()
1222
+ if provided != expected_token:
1223
+ return jsonify({"status": "error", "message": "Invalid bearer token."}), 401
1224
+
1225
+ return None
1226
+
1227
+
1228
+ def parse_context(raw: str | None) -> dict[str, Any]:
1229
+ if not raw:
1230
+ return {}
1231
+
1232
+ try:
1233
+ payload = json.loads(raw)
1234
+ return payload if isinstance(payload, dict) else {}
1235
+ except json.JSONDecodeError:
1236
+ return {}
1237
+
1238
+
1239
+ @app.get("/")
1240
+ def index():
1241
+ return jsonify({
1242
+ "status": "ok",
1243
+ "message": "Voice processing API is running",
1244
+ "endpoints": {
1245
+ "POST /process-audio": "Process audio file",
1246
+ "GET /health": "Health check"
1247
+ }
1248
+ })
1249
+
1250
+
1251
+ @app.get("/health")
1252
+ def health():
1253
+ return jsonify({"status": "ok"})
1254
+
1255
+
1256
+ @app.post("/process-audio")
1257
+ def process_audio():
1258
+ auth_error = require_auth()
1259
+ if auth_error:
1260
+ return auth_error
1261
+
1262
+ audio = request.files.get("audio")
1263
+ mode = (request.form.get("mode") or "expense").strip()
1264
+ context = parse_context(request.form.get("context"))
1265
+
1266
+ if audio is None:
1267
+ return jsonify({"status": "error", "message": "Audio file is required."}), 422
1268
+
1269
+ suffix = Path(audio.filename or "voice.webm").suffix or ".webm"
1270
+ temp_path = None
1271
+
1272
+ try:
1273
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
1274
+ temp_path = temp_file.name
1275
+ audio.save(temp_file)
1276
+
1277
+ result = process_voice_request(audio_path=temp_path, mode=mode, payload={"context": context})
1278
+ return jsonify(result)
1279
+ except Exception as exception:
1280
+ return jsonify({"status": "error", "message": str(exception)}), 422
1281
+ finally:
1282
+ if temp_path and os.path.exists(temp_path):
1283
+ os.unlink(temp_path)
1284
+
1285
+
1286
+ if __name__ == "__main__":
1287
+ app.run(host="0.0.0.0", port=int(os.getenv("PORT", "7860")))
duckling_client.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ from datetime import datetime
4
+ from typing import Optional
5
+
6
+ DUCKLING_URL = os.getenv("DUCKLING_URL", "http://localhost:8000/parse")
7
+
8
+
9
+ def parse_date_with_duckling(
10
+ text: str,
11
+ locale: str = "ru_RU",
12
+ reference_time: Optional[datetime] = None
13
+ ) -> Optional[str]:
14
+ """
15
+ Извлекает дату из текста с помощью Duckling.
16
+
17
+ Args:
18
+ text: Текст для парсинга (например: "завтра", "через 2 дня", "15 января 2025")
19
+ locale: Локаль для парсинга (ru_RU для русского)
20
+ reference_time: Опорное время для относительных дат (по умолчанию - сейчас)
21
+
22
+ Returns:
23
+ ISO дата (YYYY-MM-DD) или None если дата не найдена
24
+ """
25
+ if reference_time is None:
26
+ reference_time = datetime.now()
27
+
28
+ # Формат времени для Duckling: "2026-04-19T12:00:00+03:00"
29
+ ref_time_str = reference_time.strftime("%Y-%m-%dT%H:%M:%S+00:00")
30
+
31
+ try:
32
+ response = requests.post(
33
+ DUCKLING_URL,
34
+ data={
35
+ "locale": locale,
36
+ "text": text,
37
+ "dims": '["time"]', # Извлекаем только даты/время
38
+ "reftime": ref_time_str
39
+ },
40
+ timeout=5
41
+ )
42
+ response.raise_for_status()
43
+ results = response.json()
44
+
45
+ if results and len(results) > 0:
46
+ # Берём первый найденный результат
47
+ value = results[0].get("value", {})
48
+
49
+ # Duckling возвращает разные форматы
50
+ if "value" in value:
51
+ # Формат: {"value": "2026-04-20T00:00:00.000+00:00", "grain": "day"}
52
+ date_str = value["value"]
53
+ # Извлекаем только дату
54
+ return date_str[:10] # "2026-04-20"
55
+ elif "from" in value:
56
+ # Интервал: {"from": {...}, "to": {...}}
57
+ from_value = value["from"].get("value", "")
58
+ return from_value[:10] if from_value else None
59
+
60
+ return None
61
+
62
+ except requests.RequestException as e:
63
+ print(f"Duckling error: {e}")
64
+ return None
65
+ except (KeyError, IndexError, ValueError) as e:
66
+ print(f"Duckling parse error: {e}")
67
+ return None
68
+
69
+
70
+ def parse_all_dates_with_duckling(
71
+ text: str,
72
+ locale: str = "ru_RU",
73
+ reference_time: Optional[datetime] = None
74
+ ) -> list[dict]:
75
+ """
76
+ Извлекает все даты из текста.
77
+
78
+ Returns:
79
+ Список словарей: [{"text": "завтра", "date": "2026-04-20", "start": 0, "end": 6}, ...]
80
+ """
81
+ if reference_time is None:
82
+ reference_time = datetime.now()
83
+
84
+ ref_time_str = reference_time.strftime("%Y-%m-%dT%H:%M:%S+00:00")
85
+
86
+ try:
87
+ response = requests.post(
88
+ DUCKLING_URL,
89
+ data={
90
+ "locale": locale,
91
+ "text": text,
92
+ "dims": '["time"]',
93
+ "reftime": ref_time_str
94
+ },
95
+ timeout=5
96
+ )
97
+ response.raise_for_status()
98
+ results = response.json()
99
+
100
+ dates = []
101
+ for result in results:
102
+ value = result.get("value", {})
103
+ date_str = None
104
+
105
+ if "value" in value:
106
+ date_str = value["value"][:10]
107
+ elif "from" in value:
108
+ from_val = value["from"].get("value", "")
109
+ date_str = from_val[:10] if from_val else None
110
+
111
+ if date_str:
112
+ dates.append({
113
+ "text": result.get("body", ""),
114
+ "date": date_str,
115
+ "start": result.get("start", 0),
116
+ "end": result.get("end", 0)
117
+ })
118
+
119
+ return dates
120
+
121
+ except Exception as e:
122
+ print(f"Duckling error: {e}")
123
+ return []
124
+
125
+
126
+ # Пример использования
127
+ if __name__ == "__main__":
128
+ test_phrases = [
129
+ "завтра",
130
+ "через 2 дня",
131
+ "на следующей неделе",
132
+ "15 января 2025",
133
+ "позавчера",
134
+ "в прошлый понедельник",
135
+ "оплата за март"
136
+ ]
137
+
138
+ print("Тестирование Duckling:\n")
139
+ for phrase in test_phrases:
140
+ date = parse_date_with_duckling(phrase)
141
+ print(f" '{phrase}' -> {date}")
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ flask==3.1.0
2
+ gunicorn==23.0.0
3
+ faster-whisper
4
+ pymorphy3
5
+ rapidfuzz
6
+ dateparser
7
+ iuliia
8
+ torch
9
+ sentence-transformers
10
+ scikit-learn
11
+ gliner
12
+ requests
supervisord.conf ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [supervisord]
2
+ nodaemon=true
3
+ logfile=/tmp/supervisord.log
4
+ pidfile=/tmp/supervisord.pid
5
+ childlogdir=/tmp
6
+
7
+ [program:duckling]
8
+ command=/usr/local/bin/duckling-example-exe -p 8000
9
+ autostart=true
10
+ autorestart=true
11
+ stdout_logfile=/dev/stdout
12
+ stdout_logfile_maxbytes=0
13
+ stderr_logfile=/dev/stderr
14
+ stderr_logfile_maxbytes=0
15
+ startsecs=3
16
+
17
+ [program:gunicorn]
18
+ command=gunicorn --bind 0.0.0.0:7860 --workers 1 --threads 8 --timeout 120 app:app
19
+ directory=/home/user/app
20
+ autostart=true
21
+ autorestart=true
22
+ stdout_logfile=/dev/stdout
23
+ stdout_logfile_maxbytes=0
24
+ stderr_logfile=/dev/stderr
25
+ stderr_logfile_maxbytes=0
26
+ startsecs=5