Spaces:
Running
Running
VladGeekPro commited on
Commit ·
218085c
1
Parent(s): e693b61
NewStructureForProject
Browse files- Dockerfile +3 -3
- app.py +62 -961
- extractors/__init__.py +21 -0
- extractors/amount_extractor.py +124 -0
- extractors/date_extractor.py +518 -0
- extractors/supplier_extractor.py +402 -0
- extractors/user_extractor.py +142 -0
- natasha_dates.py +0 -589
- requirements.txt +0 -1
Dockerfile
CHANGED
|
@@ -2,8 +2,7 @@ FROM python:3.11-slim
|
|
| 2 |
|
| 3 |
ENV PYTHONUNBUFFERED=1 PIP_NO_CACHE_DIR=1 HOME=/home/user \
|
| 4 |
PATH=/home/user/.local/bin:$PATH PORT=7860 \
|
| 5 |
-
WHISPER_MODEL=large-v3 WHISPER_COMPUTE_TYPE=int8
|
| 6 |
-
DATE_PARSER_MODE=natasha
|
| 7 |
|
| 8 |
RUN apt-get update && apt-get install -y --no-install-recommends ffmpeg \
|
| 9 |
&& rm -rf /var/lib/apt/lists/* \
|
|
@@ -15,7 +14,8 @@ WORKDIR /home/user/app
|
|
| 15 |
COPY --chown=user requirements.txt .
|
| 16 |
RUN pip install --upgrade pip && pip install -r requirements.txt
|
| 17 |
|
| 18 |
-
COPY --chown=user app.py
|
|
|
|
| 19 |
|
| 20 |
EXPOSE 7860
|
| 21 |
CMD ["gunicorn", "--bind", "0.0.0.0:7860", "--workers", "1", "--threads", "4", "--timeout", "120", "app:app"]
|
|
|
|
| 2 |
|
| 3 |
ENV PYTHONUNBUFFERED=1 PIP_NO_CACHE_DIR=1 HOME=/home/user \
|
| 4 |
PATH=/home/user/.local/bin:$PATH PORT=7860 \
|
| 5 |
+
WHISPER_MODEL=large-v3 WHISPER_COMPUTE_TYPE=int8
|
|
|
|
| 6 |
|
| 7 |
RUN apt-get update && apt-get install -y --no-install-recommends ffmpeg \
|
| 8 |
&& rm -rf /var/lib/apt/lists/* \
|
|
|
|
| 14 |
COPY --chown=user requirements.txt .
|
| 15 |
RUN pip install --upgrade pip && pip install -r requirements.txt
|
| 16 |
|
| 17 |
+
COPY --chown=user app.py ./
|
| 18 |
+
COPY --chown=user extractors/ ./extractors/
|
| 19 |
|
| 20 |
EXPOSE 7860
|
| 21 |
CMD ["gunicorn", "--bind", "0.0.0.0:7860", "--workers", "1", "--threads", "4", "--timeout", "120", "app:app"]
|
app.py
CHANGED
|
@@ -1,39 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
-
import calendar
|
| 4 |
-
import difflib
|
| 5 |
import json
|
| 6 |
import os
|
| 7 |
-
import re
|
| 8 |
import tempfile
|
| 9 |
-
import
|
| 10 |
-
from dataclasses import dataclass
|
| 11 |
-
from datetime import date, datetime, timedelta
|
| 12 |
from pathlib import Path
|
| 13 |
from typing import Any, Optional
|
| 14 |
|
| 15 |
-
import iuliia
|
| 16 |
import torch
|
| 17 |
-
from dateparser.search import search_dates
|
| 18 |
from flask import Flask, jsonify, request
|
| 19 |
-
from gliner import GLiNER
|
| 20 |
-
from pymorphy3 import MorphAnalyzer
|
| 21 |
-
from rapidfuzz import fuzz
|
| 22 |
-
from rapidfuzz.distance import Levenshtein
|
| 23 |
from sentence_transformers import SentenceTransformer
|
| 24 |
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 25 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
| 26 |
|
| 27 |
-
#
|
| 28 |
-
from
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
# HuggingFace Token (если нужен для моделей)
|
| 31 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 32 |
|
| 33 |
-
MORPH = MorphAnalyzer()
|
| 34 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 35 |
_MODEL: Optional[SentenceTransformer] = None
|
| 36 |
-
_AMOUNT_MODEL: Optional[Any] = None
|
| 37 |
_WHISPER_MODEL: Optional[Any] = None
|
| 38 |
|
| 39 |
|
|
@@ -42,6 +39,7 @@ app.config["MAX_CONTENT_LENGTH"] = 20 * 1024 * 1024
|
|
| 42 |
|
| 43 |
|
| 44 |
def get_embedding_model() -> SentenceTransformer:
|
|
|
|
| 45 |
global _MODEL
|
| 46 |
|
| 47 |
if _MODEL is None:
|
|
@@ -50,16 +48,8 @@ def get_embedding_model() -> SentenceTransformer:
|
|
| 50 |
return _MODEL
|
| 51 |
|
| 52 |
|
| 53 |
-
def get_amount_model() -> Optional[Any]:
|
| 54 |
-
global _AMOUNT_MODEL
|
| 55 |
-
|
| 56 |
-
if _AMOUNT_MODEL is None and GLiNER is not None:
|
| 57 |
-
_AMOUNT_MODEL = GLiNER.from_pretrained("urchade/gliner_multi-v2.1")
|
| 58 |
-
|
| 59 |
-
return _AMOUNT_MODEL
|
| 60 |
-
|
| 61 |
-
|
| 62 |
def get_whisper_model() -> Any:
|
|
|
|
| 63 |
global _WHISPER_MODEL
|
| 64 |
|
| 65 |
if _WHISPER_MODEL is None:
|
|
@@ -72,942 +62,40 @@ def get_whisper_model() -> Any:
|
|
| 72 |
return _WHISPER_MODEL
|
| 73 |
|
| 74 |
|
| 75 |
-
|
| 76 |
-
text = unicodedata.normalize("NFKD", text.lower())
|
| 77 |
-
text = "".join(ch for ch in text if not unicodedata.combining(ch))
|
| 78 |
-
return re.sub(r"[^\w\s]", "", text).strip()
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
def tokenize_text(text: str) -> list[str]:
|
| 82 |
-
return normalize_text(text).split()
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
def lemmatize_word(word: str) -> str:
|
| 86 |
-
return MORPH.parse(word)[0].normal_form if re.fullmatch(r"[а-я]+", word) else word
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
def lemmatize_text(text: str) -> list[str]:
|
| 90 |
-
return [lemmatize_word(word) for word in tokenize_text(text)]
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
def variants(text: str) -> list[str]:
|
| 94 |
-
base = normalize_text(text)
|
| 95 |
-
result = [base]
|
| 96 |
-
|
| 97 |
-
for schema in (iuliia.WIKIPEDIA, iuliia.MOSMETRO, iuliia.ALA_LC):
|
| 98 |
-
try:
|
| 99 |
-
v = normalize_text(schema.translate(base))
|
| 100 |
-
if v and v not in result:
|
| 101 |
-
result.append(v)
|
| 102 |
-
except Exception:
|
| 103 |
-
pass
|
| 104 |
-
|
| 105 |
-
for v in list(result):
|
| 106 |
-
core = " ".join(w for w in v.split() if len(w) > 1 and any(ch.isalpha() for ch in w))
|
| 107 |
-
core = normalize_text(core)
|
| 108 |
-
if core and core not in result:
|
| 109 |
-
result.insert(0, core)
|
| 110 |
-
|
| 111 |
-
return result
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
def token_alignment_score(phrase_variant: str, candidate_tokens: list[str]) -> float:
|
| 115 |
-
phrase_tokens = [t for t in phrase_variant.split() if len(t) > 2]
|
| 116 |
-
if not phrase_tokens or not candidate_tokens:
|
| 117 |
-
return 0.0
|
| 118 |
-
best_scores = []
|
| 119 |
-
for pt in phrase_tokens:
|
| 120 |
-
best = 0.0
|
| 121 |
-
for ct in candidate_tokens:
|
| 122 |
-
sim = Levenshtein.normalized_similarity(pt, ct)
|
| 123 |
-
if sim > best:
|
| 124 |
-
best = sim
|
| 125 |
-
best_scores.append(best)
|
| 126 |
-
return sum(best_scores) / len(best_scores)
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
def length_penalty(phrase_len: int, candidate_len: int) -> float:
|
| 130 |
-
if phrase_len == 0 or candidate_len == 0:
|
| 131 |
-
return 0.0
|
| 132 |
-
ratio = min(phrase_len, candidate_len) / max(phrase_len, candidate_len)
|
| 133 |
-
if ratio >= 0.80:
|
| 134 |
-
return 1.0
|
| 135 |
-
if ratio >= 0.60:
|
| 136 |
-
return 0.90
|
| 137 |
-
if ratio >= 0.40:
|
| 138 |
-
return 0.70
|
| 139 |
-
return 0.50
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
def canonicalize_for_similarity(text: str) -> str:
|
| 143 |
-
t = normalize_text(text).replace(" ", "")
|
| 144 |
-
replacements = (
|
| 145 |
-
("sch", "sh"),
|
| 146 |
-
("tch", "ch"),
|
| 147 |
-
("dzh", "j"),
|
| 148 |
-
("zh", "j"),
|
| 149 |
-
("sh", "s"),
|
| 150 |
-
("ch", "c"),
|
| 151 |
-
("kh", "h"),
|
| 152 |
-
("ph", "f"),
|
| 153 |
-
("ck", "k"),
|
| 154 |
-
("qu", "k"),
|
| 155 |
-
("q", "k"),
|
| 156 |
-
("w", "v"),
|
| 157 |
-
("x", "ks"),
|
| 158 |
-
("ts", "z"),
|
| 159 |
-
("tz", "z"),
|
| 160 |
-
)
|
| 161 |
-
for src, dst in replacements:
|
| 162 |
-
t = t.replace(src, dst)
|
| 163 |
-
return re.sub(r"(.)\1+", r"\1", t)
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
def phonetic_similarity(left: str, right: str) -> float:
|
| 167 |
-
l = canonicalize_for_similarity(left)
|
| 168 |
-
r = canonicalize_for_similarity(right)
|
| 169 |
-
if not l or not r:
|
| 170 |
-
return 0.0
|
| 171 |
-
char = fuzz.ratio(l, r) / 100.0
|
| 172 |
-
lev = Levenshtein.normalized_similarity(l, r)
|
| 173 |
-
return 0.50 * char + 0.50 * lev
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
@dataclass(frozen=True)
|
| 177 |
-
class ParsedDate:
|
| 178 |
-
date_iso: str
|
| 179 |
-
matched_expression: Optional[str]
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
@dataclass(frozen=True)
|
| 183 |
-
class Token:
|
| 184 |
-
original: str
|
| 185 |
-
normalized: str
|
| 186 |
-
raw_lemma: str
|
| 187 |
-
lemma: str
|
| 188 |
-
lemma_correction: Optional[str]
|
| 189 |
-
start: int
|
| 190 |
-
end: int
|
| 191 |
-
lemma_start: int
|
| 192 |
-
lemma_end: int
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
WORD_RE = re.compile(r"[0-9]+(?:[./-][0-9]+)*|[а-яё]+", re.IGNORECASE)
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
class UniversalDateParser:
|
| 199 |
-
MONTHS = {
|
| 200 |
-
"январь": 1, "февраль": 2, "март": 3, "апрель": 4, "май": 5, "июнь": 6,
|
| 201 |
-
"июль": 7, "август": 8, "сентябрь": 9, "октябрь": 10, "ноябрь": 11, "декабрь": 12,
|
| 202 |
-
}
|
| 203 |
-
WEEKDAYS = {
|
| 204 |
-
"понедельник": 0, "вторник": 1, "среда": 2, "четверг": 3,
|
| 205 |
-
"пятница": 4, "суббота": 5, "воскресенье": 6,
|
| 206 |
-
}
|
| 207 |
-
DIRECT_RELATIVE = {"послезавтра": 2, "позавчера": -2, "сегодня": 0, "вчера": -1, "завтра": 1}
|
| 208 |
-
ORDINAL_DAYS = {
|
| 209 |
-
"первый": 1, "второй": 2, "третий": 3, "четвертый": 4, "пятый": 5, "шестой": 6,
|
| 210 |
-
"седьмой": 7, "восьмой": 8, "девятый": 9, "десятый": 10, "одиннадцатый": 11,
|
| 211 |
-
"двенадцатый": 12, "тринадцатый": 13, "четырнадцатый": 14, "пятнадцатый": 15,
|
| 212 |
-
"шестнадцатый": 16, "семнадцатый": 17, "восемнадцатый": 18, "девятнадцатый": 19,
|
| 213 |
-
"двадцатый": 20, "двадцать первый": 21, "двадцать второй": 22, "двадцать третий": 23,
|
| 214 |
-
"двадцать четвертый": 24, "двадцать пятый": 25, "двадцать шестой": 26,
|
| 215 |
-
"двадцать седьмой": 27, "двадцать восьмой": 28, "двадцать девятый": 29,
|
| 216 |
-
"тридцатый": 30, "тридцать первый": 31,
|
| 217 |
-
}
|
| 218 |
-
NUMBER_WORDS = {
|
| 219 |
-
"ноль": 0, "один": 1, "два": 2, "три": 3, "четыре": 4, "пять": 5, "шесть": 6,
|
| 220 |
-
"семь": 7, "восемь": 8, "девять": 9, "десять": 10, "одиннадцать": 11,
|
| 221 |
-
"двенадцать": 12, "тринадцать": 13, "четырнадцать": 14, "пятнадцать": 15,
|
| 222 |
-
"шестнадцать": 16, "семнадцать": 17, "восемнадцать": 18, "девятнадцать": 19,
|
| 223 |
-
"двадцать": 20, "тридцать": 30,
|
| 224 |
-
}
|
| 225 |
-
FUTURE_HINTS = ("завтра", "послезавтра", "через", "быть", "заплатить", "следующий", "последующий")
|
| 226 |
-
PAST_HINTS = ("вчера", "позавчера", "назад", "прошлый", "предыдущий", "оплатить", "купить", "заказать")
|
| 227 |
-
|
| 228 |
-
DIRECT_RELATIVE_RE = re.compile(r"(?<!\S)(послезавтра|позавчера|сегодня|вчера|завтра)(?!\S)")
|
| 229 |
-
WEEK_RELATIVE_RE = re.compile(
|
| 230 |
-
r"(?<!\S)на (?P<which>следующий|последующий|прошлый|предыдущий|этот) неделя"
|
| 231 |
-
r"(?: (?P<prep>в|во|на) (?P<weekday>понедельник|вторник|среда|четверг|пятница|суббота|воскресенье))?(?!\S)"
|
| 232 |
-
)
|
| 233 |
-
QUANTITY_RELATIVE_RE = re.compile(
|
| 234 |
-
r"(?<!\S)(?P<number>\d+|[а-яё]+(?: [а-яё]+)?) "
|
| 235 |
-
r"(?P<unit>месяц|неделя|день) "
|
| 236 |
-
r"(?P<ago>назад)"
|
| 237 |
-
r"(?: (?P<prep>в|во|на) (?P<weekday>понедельник|вторник|среда|четверг|пятница|суббота|воскресенье))?(?!\S)",
|
| 238 |
-
re.IGNORECASE,
|
| 239 |
-
)
|
| 240 |
-
FORWARD_QUANTITY_RE = re.compile(
|
| 241 |
-
r"(?<!\S)(?P<through>через) "
|
| 242 |
-
r"(?P<number>\d+|[а-яё]+(?: [а-яё]+)?) "
|
| 243 |
-
r"(?P<unit>месяц|неделя|день)"
|
| 244 |
-
r"(?: (?P<prep>в|во|на) (?P<weekday>понедельник|вторник|среда|четверг|пятница|суббота|воскресенье))?(?!\S)",
|
| 245 |
-
re.IGNORECASE,
|
| 246 |
-
)
|
| 247 |
-
FORWARD_SINGLE_UNIT_RE = re.compile(
|
| 248 |
-
r"(?<!\S)(?P<through>через) "
|
| 249 |
-
r"(?P<unit>месяц|неделя|день)"
|
| 250 |
-
r"(?: (?P<prep>в|во|на) (?P<weekday>понедельник|вторник|среда|четверг|пятница|суббота|воскресенье))?(?!\S)",
|
| 251 |
-
re.IGNORECASE,
|
| 252 |
-
)
|
| 253 |
-
TEXTUAL_ABSOLUTE_RE = re.compile(
|
| 254 |
-
r"(?<!\S)(?P<day>\d{1,2}|[а-яё]+(?: [а-яё]+)?) "
|
| 255 |
-
r"(?P<month>январь|февраль|март|апре��ь|май|июнь|июль|август|сентябрь|октябрь|ноябрь|декабрь)"
|
| 256 |
-
r"(?: (?P<year>\d{4}))?(?!\S)",
|
| 257 |
-
re.IGNORECASE,
|
| 258 |
-
)
|
| 259 |
-
PERIOD_EDGE_RE = re.compile(
|
| 260 |
-
r"(?<!\S)(?:в )?(?P<edge>начало|конец) (?P<which>этот|следующий|последующий|прошлый|предыдущий) (?P<unit>неделя|месяц)(?!\S)",
|
| 261 |
-
re.IGNORECASE,
|
| 262 |
-
)
|
| 263 |
-
|
| 264 |
-
@classmethod
|
| 265 |
-
def temporal_vocabulary(cls) -> set[str]:
|
| 266 |
-
vocab: set[str] = set()
|
| 267 |
-
vocab.update(cls.MONTHS)
|
| 268 |
-
vocab.update(cls.WEEKDAYS)
|
| 269 |
-
vocab.update(cls.DIRECT_RELATIVE)
|
| 270 |
-
vocab.update(cls.ORDINAL_DAYS)
|
| 271 |
-
vocab.update(cls.NUMBER_WORDS)
|
| 272 |
-
vocab.update({
|
| 273 |
-
"неделя", "месяц", "день", "назад", "через", "начало", "конец", "на", "в", "во",
|
| 274 |
-
"этот", "прошлый", "предыдущий", "следующий", "последующий",
|
| 275 |
-
})
|
| 276 |
-
return vocab
|
| 277 |
-
|
| 278 |
-
@staticmethod
|
| 279 |
-
def similarity(left: str, right: str) -> float:
|
| 280 |
-
return difflib.SequenceMatcher(None, left, right).ratio()
|
| 281 |
-
|
| 282 |
-
@classmethod
|
| 283 |
-
def pick_temporal_correction(cls, normalized: str, raw_lemma: str) -> tuple[str, Optional[str]]:
|
| 284 |
-
vocab = cls.temporal_vocabulary()
|
| 285 |
-
if raw_lemma in vocab or not normalized.isalpha() or len(normalized) < 5:
|
| 286 |
-
return raw_lemma, None
|
| 287 |
-
|
| 288 |
-
candidates = list(difflib.get_close_matches(normalized, list(vocab), n=4, cutoff=0.74))
|
| 289 |
-
candidates.extend(difflib.get_close_matches(raw_lemma, list(vocab), n=4, cutoff=0.74))
|
| 290 |
-
candidates = list(dict.fromkeys(candidates))
|
| 291 |
-
if not candidates:
|
| 292 |
-
return raw_lemma, None
|
| 293 |
-
|
| 294 |
-
best = max(candidates, key=lambda item: max(cls.similarity(normalized, item), cls.similarity(raw_lemma, item)))
|
| 295 |
-
best_score = max(cls.similarity(normalized, best), cls.similarity(raw_lemma, best))
|
| 296 |
-
return (best, f"{raw_lemma}->{best}") if best_score >= 0.80 else (raw_lemma, None)
|
| 297 |
-
|
| 298 |
-
@staticmethod
|
| 299 |
-
def normalize_word(word: str) -> str:
|
| 300 |
-
return word.lower().replace("ё", "е")
|
| 301 |
-
|
| 302 |
-
@classmethod
|
| 303 |
-
def lemmatize(cls, word: str) -> str:
|
| 304 |
-
return MORPH.parse(word)[0].normal_form if word.isalpha() else word
|
| 305 |
-
|
| 306 |
-
@classmethod
|
| 307 |
-
def tokenize(cls, text: str) -> list[Token]:
|
| 308 |
-
tokens: list[Token] = []
|
| 309 |
-
lemma_cursor = 0
|
| 310 |
-
|
| 311 |
-
for match in WORD_RE.finditer(text):
|
| 312 |
-
original = match.group(0)
|
| 313 |
-
normalized = cls.normalize_word(original)
|
| 314 |
-
raw_lemma = cls.lemmatize(normalized)
|
| 315 |
-
lemma, correction = cls.pick_temporal_correction(normalized, raw_lemma)
|
| 316 |
-
lemma_start = lemma_cursor
|
| 317 |
-
lemma_end = lemma_start + len(lemma)
|
| 318 |
-
tokens.append(Token(original, normalized, raw_lemma, lemma, correction, match.start(), match.end(), lemma_start, lemma_end))
|
| 319 |
-
lemma_cursor = lemma_end + 1
|
| 320 |
-
|
| 321 |
-
return tokens
|
| 322 |
-
|
| 323 |
-
@staticmethod
|
| 324 |
-
def lemma_text(tokens: list[Token]) -> str:
|
| 325 |
-
return " ".join(token.lemma for token in tokens)
|
| 326 |
-
|
| 327 |
-
@staticmethod
|
| 328 |
-
def surface_text(text: str, tokens: list[Token], start_idx: int, end_idx: int) -> str:
|
| 329 |
-
return text[tokens[start_idx].start:tokens[end_idx].end].strip() if tokens else ""
|
| 330 |
-
|
| 331 |
-
@staticmethod
|
| 332 |
-
def lemma_span_to_token_range(tokens: list[Token], span: tuple[int, int]) -> Optional[tuple[int, int]]:
|
| 333 |
-
start_char, end_char = span
|
| 334 |
-
start_idx = end_idx = None
|
| 335 |
-
|
| 336 |
-
for idx, token in enumerate(tokens):
|
| 337 |
-
if start_idx is None and token.lemma_start <= start_char < token.lemma_end:
|
| 338 |
-
start_idx = idx
|
| 339 |
-
if token.lemma_start < end_char <= token.lemma_end:
|
| 340 |
-
end_idx = idx
|
| 341 |
-
break
|
| 342 |
-
|
| 343 |
-
return (start_idx, end_idx) if start_idx is not None and end_idx is not None else None
|
| 344 |
-
|
| 345 |
-
@classmethod
|
| 346 |
-
def make_parsed_date(cls, text: str, tokens: list[Token], match, parsed_date: date) -> Optional[ParsedDate]:
|
| 347 |
-
token_span = cls.lemma_span_to_token_range(tokens, match.span())
|
| 348 |
-
if token_span is None:
|
| 349 |
-
return None
|
| 350 |
-
return ParsedDate(parsed_date.isoformat(), cls.surface_text(text, tokens, token_span[0], token_span[1]))
|
| 351 |
-
|
| 352 |
-
@classmethod
|
| 353 |
-
def parse_number_phrase(cls, phrase: str) -> Optional[int]:
|
| 354 |
-
phrase = phrase.strip()
|
| 355 |
-
if not phrase:
|
| 356 |
-
return None
|
| 357 |
-
if phrase.isdigit():
|
| 358 |
-
return int(phrase)
|
| 359 |
-
|
| 360 |
-
parts = phrase.split()
|
| 361 |
-
if len(parts) == 1:
|
| 362 |
-
return cls.NUMBER_WORDS.get(parts[0])
|
| 363 |
-
if len(parts) == 2 and parts[0] in {"двадцать", "тридцать"}:
|
| 364 |
-
base = cls.NUMBER_WORDS.get(parts[0])
|
| 365 |
-
addon = cls.NUMBER_WORDS.get(parts[1])
|
| 366 |
-
if base is not None and addon is not None and 1 <= addon <= 9:
|
| 367 |
-
return base + addon
|
| 368 |
-
return None
|
| 369 |
-
|
| 370 |
-
@classmethod
|
| 371 |
-
def parse_day_phrase(cls, phrase: str) -> Optional[int]:
|
| 372 |
-
if phrase.isdigit():
|
| 373 |
-
value = int(phrase)
|
| 374 |
-
return value if 1 <= value <= 31 else None
|
| 375 |
-
return cls.ORDINAL_DAYS.get(phrase.strip())
|
| 376 |
-
|
| 377 |
-
@staticmethod
|
| 378 |
-
def shift_months(value: date, months: int) -> date:
|
| 379 |
-
month_index = value.month - 1 + months
|
| 380 |
-
year = value.year + month_index // 12
|
| 381 |
-
month = month_index % 12 + 1
|
| 382 |
-
day = min(value.day, calendar.monthrange(year, month)[1])
|
| 383 |
-
return date(year, month, day)
|
| 384 |
-
|
| 385 |
-
@staticmethod
|
| 386 |
-
def parse_numeric_absolute(tokens: list[Token]) -> Optional[ParsedDate]:
|
| 387 |
-
for token in tokens:
|
| 388 |
-
separator = "." if "." in token.original else "-" if "-" in token.original else "/" if "/" in token.original else None
|
| 389 |
-
if separator is None:
|
| 390 |
-
continue
|
| 391 |
-
|
| 392 |
-
parts = token.original.split(separator)
|
| 393 |
-
if len(parts) != 3 or not all(part.isdigit() for part in parts):
|
| 394 |
-
continue
|
| 395 |
-
|
| 396 |
-
try:
|
| 397 |
-
if len(parts[0]) == 4:
|
| 398 |
-
parsed = date(int(parts[0]), int(parts[1]), int(parts[2]))
|
| 399 |
-
elif len(parts[2]) == 4:
|
| 400 |
-
parsed = date(int(parts[2]), int(parts[1]), int(parts[0]))
|
| 401 |
-
else:
|
| 402 |
-
continue
|
| 403 |
-
return ParsedDate(parsed.isoformat(), token.original)
|
| 404 |
-
except ValueError:
|
| 405 |
-
continue
|
| 406 |
-
|
| 407 |
-
return None
|
| 408 |
-
|
| 409 |
-
@classmethod
|
| 410 |
-
def parse_textual_absolute(cls, text: str, tokens: list[Token], reference_date: date) -> Optional[ParsedDate]:
|
| 411 |
-
lemma_text = cls.lemma_text(tokens)
|
| 412 |
-
for match in cls.TEXTUAL_ABSOLUTE_RE.finditer(lemma_text):
|
| 413 |
-
day = cls.parse_day_phrase(match.group("day"))
|
| 414 |
-
month = cls.MONTHS.get(match.group("month"))
|
| 415 |
-
if day is None or month is None:
|
| 416 |
-
continue
|
| 417 |
-
|
| 418 |
-
year = int(match.group("year")) if match.group("year") else reference_date.year
|
| 419 |
-
try:
|
| 420 |
-
parsed = date(year, month, day)
|
| 421 |
-
except ValueError:
|
| 422 |
-
continue
|
| 423 |
-
|
| 424 |
-
result = cls.make_parsed_date(text, tokens, match, parsed)
|
| 425 |
-
if result is not None:
|
| 426 |
-
return result
|
| 427 |
-
|
| 428 |
-
return None
|
| 429 |
-
|
| 430 |
-
@classmethod
|
| 431 |
-
def parse_direct_relative(cls, text: str, tokens: list[Token], reference_date: date) -> Optional[ParsedDate]:
|
| 432 |
-
lemma_text = cls.lemma_text(tokens)
|
| 433 |
-
match = cls.DIRECT_RELATIVE_RE.search(lemma_text)
|
| 434 |
-
if not match:
|
| 435 |
-
return None
|
| 436 |
-
|
| 437 |
-
parsed = reference_date + timedelta(days=cls.DIRECT_RELATIVE[match.group(1)])
|
| 438 |
-
return cls.make_parsed_date(text, tokens, match, parsed)
|
| 439 |
-
|
| 440 |
-
@staticmethod
|
| 441 |
-
def week_monday(value: date) -> date:
|
| 442 |
-
return value - timedelta(days=value.weekday())
|
| 443 |
-
|
| 444 |
-
@classmethod
|
| 445 |
-
def parse_week_relative(cls, text: str, tokens: list[Token], reference_date: date) -> Optional[ParsedDate]:
|
| 446 |
-
lemma_text = cls.lemma_text(tokens)
|
| 447 |
-
match = cls.WEEK_RELATIVE_RE.search(lemma_text)
|
| 448 |
-
if not match:
|
| 449 |
-
return None
|
| 450 |
-
|
| 451 |
-
offsets = {"следующий": 7, "последующий": 7, "прошлый": -7, "предыдущий": -7, "этот": 0}
|
| 452 |
-
anchor = reference_date + timedelta(days=offsets[match.group("which")])
|
| 453 |
-
|
| 454 |
-
if match.group("weekday"):
|
| 455 |
-
anchor = cls.week_monday(anchor) + timedelta(days=cls.WEEKDAYS[match.group("weekday")])
|
| 456 |
-
|
| 457 |
-
return cls.make_parsed_date(text, tokens, match, anchor)
|
| 458 |
-
|
| 459 |
-
@classmethod
|
| 460 |
-
def parse_period_edge(cls, text: str, tokens: list[Token], reference_date: date) -> Optional[ParsedDate]:
|
| 461 |
-
lemma_text = cls.lemma_text(tokens)
|
| 462 |
-
match = cls.PERIOD_EDGE_RE.search(lemma_text)
|
| 463 |
-
if not match:
|
| 464 |
-
return None
|
| 465 |
-
|
| 466 |
-
edge, which, unit = match.group("edge"), match.group("which"), match.group("unit")
|
| 467 |
-
|
| 468 |
-
if unit == "неделя":
|
| 469 |
-
offsets = {"прошлый": -7, "предыдущий": -7, "этот": 0, "следующий": 7, "последующий": 7}
|
| 470 |
-
monday = cls.week_monday(reference_date + timedelta(days=offsets[which]))
|
| 471 |
-
parsed_date = monday if edge == "начало" else monday + timedelta(days=6)
|
| 472 |
-
else:
|
| 473 |
-
month_offset = {"прошлый": -1, "предыдущий": -1, "этот": 0, "следующий": 1, "последующий": 1}[which]
|
| 474 |
-
shifted = cls.shift_months(date(reference_date.year, reference_date.month, 1), month_offset)
|
| 475 |
-
parsed_date = shifted if edge == "начало" else date(shifted.year, shifted.month, calendar.monthrange(shifted.year, shifted.month)[1])
|
| 476 |
-
|
| 477 |
-
return cls.make_parsed_date(text, tokens, match, parsed_date)
|
| 478 |
-
|
| 479 |
-
@classmethod
|
| 480 |
-
def parse_quantity_relative(cls, text: str, tokens: list[Token], reference_date: date) -> Optional[ParsedDate]:
|
| 481 |
-
lemma_text = cls.lemma_text(tokens)
|
| 482 |
-
|
| 483 |
-
for regex, direction in ((cls.QUANTITY_RELATIVE_RE, -1), (cls.FORWARD_QUANTITY_RE, 1)):
|
| 484 |
-
for match in regex.finditer(lemma_text):
|
| 485 |
-
number = cls.parse_number_phrase(match.group("number"))
|
| 486 |
-
if number is None:
|
| 487 |
-
continue
|
| 488 |
-
|
| 489 |
-
unit = match.group("unit")
|
| 490 |
-
if unit == "месяц":
|
| 491 |
-
anchor = cls.shift_months(reference_date, direction * number)
|
| 492 |
-
else:
|
| 493 |
-
days = number * 7 if unit == "неделя" else number
|
| 494 |
-
anchor = reference_date + timedelta(days=direction * days)
|
| 495 |
-
|
| 496 |
-
if match.group("weekday"):
|
| 497 |
-
anchor = cls.week_monday(anchor) + timedelta(days=cls.WEEKDAYS[match.group("weekday")])
|
| 498 |
-
|
| 499 |
-
result = cls.make_parsed_date(text, tokens, match, anchor)
|
| 500 |
-
if result is not None:
|
| 501 |
-
return result
|
| 502 |
-
|
| 503 |
-
for match in cls.FORWARD_SINGLE_UNIT_RE.finditer(lemma_text):
|
| 504 |
-
unit = match.group("unit")
|
| 505 |
-
if unit == "месяц":
|
| 506 |
-
anchor = cls.shift_months(reference_date, 1)
|
| 507 |
-
else:
|
| 508 |
-
days = 7 if unit == "неделя" else 1
|
| 509 |
-
anchor = reference_date + timedelta(days=days)
|
| 510 |
-
|
| 511 |
-
if match.group("weekday"):
|
| 512 |
-
anchor = cls.week_monday(anchor) + timedelta(days=cls.WEEKDAYS[match.group("weekday")])
|
| 513 |
-
|
| 514 |
-
result = cls.make_parsed_date(text, tokens, match, anchor)
|
| 515 |
-
if result is not None:
|
| 516 |
-
return result
|
| 517 |
-
|
| 518 |
-
return None
|
| 519 |
-
|
| 520 |
-
@classmethod
|
| 521 |
-
def preference_for_text(cls, tokens: list[Token]) -> str:
|
| 522 |
-
lemmas = [token.lemma for token in tokens]
|
| 523 |
-
future = sum(1 for hint in cls.FUTURE_HINTS if hint in lemmas)
|
| 524 |
-
past = sum(1 for hint in cls.PAST_HINTS if hint in lemmas)
|
| 525 |
-
return "future" if future > past else "past"
|
| 526 |
-
|
| 527 |
-
@staticmethod
|
| 528 |
-
def choose_best(matches: list[tuple[str, datetime]]) -> tuple[str, datetime]:
|
| 529 |
-
return sorted(matches, key=lambda item: (len(item[0]), -item[1].timestamp()), reverse=True)[0]
|
| 530 |
-
|
| 531 |
-
def parse(self, text: str, reference_date: date) -> Optional[ParsedDate]:
|
| 532 |
-
tokens = self.tokenize(text)
|
| 533 |
-
|
| 534 |
-
for parser in (
|
| 535 |
-
lambda: self.parse_numeric_absolute(tokens),
|
| 536 |
-
lambda: self.parse_textual_absolute(text, tokens, reference_date),
|
| 537 |
-
lambda: self.parse_direct_relative(text, tokens, reference_date),
|
| 538 |
-
lambda: self.parse_week_relative(text, tokens, reference_date),
|
| 539 |
-
lambda: self.parse_period_edge(text, tokens, reference_date),
|
| 540 |
-
lambda: self.parse_quantity_relative(text, tokens, reference_date),
|
| 541 |
-
):
|
| 542 |
-
parsed = parser()
|
| 543 |
-
if parsed is not None:
|
| 544 |
-
return parsed
|
| 545 |
-
|
| 546 |
-
normalized = " ".join(token.normalized for token in tokens)
|
| 547 |
-
relative_base = datetime.combine(reference_date, datetime.min.time()).replace(hour=12)
|
| 548 |
-
result = search_dates(
|
| 549 |
-
normalized,
|
| 550 |
-
languages=["ru"],
|
| 551 |
-
settings={
|
| 552 |
-
"RELATIVE_BASE": relative_base,
|
| 553 |
-
"PREFER_DATES_FROM": self.preference_for_text(tokens),
|
| 554 |
-
"STRICT_PARSING": False,
|
| 555 |
-
"REQUIRE_PARTS": [],
|
| 556 |
-
"NORMALIZE": True,
|
| 557 |
-
"RETURN_AS_TIMEZONE_AWARE": False,
|
| 558 |
-
"DATE_ORDER": "DMY",
|
| 559 |
-
},
|
| 560 |
-
)
|
| 561 |
-
|
| 562 |
-
filtered: list[tuple[str, datetime]] = []
|
| 563 |
-
for matched, value in result or []:
|
| 564 |
-
if isinstance(value, datetime) and not matched.strip().isdigit() and 2020 <= value.year <= 2100:
|
| 565 |
-
filtered.append((matched.strip(), value))
|
| 566 |
-
|
| 567 |
-
if not filtered:
|
| 568 |
-
return None
|
| 569 |
-
|
| 570 |
-
matched_expression, value = self.choose_best(filtered)
|
| 571 |
-
return ParsedDate(date_iso=value.date().isoformat(), matched_expression=matched_expression)
|
| 572 |
-
|
| 573 |
-
|
| 574 |
-
class ExpenseDateExtractor:
|
| 575 |
-
def __init__(self) -> None:
|
| 576 |
-
self.parser = UniversalDateParser()
|
| 577 |
-
|
| 578 |
-
def extract(self, text: str, reference_date: str | date | None = None) -> dict[str, Any]:
|
| 579 |
-
ref_date = self.to_date(reference_date or date.today().isoformat())
|
| 580 |
-
parsed = self.parser.parse(text=text, reference_date=ref_date)
|
| 581 |
-
|
| 582 |
-
return {
|
| 583 |
-
"date": datetime.strptime(parsed.date_iso, "%Y-%m-%d").strftime("%d.%m.%Y") if parsed else None,
|
| 584 |
-
"date_iso": parsed.date_iso if parsed else None,
|
| 585 |
-
"matched_date_phrase": parsed.matched_expression if parsed else None,
|
| 586 |
-
}
|
| 587 |
-
|
| 588 |
-
@staticmethod
|
| 589 |
-
def to_date(value: str | date) -> date:
|
| 590 |
-
return value if isinstance(value, date) else datetime.strptime(value, "%Y-%m-%d").date()
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
# Парсер дат: "natasha" (рекомендуется) или "legacy"
|
| 594 |
-
DATE_PARSER_MODE = os.getenv("DATE_PARSER_MODE", "legacy")
|
| 595 |
-
|
| 596 |
-
def get_date_extractor():
|
| 597 |
"""
|
| 598 |
-
|
| 599 |
-
|
| 600 |
-
|
| 601 |
"""
|
| 602 |
-
|
| 603 |
-
return NatashaDateExtractor()
|
| 604 |
-
return ExpenseDateExtractor()
|
| 605 |
-
|
| 606 |
-
|
| 607 |
-
class ExpenseUserExtractor:
|
| 608 |
-
def __init__(self, users: list[str], suppliers: list[str], model: SentenceTransformer, threshold: float = 0.6) -> None:
|
| 609 |
-
self.users = users
|
| 610 |
-
self.model = model
|
| 611 |
-
self.threshold = threshold
|
| 612 |
-
self.supplier_terms = {normalize_text(supplier) for supplier in suppliers}
|
| 613 |
-
self.user_terms = [normalize_text(user) for user in users]
|
| 614 |
-
self.user_embeddings = model.encode(
|
| 615 |
-
[f"passage: {user}" for user in self.user_terms],
|
| 616 |
-
convert_to_tensor=True,
|
| 617 |
-
normalize_embeddings=True,
|
| 618 |
-
)
|
| 619 |
-
|
| 620 |
-
def extract(self, text: str, supplier_phrase: str | None = None, date_phrase: str | None = None) -> dict[str, Any]:
|
| 621 |
-
excluded_tokens: set[str] = set()
|
| 622 |
-
if supplier_phrase:
|
| 623 |
-
excluded_tokens.update(normalize_text(supplier_phrase).split())
|
| 624 |
-
if date_phrase:
|
| 625 |
-
excluded_tokens.update(normalize_text(date_phrase).split())
|
| 626 |
-
|
| 627 |
-
best_user = None
|
| 628 |
-
best_score = -1.0
|
| 629 |
-
best_phrase = None
|
| 630 |
-
|
| 631 |
-
for word in lemmatize_text(text):
|
| 632 |
-
if len(word) < 3:
|
| 633 |
-
continue
|
| 634 |
-
if word in excluded_tokens or word in self.supplier_terms:
|
| 635 |
-
continue
|
| 636 |
-
|
| 637 |
-
query_emb = self.model.encode(
|
| 638 |
-
f"query: {word}",
|
| 639 |
-
convert_to_tensor=True,
|
| 640 |
-
normalize_embeddings=True,
|
| 641 |
-
)
|
| 642 |
-
similarities = torch.cosine_similarity(query_emb.unsqueeze(0), self.user_embeddings, dim=1)
|
| 643 |
-
idx = int(torch.argmax(similarities))
|
| 644 |
-
score = similarities[idx].item()
|
| 645 |
-
|
| 646 |
-
if score > best_score:
|
| 647 |
-
best_score = score
|
| 648 |
-
best_user = self.users[idx]
|
| 649 |
-
best_phrase = word
|
| 650 |
-
|
| 651 |
-
if best_score >= self.threshold:
|
| 652 |
-
return {
|
| 653 |
-
"user": best_user,
|
| 654 |
-
"user_score": round(best_score, 4),
|
| 655 |
-
"matched_user_phrase": best_phrase,
|
| 656 |
-
}
|
| 657 |
-
|
| 658 |
-
if re.search(r"(?<!\S)я(?!\S)", normalize_text(text), re.IGNORECASE):
|
| 659 |
-
return {
|
| 660 |
-
"user": "Я",
|
| 661 |
-
"user_score": 1.0,
|
| 662 |
-
"matched_user_phrase": "я",
|
| 663 |
-
}
|
| 664 |
-
|
| 665 |
-
return {
|
| 666 |
-
"user": None,
|
| 667 |
-
"user_score": None,
|
| 668 |
-
"matched_user_phrase": None,
|
| 669 |
-
}
|
| 670 |
-
|
| 671 |
-
|
| 672 |
-
class ExpenseSupplierExtractor:
|
| 673 |
-
def __init__(self, suppliers: list[str]) -> None:
|
| 674 |
-
self.suppliers = suppliers
|
| 675 |
-
self.sup_norm = [normalize_text(s) for s in suppliers]
|
| 676 |
-
self.sup_tokens = [s.split() for s in self.sup_norm]
|
| 677 |
-
self.sup_num_sets = [self.numeric_tokens(s) for s in self.sup_norm]
|
| 678 |
-
self.sup_number_tokens = {token for supplier in self.sup_tokens for token in supplier if token.isdigit()}
|
| 679 |
-
self.supplier_lexicon = [
|
| 680 |
-
token
|
| 681 |
-
for token in sorted({tok for tokens in self.sup_tokens for tok in tokens})
|
| 682 |
-
if token and not token.isdigit()
|
| 683 |
-
]
|
| 684 |
-
self.tfidf = TfidfVectorizer(analyzer="char_wb", ngram_range=(3, 5))
|
| 685 |
-
self.sup_mat = self.tfidf.fit_transform(self.sup_norm)
|
| 686 |
-
self.max_words = max(len(s.split()) for s in self.sup_norm)
|
| 687 |
-
self.variant_cache: dict[str, list[str]] = {}
|
| 688 |
-
self.lexical_token_cache: dict[str, float] = {}
|
| 689 |
-
self.phrase_support_cache: dict[str, float] = {}
|
| 690 |
-
self.noise_terms = {
|
| 691 |
-
"за", "на", "из", "для", "под", "над", "при", "без", "и", "или",
|
| 692 |
-
"купил", "купила", "купили", "покупка", "заказал", "заказала", "заказали",
|
| 693 |
-
"оплатил", "оплатила", "оплатили", "заплатил", "заплатила", "заплатили",
|
| 694 |
-
"был", "была", "было", "были", "утром", "днем", "днём", "вечером", "ночью",
|
| 695 |
-
"товар", "товары", "продукт", "продукты", "десерт", "еда",
|
| 696 |
-
"лей", "лея", "леи", "целых", "сотых", "сом", "сомов", "руб", "рублей", "грн", "usd", "eur",
|
| 697 |
-
}
|
| 698 |
-
self.noise_terms.update(UniversalDateParser.temporal_vocabulary())
|
| 699 |
-
|
| 700 |
-
@staticmethod
|
| 701 |
-
def numeric_tokens(text: str) -> set[str]:
|
| 702 |
-
return set(re.findall(r"\d+", text))
|
| 703 |
-
|
| 704 |
-
def cached_variants(self, text: str) -> list[str]:
|
| 705 |
-
key = normalize_text(text)
|
| 706 |
-
cached = self.variant_cache.get(key)
|
| 707 |
-
if cached is None:
|
| 708 |
-
cached = variants(key)
|
| 709 |
-
self.variant_cache[key] = cached
|
| 710 |
-
return cached
|
| 711 |
-
|
| 712 |
-
@staticmethod
|
| 713 |
-
def split_words(text: str) -> list[str]:
|
| 714 |
-
return [w for w in normalize_text(text).split() if w]
|
| 715 |
-
|
| 716 |
-
@classmethod
|
| 717 |
-
def is_supplier_extension(cls, base_supplier: str, extended_supplier: str) -> bool:
|
| 718 |
-
base_tokens = cls.split_words(base_supplier)
|
| 719 |
-
extended_tokens = cls.split_words(extended_supplier)
|
| 720 |
-
return len(base_tokens) < len(extended_tokens) and extended_tokens[:len(base_tokens)] == base_tokens
|
| 721 |
-
|
| 722 |
-
@classmethod
|
| 723 |
-
def phrase_token_count(cls, phrase: str | None) -> int:
|
| 724 |
-
return len(cls.split_words(phrase or ""))
|
| 725 |
-
|
| 726 |
-
@classmethod
|
| 727 |
-
def resolve_overlapping_suppliers(cls, ranking: list[dict[str, Any]]) -> dict[str, Any]:
|
| 728 |
-
if not ranking:
|
| 729 |
-
return {"supplier": None, "score": -1.0, "phrase": None}
|
| 730 |
-
|
| 731 |
-
best = ranking[0]
|
| 732 |
-
best_combined = float(best.get("combined", best.get("score", -1.0)))
|
| 733 |
-
best_phrase_len = cls.phrase_token_count(best.get("phrase"))
|
| 734 |
-
|
| 735 |
-
for alt in ranking[1:]:
|
| 736 |
-
if not cls.is_supplier_extension(str(best.get("supplier") or ""), str(alt.get("supplier") or "")):
|
| 737 |
-
continue
|
| 738 |
-
|
| 739 |
-
alt_combined = float(alt.get("combined", alt.get("score", -1.0)))
|
| 740 |
-
alt_phrase_len = cls.phrase_token_count(alt.get("phrase"))
|
| 741 |
-
|
| 742 |
-
if alt_phrase_len > best_phrase_len and alt_combined >= best_combined - 0.15:
|
| 743 |
-
best = alt
|
| 744 |
-
best_combined = alt_combined
|
| 745 |
-
best_phrase_len = alt_phrase_len
|
| 746 |
-
|
| 747 |
-
return best
|
| 748 |
-
|
| 749 |
-
@staticmethod
|
| 750 |
-
def numeric_compatibility_multiplier(phrase_nums: set[str], candidate_nums: set[str]) -> float:
|
| 751 |
-
if not phrase_nums and not candidate_nums:
|
| 752 |
-
return 1.0
|
| 753 |
-
if phrase_nums == candidate_nums:
|
| 754 |
-
return 1.08
|
| 755 |
-
if phrase_nums and candidate_nums:
|
| 756 |
-
return 1.03 if phrase_nums & candidate_nums else 0.80
|
| 757 |
-
return 0.82
|
| 758 |
-
|
| 759 |
-
def lexical_support(self, phrase: str) -> float:
|
| 760 |
-
tokens = [token for token in normalize_text(phrase).split() if token and not token.isdigit()]
|
| 761 |
-
if not tokens or not self.supplier_lexicon:
|
| 762 |
-
return 0.0
|
| 763 |
-
|
| 764 |
-
support_scores: list[float] = []
|
| 765 |
-
for token in tokens:
|
| 766 |
-
cached = self.lexical_token_cache.get(token)
|
| 767 |
-
if cached is not None:
|
| 768 |
-
support_scores.append(cached)
|
| 769 |
-
continue
|
| 770 |
-
|
| 771 |
-
best = 0.0
|
| 772 |
-
for token_variant in self.cached_variants(token):
|
| 773 |
-
for lex in self.supplier_lexicon:
|
| 774 |
-
lev = Levenshtein.normalized_similarity(token_variant, lex)
|
| 775 |
-
phon = phonetic_similarity(token_variant, lex)
|
| 776 |
-
sim = max(lev, phon)
|
| 777 |
-
if sim > best:
|
| 778 |
-
best = sim
|
| 779 |
-
|
| 780 |
-
self.lexical_token_cache[token] = best
|
| 781 |
-
support_scores.append(best)
|
| 782 |
-
|
| 783 |
-
return sum(support_scores) / len(support_scores)
|
| 784 |
-
|
| 785 |
-
def score_phrase(self, phrase: str) -> dict[str, Any]:
|
| 786 |
-
vs = self.cached_variants(phrase)
|
| 787 |
-
q = self.tfidf.transform(vs)
|
| 788 |
-
tf = cosine_similarity(q, self.sup_mat)
|
| 789 |
-
|
| 790 |
-
best: dict[str, Any] = {"supplier": None, "score": -1.0, "phrase": phrase, "variant": ""}
|
| 791 |
-
for i, cand in enumerate(self.sup_norm):
|
| 792 |
-
local = -1.0
|
| 793 |
-
local_variant = ""
|
| 794 |
-
candidate_nums = self.sup_num_sets[i]
|
| 795 |
-
for j, v in enumerate(vs):
|
| 796 |
-
char = fuzz.ratio(v, cand) / 100.0
|
| 797 |
-
tf_val = float(tf[j, i])
|
| 798 |
-
penalty = length_penalty(len(v), len(cand))
|
| 799 |
-
phon = phonetic_similarity(v, cand)
|
| 800 |
-
phrase_nums = self.numeric_tokens(v)
|
| 801 |
-
|
| 802 |
-
if len(v.split()) == 1 and len(cand.split()) == 1:
|
| 803 |
-
lev = Levenshtein.normalized_similarity(v, cand)
|
| 804 |
-
val = (0.45 * lev + 0.25 * char + 0.10 * tf_val + 0.20 * phon) * penalty
|
| 805 |
-
else:
|
| 806 |
-
align = token_alignment_score(v, self.sup_tokens[i])
|
| 807 |
-
tok = fuzz.token_set_ratio(v, cand) / 100.0
|
| 808 |
-
val = (0.30 * char + 0.20 * tok + 0.10 * tf_val + 0.20 * align + 0.20 * phon) * penalty
|
| 809 |
-
|
| 810 |
-
compact_v = v.replace(" ", "")
|
| 811 |
-
compact_cand = cand.replace(" ", "")
|
| 812 |
-
compact_char = fuzz.ratio(compact_v, compact_cand) / 100.0
|
| 813 |
-
compact_lev = Levenshtein.normalized_similarity(compact_v, compact_cand)
|
| 814 |
-
compact_phon = phonetic_similarity(compact_v, compact_cand)
|
| 815 |
-
compact = max(compact_char, compact_lev, compact_phon)
|
| 816 |
-
if compact > 0.55:
|
| 817 |
-
val = max(val, compact * penalty)
|
| 818 |
-
|
| 819 |
-
val *= self.numeric_compatibility_multiplier(phrase_nums, candidate_nums)
|
| 820 |
-
|
| 821 |
-
if val > local:
|
| 822 |
-
local = val
|
| 823 |
-
local_variant = v
|
| 824 |
-
|
| 825 |
-
if local > best["score"]:
|
| 826 |
-
best = {"supplier": self.suppliers[i], "score": local, "phrase": phrase, "variant": local_variant}
|
| 827 |
-
return best
|
| 828 |
-
|
| 829 |
-
def extract(self, text: str, date_phrase: str | None = None, debug: bool = False) -> dict[str, Any]:
|
| 830 |
-
threshold = 0.50
|
| 831 |
-
excluded_tokens: set[str] = set()
|
| 832 |
-
if date_phrase:
|
| 833 |
-
excluded_tokens.update(normalize_text(date_phrase).split())
|
| 834 |
-
excluded_tokens.update(self.noise_terms)
|
| 835 |
-
|
| 836 |
-
raw_tokens = normalize_text(text).split()
|
| 837 |
-
tokens: list[str] = []
|
| 838 |
-
for token in raw_tokens:
|
| 839 |
-
if token in excluded_tokens:
|
| 840 |
-
continue
|
| 841 |
-
|
| 842 |
-
if token.isdigit():
|
| 843 |
-
if token in self.sup_number_tokens:
|
| 844 |
-
tokens.append(token)
|
| 845 |
-
|
| 846 |
-
if tokens and len(token) <= 3 and len(tokens[-1]) >= 4 and tokens[-1].isalpha():
|
| 847 |
-
tokens.append(f"{tokens[-1]}{token}")
|
| 848 |
-
continue
|
| 849 |
-
|
| 850 |
-
if len(token) > 1:
|
| 851 |
-
tokens.append(token)
|
| 852 |
-
|
| 853 |
-
tokens = [t for t in tokens if len(t) > 1 and t not in excluded_tokens]
|
| 854 |
-
|
| 855 |
-
phrases: list[str] = []
|
| 856 |
-
seen: set[str] = set()
|
| 857 |
-
for i in range(len(tokens)):
|
| 858 |
-
for j in range(i + 1, min(i + 1 + self.max_words, len(tokens) + 1)):
|
| 859 |
-
p = " ".join(tokens[i:j])
|
| 860 |
-
if p not in seen:
|
| 861 |
-
seen.add(p)
|
| 862 |
-
phrases.append(p)
|
| 863 |
-
|
| 864 |
-
results = [self.score_phrase(p) for p in phrases]
|
| 865 |
-
candidate_rows: list[dict[str, Any]] = []
|
| 866 |
-
best_by_supplier: dict[str, dict[str, Any]] = {}
|
| 867 |
-
for row in results:
|
| 868 |
-
supplier = row["supplier"]
|
| 869 |
-
score = float(row.get("score", -1.0))
|
| 870 |
-
phrase = str(row.get("phrase") or "")
|
| 871 |
-
support = self.phrase_support_cache.get(phrase)
|
| 872 |
-
if support is None:
|
| 873 |
-
support = self.lexical_support(phrase)
|
| 874 |
-
self.phrase_support_cache[phrase] = support
|
| 875 |
-
combined = 0.75 * score + 0.25 * support
|
| 876 |
-
|
| 877 |
-
if debug:
|
| 878 |
-
candidate_rows.append({
|
| 879 |
-
"supplier": supplier,
|
| 880 |
-
"phrase": phrase,
|
| 881 |
-
"score": round(score, 4),
|
| 882 |
-
"support": round(support, 4),
|
| 883 |
-
"combined": round(combined, 4),
|
| 884 |
-
})
|
| 885 |
-
|
| 886 |
-
enriched = {**row, "combined": combined}
|
| 887 |
-
passes = score >= threshold or combined >= 0.48
|
| 888 |
-
if passes and (supplier not in best_by_supplier or combined > float(best_by_supplier[supplier].get("combined", -1.0))):
|
| 889 |
-
best_by_supplier[supplier] = enriched
|
| 890 |
-
|
| 891 |
-
if not best_by_supplier and results:
|
| 892 |
-
def support_for_phrase(phrase: str) -> float:
|
| 893 |
-
cached_support = self.phrase_support_cache.get(phrase)
|
| 894 |
-
if cached_support is None:
|
| 895 |
-
cached_support = self.lexical_support(phrase)
|
| 896 |
-
self.phrase_support_cache[phrase] = cached_support
|
| 897 |
-
return cached_support
|
| 898 |
-
|
| 899 |
-
fallback = max(
|
| 900 |
-
results,
|
| 901 |
-
key=lambda item: 0.75 * float(item.get("score", -1.0)) + 0.25 * support_for_phrase(str(item.get("phrase") or "")),
|
| 902 |
-
)
|
| 903 |
-
fallback_score = float(fallback.get("score", -1.0))
|
| 904 |
-
fallback_phrase = str(fallback.get("phrase") or "")
|
| 905 |
-
fallback_support = support_for_phrase(fallback_phrase)
|
| 906 |
-
fallback_combined = 0.75 * fallback_score + 0.25 * fallback_support
|
| 907 |
-
if fallback_score >= 0.40 and fallback_support >= 0.43 and fallback_combined >= 0.43:
|
| 908 |
-
best_by_supplier[fallback["supplier"]] = {**fallback, "combined": fallback_combined}
|
| 909 |
-
|
| 910 |
-
supplier_ranking = sorted(best_by_supplier.values(), key=lambda x: float(x.get("combined", x["score"])), reverse=True)
|
| 911 |
-
best = self.resolve_overlapping_suppliers(supplier_ranking)
|
| 912 |
-
|
| 913 |
-
payload = {
|
| 914 |
-
"supplier": best["supplier"],
|
| 915 |
-
"supplier_score": round(best["score"], 4) if best["score"] >= 0 else None,
|
| 916 |
-
"matched_supplier_phrase": best.get("phrase"),
|
| 917 |
-
}
|
| 918 |
-
|
| 919 |
-
if debug:
|
| 920 |
-
top_candidates = sorted(candidate_rows, key=lambda item: item["combined"], reverse=True)[:8]
|
| 921 |
-
payload["supplier_debug"] = {
|
| 922 |
-
"tokens": tokens,
|
| 923 |
-
"phrases_count": len(phrases),
|
| 924 |
-
"top_candidates": top_candidates,
|
| 925 |
-
}
|
| 926 |
-
|
| 927 |
-
return payload
|
| 928 |
-
|
| 929 |
-
|
| 930 |
-
class ExpenseAmountExtractor:
|
| 931 |
-
def __init__(self, suppliers: list[str]) -> None:
|
| 932 |
-
self.model = get_amount_model()
|
| 933 |
-
|
| 934 |
-
@staticmethod
|
| 935 |
-
def to_float(value: str) -> Optional[float]:
|
| 936 |
-
cleaned = value.replace(" ", "").replace("\u00A0", "")
|
| 937 |
-
match = re.search(r"\d+(?:[,]\d{1,2})?", cleaned)
|
| 938 |
-
if not match:
|
| 939 |
-
return None
|
| 940 |
-
try:
|
| 941 |
-
return float(match.group(0).replace(",", "."))
|
| 942 |
-
except ValueError:
|
| 943 |
-
return None
|
| 944 |
-
|
| 945 |
-
@staticmethod
|
| 946 |
-
def phrase_span(text: str, phrase: Optional[str]) -> Optional[tuple[int, int]]:
|
| 947 |
-
if not phrase:
|
| 948 |
-
return None
|
| 949 |
-
idx = text.lower().find(phrase.lower())
|
| 950 |
-
if idx == -1:
|
| 951 |
-
return None
|
| 952 |
-
return idx, idx + len(phrase)
|
| 953 |
-
|
| 954 |
-
@staticmethod
|
| 955 |
-
def overlaps(span1: tuple[int, int], span2: Optional[tuple[int, int]]) -> bool:
|
| 956 |
-
if span2 is None:
|
| 957 |
-
return False
|
| 958 |
-
return span1[0] < span2[1] and span2[0] < span1[1]
|
| 959 |
-
|
| 960 |
-
@staticmethod
|
| 961 |
-
def expand_amount_text(text: str, start: int, end: int) -> tuple[str, tuple[int, int]]:
|
| 962 |
-
suffix = re.match(r",\d{1,2}", text[end:])
|
| 963 |
-
if suffix:
|
| 964 |
-
new_end = end + len(suffix.group(0))
|
| 965 |
-
return text[start:new_end].strip(), (start, new_end)
|
| 966 |
-
|
| 967 |
-
prefix = re.search(r"(\d{1,3}(?:\s*\d{3})*),", text[:start])
|
| 968 |
-
if prefix:
|
| 969 |
-
new_start = prefix.start(1)
|
| 970 |
-
return text[new_start:end].strip(), (new_start, end)
|
| 971 |
-
|
| 972 |
-
return text[start:end].strip(), (start, end)
|
| 973 |
-
|
| 974 |
-
def extract(
|
| 975 |
-
self,
|
| 976 |
-
text: str,
|
| 977 |
-
matched_date_phrase: Optional[str] = None,
|
| 978 |
-
matched_supplier_phrase: Optional[str] = None,
|
| 979 |
-
) -> dict[str, Any]:
|
| 980 |
-
if self.model is None:
|
| 981 |
-
return {"amount": None, "amount_text": None}
|
| 982 |
-
|
| 983 |
-
date_span = self.phrase_span(text, matched_date_phrase)
|
| 984 |
-
supplier_span = self.phrase_span(text, matched_supplier_phrase)
|
| 985 |
-
entities = self.model.predict_entities(text, ["money"], threshold=0.3)
|
| 986 |
-
|
| 987 |
-
for ent in sorted(entities, key=lambda item: float(item.get("score", 0.0)), reverse=True):
|
| 988 |
-
raw_span = (int(ent.get("start", 0)), int(ent.get("end", 0)))
|
| 989 |
-
amount_text, span = self.expand_amount_text(text, raw_span[0], raw_span[1])
|
| 990 |
-
amount = self.to_float(amount_text)
|
| 991 |
-
overlaps_date = self.overlaps(span, date_span)
|
| 992 |
-
overlaps_supplier = self.overlaps(span, supplier_span)
|
| 993 |
-
|
| 994 |
-
if amount is None:
|
| 995 |
-
continue
|
| 996 |
-
if overlaps_date or overlaps_supplier:
|
| 997 |
-
continue
|
| 998 |
-
return {"amount": amount, "amount_text": amount_text}
|
| 999 |
-
|
| 1000 |
-
return {"amount": None, "amount_text": None}
|
| 1001 |
-
|
| 1002 |
-
|
| 1003 |
-
class ExpenseTextExtractor:
|
| 1004 |
def __init__(self, suppliers: list[str], users: list[str]) -> None:
|
| 1005 |
self.date_extractor = ExpenseDateExtractor()
|
| 1006 |
self.supplier_extractor = ExpenseSupplierExtractor(suppliers=suppliers)
|
| 1007 |
self.amount_extractor = ExpenseAmountExtractor(suppliers=suppliers)
|
| 1008 |
-
self.user_extractor = ExpenseUserExtractor(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1009 |
|
| 1010 |
-
def extract(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1011 |
date_info = self.date_extractor.extract(text, reference_date=reference_date)
|
| 1012 |
supplier_info = self.supplier_extractor.extract(
|
| 1013 |
text,
|
|
@@ -1039,10 +127,12 @@ class ExpenseTextExtractor:
|
|
| 1039 |
|
| 1040 |
|
| 1041 |
def build_default_pipeline(suppliers: list[str], users: list[str]) -> ExpenseTextExtractor:
|
|
|
|
| 1042 |
return ExpenseTextExtractor(suppliers=suppliers, users=users)
|
| 1043 |
|
| 1044 |
|
| 1045 |
def extract_names(items: Any) -> list[str]:
|
|
|
|
| 1046 |
if not isinstance(items, list):
|
| 1047 |
return []
|
| 1048 |
|
|
@@ -1061,6 +151,8 @@ def extract_names(items: Any) -> list[str]:
|
|
| 1061 |
|
| 1062 |
|
| 1063 |
def polish_notes_text(text: str) -> str:
|
|
|
|
|
|
|
| 1064 |
normalized = re.sub(r"\s+", " ", text).strip()
|
| 1065 |
if not normalized:
|
| 1066 |
return ""
|
|
@@ -1073,6 +165,7 @@ def polish_notes_text(text: str) -> str:
|
|
| 1073 |
|
| 1074 |
|
| 1075 |
def transcribe_audio_text(audio_path: str) -> str:
|
|
|
|
| 1076 |
mock_text = os.getenv("EXPENSE_VOICE_MOCK_TEXT")
|
| 1077 |
if mock_text:
|
| 1078 |
return mock_text.strip()
|
|
@@ -1090,6 +183,7 @@ def transcribe_audio_text(audio_path: str) -> str:
|
|
| 1090 |
|
| 1091 |
|
| 1092 |
def process_voice_request(audio_path: str, mode: str, payload: dict[str, Any]) -> dict[str, Any]:
|
|
|
|
| 1093 |
context = payload.get("context", {}) if isinstance(payload, dict) else {}
|
| 1094 |
supplier_names = extract_names(context.get("suppliers"))
|
| 1095 |
user_names = extract_names(context.get("users"))
|
|
@@ -1129,6 +223,7 @@ def process_voice_request(audio_path: str, mode: str, payload: dict[str, Any]) -
|
|
| 1129 |
|
| 1130 |
|
| 1131 |
def require_auth():
|
|
|
|
| 1132 |
expected_token = os.getenv("PYTHON_API_TOKEN", os.getenv("EXPENSE_VOICE_FASTAPI_TOKEN", "")).strip()
|
| 1133 |
|
| 1134 |
if not expected_token:
|
|
@@ -1146,6 +241,7 @@ def require_auth():
|
|
| 1146 |
|
| 1147 |
|
| 1148 |
def parse_context(raw: str | None) -> dict[str, Any]:
|
|
|
|
| 1149 |
if not raw:
|
| 1150 |
return {}
|
| 1151 |
|
|
@@ -1156,12 +252,16 @@ def parse_context(raw: str | None) -> dict[str, Any]:
|
|
| 1156 |
return {}
|
| 1157 |
|
| 1158 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1159 |
@app.get("/")
|
| 1160 |
def index():
|
|
|
|
| 1161 |
return jsonify({
|
| 1162 |
"status": "ok",
|
| 1163 |
"message": "Voice processing API is running",
|
| 1164 |
-
"date_parser": DATE_PARSER_MODE,
|
| 1165 |
"endpoints": {
|
| 1166 |
"POST /process-audio": "Process audio file",
|
| 1167 |
"GET /health": "Health check",
|
|
@@ -1172,12 +272,13 @@ def index():
|
|
| 1172 |
|
| 1173 |
@app.get("/health")
|
| 1174 |
def health():
|
|
|
|
| 1175 |
return jsonify({"status": "ok"})
|
| 1176 |
|
| 1177 |
|
| 1178 |
@app.get("/date-test")
|
| 1179 |
def date_test():
|
| 1180 |
-
"""Тестирование парсера дат
|
| 1181 |
test_phrases = [
|
| 1182 |
"завтра",
|
| 1183 |
"через 2 дня",
|
|
@@ -1191,7 +292,7 @@ def date_test():
|
|
| 1191 |
"в конце месяца"
|
| 1192 |
]
|
| 1193 |
|
| 1194 |
-
extractor =
|
| 1195 |
results = []
|
| 1196 |
for phrase in test_phrases:
|
| 1197 |
result = extractor.extract(phrase)
|
|
@@ -1203,7 +304,6 @@ def date_test():
|
|
| 1203 |
|
| 1204 |
return jsonify({
|
| 1205 |
"status": "ok",
|
| 1206 |
-
"parser": DATE_PARSER_MODE,
|
| 1207 |
"reference_date": date.today().isoformat(),
|
| 1208 |
"results": results
|
| 1209 |
})
|
|
@@ -1211,6 +311,7 @@ def date_test():
|
|
| 1211 |
|
| 1212 |
@app.post("/process-audio")
|
| 1213 |
def process_audio():
|
|
|
|
| 1214 |
auth_error = require_auth()
|
| 1215 |
if auth_error:
|
| 1216 |
return auth_error
|
|
@@ -1240,4 +341,4 @@ def process_audio():
|
|
| 1240 |
|
| 1241 |
|
| 1242 |
if __name__ == "__main__":
|
| 1243 |
-
app.run(host="0.0.0.0", port=int(os.getenv("PORT", "7860")))
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Voice Processing API для обработки аудио и извлечения данных о расходах.
|
| 3 |
+
|
| 4 |
+
Основной файл приложения Flask.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
from __future__ import annotations
|
| 8 |
|
|
|
|
|
|
|
| 9 |
import json
|
| 10 |
import os
|
|
|
|
| 11 |
import tempfile
|
| 12 |
+
from datetime import date
|
|
|
|
|
|
|
| 13 |
from pathlib import Path
|
| 14 |
from typing import Any, Optional
|
| 15 |
|
|
|
|
| 16 |
import torch
|
|
|
|
| 17 |
from flask import Flask, jsonify, request
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
from sentence_transformers import SentenceTransformer
|
|
|
|
|
|
|
| 19 |
|
| 20 |
+
# Импорт экстракторов
|
| 21 |
+
from extractors import (
|
| 22 |
+
ExpenseDateExtractor,
|
| 23 |
+
ExpenseSupplierExtractor,
|
| 24 |
+
ExpenseUserExtractor,
|
| 25 |
+
ExpenseAmountExtractor,
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
|
| 29 |
# HuggingFace Token (если нужен для моделей)
|
| 30 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 31 |
|
|
|
|
| 32 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 33 |
_MODEL: Optional[SentenceTransformer] = None
|
|
|
|
| 34 |
_WHISPER_MODEL: Optional[Any] = None
|
| 35 |
|
| 36 |
|
|
|
|
| 39 |
|
| 40 |
|
| 41 |
def get_embedding_model() -> SentenceTransformer:
|
| 42 |
+
"""Возвращает модель эмбеддингов (ленивая загрузка)."""
|
| 43 |
global _MODEL
|
| 44 |
|
| 45 |
if _MODEL is None:
|
|
|
|
| 48 |
return _MODEL
|
| 49 |
|
| 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
def get_whisper_model() -> Any:
|
| 52 |
+
"""Возвращает модель Whisper (ленивая загрузка)."""
|
| 53 |
global _WHISPER_MODEL
|
| 54 |
|
| 55 |
if _WHISPER_MODEL is None:
|
|
|
|
| 62 |
return _WHISPER_MODEL
|
| 63 |
|
| 64 |
|
| 65 |
+
class ExpenseTextExtractor:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
"""
|
| 67 |
+
Главный экстрактор данных о расходах.
|
| 68 |
+
|
| 69 |
+
Комбинирует все ��кстракторы: даты, поставщики, пользователи, суммы.
|
| 70 |
"""
|
| 71 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
def __init__(self, suppliers: list[str], users: list[str]) -> None:
|
| 73 |
self.date_extractor = ExpenseDateExtractor()
|
| 74 |
self.supplier_extractor = ExpenseSupplierExtractor(suppliers=suppliers)
|
| 75 |
self.amount_extractor = ExpenseAmountExtractor(suppliers=suppliers)
|
| 76 |
+
self.user_extractor = ExpenseUserExtractor(
|
| 77 |
+
users=users,
|
| 78 |
+
suppliers=suppliers,
|
| 79 |
+
model=get_embedding_model()
|
| 80 |
+
)
|
| 81 |
|
| 82 |
+
def extract(
|
| 83 |
+
self,
|
| 84 |
+
text: str,
|
| 85 |
+
reference_date: str | date | None = None,
|
| 86 |
+
debug_supplier: bool = False
|
| 87 |
+
) -> dict[str, Any]:
|
| 88 |
+
"""
|
| 89 |
+
Извлекает все данные из текста.
|
| 90 |
+
|
| 91 |
+
Args:
|
| 92 |
+
text: Текст для анализа
|
| 93 |
+
reference_date: Базовая дата
|
| 94 |
+
debug_supplier: Включить отладку поставщиков
|
| 95 |
+
|
| 96 |
+
Returns:
|
| 97 |
+
Словарь со всеми извлечёнными данными
|
| 98 |
+
"""
|
| 99 |
date_info = self.date_extractor.extract(text, reference_date=reference_date)
|
| 100 |
supplier_info = self.supplier_extractor.extract(
|
| 101 |
text,
|
|
|
|
| 127 |
|
| 128 |
|
| 129 |
def build_default_pipeline(suppliers: list[str], users: list[str]) -> ExpenseTextExtractor:
|
| 130 |
+
"""Создаёт пайплайн извлечения данных."""
|
| 131 |
return ExpenseTextExtractor(suppliers=suppliers, users=users)
|
| 132 |
|
| 133 |
|
| 134 |
def extract_names(items: Any) -> list[str]:
|
| 135 |
+
"""Извлекает имена из списка объектов или строк."""
|
| 136 |
if not isinstance(items, list):
|
| 137 |
return []
|
| 138 |
|
|
|
|
| 151 |
|
| 152 |
|
| 153 |
def polish_notes_text(text: str) -> str:
|
| 154 |
+
"""Форматирует текст заметки."""
|
| 155 |
+
import re
|
| 156 |
normalized = re.sub(r"\s+", " ", text).strip()
|
| 157 |
if not normalized:
|
| 158 |
return ""
|
|
|
|
| 165 |
|
| 166 |
|
| 167 |
def transcribe_audio_text(audio_path: str) -> str:
|
| 168 |
+
"""Транскрибирует аудио в текст."""
|
| 169 |
mock_text = os.getenv("EXPENSE_VOICE_MOCK_TEXT")
|
| 170 |
if mock_text:
|
| 171 |
return mock_text.strip()
|
|
|
|
| 183 |
|
| 184 |
|
| 185 |
def process_voice_request(audio_path: str, mode: str, payload: dict[str, Any]) -> dict[str, Any]:
|
| 186 |
+
"""Обрабатывает голосовой запрос."""
|
| 187 |
context = payload.get("context", {}) if isinstance(payload, dict) else {}
|
| 188 |
supplier_names = extract_names(context.get("suppliers"))
|
| 189 |
user_names = extract_names(context.get("users"))
|
|
|
|
| 223 |
|
| 224 |
|
| 225 |
def require_auth():
|
| 226 |
+
"""Проверяет авторизацию запроса."""
|
| 227 |
expected_token = os.getenv("PYTHON_API_TOKEN", os.getenv("EXPENSE_VOICE_FASTAPI_TOKEN", "")).strip()
|
| 228 |
|
| 229 |
if not expected_token:
|
|
|
|
| 241 |
|
| 242 |
|
| 243 |
def parse_context(raw: str | None) -> dict[str, Any]:
|
| 244 |
+
"""Парсит JSON контекст."""
|
| 245 |
if not raw:
|
| 246 |
return {}
|
| 247 |
|
|
|
|
| 252 |
return {}
|
| 253 |
|
| 254 |
|
| 255 |
+
# ============================================================================
|
| 256 |
+
# ENDPOINTS
|
| 257 |
+
# ============================================================================
|
| 258 |
+
|
| 259 |
@app.get("/")
|
| 260 |
def index():
|
| 261 |
+
"""Главная страница API."""
|
| 262 |
return jsonify({
|
| 263 |
"status": "ok",
|
| 264 |
"message": "Voice processing API is running",
|
|
|
|
| 265 |
"endpoints": {
|
| 266 |
"POST /process-audio": "Process audio file",
|
| 267 |
"GET /health": "Health check",
|
|
|
|
| 272 |
|
| 273 |
@app.get("/health")
|
| 274 |
def health():
|
| 275 |
+
"""Проверка здоровья сервиса."""
|
| 276 |
return jsonify({"status": "ok"})
|
| 277 |
|
| 278 |
|
| 279 |
@app.get("/date-test")
|
| 280 |
def date_test():
|
| 281 |
+
"""Тестирование парсера дат."""
|
| 282 |
test_phrases = [
|
| 283 |
"завтра",
|
| 284 |
"через 2 дня",
|
|
|
|
| 292 |
"в конце месяца"
|
| 293 |
]
|
| 294 |
|
| 295 |
+
extractor = ExpenseDateExtractor()
|
| 296 |
results = []
|
| 297 |
for phrase in test_phrases:
|
| 298 |
result = extractor.extract(phrase)
|
|
|
|
| 304 |
|
| 305 |
return jsonify({
|
| 306 |
"status": "ok",
|
|
|
|
| 307 |
"reference_date": date.today().isoformat(),
|
| 308 |
"results": results
|
| 309 |
})
|
|
|
|
| 311 |
|
| 312 |
@app.post("/process-audio")
|
| 313 |
def process_audio():
|
| 314 |
+
"""Обработка аудио файла."""
|
| 315 |
auth_error = require_auth()
|
| 316 |
if auth_error:
|
| 317 |
return auth_error
|
|
|
|
| 341 |
|
| 342 |
|
| 343 |
if __name__ == "__main__":
|
| 344 |
+
app.run(host="0.0.0.0", port=int(os.getenv("PORT", "7860")))
|
extractors/__init__.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Модуль экстракторов данных из текста.
|
| 3 |
+
|
| 4 |
+
- DateExtractor: извлечение дат
|
| 5 |
+
- SupplierExtractor: извлечение поставщиков
|
| 6 |
+
- UserExtractor: извлечение пользователей
|
| 7 |
+
- AmountExtractor: извлечение сумм
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from extractors.date_extractor import ExpenseDateExtractor, ParsedDate
|
| 11 |
+
from extractors.supplier_extractor import ExpenseSupplierExtractor
|
| 12 |
+
from extractors.user_extractor import ExpenseUserExtractor
|
| 13 |
+
from extractors.amount_extractor import ExpenseAmountExtractor
|
| 14 |
+
|
| 15 |
+
__all__ = [
|
| 16 |
+
"ExpenseDateExtractor",
|
| 17 |
+
"ExpenseSupplierExtractor",
|
| 18 |
+
"ExpenseUserExtractor",
|
| 19 |
+
"ExpenseAmountExtractor",
|
| 20 |
+
"ParsedDate",
|
| 21 |
+
]
|
extractors/amount_extractor.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Экстрактор сумм из текста.
|
| 3 |
+
|
| 4 |
+
Использует GLiNER для извлечения денежных сумм.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import re
|
| 10 |
+
from typing import Any, Optional
|
| 11 |
+
|
| 12 |
+
from gliner import GLiNER
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
# Глобальная модель для извлечения сумм
|
| 16 |
+
_AMOUNT_MODEL: Optional[GLiNER] = None
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def get_amount_model() -> Optional[GLiNER]:
|
| 20 |
+
"""Возвращает модель для извлечения сумм (ленивая загрузка)."""
|
| 21 |
+
global _AMOUNT_MODEL
|
| 22 |
+
|
| 23 |
+
if _AMOUNT_MODEL is None:
|
| 24 |
+
_AMOUNT_MODEL = GLiNER.from_pretrained("urchade/gliner_multi-v2.1")
|
| 25 |
+
|
| 26 |
+
return _AMOUNT_MODEL
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class ExpenseAmountExtractor:
|
| 30 |
+
"""
|
| 31 |
+
Экстрактор денежных сумм из текста.
|
| 32 |
+
|
| 33 |
+
Использует GLiNER для поиска упоминаний денег.
|
| 34 |
+
"""
|
| 35 |
+
|
| 36 |
+
def __init__(self, suppliers: list[str] | None = None) -> None:
|
| 37 |
+
"""
|
| 38 |
+
Args:
|
| 39 |
+
suppliers: Список поставщиков (не используется, для совместимости)
|
| 40 |
+
"""
|
| 41 |
+
self.model = get_amount_model()
|
| 42 |
+
|
| 43 |
+
@staticmethod
|
| 44 |
+
def to_float(value: str) -> Optional[float]:
|
| 45 |
+
"""Преобразует строку в число."""
|
| 46 |
+
cleaned = value.replace(" ", "").replace("\u00A0", "")
|
| 47 |
+
match = re.search(r"\d+(?:[,]\d{1,2})?", cleaned)
|
| 48 |
+
if not match:
|
| 49 |
+
return None
|
| 50 |
+
try:
|
| 51 |
+
return float(match.group(0).replace(",", "."))
|
| 52 |
+
except ValueError:
|
| 53 |
+
return None
|
| 54 |
+
|
| 55 |
+
@staticmethod
|
| 56 |
+
def phrase_span(text: str, phrase: Optional[str]) -> Optional[tuple[int, int]]:
|
| 57 |
+
"""Возвращает позицию фразы в тексте."""
|
| 58 |
+
if not phrase:
|
| 59 |
+
return None
|
| 60 |
+
idx = text.lower().find(phrase.lower())
|
| 61 |
+
if idx == -1:
|
| 62 |
+
return None
|
| 63 |
+
return idx, idx + len(phrase)
|
| 64 |
+
|
| 65 |
+
@staticmethod
|
| 66 |
+
def overlaps(span1: tuple[int, int], span2: Optional[tuple[int, int]]) -> bool:
|
| 67 |
+
"""Проверяет пересечение двух диапазонов."""
|
| 68 |
+
if span2 is None:
|
| 69 |
+
return False
|
| 70 |
+
return span1[0] < span2[1] and span2[0] < span1[1]
|
| 71 |
+
|
| 72 |
+
@staticmethod
|
| 73 |
+
def expand_amount_text(text: str, start: int, end: int) -> tuple[str, tuple[int, int]]:
|
| 74 |
+
"""Расширяет текст суммы (для дробных чисел)."""
|
| 75 |
+
suffix = re.match(r",\d{1,2}", text[end:])
|
| 76 |
+
if suffix:
|
| 77 |
+
new_end = end + len(suffix.group(0))
|
| 78 |
+
return text[start:new_end].strip(), (start, new_end)
|
| 79 |
+
|
| 80 |
+
prefix = re.search(r"(\d{1,3}(?:\s*\d{3})*),", text[:start])
|
| 81 |
+
if prefix:
|
| 82 |
+
new_start = prefix.start(1)
|
| 83 |
+
return text[new_start:end].strip(), (new_start, end)
|
| 84 |
+
|
| 85 |
+
return text[start:end].strip(), (start, end)
|
| 86 |
+
|
| 87 |
+
def extract(
|
| 88 |
+
self,
|
| 89 |
+
text: str,
|
| 90 |
+
matched_date_phrase: Optional[str] = None,
|
| 91 |
+
matched_supplier_phrase: Optional[str] = None,
|
| 92 |
+
) -> dict[str, Any]:
|
| 93 |
+
"""
|
| 94 |
+
Извлекает сумму из текста.
|
| 95 |
+
|
| 96 |
+
Args:
|
| 97 |
+
text: Текст для анализа
|
| 98 |
+
matched_date_phrase: Фраза даты для исключения
|
| 99 |
+
matched_supplier_phrase: Фраза поставщика для исключения
|
| 100 |
+
|
| 101 |
+
Returns:
|
| 102 |
+
Словарь с amount и amount_text
|
| 103 |
+
"""
|
| 104 |
+
if self.model is None:
|
| 105 |
+
return {"amount": None, "amount_text": None}
|
| 106 |
+
|
| 107 |
+
date_span = self.phrase_span(text, matched_date_phrase)
|
| 108 |
+
supplier_span = self.phrase_span(text, matched_supplier_phrase)
|
| 109 |
+
entities = self.model.predict_entities(text, ["money"], threshold=0.3)
|
| 110 |
+
|
| 111 |
+
for ent in sorted(entities, key=lambda item: float(item.get("score", 0.0)), reverse=True):
|
| 112 |
+
raw_span = (int(ent.get("start", 0)), int(ent.get("end", 0)))
|
| 113 |
+
amount_text, span = self.expand_amount_text(text, raw_span[0], raw_span[1])
|
| 114 |
+
amount = self.to_float(amount_text)
|
| 115 |
+
overlaps_date = self.overlaps(span, date_span)
|
| 116 |
+
overlaps_supplier = self.overlaps(span, supplier_span)
|
| 117 |
+
|
| 118 |
+
if amount is None:
|
| 119 |
+
continue
|
| 120 |
+
if overlaps_date or overlaps_supplier:
|
| 121 |
+
continue
|
| 122 |
+
return {"amount": amount, "amount_text": amount_text}
|
| 123 |
+
|
| 124 |
+
return {"amount": None, "amount_text": None}
|
extractors/date_extractor.py
ADDED
|
@@ -0,0 +1,518 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Экстрактор дат из русского текста.
|
| 3 |
+
|
| 4 |
+
Классы:
|
| 5 |
+
- UniversalDateParser: парсер дат с поддержкой относительных и абсолютных дат
|
| 6 |
+
- ExpenseDateExtractor: обёртка для извлечения дат из текста
|
| 7 |
+
- ParsedDate: результат парсинга
|
| 8 |
+
- Token: токен текста
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
import calendar
|
| 14 |
+
import difflib
|
| 15 |
+
import re
|
| 16 |
+
from dataclasses import dataclass
|
| 17 |
+
from datetime import date, datetime, timedelta
|
| 18 |
+
from typing import Any, Optional
|
| 19 |
+
|
| 20 |
+
from dateparser.search import search_dates
|
| 21 |
+
from pymorphy3 import MorphAnalyzer
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
MORPH = MorphAnalyzer()
|
| 25 |
+
WORD_RE = re.compile(r"[0-9]+(?:[./-][0-9]+)*|[а-яё]+", re.IGNORECASE)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
@dataclass(frozen=True)
|
| 29 |
+
class ParsedDate:
|
| 30 |
+
"""Результат парсинга даты."""
|
| 31 |
+
date_iso: str
|
| 32 |
+
matched_expression: Optional[str]
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
@dataclass(frozen=True)
|
| 36 |
+
class Token:
|
| 37 |
+
"""Токен текста с морфологической информацией."""
|
| 38 |
+
original: str
|
| 39 |
+
normalized: str
|
| 40 |
+
raw_lemma: str
|
| 41 |
+
lemma: str
|
| 42 |
+
lemma_correction: Optional[str]
|
| 43 |
+
start: int
|
| 44 |
+
end: int
|
| 45 |
+
lemma_start: int
|
| 46 |
+
lemma_end: int
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
class UniversalDateParser:
|
| 50 |
+
"""
|
| 51 |
+
Универсальный парсер дат для русского языка.
|
| 52 |
+
|
| 53 |
+
Поддерживает:
|
| 54 |
+
- Прямые относительные даты: вчера, завтра, позавчера, послезавтра
|
| 55 |
+
- Недели: на следующей неделе, на прошлой неделе
|
| 56 |
+
- Периоды: через 2 дня, 3 недели назад, через месяц
|
| 57 |
+
- Текстовые даты: 5 марта, 15 января 2025
|
| 58 |
+
- Числовые даты: 15.01.2025, 2025-01-15
|
| 59 |
+
- Края периодов: в конце месяца, в начале недели
|
| 60 |
+
"""
|
| 61 |
+
|
| 62 |
+
MONTHS = {
|
| 63 |
+
"январь": 1, "февраль": 2, "март": 3, "апрель": 4, "май": 5, "июнь": 6,
|
| 64 |
+
"июль": 7, "август": 8, "сентябрь": 9, "октябрь": 10, "ноябрь": 11, "декабрь": 12,
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
WEEKDAYS = {
|
| 68 |
+
"понедельник": 0, "вторник": 1, "среда": 2, "четверг": 3,
|
| 69 |
+
"пятница": 4, "суббота": 5, "воскресенье": 6,
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
DIRECT_RELATIVE = {
|
| 73 |
+
"послезавтра": 2, "позавчера": -2, "сегодня": 0, "вчера": -1, "завтра": 1
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
ORDINAL_DAYS = {
|
| 77 |
+
"первый": 1, "второй": 2, "третий": 3, "четвертый": 4, "пятый": 5, "шестой": 6,
|
| 78 |
+
"седьмой": 7, "восьмой": 8, "девятый": 9, "десятый": 10, "одиннадцатый": 11,
|
| 79 |
+
"двенадцатый": 12, "тринадцатый": 13, "четырнадцатый": 14, "пятнадцатый": 15,
|
| 80 |
+
"шестнадцатый": 16, "семнадцатый": 17, "восемнадцатый": 18, "девятнадцатый": 19,
|
| 81 |
+
"двадцатый": 20, "двадцать первый": 21, "двадцать второй": 22, "двадцать третий": 23,
|
| 82 |
+
"двадцать четвертый": 24, "двадцать пятый": 25, "двадцать шестой": 26,
|
| 83 |
+
"двадцать седьмой": 27, "двадцать восьмой": 28, "двадцать девятый": 29,
|
| 84 |
+
"тридцатый": 30, "тридцать первый": 31,
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
NUMBER_WORDS = {
|
| 88 |
+
"ноль": 0, "один": 1, "два": 2, "три": 3, "четыре": 4, "пять": 5, "шесть": 6,
|
| 89 |
+
"семь": 7, "восемь": 8, "девять": 9, "десять": 10, "одиннадцать": 11,
|
| 90 |
+
"двенадцать": 12, "тринадцать": 13, "четырнадцать": 14, "пятнадцать": 15,
|
| 91 |
+
"шестнадцать": 16, "семнадцать": 17, "восемнадцать": 18, "девятнадцать": 19,
|
| 92 |
+
"двадцать": 20, "тридцать": 30,
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
FUTURE_HINTS = ("завтра", "послезавтра", "через", "быть", "заплатить", "следующий", "последующий")
|
| 96 |
+
PAST_HINTS = ("вчера", "позавчера", "назад", "прошлый", "предыдущий", "оплатить", "купить", "заказать")
|
| 97 |
+
|
| 98 |
+
# Регулярные выражения для парсинга
|
| 99 |
+
DIRECT_RELATIVE_RE = re.compile(r"(?<!\S)(послезавтра|позавчера|сегодня|вчера|завтра)(?!\S)")
|
| 100 |
+
|
| 101 |
+
WEEK_RELATIVE_RE = re.compile(
|
| 102 |
+
r"(?<!\S)на (?P<which>следующий|последующий|прошлый|предыдущий|этот) неделя"
|
| 103 |
+
r"(?: (?P<prep>в|во|на) (?P<weekday>понедельник|вторник|среда|четверг|пятница|суббота|воскресенье))?(?!\S)"
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
QUANTITY_RELATIVE_RE = re.compile(
|
| 107 |
+
r"(?<!\S)(?P<number>\d+|[а-яё]+(?: [а-яё]+)?) "
|
| 108 |
+
r"(?P<unit>месяц|неделя|день) "
|
| 109 |
+
r"(?P<ago>назад)"
|
| 110 |
+
r"(?: (?P<prep>в|во|на) (?P<weekday>понедельник|вторник|среда|четверг|пятница|суббота|воскресенье))?(?!\S)",
|
| 111 |
+
re.IGNORECASE,
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
FORWARD_QUANTITY_RE = re.compile(
|
| 115 |
+
r"(?<!\S)(?P<through>через) "
|
| 116 |
+
r"(?P<number>\d+|[а-яё]+(?: [а-яё]+)?) "
|
| 117 |
+
r"(?P<unit>месяц|неделя|день)"
|
| 118 |
+
r"(?: (?P<prep>в|во|на) (?P<weekday>понедельник|вторник|среда|четверг|пятница|суббота|воскресенье))?(?!\S)",
|
| 119 |
+
re.IGNORECASE,
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
FORWARD_SINGLE_UNIT_RE = re.compile(
|
| 123 |
+
r"(?<!\S)(?P<through>через) "
|
| 124 |
+
r"(?P<unit>месяц|неделя|день)"
|
| 125 |
+
r"(?: (?P<prep>в|во|на) (?P<weekday>понедельник|вторник|среда|четверг|пятница|суббота|воскресенье))?(?!\S)",
|
| 126 |
+
re.IGNORECASE,
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
TEXTUAL_ABSOLUTE_RE = re.compile(
|
| 130 |
+
r"(?<!\S)(?P<day>\d{1,2}|[а-яё]+(?: [а-яё]+)?) "
|
| 131 |
+
r"(?P<month>январь|февраль|март|апрель|май|июнь|июль|август|сентябрь|октябрь|ноябрь|декабрь)"
|
| 132 |
+
r"(?: (?P<year>\d{4}))?(?!\S)",
|
| 133 |
+
re.IGNORECASE,
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
PERIOD_EDGE_RE = re.compile(
|
| 137 |
+
r"(?<!\S)(?:в )?(?P<edge>начало|конец) (?P<which>этот|следующий|последующий|прошлый|предыдущий) (?P<unit>неделя|месяц)(?!\S)",
|
| 138 |
+
re.IGNORECASE,
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
@classmethod
|
| 142 |
+
def temporal_vocabulary(cls) -> set[str]:
|
| 143 |
+
"""Возвращает словарь временных терминов."""
|
| 144 |
+
vocab: set[str] = set()
|
| 145 |
+
vocab.update(cls.MONTHS)
|
| 146 |
+
vocab.update(cls.WEEKDAYS)
|
| 147 |
+
vocab.update(cls.DIRECT_RELATIVE)
|
| 148 |
+
vocab.update(cls.ORDINAL_DAYS)
|
| 149 |
+
vocab.update(cls.NUMBER_WORDS)
|
| 150 |
+
vocab.update({
|
| 151 |
+
"неделя", "месяц", "день", "назад", "через", "начало", "конец", "на", "в", "во",
|
| 152 |
+
"этот", "прошлый", "предыдущий", "следующий", "последующий",
|
| 153 |
+
})
|
| 154 |
+
return vocab
|
| 155 |
+
|
| 156 |
+
@staticmethod
|
| 157 |
+
def similarity(left: str, right: str) -> float:
|
| 158 |
+
"""Вычисляет схожесть двух строк."""
|
| 159 |
+
return difflib.SequenceMatcher(None, left, right).ratio()
|
| 160 |
+
|
| 161 |
+
@classmethod
|
| 162 |
+
def pick_temporal_correction(cls, normalized: str, raw_lemma: str) -> tuple[str, Optional[str]]:
|
| 163 |
+
"""Подбирает коррекцию для временного термина."""
|
| 164 |
+
vocab = cls.temporal_vocabulary()
|
| 165 |
+
if raw_lemma in vocab or not normalized.isalpha() or len(normalized) < 5:
|
| 166 |
+
return raw_lemma, None
|
| 167 |
+
|
| 168 |
+
candidates = list(difflib.get_close_matches(normalized, list(vocab), n=4, cutoff=0.74))
|
| 169 |
+
candidates.extend(difflib.get_close_matches(raw_lemma, list(vocab), n=4, cutoff=0.74))
|
| 170 |
+
candidates = list(dict.fromkeys(candidates))
|
| 171 |
+
if not candidates:
|
| 172 |
+
return raw_lemma, None
|
| 173 |
+
|
| 174 |
+
best = max(candidates, key=lambda item: max(cls.similarity(normalized, item), cls.similarity(raw_lemma, item)))
|
| 175 |
+
best_score = max(cls.similarity(normalized, best), cls.similarity(raw_lemma, best))
|
| 176 |
+
return (best, f"{raw_lemma}->{best}") if best_score >= 0.80 else (raw_lemma, None)
|
| 177 |
+
|
| 178 |
+
@staticmethod
|
| 179 |
+
def normalize_word(word: str) -> str:
|
| 180 |
+
"""Нормализует слово."""
|
| 181 |
+
return word.lower().replace("ё", "е")
|
| 182 |
+
|
| 183 |
+
@classmethod
|
| 184 |
+
def lemmatize(cls, word: str) -> str:
|
| 185 |
+
"""Возвращает лемму слова."""
|
| 186 |
+
return MORPH.parse(word)[0].normal_form if word.isalpha() else word
|
| 187 |
+
|
| 188 |
+
@classmethod
|
| 189 |
+
def tokenize(cls, text: str) -> list[Token]:
|
| 190 |
+
"""Токенизирует текст."""
|
| 191 |
+
tokens: list[Token] = []
|
| 192 |
+
lemma_cursor = 0
|
| 193 |
+
|
| 194 |
+
for match in WORD_RE.finditer(text):
|
| 195 |
+
original = match.group(0)
|
| 196 |
+
normalized = cls.normalize_word(original)
|
| 197 |
+
raw_lemma = cls.lemmatize(normalized)
|
| 198 |
+
lemma, correction = cls.pick_temporal_correction(normalized, raw_lemma)
|
| 199 |
+
lemma_start = lemma_cursor
|
| 200 |
+
lemma_end = lemma_start + len(lemma)
|
| 201 |
+
tokens.append(Token(original, normalized, raw_lemma, lemma, correction, match.start(), match.end(), lemma_start, lemma_end))
|
| 202 |
+
lemma_cursor = lemma_end + 1
|
| 203 |
+
|
| 204 |
+
return tokens
|
| 205 |
+
|
| 206 |
+
@staticmethod
|
| 207 |
+
def lemma_text(tokens: list[Token]) -> str:
|
| 208 |
+
"""Возвращает текст из лемм токенов."""
|
| 209 |
+
return " ".join(token.lemma for token in tokens)
|
| 210 |
+
|
| 211 |
+
@staticmethod
|
| 212 |
+
def surface_text(text: str, tokens: list[Token], start_idx: int, end_idx: int) -> str:
|
| 213 |
+
"""Возвращает исходный текст по индексам токенов."""
|
| 214 |
+
return text[tokens[start_idx].start:tokens[end_idx].end].strip() if tokens else ""
|
| 215 |
+
|
| 216 |
+
@staticmethod
|
| 217 |
+
def lemma_span_to_token_range(tokens: list[Token], span: tuple[int, int]) -> Optional[tuple[int, int]]:
|
| 218 |
+
"""Преобразует позиции в тексте лемм в индексы токенов."""
|
| 219 |
+
start_char, end_char = span
|
| 220 |
+
start_idx = end_idx = None
|
| 221 |
+
|
| 222 |
+
for idx, token in enumerate(tokens):
|
| 223 |
+
if start_idx is None and token.lemma_start <= start_char < token.lemma_end:
|
| 224 |
+
start_idx = idx
|
| 225 |
+
if token.lemma_start < end_char <= token.lemma_end:
|
| 226 |
+
end_idx = idx
|
| 227 |
+
break
|
| 228 |
+
|
| 229 |
+
return (start_idx, end_idx) if start_idx is not None and end_idx is not None else None
|
| 230 |
+
|
| 231 |
+
@classmethod
|
| 232 |
+
def make_parsed_date(cls, text: str, tokens: list[Token], match, parsed_date: date) -> Optional[ParsedDate]:
|
| 233 |
+
"""Создаёт ParsedDate из результата match."""
|
| 234 |
+
token_span = cls.lemma_span_to_token_range(tokens, match.span())
|
| 235 |
+
if token_span is None:
|
| 236 |
+
return None
|
| 237 |
+
return ParsedDate(parsed_date.isoformat(), cls.surface_text(text, tokens, token_span[0], token_span[1]))
|
| 238 |
+
|
| 239 |
+
@classmethod
|
| 240 |
+
def parse_number_phrase(cls, phrase: str) -> Optional[int]:
|
| 241 |
+
"""Парсит числовую фразу (цифры или слова)."""
|
| 242 |
+
phrase = phrase.strip()
|
| 243 |
+
if not phrase:
|
| 244 |
+
return None
|
| 245 |
+
if phrase.isdigit():
|
| 246 |
+
return int(phrase)
|
| 247 |
+
|
| 248 |
+
parts = phrase.split()
|
| 249 |
+
if len(parts) == 1:
|
| 250 |
+
return cls.NUMBER_WORDS.get(parts[0])
|
| 251 |
+
if len(parts) == 2 and parts[0] in {"двадцать", "тридцать"}:
|
| 252 |
+
base = cls.NUMBER_WORDS.get(parts[0])
|
| 253 |
+
addon = cls.NUMBER_WORDS.get(parts[1])
|
| 254 |
+
if base is not None and addon is not None and 1 <= addon <= 9:
|
| 255 |
+
return base + addon
|
| 256 |
+
return None
|
| 257 |
+
|
| 258 |
+
@classmethod
|
| 259 |
+
def parse_day_phrase(cls, phrase: str) -> Optional[int]:
|
| 260 |
+
"""Парсит день (число или порядковое слово)."""
|
| 261 |
+
if phrase.isdigit():
|
| 262 |
+
value = int(phrase)
|
| 263 |
+
return value if 1 <= value <= 31 else None
|
| 264 |
+
return cls.ORDINAL_DAYS.get(phrase.strip())
|
| 265 |
+
|
| 266 |
+
@staticmethod
|
| 267 |
+
def shift_months(value: date, months: int) -> date:
|
| 268 |
+
"""Сдвигает дату на указанное число месяцев."""
|
| 269 |
+
month_index = value.month - 1 + months
|
| 270 |
+
year = value.year + month_index // 12
|
| 271 |
+
month = month_index % 12 + 1
|
| 272 |
+
day = min(value.day, calendar.monthrange(year, month)[1])
|
| 273 |
+
return date(year, month, day)
|
| 274 |
+
|
| 275 |
+
@staticmethod
|
| 276 |
+
def parse_numeric_absolute(tokens: list[Token]) -> Optional[ParsedDate]:
|
| 277 |
+
"""Парсит числовые даты: 15.01.2025, 2025-01-15."""
|
| 278 |
+
for token in tokens:
|
| 279 |
+
separator = "." if "." in token.original else "-" if "-" in token.original else "/" if "/" in token.original else None
|
| 280 |
+
if separator is None:
|
| 281 |
+
continue
|
| 282 |
+
|
| 283 |
+
parts = token.original.split(separator)
|
| 284 |
+
if len(parts) != 3 or not all(part.isdigit() for part in parts):
|
| 285 |
+
continue
|
| 286 |
+
|
| 287 |
+
try:
|
| 288 |
+
if len(parts[0]) == 4:
|
| 289 |
+
parsed = date(int(parts[0]), int(parts[1]), int(parts[2]))
|
| 290 |
+
elif len(parts[2]) == 4:
|
| 291 |
+
parsed = date(int(parts[2]), int(parts[1]), int(parts[0]))
|
| 292 |
+
else:
|
| 293 |
+
continue
|
| 294 |
+
return ParsedDate(parsed.isoformat(), token.original)
|
| 295 |
+
except ValueError:
|
| 296 |
+
continue
|
| 297 |
+
|
| 298 |
+
return None
|
| 299 |
+
|
| 300 |
+
@classmethod
|
| 301 |
+
def parse_textual_absolute(cls, text: str, tokens: list[Token], reference_date: date) -> Optional[ParsedDate]:
|
| 302 |
+
"""Парсит текстовые даты: 5 марта, 15 января 2025."""
|
| 303 |
+
lemma_text = cls.lemma_text(tokens)
|
| 304 |
+
for match in cls.TEXTUAL_ABSOLUTE_RE.finditer(lemma_text):
|
| 305 |
+
day = cls.parse_day_phrase(match.group("day"))
|
| 306 |
+
month = cls.MONTHS.get(match.group("month"))
|
| 307 |
+
if day is None or month is None:
|
| 308 |
+
continue
|
| 309 |
+
|
| 310 |
+
year = int(match.group("year")) if match.group("year") else reference_date.year
|
| 311 |
+
try:
|
| 312 |
+
parsed = date(year, month, day)
|
| 313 |
+
except ValueError:
|
| 314 |
+
continue
|
| 315 |
+
|
| 316 |
+
result = cls.make_parsed_date(text, tokens, match, parsed)
|
| 317 |
+
if result is not None:
|
| 318 |
+
return result
|
| 319 |
+
|
| 320 |
+
return None
|
| 321 |
+
|
| 322 |
+
@classmethod
|
| 323 |
+
def parse_direct_relative(cls, text: str, tokens: list[Token], reference_date: date) -> Optional[ParsedDate]:
|
| 324 |
+
"""Парсит прямые относительные даты: вчера, завтра, позавчера, послезавтра."""
|
| 325 |
+
lemma_text = cls.lemma_text(tokens)
|
| 326 |
+
match = cls.DIRECT_RELATIVE_RE.search(lemma_text)
|
| 327 |
+
if not match:
|
| 328 |
+
return None
|
| 329 |
+
|
| 330 |
+
parsed = reference_date + timedelta(days=cls.DIRECT_RELATIVE[match.group(1)])
|
| 331 |
+
return cls.make_parsed_date(text, tokens, match, parsed)
|
| 332 |
+
|
| 333 |
+
@staticmethod
|
| 334 |
+
def week_monday(value: date) -> date:
|
| 335 |
+
"""Возвращает понедельник недели для указанной даты."""
|
| 336 |
+
return value - timedelta(days=value.weekday())
|
| 337 |
+
|
| 338 |
+
@classmethod
|
| 339 |
+
def parse_week_relative(cls, text: str, tokens: list[Token], reference_date: date) -> Optional[ParsedDate]:
|
| 340 |
+
"""Парсит недельные относительные даты: на следующей неделе, на прошлой неделе."""
|
| 341 |
+
lemma_text = cls.lemma_text(tokens)
|
| 342 |
+
match = cls.WEEK_RELATIVE_RE.search(lemma_text)
|
| 343 |
+
if not match:
|
| 344 |
+
return None
|
| 345 |
+
|
| 346 |
+
offsets = {"следующий": 7, "последующий": 7, "прошлый": -7, "предыдущий": -7, "этот": 0}
|
| 347 |
+
anchor = reference_date + timedelta(days=offsets[match.group("which")])
|
| 348 |
+
|
| 349 |
+
if match.group("weekday"):
|
| 350 |
+
anchor = cls.week_monday(anchor) + timedelta(days=cls.WEEKDAYS[match.group("weekday")])
|
| 351 |
+
|
| 352 |
+
return cls.make_parsed_date(text, tokens, match, anchor)
|
| 353 |
+
|
| 354 |
+
@classmethod
|
| 355 |
+
def parse_period_edge(cls, text: str, tokens: list[Token], reference_date: date) -> Optional[ParsedDate]:
|
| 356 |
+
"""Парсит края периодов: в конце месяца, в начале недели."""
|
| 357 |
+
lemma_text = cls.lemma_text(tokens)
|
| 358 |
+
match = cls.PERIOD_EDGE_RE.search(lemma_text)
|
| 359 |
+
if not match:
|
| 360 |
+
return None
|
| 361 |
+
|
| 362 |
+
edge, which, unit = match.group("edge"), match.group("which"), match.group("unit")
|
| 363 |
+
|
| 364 |
+
if unit == "неделя":
|
| 365 |
+
offsets = {"прошлый": -7, "предыдущий": -7, "этот": 0, "следующий": 7, "последующий": 7}
|
| 366 |
+
monday = cls.week_monday(reference_date + timedelta(days=offsets[which]))
|
| 367 |
+
parsed_date = monday if edge == "начало" else monday + timedelta(days=6)
|
| 368 |
+
else:
|
| 369 |
+
month_offset = {"прошлый": -1, "предыдущий": -1, "этот": 0, "следующий": 1, "последующий": 1}[which]
|
| 370 |
+
shifted = cls.shift_months(date(reference_date.year, reference_date.month, 1), month_offset)
|
| 371 |
+
parsed_date = shifted if edge == "начало" else date(shifted.year, shifted.month, calendar.monthrange(shifted.year, shifted.month)[1])
|
| 372 |
+
|
| 373 |
+
return cls.make_parsed_date(text, tokens, match, parsed_date)
|
| 374 |
+
|
| 375 |
+
@classmethod
|
| 376 |
+
def parse_quantity_relative(cls, text: str, tokens: list[Token], reference_date: date) -> Optional[ParsedDate]:
|
| 377 |
+
"""Парсит количественные относительные даты: через 2 дня, 3 недели назад."""
|
| 378 |
+
lemma_text = cls.lemma_text(tokens)
|
| 379 |
+
|
| 380 |
+
for regex, direction in ((cls.QUANTITY_RELATIVE_RE, -1), (cls.FORWARD_QUANTITY_RE, 1)):
|
| 381 |
+
for match in regex.finditer(lemma_text):
|
| 382 |
+
number = cls.parse_number_phrase(match.group("number"))
|
| 383 |
+
if number is None:
|
| 384 |
+
continue
|
| 385 |
+
|
| 386 |
+
unit = match.group("unit")
|
| 387 |
+
if unit == "месяц":
|
| 388 |
+
anchor = cls.shift_months(reference_date, direction * number)
|
| 389 |
+
else:
|
| 390 |
+
days = number * 7 if unit == "неделя" else number
|
| 391 |
+
anchor = reference_date + timedelta(days=direction * days)
|
| 392 |
+
|
| 393 |
+
if match.group("weekday"):
|
| 394 |
+
anchor = cls.week_monday(anchor) + timedelta(days=cls.WEEKDAYS[match.group("weekday")])
|
| 395 |
+
|
| 396 |
+
result = cls.make_parsed_date(text, tokens, match, anchor)
|
| 397 |
+
if result is not None:
|
| 398 |
+
return result
|
| 399 |
+
|
| 400 |
+
for match in cls.FORWARD_SINGLE_UNIT_RE.finditer(lemma_text):
|
| 401 |
+
unit = match.group("unit")
|
| 402 |
+
if unit == "месяц":
|
| 403 |
+
anchor = cls.shift_months(reference_date, 1)
|
| 404 |
+
else:
|
| 405 |
+
days = 7 if unit == "неделя" else 1
|
| 406 |
+
anchor = reference_date + timedelta(days=days)
|
| 407 |
+
|
| 408 |
+
if match.group("weekday"):
|
| 409 |
+
anchor = cls.week_monday(anchor) + timedelta(days=cls.WEEKDAYS[match.group("weekday")])
|
| 410 |
+
|
| 411 |
+
result = cls.make_parsed_date(text, tokens, match, anchor)
|
| 412 |
+
if result is not None:
|
| 413 |
+
return result
|
| 414 |
+
|
| 415 |
+
return None
|
| 416 |
+
|
| 417 |
+
@classmethod
|
| 418 |
+
def preference_for_text(cls, tokens: list[Token]) -> str:
|
| 419 |
+
"""Определяет предпочтение: прошлое или будущее."""
|
| 420 |
+
lemmas = [token.lemma for token in tokens]
|
| 421 |
+
future = sum(1 for hint in cls.FUTURE_HINTS if hint in lemmas)
|
| 422 |
+
past = sum(1 for hint in cls.PAST_HINTS if hint in lemmas)
|
| 423 |
+
return "future" if future > past else "past"
|
| 424 |
+
|
| 425 |
+
@staticmethod
|
| 426 |
+
def choose_best(matches: list[tuple[str, datetime]]) -> tuple[str, datetime]:
|
| 427 |
+
"""Выбирает лучший результат из списка."""
|
| 428 |
+
return sorted(matches, key=lambda item: (len(item[0]), -item[1].timestamp()), reverse=True)[0]
|
| 429 |
+
|
| 430 |
+
def parse(self, text: str, reference_date: date) -> Optional[ParsedDate]:
|
| 431 |
+
"""
|
| 432 |
+
Основной метод парсинга даты из текста.
|
| 433 |
+
|
| 434 |
+
Args:
|
| 435 |
+
text: Текст для парсинга
|
| 436 |
+
reference_date: Базовая дата для относительных вычислений
|
| 437 |
+
|
| 438 |
+
Returns:
|
| 439 |
+
ParsedDate с результатом или None
|
| 440 |
+
"""
|
| 441 |
+
tokens = self.tokenize(text)
|
| 442 |
+
|
| 443 |
+
# Пробуем все парсеры по очереди
|
| 444 |
+
for parser in (
|
| 445 |
+
lambda: self.parse_numeric_absolute(tokens),
|
| 446 |
+
lambda: self.parse_textual_absolute(text, tokens, reference_date),
|
| 447 |
+
lambda: self.parse_direct_relative(text, tokens, reference_date),
|
| 448 |
+
lambda: self.parse_week_relative(text, tokens, reference_date),
|
| 449 |
+
lambda: self.parse_period_edge(text, tokens, reference_date),
|
| 450 |
+
lambda: self.parse_quantity_relative(text, tokens, reference_date),
|
| 451 |
+
):
|
| 452 |
+
parsed = parser()
|
| 453 |
+
if parsed is not None:
|
| 454 |
+
return parsed
|
| 455 |
+
|
| 456 |
+
# Fallback: dateparser
|
| 457 |
+
normalized = " ".join(token.normalized for token in tokens)
|
| 458 |
+
relative_base = datetime.combine(reference_date, datetime.min.time()).replace(hour=12)
|
| 459 |
+
result = search_dates(
|
| 460 |
+
normalized,
|
| 461 |
+
languages=["ru"],
|
| 462 |
+
settings={
|
| 463 |
+
"RELATIVE_BASE": relative_base,
|
| 464 |
+
"PREFER_DATES_FROM": self.preference_for_text(tokens),
|
| 465 |
+
"STRICT_PARSING": False,
|
| 466 |
+
"REQUIRE_PARTS": [],
|
| 467 |
+
"NORMALIZE": True,
|
| 468 |
+
"RETURN_AS_TIMEZONE_AWARE": False,
|
| 469 |
+
"DATE_ORDER": "DMY",
|
| 470 |
+
},
|
| 471 |
+
)
|
| 472 |
+
|
| 473 |
+
filtered: list[tuple[str, datetime]] = []
|
| 474 |
+
for matched, value in result or []:
|
| 475 |
+
if isinstance(value, datetime) and not matched.strip().isdigit() and 2020 <= value.year <= 2100:
|
| 476 |
+
filtered.append((matched.strip(), value))
|
| 477 |
+
|
| 478 |
+
if not filtered:
|
| 479 |
+
return None
|
| 480 |
+
|
| 481 |
+
matched_expression, value = self.choose_best(filtered)
|
| 482 |
+
return ParsedDate(date_iso=value.date().isoformat(), matched_expression=matched_expression)
|
| 483 |
+
|
| 484 |
+
|
| 485 |
+
class ExpenseDateExtractor:
|
| 486 |
+
"""
|
| 487 |
+
Экстрактор дат для текста расходов.
|
| 488 |
+
|
| 489 |
+
Обёртка над UniversalDateParser с удобным интерфейсом.
|
| 490 |
+
"""
|
| 491 |
+
|
| 492 |
+
def __init__(self) -> None:
|
| 493 |
+
self.parser = UniversalDateParser()
|
| 494 |
+
|
| 495 |
+
def extract(self, text: str, reference_date: str | date | None = None) -> dict[str, Any]:
|
| 496 |
+
"""
|
| 497 |
+
Извлекает дату из текста.
|
| 498 |
+
|
| 499 |
+
Args:
|
| 500 |
+
text: Текст для анализа
|
| 501 |
+
reference_date: Базовая дата (по умолчанию сегодня)
|
| 502 |
+
|
| 503 |
+
Returns:
|
| 504 |
+
Словарь с date, date_iso, matched_date_phrase
|
| 505 |
+
"""
|
| 506 |
+
ref_date = self.to_date(reference_date or date.today().isoformat())
|
| 507 |
+
parsed = self.parser.parse(text=text, reference_date=ref_date)
|
| 508 |
+
|
| 509 |
+
return {
|
| 510 |
+
"date": datetime.strptime(parsed.date_iso, "%Y-%m-%d").strftime("%d.%m.%Y") if parsed else None,
|
| 511 |
+
"date_iso": parsed.date_iso if parsed else None,
|
| 512 |
+
"matched_date_phrase": parsed.matched_expression if parsed else None,
|
| 513 |
+
}
|
| 514 |
+
|
| 515 |
+
@staticmethod
|
| 516 |
+
def to_date(value: str | date) -> date:
|
| 517 |
+
"""Преобразует строку или date в date."""
|
| 518 |
+
return value if isinstance(value, date) else datetime.strptime(value, "%Y-%m-%d").date()
|
extractors/supplier_extractor.py
ADDED
|
@@ -0,0 +1,402 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Экстрактор поставщиков из текста.
|
| 3 |
+
|
| 4 |
+
Использует комбинацию методов:
|
| 5 |
+
- TF-IDF для символьных n-грамм
|
| 6 |
+
- Фонетическое сравнение
|
| 7 |
+
- Выравнивание токенов
|
| 8 |
+
- Расстояние Левенштейна
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
import re
|
| 14 |
+
import unicodedata
|
| 15 |
+
from typing import Any
|
| 16 |
+
|
| 17 |
+
import iuliia
|
| 18 |
+
from rapidfuzz import fuzz
|
| 19 |
+
from rapidfuzz.distance import Levenshtein
|
| 20 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 21 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 22 |
+
|
| 23 |
+
from extractors.date_extractor import UniversalDateParser
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def normalize_text(text: str) -> str:
|
| 27 |
+
"""Нормализует текст: lowercase, удаление диакритики и пунктуации."""
|
| 28 |
+
text = unicodedata.normalize("NFKD", text.lower())
|
| 29 |
+
text = "".join(ch for ch in text if not unicodedata.combining(ch))
|
| 30 |
+
return re.sub(r"[^\w\s]", "", text).strip()
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def variants(text: str) -> list[str]:
|
| 34 |
+
"""Генерирует варианты текста (транслитерация)."""
|
| 35 |
+
base = normalize_text(text)
|
| 36 |
+
result = [base]
|
| 37 |
+
|
| 38 |
+
for schema in (iuliia.WIKIPEDIA, iuliia.MOSMETRO, iuliia.ALA_LC):
|
| 39 |
+
try:
|
| 40 |
+
v = normalize_text(schema.translate(base))
|
| 41 |
+
if v and v not in result:
|
| 42 |
+
result.append(v)
|
| 43 |
+
except Exception:
|
| 44 |
+
pass
|
| 45 |
+
|
| 46 |
+
for v in list(result):
|
| 47 |
+
core = " ".join(w for w in v.split() if len(w) > 1 and any(ch.isalpha() for ch in w))
|
| 48 |
+
core = normalize_text(core)
|
| 49 |
+
if core and core not in result:
|
| 50 |
+
result.insert(0, core)
|
| 51 |
+
|
| 52 |
+
return result
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def token_alignment_score(phrase_variant: str, candidate_tokens: list[str]) -> float:
|
| 56 |
+
"""Вычисляет выравнивание токенов."""
|
| 57 |
+
phrase_tokens = [t for t in phrase_variant.split() if len(t) > 2]
|
| 58 |
+
if not phrase_tokens or not candidate_tokens:
|
| 59 |
+
return 0.0
|
| 60 |
+
best_scores = []
|
| 61 |
+
for pt in phrase_tokens:
|
| 62 |
+
best = 0.0
|
| 63 |
+
for ct in candidate_tokens:
|
| 64 |
+
sim = Levenshtein.normalized_similarity(pt, ct)
|
| 65 |
+
if sim > best:
|
| 66 |
+
best = sim
|
| 67 |
+
best_scores.append(best)
|
| 68 |
+
return sum(best_scores) / len(best_scores)
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def length_penalty(phrase_len: int, candidate_len: int) -> float:
|
| 72 |
+
"""Штраф за разницу в длине."""
|
| 73 |
+
if phrase_len == 0 or candidate_len == 0:
|
| 74 |
+
return 0.0
|
| 75 |
+
ratio = min(phrase_len, candidate_len) / max(phrase_len, candidate_len)
|
| 76 |
+
if ratio >= 0.80:
|
| 77 |
+
return 1.0
|
| 78 |
+
if ratio >= 0.60:
|
| 79 |
+
return 0.90
|
| 80 |
+
if ratio >= 0.40:
|
| 81 |
+
return 0.70
|
| 82 |
+
return 0.50
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def canonicalize_for_similarity(text: str) -> str:
|
| 86 |
+
"""Каноникализирует текст для фонетического сравнения."""
|
| 87 |
+
t = normalize_text(text).replace(" ", "")
|
| 88 |
+
replacements = (
|
| 89 |
+
("sch", "sh"),
|
| 90 |
+
("tch", "ch"),
|
| 91 |
+
("dzh", "j"),
|
| 92 |
+
("zh", "j"),
|
| 93 |
+
("sh", "s"),
|
| 94 |
+
("ch", "c"),
|
| 95 |
+
("kh", "h"),
|
| 96 |
+
("ph", "f"),
|
| 97 |
+
("ck", "k"),
|
| 98 |
+
("qu", "k"),
|
| 99 |
+
("q", "k"),
|
| 100 |
+
("w", "v"),
|
| 101 |
+
("x", "ks"),
|
| 102 |
+
("ts", "z"),
|
| 103 |
+
("tz", "z"),
|
| 104 |
+
)
|
| 105 |
+
for src, dst in replacements:
|
| 106 |
+
t = t.replace(src, dst)
|
| 107 |
+
return re.sub(r"(.)\1+", r"\1", t)
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def phonetic_similarity(left: str, right: str) -> float:
|
| 111 |
+
"""Вычисляет фонетическую схожесть."""
|
| 112 |
+
l = canonicalize_for_similarity(left)
|
| 113 |
+
r = canonicalize_for_similarity(right)
|
| 114 |
+
if not l or not r:
|
| 115 |
+
return 0.0
|
| 116 |
+
char = fuzz.ratio(l, r) / 100.0
|
| 117 |
+
lev = Levenshtein.normalized_similarity(l, r)
|
| 118 |
+
return 0.50 * char + 0.50 * lev
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
class ExpenseSupplierExtractor:
|
| 122 |
+
"""
|
| 123 |
+
Экстрактор поставщиков из текста.
|
| 124 |
+
|
| 125 |
+
Ищет наиболее похожего поставщика из списка известных.
|
| 126 |
+
"""
|
| 127 |
+
|
| 128 |
+
def __init__(self, suppliers: list[str]) -> None:
|
| 129 |
+
self.suppliers = suppliers
|
| 130 |
+
self.sup_norm = [normalize_text(s) for s in suppliers]
|
| 131 |
+
self.sup_tokens = [s.split() for s in self.sup_norm]
|
| 132 |
+
self.sup_num_sets = [self.numeric_tokens(s) for s in self.sup_norm]
|
| 133 |
+
self.sup_number_tokens = {token for supplier in self.sup_tokens for token in supplier if token.isdigit()}
|
| 134 |
+
self.supplier_lexicon = [
|
| 135 |
+
token
|
| 136 |
+
for token in sorted({tok for tokens in self.sup_tokens for tok in tokens})
|
| 137 |
+
if token and not token.isdigit()
|
| 138 |
+
]
|
| 139 |
+
self.tfidf = TfidfVectorizer(analyzer="char_wb", ngram_range=(3, 5))
|
| 140 |
+
self.sup_mat = self.tfidf.fit_transform(self.sup_norm)
|
| 141 |
+
self.max_words = max(len(s.split()) for s in self.sup_norm)
|
| 142 |
+
self.variant_cache: dict[str, list[str]] = {}
|
| 143 |
+
self.lexical_token_cache: dict[str, float] = {}
|
| 144 |
+
self.phrase_support_cache: dict[str, float] = {}
|
| 145 |
+
self.noise_terms = {
|
| 146 |
+
"за", "на", "из", "для", "под", "над", "при", "без", "и", "или",
|
| 147 |
+
"купил", "купила", "купили", "покупка", "заказал", "заказала", "заказали",
|
| 148 |
+
"оплатил", "оплатила", "оплатили", "заплатил", "заплатила", "заплатили",
|
| 149 |
+
"был", "была", "было", "были", "утром", "днем", "днём", "вечером", "ночью",
|
| 150 |
+
"товар", "товары", "продукт", "продукты", "десерт", "еда",
|
| 151 |
+
"лей", "лея", "леи", "целых", "сотых", "сом", "сомов", "руб", "рублей", "грн", "usd", "eur",
|
| 152 |
+
}
|
| 153 |
+
self.noise_terms.update(UniversalDateParser.temporal_vocabulary())
|
| 154 |
+
|
| 155 |
+
@staticmethod
|
| 156 |
+
def numeric_tokens(text: str) -> set[str]:
|
| 157 |
+
"""Извлекает числовые токены."""
|
| 158 |
+
return set(re.findall(r"\d+", text))
|
| 159 |
+
|
| 160 |
+
def cached_variants(self, text: str) -> list[str]:
|
| 161 |
+
"""Кэширует варианты текста."""
|
| 162 |
+
key = normalize_text(text)
|
| 163 |
+
cached = self.variant_cache.get(key)
|
| 164 |
+
if cached is None:
|
| 165 |
+
cached = variants(key)
|
| 166 |
+
self.variant_cache[key] = cached
|
| 167 |
+
return cached
|
| 168 |
+
|
| 169 |
+
@staticmethod
|
| 170 |
+
def split_words(text: str) -> list[str]:
|
| 171 |
+
"""Разбивает текст на слова."""
|
| 172 |
+
return [w for w in normalize_text(text).split() if w]
|
| 173 |
+
|
| 174 |
+
@classmethod
|
| 175 |
+
def is_supplier_extension(cls, base_supplier: str, extended_supplier: str) -> bool:
|
| 176 |
+
"""Проверяет, является ли один поставщик расширением другого."""
|
| 177 |
+
base_tokens = cls.split_words(base_supplier)
|
| 178 |
+
extended_tokens = cls.split_words(extended_supplier)
|
| 179 |
+
return len(base_tokens) < len(extended_tokens) and extended_tokens[:len(base_tokens)] == base_tokens
|
| 180 |
+
|
| 181 |
+
@classmethod
|
| 182 |
+
def phrase_token_count(cls, phrase: str | None) -> int:
|
| 183 |
+
"""Считает количество токенов во фразе."""
|
| 184 |
+
return len(cls.split_words(phrase or ""))
|
| 185 |
+
|
| 186 |
+
@classmethod
|
| 187 |
+
def resolve_overlapping_suppliers(cls, ranking: list[dict[str, Any]]) -> dict[str, Any]:
|
| 188 |
+
"""Разрешает конфликты между похожими поставщиками."""
|
| 189 |
+
if not ranking:
|
| 190 |
+
return {"supplier": None, "score": -1.0, "phrase": None}
|
| 191 |
+
|
| 192 |
+
best = ranking[0]
|
| 193 |
+
best_combined = float(best.get("combined", best.get("score", -1.0)))
|
| 194 |
+
best_phrase_len = cls.phrase_token_count(best.get("phrase"))
|
| 195 |
+
|
| 196 |
+
for alt in ranking[1:]:
|
| 197 |
+
if not cls.is_supplier_extension(str(best.get("supplier") or ""), str(alt.get("supplier") or "")):
|
| 198 |
+
continue
|
| 199 |
+
|
| 200 |
+
alt_combined = float(alt.get("combined", alt.get("score", -1.0)))
|
| 201 |
+
alt_phrase_len = cls.phrase_token_count(alt.get("phrase"))
|
| 202 |
+
|
| 203 |
+
if alt_phrase_len > best_phrase_len and alt_combined >= best_combined - 0.15:
|
| 204 |
+
best = alt
|
| 205 |
+
best_combined = alt_combined
|
| 206 |
+
best_phrase_len = alt_phrase_len
|
| 207 |
+
|
| 208 |
+
return best
|
| 209 |
+
|
| 210 |
+
@staticmethod
|
| 211 |
+
def numeric_compatibility_multiplier(phrase_nums: set[str], candidate_nums: set[str]) -> float:
|
| 212 |
+
"""Множитель совместимости числовых токенов."""
|
| 213 |
+
if not phrase_nums and not candidate_nums:
|
| 214 |
+
return 1.0
|
| 215 |
+
if phrase_nums == candidate_nums:
|
| 216 |
+
return 1.08
|
| 217 |
+
if phrase_nums and candidate_nums:
|
| 218 |
+
return 1.03 if phrase_nums & candidate_nums else 0.80
|
| 219 |
+
return 0.82
|
| 220 |
+
|
| 221 |
+
def lexical_support(self, phrase: str) -> float:
|
| 222 |
+
"""Вычисляет лексическую поддержку фразы."""
|
| 223 |
+
tokens = [token for token in normalize_text(phrase).split() if token and not token.isdigit()]
|
| 224 |
+
if not tokens or not self.supplier_lexicon:
|
| 225 |
+
return 0.0
|
| 226 |
+
|
| 227 |
+
support_scores: list[float] = []
|
| 228 |
+
for token in tokens:
|
| 229 |
+
cached = self.lexical_token_cache.get(token)
|
| 230 |
+
if cached is not None:
|
| 231 |
+
support_scores.append(cached)
|
| 232 |
+
continue
|
| 233 |
+
|
| 234 |
+
best = 0.0
|
| 235 |
+
for token_variant in self.cached_variants(token):
|
| 236 |
+
for lex in self.supplier_lexicon:
|
| 237 |
+
lev = Levenshtein.normalized_similarity(token_variant, lex)
|
| 238 |
+
phon = phonetic_similarity(token_variant, lex)
|
| 239 |
+
sim = max(lev, phon)
|
| 240 |
+
if sim > best:
|
| 241 |
+
best = sim
|
| 242 |
+
|
| 243 |
+
self.lexical_token_cache[token] = best
|
| 244 |
+
support_scores.append(best)
|
| 245 |
+
|
| 246 |
+
return sum(support_scores) / len(support_scores)
|
| 247 |
+
|
| 248 |
+
def score_phrase(self, phrase: str) -> dict[str, Any]:
|
| 249 |
+
"""Оценивает фразу на соответствие поставщикам."""
|
| 250 |
+
vs = self.cached_variants(phrase)
|
| 251 |
+
q = self.tfidf.transform(vs)
|
| 252 |
+
tf = cosine_similarity(q, self.sup_mat)
|
| 253 |
+
|
| 254 |
+
best: dict[str, Any] = {"supplier": None, "score": -1.0, "phrase": phrase, "variant": ""}
|
| 255 |
+
for i, cand in enumerate(self.sup_norm):
|
| 256 |
+
local = -1.0
|
| 257 |
+
local_variant = ""
|
| 258 |
+
candidate_nums = self.sup_num_sets[i]
|
| 259 |
+
for j, v in enumerate(vs):
|
| 260 |
+
char = fuzz.ratio(v, cand) / 100.0
|
| 261 |
+
tf_val = float(tf[j, i])
|
| 262 |
+
penalty = length_penalty(len(v), len(cand))
|
| 263 |
+
phon = phonetic_similarity(v, cand)
|
| 264 |
+
phrase_nums = self.numeric_tokens(v)
|
| 265 |
+
|
| 266 |
+
if len(v.split()) == 1 and len(cand.split()) == 1:
|
| 267 |
+
lev = Levenshtein.normalized_similarity(v, cand)
|
| 268 |
+
val = (0.45 * lev + 0.25 * char + 0.10 * tf_val + 0.20 * phon) * penalty
|
| 269 |
+
else:
|
| 270 |
+
align = token_alignment_score(v, self.sup_tokens[i])
|
| 271 |
+
tok = fuzz.token_set_ratio(v, cand) / 100.0
|
| 272 |
+
val = (0.30 * char + 0.20 * tok + 0.10 * tf_val + 0.20 * align + 0.20 * phon) * penalty
|
| 273 |
+
|
| 274 |
+
compact_v = v.replace(" ", "")
|
| 275 |
+
compact_cand = cand.replace(" ", "")
|
| 276 |
+
compact_char = fuzz.ratio(compact_v, compact_cand) / 100.0
|
| 277 |
+
compact_lev = Levenshtein.normalized_similarity(compact_v, compact_cand)
|
| 278 |
+
compact_phon = phonetic_similarity(compact_v, compact_cand)
|
| 279 |
+
compact = max(compact_char, compact_lev, compact_phon)
|
| 280 |
+
if compact > 0.55:
|
| 281 |
+
val = max(val, compact * penalty)
|
| 282 |
+
|
| 283 |
+
val *= self.numeric_compatibility_multiplier(phrase_nums, candidate_nums)
|
| 284 |
+
|
| 285 |
+
if val > local:
|
| 286 |
+
local = val
|
| 287 |
+
local_variant = v
|
| 288 |
+
|
| 289 |
+
if local > best["score"]:
|
| 290 |
+
best = {"supplier": self.suppliers[i], "score": local, "phrase": phrase, "variant": local_variant}
|
| 291 |
+
return best
|
| 292 |
+
|
| 293 |
+
def extract(self, text: str, date_phrase: str | None = None, debug: bool = False) -> dict[str, Any]:
|
| 294 |
+
"""
|
| 295 |
+
Извлекает поставщика из текста.
|
| 296 |
+
|
| 297 |
+
Args:
|
| 298 |
+
text: Текст для анализа
|
| 299 |
+
date_phrase: Фраза даты для исключения
|
| 300 |
+
debug: Включить отладочную информацию
|
| 301 |
+
|
| 302 |
+
Returns:
|
| 303 |
+
Словарь с supplier, supplier_score, matched_supplier_phrase
|
| 304 |
+
"""
|
| 305 |
+
threshold = 0.50
|
| 306 |
+
excluded_tokens: set[str] = set()
|
| 307 |
+
if date_phrase:
|
| 308 |
+
excluded_tokens.update(normalize_text(date_phrase).split())
|
| 309 |
+
excluded_tokens.update(self.noise_terms)
|
| 310 |
+
|
| 311 |
+
raw_tokens = normalize_text(text).split()
|
| 312 |
+
tokens: list[str] = []
|
| 313 |
+
for token in raw_tokens:
|
| 314 |
+
if token in excluded_tokens:
|
| 315 |
+
continue
|
| 316 |
+
|
| 317 |
+
if token.isdigit():
|
| 318 |
+
if token in self.sup_number_tokens:
|
| 319 |
+
tokens.append(token)
|
| 320 |
+
|
| 321 |
+
if tokens and len(token) <= 3 and len(tokens[-1]) >= 4 and tokens[-1].isalpha():
|
| 322 |
+
tokens.append(f"{tokens[-1]}{token}")
|
| 323 |
+
continue
|
| 324 |
+
|
| 325 |
+
if len(token) > 1:
|
| 326 |
+
tokens.append(token)
|
| 327 |
+
|
| 328 |
+
tokens = [t for t in tokens if len(t) > 1 and t not in excluded_tokens]
|
| 329 |
+
|
| 330 |
+
phrases: list[str] = []
|
| 331 |
+
seen: set[str] = set()
|
| 332 |
+
for i in range(len(tokens)):
|
| 333 |
+
for j in range(i + 1, min(i + 1 + self.max_words, len(tokens) + 1)):
|
| 334 |
+
p = " ".join(tokens[i:j])
|
| 335 |
+
if p not in seen:
|
| 336 |
+
seen.add(p)
|
| 337 |
+
phrases.append(p)
|
| 338 |
+
|
| 339 |
+
results = [self.score_phrase(p) for p in phrases]
|
| 340 |
+
candidate_rows: list[dict[str, Any]] = []
|
| 341 |
+
best_by_supplier: dict[str, dict[str, Any]] = {}
|
| 342 |
+
for row in results:
|
| 343 |
+
supplier = row["supplier"]
|
| 344 |
+
score = float(row.get("score", -1.0))
|
| 345 |
+
phrase = str(row.get("phrase") or "")
|
| 346 |
+
support = self.phrase_support_cache.get(phrase)
|
| 347 |
+
if support is None:
|
| 348 |
+
support = self.lexical_support(phrase)
|
| 349 |
+
self.phrase_support_cache[phrase] = support
|
| 350 |
+
combined = 0.75 * score + 0.25 * support
|
| 351 |
+
|
| 352 |
+
if debug:
|
| 353 |
+
candidate_rows.append({
|
| 354 |
+
"supplier": supplier,
|
| 355 |
+
"phrase": phrase,
|
| 356 |
+
"score": round(score, 4),
|
| 357 |
+
"support": round(support, 4),
|
| 358 |
+
"combined": round(combined, 4),
|
| 359 |
+
})
|
| 360 |
+
|
| 361 |
+
enriched = {**row, "combined": combined}
|
| 362 |
+
passes = score >= threshold or combined >= 0.48
|
| 363 |
+
if passes and (supplier not in best_by_supplier or combined > float(best_by_supplier[supplier].get("combined", -1.0))):
|
| 364 |
+
best_by_supplier[supplier] = enriched
|
| 365 |
+
|
| 366 |
+
if not best_by_supplier and results:
|
| 367 |
+
def support_for_phrase(phrase: str) -> float:
|
| 368 |
+
cached_support = self.phrase_support_cache.get(phrase)
|
| 369 |
+
if cached_support is None:
|
| 370 |
+
cached_support = self.lexical_support(phrase)
|
| 371 |
+
self.phrase_support_cache[phrase] = cached_support
|
| 372 |
+
return cached_support
|
| 373 |
+
|
| 374 |
+
fallback = max(
|
| 375 |
+
results,
|
| 376 |
+
key=lambda item: 0.75 * float(item.get("score", -1.0)) + 0.25 * support_for_phrase(str(item.get("phrase") or "")),
|
| 377 |
+
)
|
| 378 |
+
fallback_score = float(fallback.get("score", -1.0))
|
| 379 |
+
fallback_phrase = str(fallback.get("phrase") or "")
|
| 380 |
+
fallback_support = support_for_phrase(fallback_phrase)
|
| 381 |
+
fallback_combined = 0.75 * fallback_score + 0.25 * fallback_support
|
| 382 |
+
if fallback_score >= 0.40 and fallback_support >= 0.43 and fallback_combined >= 0.43:
|
| 383 |
+
best_by_supplier[fallback["supplier"]] = {**fallback, "combined": fallback_combined}
|
| 384 |
+
|
| 385 |
+
supplier_ranking = sorted(best_by_supplier.values(), key=lambda x: float(x.get("combined", x["score"])), reverse=True)
|
| 386 |
+
best = self.resolve_overlapping_suppliers(supplier_ranking)
|
| 387 |
+
|
| 388 |
+
payload = {
|
| 389 |
+
"supplier": best["supplier"],
|
| 390 |
+
"supplier_score": round(best["score"], 4) if best["score"] >= 0 else None,
|
| 391 |
+
"matched_supplier_phrase": best.get("phrase"),
|
| 392 |
+
}
|
| 393 |
+
|
| 394 |
+
if debug:
|
| 395 |
+
top_candidates = sorted(candidate_rows, key=lambda item: item["combined"], reverse=True)[:8]
|
| 396 |
+
payload["supplier_debug"] = {
|
| 397 |
+
"tokens": tokens,
|
| 398 |
+
"phrases_count": len(phrases),
|
| 399 |
+
"top_candidates": top_candidates,
|
| 400 |
+
}
|
| 401 |
+
|
| 402 |
+
return payload
|
extractors/user_extractor.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Экстрактор пользователей из текста.
|
| 3 |
+
|
| 4 |
+
Использует семантические эмбеддинги для поиска пользователей.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import re
|
| 10 |
+
import unicodedata
|
| 11 |
+
from typing import Any
|
| 12 |
+
|
| 13 |
+
import torch
|
| 14 |
+
from pymorphy3 import MorphAnalyzer
|
| 15 |
+
from sentence_transformers import SentenceTransformer
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
MORPH = MorphAnalyzer()
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def normalize_text(text: str) -> str:
|
| 22 |
+
"""Нормализует текст: lowercase, удаление диакритики и пунктуации."""
|
| 23 |
+
text = unicodedata.normalize("NFKD", text.lower())
|
| 24 |
+
text = "".join(ch for ch in text if not unicodedata.combining(ch))
|
| 25 |
+
return re.sub(r"[^\w\s]", "", text).strip()
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def tokenize_text(text: str) -> list[str]:
|
| 29 |
+
"""Токенизирует текст."""
|
| 30 |
+
return normalize_text(text).split()
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def lemmatize_word(word: str) -> str:
|
| 34 |
+
"""Возвращает лемму слова."""
|
| 35 |
+
return MORPH.parse(word)[0].normal_form if re.fullmatch(r"[а-я]+", word) else word
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def lemmatize_text(text: str) -> list[str]:
|
| 39 |
+
"""Лемматизирует текст."""
|
| 40 |
+
return [lemmatize_word(word) for word in tokenize_text(text)]
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
class ExpenseUserExtractor:
|
| 44 |
+
"""
|
| 45 |
+
Экстрактор пользователей из текста.
|
| 46 |
+
|
| 47 |
+
Использует семантические эмбеддинги для сопоставления слов из текста
|
| 48 |
+
с известными пользователями.
|
| 49 |
+
"""
|
| 50 |
+
|
| 51 |
+
def __init__(
|
| 52 |
+
self,
|
| 53 |
+
users: list[str],
|
| 54 |
+
suppliers: list[str],
|
| 55 |
+
model: SentenceTransformer,
|
| 56 |
+
threshold: float = 0.6
|
| 57 |
+
) -> None:
|
| 58 |
+
"""
|
| 59 |
+
Args:
|
| 60 |
+
users: Список известных пользователей
|
| 61 |
+
suppliers: Список поставщиков (для исключения)
|
| 62 |
+
model: Модель для создания эмбеддингов
|
| 63 |
+
threshold: Порог схожести
|
| 64 |
+
"""
|
| 65 |
+
self.users = users
|
| 66 |
+
self.model = model
|
| 67 |
+
self.threshold = threshold
|
| 68 |
+
self.supplier_terms = {normalize_text(supplier) for supplier in suppliers}
|
| 69 |
+
self.user_terms = [normalize_text(user) for user in users]
|
| 70 |
+
self.user_embeddings = model.encode(
|
| 71 |
+
[f"passage: {user}" for user in self.user_terms],
|
| 72 |
+
convert_to_tensor=True,
|
| 73 |
+
normalize_embeddings=True,
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
def extract(
|
| 77 |
+
self,
|
| 78 |
+
text: str,
|
| 79 |
+
supplier_phrase: str | None = None,
|
| 80 |
+
date_phrase: str | None = None
|
| 81 |
+
) -> dict[str, Any]:
|
| 82 |
+
"""
|
| 83 |
+
Извлекает пользователя из текста.
|
| 84 |
+
|
| 85 |
+
Args:
|
| 86 |
+
text: Текст для анализа
|
| 87 |
+
supplier_phrase: Фраза поставщика для исключения
|
| 88 |
+
date_phrase: Фраза даты для исключения
|
| 89 |
+
|
| 90 |
+
Returns:
|
| 91 |
+
Словарь с user, user_score, matched_user_phrase
|
| 92 |
+
"""
|
| 93 |
+
excluded_tokens: set[str] = set()
|
| 94 |
+
if supplier_phrase:
|
| 95 |
+
excluded_tokens.update(normalize_text(supplier_phrase).split())
|
| 96 |
+
if date_phrase:
|
| 97 |
+
excluded_tokens.update(normalize_text(date_phrase).split())
|
| 98 |
+
|
| 99 |
+
best_user = None
|
| 100 |
+
best_score = -1.0
|
| 101 |
+
best_phrase = None
|
| 102 |
+
|
| 103 |
+
for word in lemmatize_text(text):
|
| 104 |
+
if len(word) < 3:
|
| 105 |
+
continue
|
| 106 |
+
if word in excluded_tokens or word in self.supplier_terms:
|
| 107 |
+
continue
|
| 108 |
+
|
| 109 |
+
query_emb = self.model.encode(
|
| 110 |
+
f"query: {word}",
|
| 111 |
+
convert_to_tensor=True,
|
| 112 |
+
normalize_embeddings=True,
|
| 113 |
+
)
|
| 114 |
+
similarities = torch.cosine_similarity(query_emb.unsqueeze(0), self.user_embeddings, dim=1)
|
| 115 |
+
idx = int(torch.argmax(similarities))
|
| 116 |
+
score = similarities[idx].item()
|
| 117 |
+
|
| 118 |
+
if score > best_score:
|
| 119 |
+
best_score = score
|
| 120 |
+
best_user = self.users[idx]
|
| 121 |
+
best_phrase = word
|
| 122 |
+
|
| 123 |
+
if best_score >= self.threshold:
|
| 124 |
+
return {
|
| 125 |
+
"user": best_user,
|
| 126 |
+
"user_score": round(best_score, 4),
|
| 127 |
+
"matched_user_phrase": best_phrase,
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
# Проверка на местоимение "я"
|
| 131 |
+
if re.search(r"(?<!\S)я(?!\S)", normalize_text(text), re.IGNORECASE):
|
| 132 |
+
return {
|
| 133 |
+
"user": "Я",
|
| 134 |
+
"user_score": 1.0,
|
| 135 |
+
"matched_user_phrase": "я",
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
return {
|
| 139 |
+
"user": None,
|
| 140 |
+
"user_score": None,
|
| 141 |
+
"matched_user_phrase": None,
|
| 142 |
+
}
|
natasha_dates.py
DELETED
|
@@ -1,589 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Универсальный парсер дат для русского языка.
|
| 3 |
-
Использует собственные правила + опционально Natasha как fallback.
|
| 4 |
-
Поддерживает: точные даты, относительные, порядковые числительные, числа словами.
|
| 5 |
-
"""
|
| 6 |
-
|
| 7 |
-
import re
|
| 8 |
-
from datetime import date, datetime, timedelta
|
| 9 |
-
from typing import Any, Optional, Callable
|
| 10 |
-
from dateutil.relativedelta import relativedelta
|
| 11 |
-
|
| 12 |
-
# Опциональный импорт Natasha
|
| 13 |
-
try:
|
| 14 |
-
from natasha import DatesExtractor, MorphVocab
|
| 15 |
-
NATASHA_AVAILABLE = True
|
| 16 |
-
except ImportError:
|
| 17 |
-
NATASHA_AVAILABLE = False
|
| 18 |
-
DatesExtractor = None
|
| 19 |
-
MorphVocab = None
|
| 20 |
-
|
| 21 |
-
# Инициализация Natasha (ленивая)
|
| 22 |
-
_MORPH_VOCAB = None
|
| 23 |
-
_DATES_EXTRACTOR = None
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
def _get_extractor():
|
| 27 |
-
"""Ленивая инициализация экстрактора Natasha."""
|
| 28 |
-
global _MORPH_VOCAB, _DATES_EXTRACTOR
|
| 29 |
-
if not NATASHA_AVAILABLE:
|
| 30 |
-
return None
|
| 31 |
-
if _DATES_EXTRACTOR is None:
|
| 32 |
-
_MORPH_VOCAB = MorphVocab()
|
| 33 |
-
_DATES_EXTRACTOR = DatesExtractor(_MORPH_VOCAB)
|
| 34 |
-
return _DATES_EXTRACTOR
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
# ============== СЛОВАРИ ==============
|
| 38 |
-
|
| 39 |
-
MONTHS = {
|
| 40 |
-
"январь": 1, "января": 1, "январе": 1,
|
| 41 |
-
"февраль": 2, "февраля": 2, "феврале": 2,
|
| 42 |
-
"март": 3, "марта": 3, "марте": 3,
|
| 43 |
-
"апрель": 4, "апреля": 4, "апреле": 4,
|
| 44 |
-
"май": 5, "мая": 5, "мае": 5,
|
| 45 |
-
"июнь": 6, "июня": 6, "июне": 6,
|
| 46 |
-
"июль": 7, "июля": 7, "июле": 7,
|
| 47 |
-
"август": 8, "августа": 8, "августе": 8,
|
| 48 |
-
"сентябрь": 9, "сентября": 9, "сентябре": 9,
|
| 49 |
-
"октябрь": 10, "октября": 10, "октябре": 10,
|
| 50 |
-
"ноябрь": 11, "ноября": 11, "ноябре": 11,
|
| 51 |
-
"декабрь": 12, "декабря": 12, "декабре": 12,
|
| 52 |
-
}
|
| 53 |
-
|
| 54 |
-
WEEKDAYS = {
|
| 55 |
-
"понедельник": 0, "вторник": 1, "среда": 2, "среду": 2,
|
| 56 |
-
"четверг": 3, "пятница": 4, "пятницу": 4,
|
| 57 |
-
"суббота": 5, "субботу": 5, "воскресенье": 6,
|
| 58 |
-
}
|
| 59 |
-
|
| 60 |
-
# Числа словами (кардинальные)
|
| 61 |
-
NUMBER_WORDS = {
|
| 62 |
-
"ноль": 0, "один": 1, "одну": 1, "одного": 1,
|
| 63 |
-
"два": 2, "две": 2, "двух": 2,
|
| 64 |
-
"три": 3, "трёх": 3, "трех": 3,
|
| 65 |
-
"четыре": 4, "четырёх": 4, "четырех": 4,
|
| 66 |
-
"пять": 5, "пяти": 5,
|
| 67 |
-
"шесть": 6, "шести": 6,
|
| 68 |
-
"семь": 7, "семи": 7,
|
| 69 |
-
"восемь": 8, "восьми": 8,
|
| 70 |
-
"девять": 9, "девяти": 9,
|
| 71 |
-
"десять": 10, "десяти": 10,
|
| 72 |
-
"одиннадцать": 11, "двенадцать": 12, "тринадцать": 13,
|
| 73 |
-
"четырнадцать": 14, "пятнадцать": 15, "шестнадцать": 16,
|
| 74 |
-
"семнадцать": 17, "восемнадцать": 18, "девятнадцать": 19,
|
| 75 |
-
"двадцать": 20, "тридцать": 30,
|
| 76 |
-
}
|
| 77 |
-
|
| 78 |
-
# Порядковые числительные для дней
|
| 79 |
-
ORDINAL_DAYS = {
|
| 80 |
-
"первое": 1, "первого": 1, "первом": 1,
|
| 81 |
-
"второе": 2, "второго": 2, "втором": 2,
|
| 82 |
-
"третье": 3, "третьего": 3, "третьем": 3,
|
| 83 |
-
"четвёртое": 4, "четвертое": 4, "четвёртого": 4, "четвертого": 4,
|
| 84 |
-
"пятое": 5, "пятого": 5,
|
| 85 |
-
"шестое": 6, "шестого": 6,
|
| 86 |
-
"седьмое": 7, "седьмого": 7,
|
| 87 |
-
"восьмое": 8, "восьмого": 8,
|
| 88 |
-
"девятое": 9, "девятого": 9,
|
| 89 |
-
"десятое": 10, "десятого": 10,
|
| 90 |
-
"одиннадцатое": 11, "одиннадцатого": 11,
|
| 91 |
-
"двенадцатое": 12, "двенадцатого": 12,
|
| 92 |
-
"тринадцатое": 13, "тринадцатого": 13,
|
| 93 |
-
"четырнадцатое": 14, "четырнадцатого": 14,
|
| 94 |
-
"пятнадцатое": 15, "пятнадцатого": 15,
|
| 95 |
-
"шестнадцатое": 16, "шестнадцатого": 16,
|
| 96 |
-
"семнадцатое": 17, "семнадцатого": 17,
|
| 97 |
-
"восемнадцатое": 18, "восемнадцатого": 18,
|
| 98 |
-
"девятнадцатое": 19, "девятнадцатого": 19,
|
| 99 |
-
"двадцатое": 20, "двадцатого": 20,
|
| 100 |
-
"двадцать первое": 21, "двадцать первого": 21,
|
| 101 |
-
"двадцать второе": 22, "двадцать второго": 22,
|
| 102 |
-
"двадцать третье": 23, "двадцать третьего": 23,
|
| 103 |
-
"двадцать четвёртое": 24, "двадцать четвертое": 24, "двадцать четвёртого": 24, "двадцать четвертого": 24,
|
| 104 |
-
"двадцать пятое": 25, "двадцать пятого": 25,
|
| 105 |
-
"двадцать шестое": 26, "двадцать шестого": 26,
|
| 106 |
-
"двадцать седьмое": 27, "двадцать седьмого": 27,
|
| 107 |
-
"двадцать восьмое": 28, "двадцать восьмого": 28,
|
| 108 |
-
"двадцать девятое": 29, "двадцать девятого": 29,
|
| 109 |
-
"тридцатое": 30, "тридцатого": 30,
|
| 110 |
-
"тридцать первое": 31, "тридцать первого": 31,
|
| 111 |
-
}
|
| 112 |
-
|
| 113 |
-
# Контекст прошлого/будущего
|
| 114 |
-
PAST_INDICATORS = re.compile(
|
| 115 |
-
r'\b(оплата|оплатил[аи]?|заплатил[аи]?|купил[аи]?|заказал[аи]?|'
|
| 116 |
-
r'потратил[аи]?|был[аио]?|получил[аи]?|сделал[аи]?|прошл[аоыйую]|'
|
| 117 |
-
r'предыдущ[аоыйую]|назад)\b',
|
| 118 |
-
re.IGNORECASE
|
| 119 |
-
)
|
| 120 |
-
|
| 121 |
-
FUTURE_INDICATORS = re.compile(
|
| 122 |
-
r'\b(завтра|послезавтра|через|следующ[аоыйую]|будущ[аоыйую]|'
|
| 123 |
-
r'заплатить|купить|заказать)\b',
|
| 124 |
-
re.IGNORECASE
|
| 125 |
-
)
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
# ============== ВСПОМОГАТЕЛЬНЫЕ ФУНКЦИИ ==============
|
| 129 |
-
|
| 130 |
-
def _parse_number(text: str) -> Optional[int]:
|
| 131 |
-
"""Парсит число из текста (цифры или словами)."""
|
| 132 |
-
text = text.strip().lower().replace('ё', 'е')
|
| 133 |
-
|
| 134 |
-
# Цифры
|
| 135 |
-
if text.isdigit():
|
| 136 |
-
return int(text)
|
| 137 |
-
|
| 138 |
-
# Одно слово
|
| 139 |
-
if text in NUMBER_WORDS:
|
| 140 |
-
return NUMBER_WORDS[text]
|
| 141 |
-
|
| 142 |
-
# Два слова (двадцать один)
|
| 143 |
-
parts = text.split()
|
| 144 |
-
if len(parts) == 2:
|
| 145 |
-
tens = NUMBER_WORDS.get(parts[0])
|
| 146 |
-
units = NUMBER_WORDS.get(parts[1])
|
| 147 |
-
if tens in (20, 30) and units and 1 <= units <= 9:
|
| 148 |
-
return tens + units
|
| 149 |
-
|
| 150 |
-
return None
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
def _parse_day(text: str) -> Optional[int]:
|
| 154 |
-
"""Парсит день месяца (цифры или порядковые числительные)."""
|
| 155 |
-
text = text.strip().lower().replace('ё', 'е')
|
| 156 |
-
|
| 157 |
-
if text.isdigit():
|
| 158 |
-
val = int(text)
|
| 159 |
-
return val if 1 <= val <= 31 else None
|
| 160 |
-
|
| 161 |
-
# Порядковые числительные
|
| 162 |
-
if text in ORDINAL_DAYS:
|
| 163 |
-
return ORDINAL_DAYS[text]
|
| 164 |
-
|
| 165 |
-
# Составные порядковые (двадцать первого)
|
| 166 |
-
for phrase, day in ORDINAL_DAYS.items():
|
| 167 |
-
if ' ' in phrase and phrase in text:
|
| 168 |
-
return day
|
| 169 |
-
|
| 170 |
-
return None
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
def _week_start(ref: date) -> date:
|
| 174 |
-
"""Понедельник текущей недели."""
|
| 175 |
-
return ref - timedelta(days=ref.weekday())
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
def _get_weekday_date(weekday: int, ref: date, direction: str) -> date:
|
| 179 |
-
"""Находит дату дня недели относительно ref."""
|
| 180 |
-
days_diff = weekday - ref.weekday()
|
| 181 |
-
|
| 182 |
-
if direction == 'past':
|
| 183 |
-
if days_diff >= 0:
|
| 184 |
-
days_diff -= 7
|
| 185 |
-
elif direction == 'next':
|
| 186 |
-
if days_diff <= 0:
|
| 187 |
-
days_diff += 7
|
| 188 |
-
# 'this' - ближайший
|
| 189 |
-
|
| 190 |
-
return ref + timedelta(days=days_diff)
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
def _adjust_year_by_context(parsed_date: date, text: str, ref: date) -> date:
|
| 194 |
-
"""Корректирует год по контексту (прошлое/будущее)."""
|
| 195 |
-
has_past = bool(PAST_INDICATORS.search(text))
|
| 196 |
-
has_future = bool(FUTURE_INDICATORS.search(text))
|
| 197 |
-
|
| 198 |
-
# Если явно прошлое и дата в будущем
|
| 199 |
-
if has_past and not has_future and parsed_date > ref:
|
| 200 |
-
return parsed_date - relativedelta(years=1)
|
| 201 |
-
|
| 202 |
-
# Если явно будущее и дата в прошлом
|
| 203 |
-
if has_future and not has_past and parsed_date < ref:
|
| 204 |
-
return parsed_date + relativedelta(years=1)
|
| 205 |
-
|
| 206 |
-
return parsed_date
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
# ============== ПАРСЕРЫ ==============
|
| 210 |
-
|
| 211 |
-
def _parse_direct_relative(text: str, ref: date) -> Optional[tuple[date, str]]:
|
| 212 |
-
"""Прямые относительные: сегодня, завтра, вчера..."""
|
| 213 |
-
patterns = [
|
| 214 |
-
(r'\bпослезавтра\b', 2),
|
| 215 |
-
(r'\bпозавчера\b', -2),
|
| 216 |
-
(r'\bсегодня\b', 0),
|
| 217 |
-
(r'\bзавтра\b', 1),
|
| 218 |
-
(r'\bвчера\b', -1),
|
| 219 |
-
]
|
| 220 |
-
text_lower = text.lower()
|
| 221 |
-
for pattern, delta in patterns:
|
| 222 |
-
match = re.search(pattern, text_lower)
|
| 223 |
-
if match:
|
| 224 |
-
return ref + timedelta(days=delta), match.group(0)
|
| 225 |
-
return None
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
def _parse_quantity_relative(text: str, ref: date) -> Optional[tuple[date, str]]:
|
| 229 |
-
"""Количественные: через 2 дня, 3 недели назад..."""
|
| 230 |
-
text_lower = text.lower()
|
| 231 |
-
|
| 232 |
-
# через X дней/недель/месяцев
|
| 233 |
-
patterns_forward = [
|
| 234 |
-
(r'\bчерез\s+(\d+|[а-яё]+(?:\s+[а-яё]+)?)\s+(день|дня|дней)\b', 'days'),
|
| 235 |
-
(r'\bчерез\s+(\d+|[а-яё]+(?:\s+[а-яё]+)?)\s+(неделю|недели|недель)\b', 'weeks'),
|
| 236 |
-
(r'\bчерез\s+(\d+|[а-яё]+(?:\s+[а-яё]+)?)\s+(месяц|месяца|месяцев)\b', 'months'),
|
| 237 |
-
]
|
| 238 |
-
|
| 239 |
-
for pattern, unit in patterns_forward:
|
| 240 |
-
match = re.search(pattern, text_lower)
|
| 241 |
-
if match:
|
| 242 |
-
num = _parse_number(match.group(1))
|
| 243 |
-
if num:
|
| 244 |
-
if unit == 'days':
|
| 245 |
-
return ref + timedelta(days=num), match.group(0)
|
| 246 |
-
elif unit == 'weeks':
|
| 247 |
-
return ref + timedelta(weeks=num), match.group(0)
|
| 248 |
-
elif unit == 'months':
|
| 249 |
-
return ref + relativedelta(months=num), match.group(0)
|
| 250 |
-
|
| 251 |
-
# X дней/недель/месяцев назад
|
| 252 |
-
patterns_back = [
|
| 253 |
-
(r'\b(\d+|[а-яё]+(?:\s+[а-яё]+)?)\s+(день|дня|дней)\s+назад\b', 'days'),
|
| 254 |
-
(r'\b(\d+|[а-яё]+(?:\s+[а-яё]+)?)\s+(неделю|недели|недель)\s+назад\b', 'weeks'),
|
| 255 |
-
(r'\b(\d+|[а-яё]+(?:\s+[а-яё]+)?)\s+(месяц|месяца|месяцев)\s+назад\b', 'months'),
|
| 256 |
-
]
|
| 257 |
-
|
| 258 |
-
for pattern, unit in patterns_back:
|
| 259 |
-
match = re.search(pattern, text_lower)
|
| 260 |
-
if match:
|
| 261 |
-
num = _parse_number(match.group(1))
|
| 262 |
-
if num:
|
| 263 |
-
if unit == 'days':
|
| 264 |
-
return ref - timedelta(days=num), match.group(0)
|
| 265 |
-
elif unit == 'weeks':
|
| 266 |
-
return ref - timedelta(weeks=num), match.group(0)
|
| 267 |
-
elif unit == 'months':
|
| 268 |
-
return ref - relativedelta(months=num), match.group(0)
|
| 269 |
-
|
| 270 |
-
return None
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
def _parse_week_relative(text: str, ref: date) -> Optional[tuple[date, str]]:
|
| 274 |
-
"""Недельные: на следующей неделе, на прошлой неделе..."""
|
| 275 |
-
text_lower = text.lower()
|
| 276 |
-
|
| 277 |
-
# на следующей/прошлой/этой неделе
|
| 278 |
-
week_patterns = [
|
| 279 |
-
(r'\b(?:на\s+)?следующ(?:ей|ую)\s+недел[юеи]\b', 7),
|
| 280 |
-
(r'\b(?:на\s+)?прошл(?:ой|ую)\s+недел[юеи]\b', -7),
|
| 281 |
-
(r'\b(?:на\s+)?предыдущ(?:ей|ую)\s+недел[юеи]\b', -7),
|
| 282 |
-
(r'\b(?:на\s+)?этой\s+неделе\b', 0),
|
| 283 |
-
(r'\b(?:на\s+)?текущ(?:ей|ую)\s+недел[юеи]\b', 0),
|
| 284 |
-
(r'\bчерез\s+неделю\b', 7),
|
| 285 |
-
(r'\bнеделю\s+назад\b', -7),
|
| 286 |
-
]
|
| 287 |
-
|
| 288 |
-
for pattern, delta in week_patterns:
|
| 289 |
-
match = re.search(pattern, text_lower)
|
| 290 |
-
if match:
|
| 291 |
-
# Понедельник целевой недели
|
| 292 |
-
target_monday = _week_start(ref) + timedelta(days=delta)
|
| 293 |
-
return target_monday, match.group(0)
|
| 294 |
-
|
| 295 |
-
return None
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
def _parse_weekday(text: str, ref: date) -> Optional[tuple[date, str]]:
|
| 299 |
-
"""Дни недели: в прошлый понедельник, в следующую пятницу..."""
|
| 300 |
-
text_lower = text.lower()
|
| 301 |
-
|
| 302 |
-
# Прошлый день недели
|
| 303 |
-
match = re.search(
|
| 304 |
-
r'\b(?:в\s+)?прошл(?:ый|ую)\s+(понедельник|вторник|сред[ау]|четверг|пятниц[ау]|суббот[ау]|воскресенье)\b',
|
| 305 |
-
text_lower
|
| 306 |
-
)
|
| 307 |
-
if match:
|
| 308 |
-
weekday_text = match.group(1).replace('у', 'а') if match.group(1).endswith('у') else match.group(1)
|
| 309 |
-
weekday = WEEKDAYS.get(weekday_text) or WEEKDAYS.get(match.group(1))
|
| 310 |
-
if weekday is not None:
|
| 311 |
-
return _get_weekday_date(weekday, ref, 'past'), match.group(0)
|
| 312 |
-
|
| 313 |
-
# Следующий день недели
|
| 314 |
-
match = re.search(
|
| 315 |
-
r'\b(?:в\s+)?следующ(?:ий|ую)\s+(понедельник|вторник|сред[ау]|четверг|пятниц[ау]|суббот[ау]|воскресенье)\b',
|
| 316 |
-
text_lower
|
| 317 |
-
)
|
| 318 |
-
if match:
|
| 319 |
-
weekday_text = match.group(1)
|
| 320 |
-
weekday = WEEKDAYS.get(weekday_text)
|
| 321 |
-
if weekday is not None:
|
| 322 |
-
return _get_weekday_date(weekday, ref, 'next'), match.group(0)
|
| 323 |
-
|
| 324 |
-
# Этот день недели
|
| 325 |
-
match = re.search(
|
| 326 |
-
r'\b(?:в\s+)?(?:этот|эту)\s+(понедельник|вторник|сред[ау]|четверг|пятниц[ау]|суббот[ау]|воскресенье)\b',
|
| 327 |
-
text_lower
|
| 328 |
-
)
|
| 329 |
-
if match:
|
| 330 |
-
weekday = WEEKDAYS.get(match.group(1))
|
| 331 |
-
if weekday is not None:
|
| 332 |
-
return _get_weekday_date(weekday, ref, 'this'), match.group(0)
|
| 333 |
-
|
| 334 |
-
return None
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
def _parse_period_edge(text: str, ref: date) -> Optional[tuple[date, str]]:
|
| 338 |
-
"""Границы периодов: в начале месяца, в конце недели..."""
|
| 339 |
-
text_lower = text.lower()
|
| 340 |
-
|
| 341 |
-
# Начало/конец месяца
|
| 342 |
-
match = re.search(r'\b(?:в\s+)?начал[еоа]\s+месяца\b', text_lower)
|
| 343 |
-
if match:
|
| 344 |
-
return ref.replace(day=1), match.group(0)
|
| 345 |
-
|
| 346 |
-
match = re.search(r'\b(?:в\s+)?конц[еа]\s+месяца\b', text_lower)
|
| 347 |
-
if match:
|
| 348 |
-
last_day = (ref.replace(day=1) + relativedelta(months=1) - timedelta(days=1)).day
|
| 349 |
-
return ref.replace(day=last_day), match.group(0)
|
| 350 |
-
|
| 351 |
-
# Начало/конец недели
|
| 352 |
-
match = re.search(r'\b(?:в\s+)?начал[еоа]\s+недели\b', text_lower)
|
| 353 |
-
if match:
|
| 354 |
-
return _week_start(ref), match.group(0)
|
| 355 |
-
|
| 356 |
-
match = re.search(r'\b(?:в\s+)?конц[еа]\s+недели\b', text_lower)
|
| 357 |
-
if match:
|
| 358 |
-
return _week_start(ref) + timedelta(days=6), match.group(0)
|
| 359 |
-
|
| 360 |
-
# Начало/конец следующего месяца
|
| 361 |
-
match = re.search(r'\b(?:в\s+)?начал[еоа]\s+следующего\s+месяца\b', text_lower)
|
| 362 |
-
if match:
|
| 363 |
-
return (ref.replace(day=1) + relativedelta(months=1)), match.group(0)
|
| 364 |
-
|
| 365 |
-
match = re.search(r'\b(?:в\s+)?конц[еа]\s+следующего\s+месяца\b', text_lower)
|
| 366 |
-
if match:
|
| 367 |
-
next_month = ref.replace(day=1) + relativedelta(months=2) - timedelta(days=1)
|
| 368 |
-
return next_month, match.group(0)
|
| 369 |
-
|
| 370 |
-
return None
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
def _parse_textual_date(text: str, ref: date) -> Optional[tuple[date, str]]:
|
| 374 |
-
"""Текстовые даты: 15 января, пятого марта 2025..."""
|
| 375 |
-
text_lower = text.lower().replace('ё', 'е')
|
| 376 |
-
|
| 377 |
-
# Порядковые + месяц: пятого марта, двадцать первого января
|
| 378 |
-
for ordinal, day in sorted(ORDINAL_DAYS.items(), key=lambda x: -len(x[0])):
|
| 379 |
-
for month_name, month_num in MONTHS.items():
|
| 380 |
-
pattern = rf'\b{re.escape(ordinal)}\s+{re.escape(month_name)}(?:\s+(\d{{4}}))?\b'
|
| 381 |
-
match = re.search(pattern, text_lower)
|
| 382 |
-
if match:
|
| 383 |
-
year = int(match.group(1)) if match.group(1) else ref.year
|
| 384 |
-
try:
|
| 385 |
-
parsed = date(year, month_num, day)
|
| 386 |
-
parsed = _adjust_year_by_context(parsed, text, ref)
|
| 387 |
-
return parsed, match.group(0)
|
| 388 |
-
except ValueError:
|
| 389 |
-
continue
|
| 390 |
-
|
| 391 |
-
# Цифра + месяц: 15 января 2025
|
| 392 |
-
for month_name, month_num in MONTHS.items():
|
| 393 |
-
pattern = rf'\b(\d{{1,2}})\s+{re.escape(month_name)}(?:\s+(\d{{4}}))?\b'
|
| 394 |
-
match = re.search(pattern, text_lower)
|
| 395 |
-
if match:
|
| 396 |
-
day = int(match.group(1))
|
| 397 |
-
year = int(match.group(2)) if match.group(2) else ref.year
|
| 398 |
-
if 1 <= day <= 31:
|
| 399 |
-
try:
|
| 400 |
-
parsed = date(year, month_num, day)
|
| 401 |
-
parsed = _adjust_year_by_context(parsed, text, ref)
|
| 402 |
-
return parsed, match.group(0)
|
| 403 |
-
except ValueError:
|
| 404 |
-
continue
|
| 405 |
-
|
| 406 |
-
return None
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
def _parse_month_only(text: str, ref: date) -> Optional[tuple[date, str]]:
|
| 410 |
-
"""Только месяц: за март, в апреле..."""
|
| 411 |
-
text_lower = text.lower()
|
| 412 |
-
|
| 413 |
-
for month_name, month_num in MONTHS.items():
|
| 414 |
-
pattern = rf'\b(?:за|в|на)\s+{re.escape(month_name)}\b'
|
| 415 |
-
match = re.search(pattern, text_lower)
|
| 416 |
-
if match:
|
| 417 |
-
year = ref.year
|
| 418 |
-
# Контекст определяет год
|
| 419 |
-
if PAST_INDICATORS.search(text) and month_num > ref.month:
|
| 420 |
-
year -= 1
|
| 421 |
-
elif not PAST_INDICATORS.search(text) and month_num < ref.month:
|
| 422 |
-
# Если месяц уже прошёл и нет индикаторов - следующий год?
|
| 423 |
-
# Нет, оставляем текущий год по умолчанию
|
| 424 |
-
pass
|
| 425 |
-
|
| 426 |
-
return date(year, month_num, 1), match.group(0)
|
| 427 |
-
|
| 428 |
-
return None
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
def _parse_numeric_date(text: str, ref: date) -> Optional[tuple[date, str]]:
|
| 432 |
-
"""Числовые даты: 15.01.2025, 2025-01-15..."""
|
| 433 |
-
# DD.MM.YYYY или DD/MM/YYYY или DD-MM-YYYY
|
| 434 |
-
match = re.search(r'\b(\d{1,2})[./\-](\d{1,2})[./\-](\d{4})\b', text)
|
| 435 |
-
if match:
|
| 436 |
-
day, month, year = int(match.group(1)), int(match.group(2)), int(match.group(3))
|
| 437 |
-
try:
|
| 438 |
-
return date(year, month, day), match.group(0)
|
| 439 |
-
except ValueError:
|
| 440 |
-
pass
|
| 441 |
-
|
| 442 |
-
# YYYY-MM-DD
|
| 443 |
-
match = re.search(r'\b(\d{4})[./\-](\d{1,2})[./\-](\d{1,2})\b', text)
|
| 444 |
-
if match:
|
| 445 |
-
year, month, day = int(match.group(1)), int(match.group(2)), int(match.group(3))
|
| 446 |
-
try:
|
| 447 |
-
return date(year, month, day), match.group(0)
|
| 448 |
-
except ValueError:
|
| 449 |
-
pass
|
| 450 |
-
|
| 451 |
-
return None
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
def _parse_with_natasha(text: str, ref: date) -> Optional[tuple[date, str]]:
|
| 455 |
-
"""Natasha как fallback для сложных случаев."""
|
| 456 |
-
if not NATASHA_AVAILABLE:
|
| 457 |
-
return None
|
| 458 |
-
|
| 459 |
-
try:
|
| 460 |
-
extractor = _get_extractor()
|
| 461 |
-
if extractor is None:
|
| 462 |
-
return None
|
| 463 |
-
|
| 464 |
-
matches = list(extractor(text))
|
| 465 |
-
|
| 466 |
-
if matches:
|
| 467 |
-
match = matches[0]
|
| 468 |
-
fact = match.fact
|
| 469 |
-
|
| 470 |
-
year = getattr(fact, 'year', None) or ref.year
|
| 471 |
-
month = getattr(fact, 'month', None)
|
| 472 |
-
day = getattr(fact, 'day', None) or 1
|
| 473 |
-
|
| 474 |
-
if month:
|
| 475 |
-
try:
|
| 476 |
-
parsed = date(year, month, day)
|
| 477 |
-
parsed = _adjust_year_by_context(parsed, text, ref)
|
| 478 |
-
return parsed, text[match.start:match.stop]
|
| 479 |
-
except ValueError:
|
| 480 |
-
pass
|
| 481 |
-
except Exception:
|
| 482 |
-
pass
|
| 483 |
-
|
| 484 |
-
return None
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
# ============== ГЛАВНАЯ ФУНКЦИЯ ==============
|
| 488 |
-
|
| 489 |
-
def parse_date_natasha(
|
| 490 |
-
text: str,
|
| 491 |
-
reference_date: Optional[date] = None
|
| 492 |
-
) -> dict[str, Any]:
|
| 493 |
-
"""
|
| 494 |
-
Универсальный парсер дат для русского языка.
|
| 495 |
-
|
| 496 |
-
Поддерживает:
|
| 497 |
-
- Прямые относительные: сегодня, завтра, вчера, послезавтра, позавчера
|
| 498 |
-
- Количественные: через 2 дня, 3 недели назад, через два месяца
|
| 499 |
-
- Недельные: на следующей неделе, на прошлой неделе
|
| 500 |
-
- Дни недели: в прошлый понедельник, в следующую пятницу
|
| 501 |
-
- Границы периодов: в начале месяца, в конце недели
|
| 502 |
-
- Текстовые: 15 января 2025, пятого марта
|
| 503 |
-
- Месяцы: за март, в апреле
|
| 504 |
-
- Числовые: 15.01.2025, 2025-01-15
|
| 505 |
-
|
| 506 |
-
Args:
|
| 507 |
-
text: Текст для анализа
|
| 508 |
-
reference_date: Опорная дата (по умолчанию - сегодня)
|
| 509 |
-
|
| 510 |
-
Returns:
|
| 511 |
-
{"date": "19.04.2026", "date_iso": "2026-04-19", "matched_date_phrase": "..."}
|
| 512 |
-
"""
|
| 513 |
-
if reference_date is None:
|
| 514 |
-
reference_date = date.today()
|
| 515 |
-
|
| 516 |
-
result = {
|
| 517 |
-
"date": None,
|
| 518 |
-
"date_iso": None,
|
| 519 |
-
"matched_date_phrase": None,
|
| 520 |
-
}
|
| 521 |
-
|
| 522 |
-
# Порядок парсеров: от простых к сложным
|
| 523 |
-
parsers: list[Callable[[str, date], Optional[tuple[date, str]]]] = [
|
| 524 |
-
_parse_numeric_date, # 15.01.2025
|
| 525 |
-
_parse_direct_relative, # завтра, вчера
|
| 526 |
-
_parse_quantity_relative, # через 2 дня
|
| 527 |
-
_parse_week_relative, # на следующей неделе
|
| 528 |
-
_parse_weekday, # в прошлый понедельник
|
| 529 |
-
_parse_period_edge, # в конце месяца
|
| 530 |
-
_parse_textual_date, # 15 января, пятого марта
|
| 531 |
-
_parse_month_only, # за март
|
| 532 |
-
_parse_with_natasha, # Natasha fallback
|
| 533 |
-
]
|
| 534 |
-
|
| 535 |
-
for parser in parsers:
|
| 536 |
-
parsed = parser(text, reference_date)
|
| 537 |
-
if parsed:
|
| 538 |
-
parsed_date, matched = parsed
|
| 539 |
-
result["date"] = parsed_date.strftime("%d.%m.%Y")
|
| 540 |
-
result["date_iso"] = parsed_date.isoformat()
|
| 541 |
-
result["matched_date_phrase"] = matched
|
| 542 |
-
return result
|
| 543 |
-
|
| 544 |
-
return result
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
# ============== КЛАСС-ОБЁРТКА ==============
|
| 548 |
-
|
| 549 |
-
class NatashaDateExtractor:
|
| 550 |
-
"""Экстрактор дат для совместимости с ExpenseDateExtractor."""
|
| 551 |
-
|
| 552 |
-
def extract(self, text: str, reference_date: Optional[date] = None) -> dict[str, Any]:
|
| 553 |
-
ref = reference_date or date.today()
|
| 554 |
-
if isinstance(ref, str):
|
| 555 |
-
ref = datetime.strptime(ref, "%Y-%m-%d").date()
|
| 556 |
-
return parse_date_natasha(text, ref)
|
| 557 |
-
|
| 558 |
-
def extract_all(self, text: str, reference_date: Optional[date] = None) -> list[dict[str, Any]]:
|
| 559 |
-
# Для простоты возвращаем первый результат
|
| 560 |
-
result = self.extract(text, reference_date)
|
| 561 |
-
return [result] if result["date"] else []
|
| 562 |
-
|
| 563 |
-
|
| 564 |
-
if __name__ == "__main__":
|
| 565 |
-
# Тестирование
|
| 566 |
-
test_phrases = [
|
| 567 |
-
"завтра",
|
| 568 |
-
"через 2 дня",
|
| 569 |
-
"через два дня",
|
| 570 |
-
"на следующей неделе",
|
| 571 |
-
"15 января 2025",
|
| 572 |
-
"позавчера",
|
| 573 |
-
"в прошлый понедельник",
|
| 574 |
-
"оплата за март",
|
| 575 |
-
"5 марта",
|
| 576 |
-
"купил вчера",
|
| 577 |
-
"в конце месяца",
|
| 578 |
-
"пятого марта",
|
| 579 |
-
"двадцать первого января",
|
| 580 |
-
"3 недели назад",
|
| 581 |
-
"через месяц",
|
| 582 |
-
]
|
| 583 |
-
|
| 584 |
-
ref = date(2026, 4, 19)
|
| 585 |
-
print(f"Reference: {ref}\n")
|
| 586 |
-
|
| 587 |
-
for phrase in test_phrases:
|
| 588 |
-
result = parse_date_natasha(phrase, ref)
|
| 589 |
-
print(f" '{phrase}' -> {result['date_iso']} ({result['matched_date_phrase']})")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
|
@@ -10,4 +10,3 @@ torch
|
|
| 10 |
sentence-transformers
|
| 11 |
scikit-learn
|
| 12 |
gliner
|
| 13 |
-
natasha
|
|
|
|
| 10 |
sentence-transformers
|
| 11 |
scikit-learn
|
| 12 |
gliner
|
|
|