Spaces:
Running
Running
| """Простой regex-экстрактор суммы из текста.""" | |
| from __future__ import annotations | |
| import re | |
| from typing import Any, Optional | |
| AMOUNT_PATTERN = re.compile(r"\d+(?:,\d{1,2})?", re.IGNORECASE) | |
| class ExpenseAmountExtractor: | |
| """Извлекает сумму как целое число или число с запятой.""" | |
| def __init__(self, suppliers: list[str] | None = None) -> None: | |
| self.suppliers = suppliers or [] | |
| def to_float(value: str) -> Optional[float]: | |
| try: | |
| return float(value.replace(",", ".")) | |
| except ValueError: | |
| return None | |
| def phrase_span(text: str, phrase: Optional[str]) -> Optional[tuple[int, int]]: | |
| if not phrase: | |
| return None | |
| idx = text.lower().find(phrase.lower()) | |
| if idx == -1: | |
| return None | |
| return idx, idx + len(phrase) | |
| def overlaps(span1: tuple[int, int], span2: Optional[tuple[int, int]]) -> bool: | |
| if span2 is None: | |
| return False | |
| return span1[0] < span2[1] and span2[0] < span1[1] | |
| def extract( | |
| self, | |
| text: str, | |
| matched_date_phrase: Optional[str] = None, | |
| matched_supplier_phrase: Optional[str] = None, | |
| debug: bool = False, | |
| ) -> dict[str, Any]: | |
| date_span = self.phrase_span(text, matched_date_phrase) | |
| supplier_span = self.phrase_span(text, matched_supplier_phrase) | |
| candidates: list[dict[str, Any]] = [] | |
| for match in AMOUNT_PATTERN.finditer(text): | |
| span = match.span() | |
| overlaps_date = self.overlaps(span, date_span) | |
| overlaps_supplier = self.overlaps(span, supplier_span) | |
| amount_text = match.group(0) | |
| if debug: | |
| candidates.append({ | |
| "value": amount_text, | |
| "span": [span[0], span[1]], | |
| "overlaps_date": overlaps_date, | |
| "overlaps_supplier": overlaps_supplier, | |
| }) | |
| if overlaps_date or overlaps_supplier: | |
| continue | |
| amount = self.to_float(amount_text) | |
| if amount is not None: | |
| payload = {"amount": amount, "amount_text": amount_text} | |
| if debug: | |
| payload["amount_debug"] = { | |
| "matched_date_phrase": matched_date_phrase, | |
| "matched_supplier_phrase": matched_supplier_phrase, | |
| "date_span": list(date_span) if date_span else None, | |
| "supplier_span": list(supplier_span) if supplier_span else None, | |
| "candidates": candidates, | |
| "selected": amount_text, | |
| } | |
| return payload | |
| payload = {"amount": None, "amount_text": None} | |
| if debug: | |
| payload["amount_debug"] = { | |
| "matched_date_phrase": matched_date_phrase, | |
| "matched_supplier_phrase": matched_supplier_phrase, | |
| "date_span": list(date_span) if date_span else None, | |
| "supplier_span": list(supplier_span) if supplier_span else None, | |
| "candidates": candidates, | |
| "selected": None, | |
| } | |
| return payload |