Spaces:

VladRet2026
/

ConvertAudioToJSON

Running

App Files Files

ConvertAudioToJSON / extractors /user_extractor.py

VladRet2026

CreatedSepareatedUserSearch

bd1a487 17 days ago

raw

history blame

7.81 kB

	"""Экстрактор пользователей на той же логике, что и поиск поставщика."""

	from __future__ import annotations

	import re
	import importlib
	from typing import Any

	from extractors.supplier_extractor import ExpenseSupplierExtractor, normalize_text


	class ExpenseUserExtractor:
	"""Ищет пользователя тем же fuzzy-matcher, что и поставщика."""

	MIN_LEXICAL_SUPPORT = 0.40
	MIN_LEXICAL_WITH_PERSON = 0.30

	def __init__(
	self,
	users: list[str],
	suppliers: list[str],
	threshold: float = 0.25,
	) -> None:
	self.threshold = threshold
	self.supplier_terms = {normalize_text(supplier) for supplier in suppliers}
	self.user_matcher = ExpenseSupplierExtractor(suppliers=users)
	self.morph: Any = None
	try:
	pymorphy3_module = importlib.import_module("pymorphy3")
	self.morph = pymorphy3_module.MorphAnalyzer()
	except Exception:
	self.morph = None

	def _looks_like_person_token(self, token: str) -> tuple[bool, float, bool]:
	lexical = self.user_matcher.lexical_support(token)
	has_person_grammeme = False
	if self.morph is not None:
	parses = self.morph.parse(token)
	if parses:
	has_person_grammeme = bool(
	{"Name", "Surn", "Patr"}.intersection(set(parses[0].tag.grammemes))
	)

	# Сохраняем низкий порог для имён, но не пропускаем нарицательные слова.
	accepted = lexical >= self.MIN_LEXICAL_SUPPORT or (
	has_person_grammeme and lexical >= self.MIN_LEXICAL_WITH_PERSON
	)
	return accepted, lexical, has_person_grammeme

	def _build_user_candidate_text(
	self,
	normalized_text: str,
	supplier_phrase: str \| None,
	date_phrase: str \| None,
	include_debug: bool = False,
	) -> tuple[str, list[str], list[dict[str, Any]] \| None]:
	excluded_tokens: set[str] = set(self.user_matcher.noise_terms)
	if supplier_phrase:
	excluded_tokens.update(normalize_text(supplier_phrase).split())
	if date_phrase:
	excluded_tokens.update(normalize_text(date_phrase).split())
	excluded_tokens.update(self.supplier_terms)

	candidate_tokens: list[str] = []
	candidate_debug: list[dict[str, Any]] \| None = [] if include_debug else None
	for token in normalized_text.split():
	if token in excluded_tokens or token.isdigit() or len(token) <= 1:
	continue
	accepted, lexical, has_person_grammeme = self._looks_like_person_token(token)
	if candidate_debug is not None:
	candidate_debug.append({
	"token": token,
	"lexical_support": round(lexical, 4),
	"has_person_grammeme": has_person_grammeme,
	"accepted": accepted,
	})
	if accepted:
	candidate_tokens.append(token)

	return " ".join(candidate_tokens), candidate_tokens, candidate_debug

	def _match_user_from_candidates(
	self,
	candidate_tokens: list[str],
	include_debug: bool = False,
	) -> tuple[dict[str, Any], dict[str, Any] \| None]:
	phrases: list[str] = []
	seen: set[str] = set()
	max_words = self.user_matcher.max_words
	for i in range(len(candidate_tokens)):
	for j in range(i + 1, min(i + 1 + max_words, len(candidate_tokens) + 1)):
	phrase = " ".join(candidate_tokens[i:j])
	if phrase not in seen:
	seen.add(phrase)
	phrases.append(phrase)

	best_row: dict[str, Any] \| None = None
	debug_rows: list[dict[str, Any]] = []
	for phrase in phrases:
	row = self.user_matcher.score_phrase(phrase)
	score = float(row.get("score", -1.0))
	support = self.user_matcher.lexical_support(phrase)
	combined = 0.75 * score + 0.25 * support

	if include_debug:
	debug_rows.append({
	"phrase": phrase,
	"supplier": row.get("supplier"),
	"score": round(score, 4),
	"support": round(support, 4),
	"combined": round(combined, 4),
	})

	if score >= self.threshold or combined >= self.threshold:
	enriched = {
	"user": row.get("supplier"),
	"user_score": round(score, 4) if score >= 0 else None,
	"matched_user_phrase": phrase,
	"combined": combined,
	}
	if best_row is None or combined > float(best_row.get("combined", -1.0)):
	best_row = enriched

	if best_row is None:
	match_payload = {
	"user": None,
	"user_score": None,
	"matched_user_phrase": None,
	}
	else:
	match_payload = {
	"user": best_row.get("user"),
	"user_score": best_row.get("user_score"),
	"matched_user_phrase": best_row.get("matched_user_phrase"),
	}

	match_debug = None
	if include_debug:
	match_debug = {
	"phrases_count": len(phrases),
	"score_threshold": self.threshold,
	"combined_threshold": self.threshold,
	"top_candidates": sorted(debug_rows, key=lambda item: item["combined"], reverse=True)[:8],
	}

	return match_payload, match_debug

	def extract(
	self,
	text: str,
	supplier_phrase: str \| None = None,
	date_phrase: str \| None = None,
	debug: bool = False,
	) -> dict[str, Any]:
	normalized_text = normalize_text(text)

	if re.search(r"(?<!\S)я(?!\S)", normalized_text, re.IGNORECASE):
	payload = {
	"user": "Я",
	"user_score": 1.0,
	"matched_user_phrase": "я",
	}
	if debug:
	payload["user_debug"] = {
	"mode": "direct-pronoun",
	"normalized_text": normalized_text,
	}
	return payload

	candidate_text, candidate_tokens, candidate_debug = self._build_user_candidate_text(
	normalized_text=normalized_text,
	supplier_phrase=supplier_phrase,
	date_phrase=date_phrase,
	include_debug=debug,
	)

	if not candidate_text:
	payload = {
	"user": None,
	"user_score": None,
	"matched_user_phrase": None,
	}
	match_debug = None
	else:
	payload, match_debug = self._match_user_from_candidates(candidate_tokens, include_debug=debug)

	if debug:
	payload["user_debug"] = {
	"mode": "user-matcher",
	"threshold": self.threshold,
	"rules": {
	"min_lexical_support": self.MIN_LEXICAL_SUPPORT,
	"min_lexical_with_person_grammeme": self.MIN_LEXICAL_WITH_PERSON,
	"morph_enabled": self.morph is not None,
	},
	"excluded_supplier_phrase": supplier_phrase,
	"normalized_text": normalized_text,
	"candidate_text": candidate_text,
	"candidate_tokens": candidate_tokens,
	"candidate_token_debug": candidate_debug or [],
	"matcher_debug": match_debug,
	}

	return payload