Spaces:

lucasgagneten
/

layoutlmv3-facturas-extractor

Sleeping

layoutlmv3-facturas-extractor / validator.py

Lucas Gagneten

mayor precisión en el cálculo de IVA y neto

91e4791 14 days ago

14.7 kB

	# validator.py
	"""
	Validación y corrección de etiquetas extraídas de facturas
	"""

	import re
	from datetime import datetime
	from typing import Dict, List, Tuple, Optional


	class InvoiceValidator:
	"""Clase para validar y corregir datos extraídos de facturas."""

	# Etiquetas requeridas en el orden que deben aparecer
	REQUIRED_LABELS = [
	'PROVEEDOR_RAZON_SOCIAL',
	'PROVEEDOR_CUIT',
	'COMPROBANTE_NUMERO',
	'FECHA',
	'JURISDICCION_GASTO',
	'TIPO',
	'CONCEPTO_GASTO',
	'ALICUOTA',
	'IVA',
	'NETO',
	'TOTAL'
	]

	def __init__(self):
	"""Inicializa el validador."""
	self.validation_errors = {}

	def validate_and_correct(self, ner_results: List[Dict], ocr_text: List[str] = None) -> Tuple[List[List], Dict]:
	"""
	Valida y corrige los resultados de NER.

	Args:
	ner_results: Lista de diccionarios con 'etiqueta' y 'valor'
	ocr_text: Lista opcional de palabras extraídas por OCR

	Returns:
	tuple: (tabla_corregida, errores_validacion)
	- tabla_corregida: Lista de [etiqueta, valor] (sin columna de validación)
	- errores_validacion: Dict con etiquetas que tienen errores
	"""
	# Convertir resultados NER a diccionario
	ner_dict = {item['etiqueta']: item['valor'] for item in ner_results}

	print(f"\n=== VALIDACIÓN ===")
	print(f"Etiquetas detectadas: {list(ner_dict.keys())}")
	print(f"Total palabras OCR: {len(ocr_text) if ocr_text else 0}")

	# Resetear errores
	self.validation_errors = {}

	# Crear tabla con todas las etiquetas requeridas
	corrected_table = []

	for label in self.REQUIRED_LABELS:
	value = ner_dict.get(label, '')
	corrected_value, is_valid = self._validate_label(label, value, ner_dict, ocr_text)

	# Solo agregar [etiqueta, valor], sin la columna de estado
	corrected_table.append([label, corrected_value])

	print(f"{label}: '{value}' -> '{corrected_value}' (válido: {is_valid})")

	if not is_valid:
	self.validation_errors[label] = corrected_value

	print(f"Total campos inválidos: {len(self.validation_errors)}")
	print("==================\n")

	return corrected_table, self.validation_errors

	def _validate_label(self, label: str, value: str, all_values: Dict, ocr_text: List[str]) -> Tuple[str, bool]:
	"""
	Valida y corrige un valor específico según su etiqueta.

	Args:
	label: Nombre de la etiqueta
	value: Valor a validar
	all_values: Diccionario con todos los valores NER
	ocr_text: Lista de palabras del OCR

	Returns:
	tuple: (valor_corregido, es_valido)
	"""
	validators = {
	'ALICUOTA': self._validate_alicuota,
	'COMPROBANTE_NUMERO': self._validate_comprobante,
	'CONCEPTO_GASTO': self._validate_concepto_gasto,
	'FECHA': self._validate_fecha,
	'IVA': self._validate_iva,
	'JURISDICCION_GASTO': self._validate_jurisdiccion,
	'NETO': self._validate_neto,
	'PROVEEDOR_CUIT': self._validate_cuit,
	'PROVEEDOR_RAZON_SOCIAL': self._validate_razon_social,
	'TIPO': self._validate_tipo,
	'TOTAL': self._validate_total
	}

	validator = validators.get(label)
	if validator:
	return validator(value, all_values, ocr_text)

	return value, True

	def _validate_alicuota(self, value: str, all_values: Dict, ocr_text: List[str]) -> Tuple[str, bool]:
	"""Valida ALICUOTA: debe ser '21.00' o '10.5'."""
	value_clean = value.strip().replace(',', '.')

	if '21' in value_clean:
	return '21.00', True
	elif '10.5' in value_clean or '10,5' in value:
	return '10.5', True
	else:
	# Por defecto 21%
	return '21.00', False

	def _validate_comprobante(self, value: str, all_values: Dict, ocr_text: List[str]) -> Tuple[str, bool]:
	"""Valida COMPROBANTE_NUMERO: formato #####-########."""
	# Buscar patrón correcto
	pattern = r'\d{4,5}-\d{8}'

	# Si el valor tiene el patrón, extraerlo
	match = re.search(pattern, value)

	if match:
	extracted = match.group(0)
	# Si después de extraer tiene el formato correcto, es válido
	return extracted, True

	# Si no coincide, buscar números y formatear
	numbers = re.findall(r'\d+', value)
	if len(numbers) >= 2:
	num1 = numbers[0].zfill(5)[:5]
	num2 = numbers[1].zfill(8)[:8]
	formatted = f"{num1}-{num2}"
	# Verificar si el valor formateado cumple con el patrón
	if re.match(pattern, formatted):
	return formatted, True
	return formatted, False

	return '00000-00000000', False

	def _validate_concepto_gasto(self, value: str, all_values: Dict, ocr_text: List[str]) -> Tuple[str, bool]:
	"""Valida CONCEPTO_GASTO: cualquier texto es válido."""
	return value.strip() if value else '', True

	def _validate_fecha(self, value: str, all_values: Dict, ocr_text: List[str]) -> Tuple[str, bool]:
	"""Valida FECHA: debe tener formato de fecha válido."""
	if not value:
	return datetime.now().strftime('%d/%m/%Y'), False

	# Intentar parsear diferentes formatos de fecha
	date_patterns = [
	r'(\d{1,2})[/-](\d{1,2})[/-](\d{4})', # dd/mm/yyyy o dd-mm-yyyy
	r'(\d{1,2})[/-](\d{1,2})[/-](\d{2})', # dd/mm/yy
	r'(\d{4})[/-](\d{1,2})[/-](\d{1,2})' # yyyy/mm/dd
	]

	for pattern in date_patterns:
	match = re.search(pattern, value)
	if match:
	try:
	groups = match.groups()
	if len(groups[2]) == 2: # año con 2 dígitos
	year = '20' + groups[2]
	date_str = f"{groups[0]}/{groups[1]}/{year}"
	else:
	date_str = f"{groups[0]}/{groups[1]}/{groups[2]}"

	# Validar que sea una fecha válida
	datetime.strptime(date_str, '%d/%m/%Y')
	return date_str, True
	except ValueError:
	continue

	# Si no se puede parsear, usar fecha actual
	return datetime.now().strftime('%d/%m/%Y'), False

	def _validate_total(self, value: str, all_values: Dict, ocr_text: List[str]) -> Tuple[str, bool]:
	"""Valida TOTAL: debe ser un número, eliminar símbolo $."""
	if not value:
	# Buscar el número más alto con decimales en OCR
	if ocr_text:
	max_number = self._find_max_decimal_in_ocr(ocr_text)
	if max_number:
	return max_number, False
	return '0.00', False

	# Limpiar valor
	clean_value = self._clean_currency(value)

	try:
	float(clean_value)
	return clean_value, True
	except ValueError:
	return '0.00', False

	def _validate_iva(self, value: str, all_values: Dict, ocr_text: List[str]) -> Tuple[str, bool]:
	"""Valida IVA: debe ser un número, calcular (TOTAL/1.21)*0.21 si no existe."""
	if not value:
	# Calcular 21% del TOTAL
	total = all_values.get('TOTAL', '0')
	clean_total = self._clean_currency(total)

	try:
	total_num = float(clean_total)
	iva_calculated = round(total_num * 0.17355372, 2)
	return f"{iva_calculated:.2f}", False
	except ValueError:
	return '0.00', False

	# Limpiar valor
	clean_value = self._clean_currency(value)

	try:
	float(clean_value)
	return clean_value, True
	except ValueError:
	return '0.00', False

	def _validate_neto(self, value: str, all_values: Dict, ocr_text: List[str]) -> Tuple[str, bool]:
	"""Valida NETO: debe ser un número, calcular TOTAL/1.21 si no existe."""
	if not value:
	# Calcular 79% del TOTAL (o TOTAL - IVA)
	total = all_values.get('TOTAL', '0')
	clean_total = self._clean_currency(total)

	try:
	total_num = float(clean_total)
	neto_calculated = round(total_num * 0.82644628, 2)
	return f"{neto_calculated:.2f}", False
	except ValueError:
	return '0.00', False

	# Limpiar valor
	clean_value = self._clean_currency(value)

	try:
	float(clean_value)
	return clean_value, True
	except ValueError:
	return '0.00', False

	def _validate_jurisdiccion(self, value: str, all_values: Dict, ocr_text: List[str]) -> Tuple[str, bool]:
	"""Valida JURISDICCION_GASTO: texto de localidad."""
	return value.strip() if value else '', True

	def _validate_cuit(self, value: str, all_values: Dict, ocr_text: List[str]) -> Tuple[str, bool]:
	"""Valida PROVEEDOR_CUIT: formato ##-########-#."""
	if not value:
	# Si no hay valor, buscar en OCR
	if ocr_text:
	ocr_combined = ' '.join(ocr_text)
	# Buscar patrón ##-########-#
	pattern = r'\d{2}-\d{8}-\d{1}'
	match = re.search(pattern, ocr_combined)
	if match:
	return match.group(0), False

	# Buscar patrón sin guiones: 11 dígitos consecutivos
	pattern_no_dash = r'\b\d{11}\b'
	match = re.search(pattern_no_dash, ocr_combined)
	if match:
	cuit = match.group(0)
	formatted_cuit = f"{cuit[:2]}-{cuit[2:10]}-{cuit[10]}"
	return formatted_cuit, False
	return '00-00000000-0', False

	# Limpiar valor: quitar TODO excepto números y guiones
	clean_value = re.sub(r'[^\d\-]', '', value)

	# Buscar patrón ##-########-# en el valor limpio
	pattern = r'\d{2}-\d{8}-\d{1}'
	match = re.search(pattern, clean_value)

	if match:
	extracted = match.group(0)
	# Si el valor extraído cumple con el formato, es válido
	return extracted, True

	# Si hay números pero no el formato correcto, intentar extraerlos y formatear
	numbers_only = re.sub(r'[^\d]', '', clean_value)
	if len(numbers_only) == 11:
	formatted_cuit = f"{numbers_only[:2]}-{numbers_only[2:10]}-{numbers_only[10]}"
	# El CUIT formateado es válido
	return formatted_cuit, True
	elif len(numbers_only) > 11:
	# Tomar los primeros 11 dígitos
	formatted_cuit = f"{numbers_only[:2]}-{numbers_only[2:10]}-{numbers_only[10]}"
	return formatted_cuit, True

	# Buscar en OCR si no se puede extraer del valor
	if ocr_text:
	ocr_combined = ' '.join(ocr_text)
	# Buscar con formato
	match = re.search(pattern, ocr_combined)
	if match:
	return match.group(0), False

	# Buscar sin guiones
	pattern_no_dash = r'\b\d{11}\b'
	match = re.search(pattern_no_dash, ocr_combined)
	if match:
	cuit = match.group(0)
	formatted_cuit = f"{cuit[:2]}-{cuit[2:10]}-{cuit[10]}"
	return formatted_cuit, False

	return '00-00000000-0', False

	def _validate_razon_social(self, value: str, all_values: Dict, ocr_text: List[str]) -> Tuple[str, bool]:
	"""Valida PROVEEDOR_RAZON_SOCIAL: cualquier texto es válido."""
	return value.strip() if value else '', True

	def _validate_tipo(self, value: str, all_values: Dict, ocr_text: List[str]) -> Tuple[str, bool]:
	"""Valida TIPO: debe ser A, B, C, M, E o T."""
	# Eliminar la palabra "factura"
	clean_value = re.sub(r'factura\s*', '', value, flags=re.IGNORECASE).strip().upper()

	# Validar tipos permitidos
	valid_types = ['A', 'B', 'C', 'M', 'E', 'T']

	# Buscar tipo en el valor limpio
	for tipo in valid_types:
	if tipo in clean_value:
	return tipo, True

	# Por defecto, tipo A
	return 'A', False

	def _clean_currency(self, value: str) -> str:
	"""Limpia valores monetarios: elimina $, normaliza decimales."""
	if not value:
	return '0.00'

	# Eliminar símbolos de moneda y espacios
	clean = re.sub(r'[$\s]', '', value)

	# Normalizar separadores decimales (argentinos usan , o .)
	# Si hay tanto punto como coma, el último es el decimal
	if '.' in clean and ',' in clean:
	if clean.rindex('.') > clean.rindex(','):
	# Punto es decimal
	clean = clean.replace(',', '')
	else:
	# Coma es decimal
	clean = clean.replace('.', '').replace(',', '.')
	elif ',' in clean:
	# Solo coma: es decimal
	clean = clean.replace(',', '.')

	try:
	num = float(clean)
	return f"{num:.2f}"
	except ValueError:
	return '0.00'

	def _find_max_decimal_in_ocr(self, ocr_text: List[str]) -> Optional[str]:
	"""Encuentra el número más alto con decimales en el texto OCR."""
	max_value = 0.0
	found = False

	for word in ocr_text:
	# Buscar números con decimales (con punto o coma)
	if '.' in word or ',' in word:
	clean = self._clean_currency(word)
	try:
	num = float(clean)
	if num > max_value:
	max_value = num
	found = True
	except ValueError:
	continue

	return f"{max_value:.2f}" if found else None