|
|
|
|
|
""" |
|
|
Validación y corrección de etiquetas extraídas de facturas |
|
|
""" |
|
|
|
|
|
import re |
|
|
from datetime import datetime |
|
|
from typing import Dict, List, Tuple, Optional |
|
|
|
|
|
|
|
|
class InvoiceValidator: |
|
|
"""Clase para validar y corregir datos extraídos de facturas.""" |
|
|
|
|
|
|
|
|
REQUIRED_LABELS = [ |
|
|
'PROVEEDOR_RAZON_SOCIAL', |
|
|
'PROVEEDOR_CUIT', |
|
|
'COMPROBANTE_NUMERO', |
|
|
'FECHA', |
|
|
'JURISDICCION_GASTO', |
|
|
'TIPO', |
|
|
'CONCEPTO_GASTO', |
|
|
'ALICUOTA', |
|
|
'IVA', |
|
|
'NETO', |
|
|
'TOTAL' |
|
|
] |
|
|
|
|
|
def __init__(self): |
|
|
"""Inicializa el validador.""" |
|
|
self.validation_errors = {} |
|
|
|
|
|
def validate_and_correct(self, ner_results: List[Dict], ocr_text: List[str] = None) -> Tuple[List[List], Dict]: |
|
|
""" |
|
|
Valida y corrige los resultados de NER. |
|
|
|
|
|
Args: |
|
|
ner_results: Lista de diccionarios con 'etiqueta' y 'valor' |
|
|
ocr_text: Lista opcional de palabras extraídas por OCR |
|
|
|
|
|
Returns: |
|
|
tuple: (tabla_corregida, errores_validacion) |
|
|
- tabla_corregida: Lista de [etiqueta, valor] (sin columna de validación) |
|
|
- errores_validacion: Dict con etiquetas que tienen errores |
|
|
""" |
|
|
|
|
|
ner_dict = {item['etiqueta']: item['valor'] for item in ner_results} |
|
|
|
|
|
print(f"\n=== VALIDACIÓN ===") |
|
|
print(f"Etiquetas detectadas: {list(ner_dict.keys())}") |
|
|
print(f"Total palabras OCR: {len(ocr_text) if ocr_text else 0}") |
|
|
|
|
|
|
|
|
self.validation_errors = {} |
|
|
|
|
|
|
|
|
corrected_table = [] |
|
|
|
|
|
for label in self.REQUIRED_LABELS: |
|
|
value = ner_dict.get(label, '') |
|
|
corrected_value, is_valid = self._validate_label(label, value, ner_dict, ocr_text) |
|
|
|
|
|
|
|
|
corrected_table.append([label, corrected_value]) |
|
|
|
|
|
print(f"{label}: '{value}' -> '{corrected_value}' (válido: {is_valid})") |
|
|
|
|
|
if not is_valid: |
|
|
self.validation_errors[label] = corrected_value |
|
|
|
|
|
print(f"Total campos inválidos: {len(self.validation_errors)}") |
|
|
print("==================\n") |
|
|
|
|
|
return corrected_table, self.validation_errors |
|
|
|
|
|
def _validate_label(self, label: str, value: str, all_values: Dict, ocr_text: List[str]) -> Tuple[str, bool]: |
|
|
""" |
|
|
Valida y corrige un valor específico según su etiqueta. |
|
|
|
|
|
Args: |
|
|
label: Nombre de la etiqueta |
|
|
value: Valor a validar |
|
|
all_values: Diccionario con todos los valores NER |
|
|
ocr_text: Lista de palabras del OCR |
|
|
|
|
|
Returns: |
|
|
tuple: (valor_corregido, es_valido) |
|
|
""" |
|
|
validators = { |
|
|
'ALICUOTA': self._validate_alicuota, |
|
|
'COMPROBANTE_NUMERO': self._validate_comprobante, |
|
|
'CONCEPTO_GASTO': self._validate_concepto_gasto, |
|
|
'FECHA': self._validate_fecha, |
|
|
'IVA': self._validate_iva, |
|
|
'JURISDICCION_GASTO': self._validate_jurisdiccion, |
|
|
'NETO': self._validate_neto, |
|
|
'PROVEEDOR_CUIT': self._validate_cuit, |
|
|
'PROVEEDOR_RAZON_SOCIAL': self._validate_razon_social, |
|
|
'TIPO': self._validate_tipo, |
|
|
'TOTAL': self._validate_total |
|
|
} |
|
|
|
|
|
validator = validators.get(label) |
|
|
if validator: |
|
|
return validator(value, all_values, ocr_text) |
|
|
|
|
|
return value, True |
|
|
|
|
|
def _validate_alicuota(self, value: str, all_values: Dict, ocr_text: List[str]) -> Tuple[str, bool]: |
|
|
"""Valida ALICUOTA: debe ser '21.00' o '10.5'.""" |
|
|
value_clean = value.strip().replace(',', '.') |
|
|
|
|
|
if '21' in value_clean: |
|
|
return '21.00', True |
|
|
elif '10.5' in value_clean or '10,5' in value: |
|
|
return '10.5', True |
|
|
else: |
|
|
|
|
|
return '21.00', False |
|
|
|
|
|
def _validate_comprobante(self, value: str, all_values: Dict, ocr_text: List[str]) -> Tuple[str, bool]: |
|
|
"""Valida COMPROBANTE_NUMERO: formato #####-########.""" |
|
|
|
|
|
pattern = r'\d{4,5}-\d{8}' |
|
|
|
|
|
|
|
|
match = re.search(pattern, value) |
|
|
|
|
|
if match: |
|
|
extracted = match.group(0) |
|
|
|
|
|
return extracted, True |
|
|
|
|
|
|
|
|
numbers = re.findall(r'\d+', value) |
|
|
if len(numbers) >= 2: |
|
|
num1 = numbers[0].zfill(5)[:5] |
|
|
num2 = numbers[1].zfill(8)[:8] |
|
|
formatted = f"{num1}-{num2}" |
|
|
|
|
|
if re.match(pattern, formatted): |
|
|
return formatted, True |
|
|
return formatted, False |
|
|
|
|
|
return '00000-00000000', False |
|
|
|
|
|
def _validate_concepto_gasto(self, value: str, all_values: Dict, ocr_text: List[str]) -> Tuple[str, bool]: |
|
|
"""Valida CONCEPTO_GASTO: cualquier texto es válido.""" |
|
|
return value.strip() if value else '', True |
|
|
|
|
|
def _validate_fecha(self, value: str, all_values: Dict, ocr_text: List[str]) -> Tuple[str, bool]: |
|
|
"""Valida FECHA: debe tener formato de fecha válido.""" |
|
|
if not value: |
|
|
return datetime.now().strftime('%d/%m/%Y'), False |
|
|
|
|
|
|
|
|
date_patterns = [ |
|
|
r'(\d{1,2})[/-](\d{1,2})[/-](\d{4})', |
|
|
r'(\d{1,2})[/-](\d{1,2})[/-](\d{2})', |
|
|
r'(\d{4})[/-](\d{1,2})[/-](\d{1,2})' |
|
|
] |
|
|
|
|
|
for pattern in date_patterns: |
|
|
match = re.search(pattern, value) |
|
|
if match: |
|
|
try: |
|
|
groups = match.groups() |
|
|
if len(groups[2]) == 2: |
|
|
year = '20' + groups[2] |
|
|
date_str = f"{groups[0]}/{groups[1]}/{year}" |
|
|
else: |
|
|
date_str = f"{groups[0]}/{groups[1]}/{groups[2]}" |
|
|
|
|
|
|
|
|
datetime.strptime(date_str, '%d/%m/%Y') |
|
|
return date_str, True |
|
|
except ValueError: |
|
|
continue |
|
|
|
|
|
|
|
|
return datetime.now().strftime('%d/%m/%Y'), False |
|
|
|
|
|
def _validate_total(self, value: str, all_values: Dict, ocr_text: List[str]) -> Tuple[str, bool]: |
|
|
"""Valida TOTAL: debe ser un número, eliminar símbolo $.""" |
|
|
if not value: |
|
|
|
|
|
if ocr_text: |
|
|
max_number = self._find_max_decimal_in_ocr(ocr_text) |
|
|
if max_number: |
|
|
return max_number, False |
|
|
return '0.00', False |
|
|
|
|
|
|
|
|
clean_value = self._clean_currency(value) |
|
|
|
|
|
try: |
|
|
float(clean_value) |
|
|
return clean_value, True |
|
|
except ValueError: |
|
|
return '0.00', False |
|
|
|
|
|
def _validate_iva(self, value: str, all_values: Dict, ocr_text: List[str]) -> Tuple[str, bool]: |
|
|
"""Valida IVA: debe ser un número, calcular (TOTAL/1.21)*0.21 si no existe.""" |
|
|
if not value: |
|
|
|
|
|
total = all_values.get('TOTAL', '0') |
|
|
clean_total = self._clean_currency(total) |
|
|
|
|
|
try: |
|
|
total_num = float(clean_total) |
|
|
iva_calculated = round(total_num * 0.17355372, 2) |
|
|
return f"{iva_calculated:.2f}", False |
|
|
except ValueError: |
|
|
return '0.00', False |
|
|
|
|
|
|
|
|
clean_value = self._clean_currency(value) |
|
|
|
|
|
try: |
|
|
float(clean_value) |
|
|
return clean_value, True |
|
|
except ValueError: |
|
|
return '0.00', False |
|
|
|
|
|
def _validate_neto(self, value: str, all_values: Dict, ocr_text: List[str]) -> Tuple[str, bool]: |
|
|
"""Valida NETO: debe ser un número, calcular TOTAL/1.21 si no existe.""" |
|
|
if not value: |
|
|
|
|
|
total = all_values.get('TOTAL', '0') |
|
|
clean_total = self._clean_currency(total) |
|
|
|
|
|
try: |
|
|
total_num = float(clean_total) |
|
|
neto_calculated = round(total_num * 0.82644628, 2) |
|
|
return f"{neto_calculated:.2f}", False |
|
|
except ValueError: |
|
|
return '0.00', False |
|
|
|
|
|
|
|
|
clean_value = self._clean_currency(value) |
|
|
|
|
|
try: |
|
|
float(clean_value) |
|
|
return clean_value, True |
|
|
except ValueError: |
|
|
return '0.00', False |
|
|
|
|
|
def _validate_jurisdiccion(self, value: str, all_values: Dict, ocr_text: List[str]) -> Tuple[str, bool]: |
|
|
"""Valida JURISDICCION_GASTO: texto de localidad.""" |
|
|
return value.strip() if value else '', True |
|
|
|
|
|
def _validate_cuit(self, value: str, all_values: Dict, ocr_text: List[str]) -> Tuple[str, bool]: |
|
|
"""Valida PROVEEDOR_CUIT: formato ##-########-#.""" |
|
|
if not value: |
|
|
|
|
|
if ocr_text: |
|
|
ocr_combined = ' '.join(ocr_text) |
|
|
|
|
|
pattern = r'\d{2}-\d{8}-\d{1}' |
|
|
match = re.search(pattern, ocr_combined) |
|
|
if match: |
|
|
return match.group(0), False |
|
|
|
|
|
|
|
|
pattern_no_dash = r'\b\d{11}\b' |
|
|
match = re.search(pattern_no_dash, ocr_combined) |
|
|
if match: |
|
|
cuit = match.group(0) |
|
|
formatted_cuit = f"{cuit[:2]}-{cuit[2:10]}-{cuit[10]}" |
|
|
return formatted_cuit, False |
|
|
return '00-00000000-0', False |
|
|
|
|
|
|
|
|
clean_value = re.sub(r'[^\d\-]', '', value) |
|
|
|
|
|
|
|
|
pattern = r'\d{2}-\d{8}-\d{1}' |
|
|
match = re.search(pattern, clean_value) |
|
|
|
|
|
if match: |
|
|
extracted = match.group(0) |
|
|
|
|
|
return extracted, True |
|
|
|
|
|
|
|
|
numbers_only = re.sub(r'[^\d]', '', clean_value) |
|
|
if len(numbers_only) == 11: |
|
|
formatted_cuit = f"{numbers_only[:2]}-{numbers_only[2:10]}-{numbers_only[10]}" |
|
|
|
|
|
return formatted_cuit, True |
|
|
elif len(numbers_only) > 11: |
|
|
|
|
|
formatted_cuit = f"{numbers_only[:2]}-{numbers_only[2:10]}-{numbers_only[10]}" |
|
|
return formatted_cuit, True |
|
|
|
|
|
|
|
|
if ocr_text: |
|
|
ocr_combined = ' '.join(ocr_text) |
|
|
|
|
|
match = re.search(pattern, ocr_combined) |
|
|
if match: |
|
|
return match.group(0), False |
|
|
|
|
|
|
|
|
pattern_no_dash = r'\b\d{11}\b' |
|
|
match = re.search(pattern_no_dash, ocr_combined) |
|
|
if match: |
|
|
cuit = match.group(0) |
|
|
formatted_cuit = f"{cuit[:2]}-{cuit[2:10]}-{cuit[10]}" |
|
|
return formatted_cuit, False |
|
|
|
|
|
return '00-00000000-0', False |
|
|
|
|
|
def _validate_razon_social(self, value: str, all_values: Dict, ocr_text: List[str]) -> Tuple[str, bool]: |
|
|
"""Valida PROVEEDOR_RAZON_SOCIAL: cualquier texto es válido.""" |
|
|
return value.strip() if value else '', True |
|
|
|
|
|
def _validate_tipo(self, value: str, all_values: Dict, ocr_text: List[str]) -> Tuple[str, bool]: |
|
|
"""Valida TIPO: debe ser A, B, C, M, E o T.""" |
|
|
|
|
|
clean_value = re.sub(r'factura\s*', '', value, flags=re.IGNORECASE).strip().upper() |
|
|
|
|
|
|
|
|
valid_types = ['A', 'B', 'C', 'M', 'E', 'T'] |
|
|
|
|
|
|
|
|
for tipo in valid_types: |
|
|
if tipo in clean_value: |
|
|
return tipo, True |
|
|
|
|
|
|
|
|
return 'A', False |
|
|
|
|
|
def _clean_currency(self, value: str) -> str: |
|
|
"""Limpia valores monetarios: elimina $, normaliza decimales.""" |
|
|
if not value: |
|
|
return '0.00' |
|
|
|
|
|
|
|
|
clean = re.sub(r'[$\s]', '', value) |
|
|
|
|
|
|
|
|
|
|
|
if '.' in clean and ',' in clean: |
|
|
if clean.rindex('.') > clean.rindex(','): |
|
|
|
|
|
clean = clean.replace(',', '') |
|
|
else: |
|
|
|
|
|
clean = clean.replace('.', '').replace(',', '.') |
|
|
elif ',' in clean: |
|
|
|
|
|
clean = clean.replace(',', '.') |
|
|
|
|
|
try: |
|
|
num = float(clean) |
|
|
return f"{num:.2f}" |
|
|
except ValueError: |
|
|
return '0.00' |
|
|
|
|
|
def _find_max_decimal_in_ocr(self, ocr_text: List[str]) -> Optional[str]: |
|
|
"""Encuentra el número más alto con decimales en el texto OCR.""" |
|
|
max_value = 0.0 |
|
|
found = False |
|
|
|
|
|
for word in ocr_text: |
|
|
|
|
|
if '.' in word or ',' in word: |
|
|
clean = self._clean_currency(word) |
|
|
try: |
|
|
num = float(clean) |
|
|
if num > max_value: |
|
|
max_value = num |
|
|
found = True |
|
|
except ValueError: |
|
|
continue |
|
|
|
|
|
return f"{max_value:.2f}" if found else None |