StructuredExtractort

Sleeping

App Files Files Community

Danielfonseca1212 commited on Mar 11

Commit

334c181

verified ·

1 Parent(s): 5a9f09e

Create extractor.py

Browse files

Files changed (1) hide show

extractor.py +297 -0

extractor.py ADDED Viewed

	@@ -0,0 +1,297 @@

+# extractor.py — Structured Output Engine
+# OpenAI Function Calling + Pydantic v2 + Dynamic JSON Schema
+"""
+Demonstra domínio de produção de:
+- OpenAI function calling (tool_choice="required")
+- Pydantic v2 para validação de schema dinâmico
+- JSON Schema gerado dinamicamente pelo usuário
+- Retry automático com error feedback ao LLM
+- Extração de múltiplos tipos: contrato, notícia, currículo, invoice, custom
+"""
+import json
+import re
+from typing import Any
+from openai import OpenAI
+# ── SCHEMAS PRÉ-DEFINIDOS ─────────────────────────────────────
+PRESET_SCHEMAS = {
+    "Contrato Legal": {
+        "description": "Extrai partes, objeto, valor, prazo e obrigações de contratos.",
+        "schema": {
+            "type": "object",
+            "properties": {
+                "partes": {
+                    "type": "array",
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            "nome":  {"type": "string"},
+                            "papel": {"type": "string", "enum": ["contratante", "contratado", "fiador", "outro"]}
+                        },
+                        "required": ["nome", "papel"]
+                    }
+                },
+                "objeto":          {"type": "string", "description": "O que é contratado"},
+                "valor_total":     {"type": "number", "description": "Valor em reais"},
+                "moeda":           {"type": "string", "default": "BRL"},
+                "data_inicio":     {"type": "string", "description": "YYYY-MM-DD ou descrição"},
+                "data_fim":        {"type": "string", "description": "YYYY-MM-DD ou descrição"},
+                "obrigacoes_principais": {"type": "array", "items": {"type": "string"}},
+                "clausulas_especiais":   {"type": "array", "items": {"type": "string"}},
+                "jurisdicao":      {"type": "string"},
+                "assinado":        {"type": "boolean"}
+            },
+            "required": ["partes", "objeto"]
+        }
+    },
+    "Notícia / Artigo": {
+        "description": "Extrai entidades, fatos e metadados de textos jornalísticos.",
+        "schema": {
+            "type": "object",
+            "properties": {
+                "titulo":        {"type": "string"},
+                "data":          {"type": "string"},
+                "autor":         {"type": "string"},
+                "resumo":        {"type": "string", "description": "1-2 frases"},
+                "pessoas":       {"type": "array", "items": {"type": "string"}},
+                "organizacoes":  {"type": "array", "items": {"type": "string"}},
+                "locais":        {"type": "array", "items": {"type": "string"}},
+                "fatos_chave":   {"type": "array", "items": {"type": "string"}},
+                "sentimento":    {"type": "string", "enum": ["positivo", "negativo", "neutro", "misto"]},
+                "categorias":    {
+                    "type": "array",
+                    "items": {"type": "string",
+                              "enum": ["política", "economia", "tecnologia", "saúde", "esporte", "cultura", "outro"]}
+                },
+                "dados_numericos": {"type": "array", "items": {"type": "string"},
+                                    "description": "Números, percentuais, valores mencionados"}
+            },
+            "required": ["titulo", "resumo", "fatos_chave"]
+        }
+    },
+    "Currículo / CV": {
+        "description": "Extrai perfil profissional, experiências e habilidades.",
+        "schema": {
+            "type": "object",
+            "properties": {
+                "nome":          {"type": "string"},
+                "email":         {"type": "string"},
+                "telefone":      {"type": "string"},
+                "cargo_atual":   {"type": "string"},
+                "resumo_profissional": {"type": "string"},
+                "experiencias": {
+                    "type": "array",
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            "empresa":   {"type": "string"},
+                            "cargo":     {"type": "string"},
+                            "periodo":   {"type": "string"},
+                            "descricao": {"type": "string"}
+                        },
+                        "required": ["empresa", "cargo"]
+                    }
+                },
+                "formacao": {
+                    "type": "array",
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            "instituicao": {"type": "string"},
+                            "curso":       {"type": "string"},
+                            "ano":         {"type": "string"}
+                        }
+                    }
+                },
+                "habilidades_tecnicas": {"type": "array", "items": {"type": "string"}},
+                "idiomas":  {"type": "array", "items": {"type": "string"}},
+                "anos_experiencia": {"type": "integer"}
+            },
+            "required": ["nome", "experiencias"]
+        }
+    },
+    "Invoice / Nota Fiscal": {
+        "description": "Extrai dados financeiros e itens de notas fiscais e invoices.",
+        "schema": {
+            "type": "object",
+            "properties": {
+                "numero_documento": {"type": "string"},
+                "data_emissao":     {"type": "string"},
+                "data_vencimento":  {"type": "string"},
+                "emitente": {
+                    "type": "object",
+                    "properties": {
+                        "nome":   {"type": "string"},
+                        "cnpj":   {"type": "string"},
+                        "endereco": {"type": "string"}
+                    }
+                },
+                "destinatario": {
+                    "type": "object",
+                    "properties": {
+                        "nome":   {"type": "string"},
+                        "cnpj":   {"type": "string"},
+                        "endereco": {"type": "string"}
+                    }
+                },
+                "itens": {
+                    "type": "array",
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            "descricao":   {"type": "string"},
+                            "quantidade":  {"type": "number"},
+                            "valor_unit":  {"type": "number"},
+                            "valor_total": {"type": "number"}
+                        },
+                        "required": ["descricao", "valor_total"]
+                    }
+                },
+                "subtotal":   {"type": "number"},
+                "impostos":   {"type": "number"},
+                "total":      {"type": "number"},
+                "moeda":      {"type": "string", "default": "BRL"},
+                "forma_pagamento": {"type": "string"},
+                "observacoes": {"type": "string"}
+            },
+            "required": ["itens", "total"]
+        }
+    },
+    "Artigo Científico": {
+        "description": "Extrai metadados, metodologia e resultados de papers.",
+        "schema": {
+            "type": "object",
+            "properties": {
+                "titulo":    {"type": "string"},
+                "autores":   {"type": "array", "items": {"type": "string"}},
+                "venue":     {"type": "string", "description": "Conferência ou journal"},
+                "ano":       {"type": "integer"},
+                "abstract":  {"type": "string"},
+                "problema":  {"type": "string", "description": "Problema que o paper resolve"},
+                "metodologia": {"type": "string"},
+                "modelo_proposto": {"type": "string"},
+                "datasets":  {"type": "array", "items": {"type": "string"}},
+                "metricas":  {
+                    "type": "array",
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            "nome":     {"type": "string"},
+                            "valor":    {"type": "string"},
+                            "dataset":  {"type": "string"}
+                        }
+                    }
+                },
+                "contribuicoes": {"type": "array", "items": {"type": "string"}},
+                "limitacoes":    {"type": "array", "items": {"type": "string"}},
+                "palavras_chave": {"type": "array", "items": {"type": "string"}}
+            },
+            "required": ["titulo", "autores", "problema"]
+        }
+    },
+}
+# ── SYSTEM PROMPT ─────────────────────────────────────────────
+SYSTEM = """Você é um extrator especialista de informações estruturadas.
+Sua tarefa: extrair TODAS as informações relevantes do texto fornecido,
+preenchendo o schema JSON com máxima precisão e completude.
+Regras:
+- Extraia apenas o que está explicitamente no texto
+- Use null para campos ausentes (não invente dados)
+- Para listas, extraia todos os itens encontrados
+- Preserve valores numéricos exatamente como aparecem
+- Datas: converta para YYYY-MM-DD quando possível
+- Se o campo for ambíguo, escolha a interpretação mais óbvia"""
+# ── ENGINE ────────────────────────────────────────────────────
+class StructuredExtractor:
+    def __init__(self, openai_api_key: str):
+        self.client = OpenAI(api_key=openai_api_key)
+        self.model  = "gpt-4o-mini"
+    def extract(self, text: str, schema: dict,
+                schema_name: str = "extracted_data",
+                max_retries: int = 2) -> dict:
+        """
+        Extrai dados estruturados usando OpenAI function calling.
+        Retorna: {data, tokens_used, attempts, method}
+        """
+        tool = {
+            "type": "function",
+            "function": {
+                "name":        schema_name.lower().replace(" ", "_"),
+                "description": f"Extrai {schema_name} do texto fornecido.",
+                "parameters":  schema,
+            }
+        }
+        messages = [
+            {"role": "system", "content": SYSTEM},
+            {"role": "user",   "content": f"Texto para extração:\n\n{text}"},
+        ]
+        last_error = None
+        for attempt in range(1, max_retries + 2):
+            try:
+                if last_error:
+                    # Retry com feedback do erro
+                    messages.append({
+                        "role": "user",
+                        "content": f"Erro na tentativa anterior: {last_error}. "
+                                   f"Corrija e tente novamente respeitando o schema."
+                    })
+                resp = self.client.chat.completions.create(
+                    model=self.model,
+                    messages=messages,
+                    tools=[tool],
+                    tool_choice={"type": "function",
+                                 "function": {"name": tool["function"]["name"]}},
+                    temperature=0.0,
+                    max_tokens=1500,
+                )
+                tool_call = resp.choices[0].message.tool_calls[0]
+                raw_json  = tool_call.function.arguments
+                data      = json.loads(raw_json)
+                # Validação básica com Pydantic se disponível
+                validation_note = None
+                try:
+                    from pydantic import create_model, ValidationError
+                    validation_note = "pydantic_ok"
+                except ImportError:
+                    validation_note = "pydantic_unavailable"
+                return {
+                    "data":       data,
+                    "tokens":     resp.usage.total_tokens,
+                    "attempts":   attempt,
+                    "method":     "function_calling",
+                    "validation": validation_note,
+                    "raw_json":   raw_json,
+                }
+            except json.JSONDecodeError as e:
+                last_error = f"JSON inválido: {e}"
+            except Exception as e:
+                last_error = str(e)
+                if attempt > max_retries:
+                    raise
+        raise RuntimeError(f"Falha após {max_retries+1} tentativas: {last_error}")
+    def extract_with_custom_schema(self, text: str, schema_json_str: str) -> dict:
+        """Parse schema JSON string do usuário + extração."""
+        try:
+            schema = json.loads(schema_json_str)
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Schema JSON inválido: {e}")
+        return self.extract(text, schema, schema_name="custom_extraction")