Spaces:

NextGenTech
/

AutomatedSemanticDiscovery

Sleeping

App Files Files Community

GaetanoParente commited on 7 days ago

Commit

b70d82f

1 Parent(s): 5ca1355

avvio refactoring

Browse files

Files changed (11) hide show

api.py +88 -40
app.py +105 -97
{data/ontologie_raw/ARCO → ontology}/ArCo.owl +0 -0
{data/ontologie_raw/ARCO → ontology}/arco.owl +0 -0
{data/ontologie_raw/ARCO → ontology}/context-description.owl +0 -0
{data/ontologie_raw/ARCO → ontology}/core.owl +0 -0
{data/ontologie_raw/ARCO → ontology}/location.owl +0 -0
requirements.txt +5 -3
src/extraction/extractor.py +187 -201
src/utils/build_schema.py +172 -131
src/validation/validator.py +60 -44

api.py CHANGED Viewed

@@ -11,6 +11,7 @@ from src.extraction.extractor import NeuroSymbolicExtractor
 from src.validation.validator import SemanticValidator
 from src.graph.graph_loader import KnowledgeGraphPersister
 from src.graph.entity_resolver import EntityResolver
 # --- GESTORE DEGLI STATI GLOBALI ---
 # Usiamo un dizionario globale per tenere in RAM i pesi dei modelli.
@@ -24,8 +25,7 @@ async def lifespan(app: FastAPI):
     ml_models["splitter"] = ActivaSemanticSplitter(model_name="all-MiniLM-L6-v2")
-    schema_path = os.path.join("data", "schemas", "ARCO_schema.json")
-    ml_models["extractor"] = NeuroSymbolicExtractor(model_name="llama3", schema_path=schema_path)
     ml_models["persister"] = KnowledgeGraphPersister()
     ml_models["resolver"] = EntityResolver(neo4j_driver=ml_models["persister"].driver, similarity_threshold=0.85)
@@ -33,6 +33,24 @@ async def lifespan(app: FastAPI):
     print("✅ Modelli caricati e pronti a ricevere richieste!")
     yield # Qui l'API inizia ad ascoltare le chiamate in ingresso
     # Chiusura pulita delle connessioni. Evita query appese su Neo4j quando killiamo il container.
@@ -51,7 +69,26 @@ app = FastAPI(
 class DiscoveryRequest(BaseModel):
     documentText: str
-@app.post("/api/discover")
 def run_discovery(payload: DiscoveryRequest):
     start_time = time.time()
     raw_text = payload.documentText
@@ -102,56 +139,67 @@ def run_discovery(payload: DiscoveryRequest):
     # --- FASE 2.2: VALIDATION ---
     # Prima di salvare nel DB, verifico con SHACL
     # se l'LLM ha generato allucinazioni o violato i vincoli dell'ontologia.
-    is_valid, report, _ = validator.validate_batch(entities_to_save, all_triples)
-    if not is_valid:
-        print("\n❌ [SHACL VALIDATION FAILED] Rilevate entità o relazioni non conformi all'ontologia:")
-        print(report)
-        print("-" * 60)
-    else:
-        print("\n✅ [SHACL VALIDATION SUCCESS] Tutte le triple ed entità rispettano i vincoli.")
     # --- FASE 3: PERSISTENCE (Neo4j) ---
     try:
-        persister.save_entities_and_triples(entities_to_save, all_triples)
     except Exception as e:
         print(f"⚠️ Errore salvataggio Neo4j: {e}")
     # Preparazione payload di risposta
     graph_data = []
-    for t in all_triples:
-        subj = getattr(t, 'subject', t[0] if isinstance(t, tuple) else str(t))
-        pred = getattr(t, 'predicate', t[1] if isinstance(t, tuple) else '')
-        obj = getattr(t, 'object', t[2] if isinstance(t, tuple) else '')
-        if isinstance(t, tuple) and len(t) > 3:
-            conf = t[3]
-        else:
-            conf = getattr(t, 'confidence', 1.0)
-        subj_str = str(subj)
-        pred_str = str(pred)
-        obj_str = str(obj)
         # Genero un ID stabile per facilitare il rendering dei nodi lato client
         node_id = hashlib.md5(subj_str.encode('utf-8')).hexdigest()
-        graph_data.append({
-            "start_node_id": node_id,
-            "start_node_label": subj_str,
-            "relationship_type": pred_str,
-            "end_node_label": obj_str,
-            "confidence": float(conf)
-        })
-    return {
-        "status": "success",
-        "message": "Estrazione semantica completata",
-        "execution_time_seconds": round(time.time() - start_time, 2),
-        "chunks_processed": len(chunks),
-        "triples_extracted": len(graph_data),
-        "shacl_valid": is_valid,
-        "graph_data": graph_data
-    }
 if __name__ == "__main__":
     uvicorn.run("api:app", host="0.0.0.0", port=5000, reload=True)

 from src.validation.validator import SemanticValidator
 from src.graph.graph_loader import KnowledgeGraphPersister
 from src.graph.entity_resolver import EntityResolver
+from pymongo import MongoClient
 # --- GESTORE DEGLI STATI GLOBALI ---
 # Usiamo un dizionario globale per tenere in RAM i pesi dei modelli.
     ml_models["splitter"] = ActivaSemanticSplitter(model_name="all-MiniLM-L6-v2")
+    ml_models["extractor"] = NeuroSymbolicExtractor(index_path="ontology/domain_index.json")
     ml_models["persister"] = KnowledgeGraphPersister()
     ml_models["resolver"] = EntityResolver(neo4j_driver=ml_models["persister"].driver, similarity_threshold=0.85)
     print("✅ Modelli caricati e pronti a ricevere richieste!")
+    # Setup connessione MongoDB per i log degli scarti
+    mongo_ur = os.getenv("MONGO_URI")
+    mongo_user = os.getenv("MONGO_USER")
+    mongo_pass = os.getenv("MONGO_PASS")
+    if mongo_ur and mongo_user and mongo_pass:
+        try:
+            client = MongoClient(mongo_ur, username=mongo_user, password=mongo_pass)
+            # Creo il database "semantic_discovery" e la collection "rejected_triples"
+            ml_models["mongo_db"] = client["semantic_discovery"]["rejected_triples"]
+            print("✅ Connesso a MongoDB per lo storage delle allucinazioni LLM.")
+        except Exception as e:
+            print(f"⚠️ Errore connessione MongoDB: {e}")
+            ml_models["mongo_db"] = None
+    else:
+        print("⚠️ Credenziali MongoDB mancanti. Gli scarti non verranno tracciati.")
+        ml_models["mongo_db"] = None
     yield # Qui l'API inizia ad ascoltare le chiamate in ingresso
     # Chiusura pulita delle connessioni. Evita query appese su Neo4j quando killiamo il container.
 class DiscoveryRequest(BaseModel):
     documentText: str
+class GraphEdge(BaseModel):
+    start_node_id: str
+    start_node_label: str
+    start_node_type: str
+    relationship_type: str
+    end_node_label: str
+    end_node_type: str
+    evidence: str
+    reasoning: str
+class DiscoveryResponse(BaseModel):
+    status: str
+    message: str
+    execution_time_seconds: float
+    chunks_processed: int
+    triples_extracted: int
+    shacl_valid: bool
+    graph_data: list[GraphEdge]
+@app.post("/api/discover", response_model=DiscoveryResponse)
 def run_discovery(payload: DiscoveryRequest):
     start_time = time.time()
     raw_text = payload.documentText
     # --- FASE 2.2: VALIDATION ---
     # Prima di salvare nel DB, verifico con SHACL
     # se l'LLM ha generato allucinazioni o violato i vincoli dell'ontologia.
+    valid_triples, invalid_triples, report = validator.filter_valid_triples(entities_to_save, all_triples)
+    if invalid_triples:
+        print(f"\n❌ [SHACL FAILED] Scartate {len(invalid_triples)} triple per violazione di Domain/Range.")
+        # Salvataggio asincrono degli scarti su MongoDB (DLQ)
+        if ml_models.get("mongo_db") is not None:
+            try:
+                # Aggiungo un timestamp per rintracciabilità
+                for doc in invalid_triples:
+                    doc["timestamp"] = time.time()
+                ml_models["mongo_db"].insert_many(invalid_triples)
+                print("💾 Triple invalide archiviate su MongoDB.")
+            except Exception as e:
+                print(f"⚠️ Errore scrittura su Mongo: {e}")
+    if len(valid_triples) == len(all_triples) and all_triples:
+        print("\n✅ [SHACL SUCCESS] Tutte le triple rispettano rigorosamente l'ontologia.")
     # --- FASE 3: PERSISTENCE (Neo4j) ---
     try:
+        # Cruciale: passiamo SOLO le valid_triples al database a grafo
+        persister.save_entities_and_triples(entities_to_save, valid_triples)
     except Exception as e:
         print(f"⚠️ Errore salvataggio Neo4j: {e}")
     # Preparazione payload di risposta
     graph_data = []
+    for t in valid_triples:
+        # Pydantic ci garantisce che i campi esistano
+        subj_str = str(t.subject)
+        obj_str = str(t.object)
+        # Formattazione della relazione (es. "a-loc:isLocatedIn" -> "A_LOC_ISLOCATEDIN")
+        # in coerenza con la convenzione Neo4j gestita dal loader
+        pred_str = str(t.predicate).replace(":", "_").replace("-", "_").upper()
         # Genero un ID stabile per facilitare il rendering dei nodi lato client
         node_id = hashlib.md5(subj_str.encode('utf-8')).hexdigest()
+        graph_data.append(GraphEdge(
+            start_node_id=node_id,
+            start_node_label=subj_str,
+            start_node_type=str(t.subject_type),
+            relationship_type=pred_str,
+            end_node_label=obj_str,
+            end_node_type=str(t.object_type),
+            evidence=str(t.evidence),
+            reasoning=str(t.reasoning)
+        ))
+    return DiscoveryResponse(
+        status="success",
+        message="Estrazione semantica completata",
+        execution_time_seconds=round(time.time() - start_time, 2),
+        chunks_processed=len(chunks),
+        triples_extracted=len(graph_data),
+        shacl_valid=len(invalid_triples) == 0, # True se nessuna tripla è stata scartata
+        graph_data=graph_data
+    )
 if __name__ == "__main__":
     uvicorn.run("api:app", host="0.0.0.0", port=5000, reload=True)

app.py CHANGED Viewed

@@ -1,7 +1,9 @@
 import streamlit as st
 import os
 import tempfile
 import pandas as pd
 from neo4j import GraphDatabase
 from pyvis.network import Network
 import streamlit.components.v1 as components
@@ -24,12 +26,13 @@ st.set_page_config(
 )
 def local_css(file_name):
-    with open(file_name, "r") as f:
-        st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
 local_css("assets/style.css")
-# --- SESSION STATE MANAGEMENT (In-Memory per HF Spaces) ---
 if 'pipeline_stage' not in st.session_state:
     st.session_state.pipeline_stage = 0
 if 'document_text' not in st.session_state:
@@ -54,8 +57,7 @@ def get_splitter():
 @st.cache_resource
 def get_extractor():
-    schema_path = os.path.join("data", "schemas", "ARCO_schema.json")
-    return NeuroSymbolicExtractor(model_name="llama3", schema_path=schema_path)
 @st.cache_resource(show_spinner="🧩 Inizializzazione Entity Resolver...")
 def get_resolver():
@@ -63,9 +65,12 @@ def get_resolver():
 @st.cache_resource
 def get_validator():
-    return SemanticValidator()
-#carico subito i vari oggetti così da evitare rallentamenti nelle varie fasi della pipeline
 _ = get_splitter()
 _ = get_extractor()
 _ = get_validator()
@@ -91,23 +96,21 @@ st.sidebar.title("⚙️ Configurazione")
 env_uri = os.getenv("NEO4J_URI", "")
 env_user = os.getenv("NEO4J_USER", "neo4j")
 env_password = os.getenv("NEO4J_PASSWORD", "")
-env_hf_token = os.getenv("HF_TOKEN", "")
-st.sidebar.subheader("Backend AI")
-if env_hf_token:
-    st.sidebar.success("✅ HF Token: Configurato da Secrets")
-    hf_token_input = st.sidebar.text_input("Sovrascrivi Token (Opzionale)", type="password", key="hf_token_override")
-    if hf_token_input: os.environ["HF_TOKEN"] = hf_token_input
 else:
-    hf_token_input = st.sidebar.text_input("Inserisci HF Token", type="password")
-    if hf_token_input: os.environ["HF_TOKEN"] = hf_token_input
 st.sidebar.subheader("Knowledge Graph")
-uri = st.sidebar.text_input("URI", value=env_uri)
-user = st.sidebar.text_input("User", value=env_user)
-pwd_placeholder = "✅ Configurato (Lascia vuoto)" if env_password else "Inserisci Password"
-password_input = st.sidebar.text_input("Password", type="password", placeholder=pwd_placeholder)
 password = password_input if password_input else env_password
 driver = None
@@ -119,7 +122,7 @@ if uri and password:
         os.environ["NEO4J_USER"] = user
         os.environ["NEO4J_PASSWORD"] = password
     else:
-        st.sidebar.error("🔴 Errore connessione")
 st.sidebar.divider()
 if st.sidebar.button("🔄 Reset Pipeline", on_click=reset_pipeline):
@@ -127,11 +130,11 @@ if st.sidebar.button("🔄 Reset Pipeline", on_click=reset_pipeline):
 # --- MAIN HEADER ---
 st.title("🧠 Automated Semantic Discovery Prototype")
-st.markdown("**Endpoint per l'ingestion testuale e l'estrazione neuro-simbolica**")
 tab_gen, tab_val, tab_vis = st.tabs([
     "⚙️ 1. Pipeline Generativa",
-    "🔍 2. Validazione (HITL)",
     "🕸️ 3. Esplorazione Grafo"
 ])
@@ -163,27 +166,17 @@ with tab_gen:
         st.markdown(f"### {'✅' if st.session_state.pipeline_stage >= 1 else '1️⃣'} Fase 1: Semantic Chunking")
         with st.expander("ℹ️ Cosa fa questa fase?"):
-            st.write("Segmenta il testo in frammenti coerenti analizzando la similarità semantica vettoriale tra le frasi. " \
-            "A differenza di un taglio rigido per numero di parole, questo approccio garantisce che i concetti non vengano interrotti bruscamente, " \
-            "ottimizzando il contesto per l'LLM.")
         if st.session_state.pipeline_stage >= 1:
             chunks = st.session_state.chunks
-            st.markdown(f"""
-            <div class="success-box">
-                <b>Chunking completato!</b> Generati {len(chunks)} frammenti semantici.
-            </div>
-            """, unsafe_allow_html=True)
-            with st.expander("Vedi dettagli frammenti"):
-                st.json(chunks)
         else:
             if st.button("Avvia Semantic Splitter", type="primary"):
                 with st.spinner("Creazione chunks in corso..."):
                     try:
                         splitter = get_splitter()
                         chunks, _, _ = splitter.create_chunks(input_text, percentile_threshold=90)
-                        # Salvataggio in-memory
                         st.session_state.chunks = chunks
                         st.session_state.pipeline_stage = 1
                         st.rerun()
@@ -193,110 +186,120 @@ with tab_gen:
     st.markdown("⬇️")
     # ==========================
-    # FASE 2: EXTRACTION
     # ==========================
     is_step_b_unlocked = st.session_state.pipeline_stage >= 1
     with st.container():
         color = "white" if is_step_b_unlocked else "gray"
         icon = "✅" if st.session_state.pipeline_stage >= 2 else ("2️⃣" if is_step_b_unlocked else "🔒")
-        st.markdown(f"<h3 style='color:{color}'>{icon} Fase 2: Neuro-Symbolic Extraction</h3>", unsafe_allow_html=True)
         with st.expander("ℹ️ Cosa fa questa fase?"):
-            st.write("Invia i frammenti al Large Language Model (es. Llama 3) per estrarre dinamicamente entità e relazioni. " \
-            "L'approccio Neuro-Simbolico forza l'output del modello a rispettare una struttura dati rigorosa (JSON tipizzato) prima di procedere.")
         if not is_step_b_unlocked:
             st.caption("Completa la Fase 1 per sbloccare l'estrazione.")
         elif st.session_state.pipeline_stage >= 2:
             data = st.session_state.extraction_data
-            st.markdown(f"""
-            <div class="success-box">
-                <b>Estrazione completata!</b> Identificate {len(data['entities'])} entità e {len(data['triples'])} triple.
-            </div>
-            """, unsafe_allow_html=True)
-            with st.expander("Vedi dati estratti"):
-                st.write("Entità Trovate:", data['entities'])
-                st.dataframe(pd.DataFrame(data['triples']), hide_index=True)
         else:
-            if st.button("Avvia Estrazione Ontologica", type="primary"):
-                with st.spinner("Invocazione modello sui frammenti..."):
-                    try:
-                        chunks = st.session_state.chunks
-                        extractor = get_extractor()
-                        all_triples = []
-                        all_entities = []
-                        prog_bar = st.progress(0)
-                        for i, chunk in enumerate(chunks):
-                            chunk_id = f"st_req_chunk_{i+1}"
-                            res = extractor.extract(chunk, source_id=chunk_id)
-                            if res:
-                                if res.triples: all_triples.extend([t.model_dump() for t in res.triples])
-                                if res.entities: all_entities.extend(res.entities)
-                            prog_bar.progress((i+1)/len(chunks))
-                        # Salvataggio in-memory
-                        st.session_state.extraction_data = {"entities": all_entities, "triples": all_triples}
-                        st.session_state.pipeline_stage = 2
-                        st.rerun()
-                    except Exception as e:
-                        st.error(f"Errore: {e}")
     st.markdown("⬇️")
     # ==========================
-    # FASE 3: RESOLUTION & PERSISTENCE
     # ==========================
     is_step_c_unlocked = st.session_state.pipeline_stage >= 2
     with st.container():
         color = "white" if is_step_c_unlocked else "gray"
         icon = "✅" if st.session_state.pipeline_stage >= 3 else ("3️⃣" if is_step_c_unlocked else "🔒")
-        st.markdown(f"<h3 style='color:{color}'>{icon} Fase 3: Resolution, Validation & Graph Population</h3>", unsafe_allow_html=True)
         with st.expander("ℹ️ Cosa fa questa fase?"):
-            st.write("Unisce ed elimina i duplicati delle entità (Entity Resolution) sfruttando i Vector Index di Neo4j e chiamate esterne. " \
-            "Successivamente, applica regole deterministiche (SHACL) per validare le triple estratte e le salva permanentemente nel database a grafo.")
         if not is_step_c_unlocked:
             st.caption("Completa la Fase 2 per procedere.")
         elif st.session_state.pipeline_stage >= 3:
-            st.markdown("""
-            <div class="success-box">
-                <b>Grafo Aggiornato!</b> I dati sono stati validati e caricati su Neo4j.
-            </div>
-            """, unsafe_allow_html=True)
         else:
             if not driver:
                 st.error("⚠️ Connettiti a Neo4j (nella sidebar) per procedere.")
             else:
-                if st.button("Genera e Valida Knowledge Graph", type="primary"):
-                    with st.spinner("Risoluzione entità, validazione SHACL e scrittura..."):
                         try:
                             raw_data = st.session_state.extraction_data
                             all_entities = raw_data.get("entities", [])
-                            all_triples = [GraphTriple(**t) for t in raw_data.get("triples", [])]
                             resolver = get_resolver()
                             resolver.driver = driver
-                            all_entities, all_triples, entities_to_save = resolver.resolve_entities(all_entities, all_triples)
                             validator = get_validator()
-                            is_valid, report, _ = validator.validate_batch(entities_to_save, all_triples)
-                            if not is_valid:
-                                st.markdown(f"""
-                                <div class="warning-box">
-                                    <b>Attenzione:</b> La validazione SHACL ha rilevato violazioni. Guarda il log console per i dettagli.
-                                </div>
-                                """, unsafe_allow_html=True)
-                            persister = KnowledgeGraphPersister()
-                            persister.save_entities_and_triples(entities_to_save, all_triples)
-                            persister.close()
                             st.session_state.pipeline_stage = 3
                             st.rerun()
@@ -304,7 +307,7 @@ with tab_gen:
                             st.error(f"Errore critico: {e}")
 # ==============================================================================
-# TAB 2 & 3: VALIDAZIONE E VISUALIZZAZIONE
 # ==============================================================================
 with tab_val:
     st.header("Curation & Feedback Loop")
@@ -320,9 +323,10 @@ with tab_val:
         RETURN elementId(r) as id,
                COALESCE(s.label, s.name, head(labels(s))) as Soggetto,
                type(r) as Predicato,
-               COALESCE(o.label, o.name, head(labels(o))) as Oggetto,
-               COALESCE(r.confidence, 1.0) as Confidenza
-        ORDER BY Confidenza ASC
         """
         triples_data = run_query(driver, cypher_val)
@@ -330,10 +334,13 @@ with tab_val:
             df = pd.DataFrame(triples_data)
             st.dataframe(df.drop(columns=["id"]), width='stretch', hide_index=True)
         else:
-            st.info("Grafo vuoto.")
     else:
         st.warning("Database non connesso.")
 with tab_vis:
     st.header("Esplorazione Topologica")
     if driver:
@@ -349,6 +356,7 @@ with tab_vis:
                 RETURN COALESCE(s.label, s.name, head(labels(s))) as src,
                        type(r) as rel,
                        COALESCE(o.label, o.name, head(labels(o))) as dst
                 """
                 graph_data = run_query(driver, cypher_vis)

 import streamlit as st
 import os
+import time
 import tempfile
 import pandas as pd
+from pymongo import MongoClient
 from neo4j import GraphDatabase
 from pyvis.network import Network
 import streamlit.components.v1 as components
 )
 def local_css(file_name):
+    if os.path.exists(file_name):
+        with open(file_name, "r") as f:
+            st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
 local_css("assets/style.css")
+# --- SESSION STATE MANAGEMENT ---
 if 'pipeline_stage' not in st.session_state:
     st.session_state.pipeline_stage = 0
 if 'document_text' not in st.session_state:
 @st.cache_resource
 def get_extractor():
+    return NeuroSymbolicExtractor(index_path="ontology/domain_index.json")
 @st.cache_resource(show_spinner="🧩 Inizializzazione Entity Resolver...")
 def get_resolver():
 @st.cache_resource
 def get_validator():
+    return SemanticValidator(
+        ontology_dir="ontology",
+        shapes_file="ontology/shapes/auto_constraints.ttl"
+    )
+# Pre-load dei modelli in memoria
 _ = get_splitter()
 _ = get_extractor()
 _ = get_validator()
 env_uri = os.getenv("NEO4J_URI", "")
 env_user = os.getenv("NEO4J_USER", "neo4j")
 env_password = os.getenv("NEO4J_PASSWORD", "")
+env_google_key = os.getenv("GOOGLE_API_KEY", "")
+st.sidebar.subheader("Backend AI (TDDT)")
+if env_google_key:
+    st.sidebar.success("✅ Google API Key: Configurata")
 else:
+    google_key_input = st.sidebar.text_input("Inserisci GOOGLE_API_KEY", type="password")
+    if google_key_input: os.environ["GOOGLE_API_KEY"] = google_key_input
 st.sidebar.subheader("Knowledge Graph")
+uri = st.sidebar.text_input("URI Neo4j", value=env_uri)
+user = st.sidebar.text_input("User Neo4j", value=env_user)
+pwd_placeholder = "✅ Configurato (Lascia vuoto)" if env_password else "Inserisci Password Neo4j"
+password_input = st.sidebar.text_input("Password Neo4j", type="password", placeholder=pwd_placeholder)
 password = password_input if password_input else env_password
 driver = None
         os.environ["NEO4J_USER"] = user
         os.environ["NEO4J_PASSWORD"] = password
     else:
+        st.sidebar.error("🔴 Errore connessione Neo4j")
 st.sidebar.divider()
 if st.sidebar.button("🔄 Reset Pipeline", on_click=reset_pipeline):
 # --- MAIN HEADER ---
 st.title("🧠 Automated Semantic Discovery Prototype")
+st.markdown("**Type-Driven Domain Traversal (TDDT) & OWL RL Validation**")
 tab_gen, tab_val, tab_vis = st.tabs([
     "⚙️ 1. Pipeline Generativa",
+    "🔍 2. Dati e DLQ",
     "🕸️ 3. Esplorazione Grafo"
 ])
         st.markdown(f"### {'✅' if st.session_state.pipeline_stage >= 1 else '1️⃣'} Fase 1: Semantic Chunking")
         with st.expander("ℹ️ Cosa fa questa fase?"):
+            st.write("Segmenta il testo in frammenti coerenti analizzando la similarità semantica vettoriale tra le frasi.")
         if st.session_state.pipeline_stage >= 1:
             chunks = st.session_state.chunks
+            st.success(f"Chunking completato! Generati {len(chunks)} frammenti semantici.")
         else:
             if st.button("Avvia Semantic Splitter", type="primary"):
                 with st.spinner("Creazione chunks in corso..."):
                     try:
                         splitter = get_splitter()
                         chunks, _, _ = splitter.create_chunks(input_text, percentile_threshold=90)
                         st.session_state.chunks = chunks
                         st.session_state.pipeline_stage = 1
                         st.rerun()
     st.markdown("⬇️")
     # ==========================
+    # FASE 2: EXTRACTION (TDDT)
     # ==========================
     is_step_b_unlocked = st.session_state.pipeline_stage >= 1
     with st.container():
         color = "white" if is_step_b_unlocked else "gray"
         icon = "✅" if st.session_state.pipeline_stage >= 2 else ("2️⃣" if is_step_b_unlocked else "🔒")
+        st.markdown(f"<h3 style='color:{color}'>{icon} Fase 2: TDDT Extraction (Gemini)</h3>", unsafe_allow_html=True)
         with st.expander("ℹ️ Cosa fa questa fase?"):
+            st.write("Esegue l'estrazione gerarchica in due passaggi: prima classifica le entità usando le root dell'ontologia, poi estrae le relazioni passando all'LLM solo le proprietà ammesse (Domain Index).")
         if not is_step_b_unlocked:
             st.caption("Completa la Fase 1 per sbloccare l'estrazione.")
         elif st.session_state.pipeline_stage >= 2:
             data = st.session_state.extraction_data
+            st.success(f"Estrazione TDDT completata! Identificate {len(data['entities'])} entità e {len(data['triples'])} triple.")
+            with st.expander("Vedi dati estratti (Pre-Validazione)"):
+                st.write("Entità Inferite:", [e.model_dump() for e in data['entities']])
+                if data['triples']:
+                    st.dataframe(pd.DataFrame([t.model_dump() for t in data['triples']]), hide_index=True)
         else:
+            if st.button("Avvia Estrazione TDDT", type="primary"):
+                if not os.getenv("GOOGLE_API_KEY"):
+                    st.error("⚠️ GOOGLE_API_KEY mancante. Inseriscila nella sidebar.")
+                else:
+                    with st.spinner("Classificazione ed estrazione gerarchica in corso..."):
+                        try:
+                            chunks = st.session_state.chunks
+                            extractor = get_extractor()
+                            all_triples = []
+                            all_entities = []
+                            prog_bar = st.progress(0)
+                            for i, chunk in enumerate(chunks):
+                                chunk_id = f"st_req_chunk_{i+1}"
+                                res = extractor.extract(chunk, source_id=chunk_id)
+                                if res:
+                                    if res.triples: all_triples.extend(res.triples)
+                                prog_bar.progress((i+1)/len(chunks))
+                            # Estraggo le entità univoche dalle triple per il Resolver
+                            unique_entities = list(set([t.subject for t in all_triples] + [t.object for t in all_triples]))
+                            st.session_state.extraction_data = {"entities": unique_entities, "triples": all_triples}
+                            st.session_state.pipeline_stage = 2
+                            st.rerun()
+                        except Exception as e:
+                            st.error(f"Errore: {e}")
     st.markdown("⬇️")
     # ==========================
+    # FASE 3: RESOLUTION & VALIDATION (BLOCCANTE)
     # ==========================
     is_step_c_unlocked = st.session_state.pipeline_stage >= 2
     with st.container():
         color = "white" if is_step_c_unlocked else "gray"
         icon = "✅" if st.session_state.pipeline_stage >= 3 else ("3️⃣" if is_step_c_unlocked else "🔒")
+        st.markdown(f"<h3 style='color:{color}'>{icon} Fase 3: Resolution & SHACL Blocking</h3>", unsafe_allow_html=True)
         with st.expander("ℹ️ Cosa fa questa fase?"):
+            st.write("Risolve le entità (Entity Linking) e applica la validazione OWL RL. Le triple non conformi vengono scartate e salvate nella Dead Letter Queue (MongoDB), mentre quelle valide popolano Neo4j.")
         if not is_step_c_unlocked:
             st.caption("Completa la Fase 2 per procedere.")
         elif st.session_state.pipeline_stage >= 3:
+            st.success("Grafo Aggiornato! Le triple conformi sono su Neo4j, gli scarti su Mongo (se configurato).")
         else:
             if not driver:
                 st.error("⚠️ Connettiti a Neo4j (nella sidebar) per procedere.")
             else:
+                if st.button("Valida e Scrivi su Grafo", type="primary"):
+                    with st.spinner("Risoluzione, validazione logica e persistenza..."):
                         try:
                             raw_data = st.session_state.extraction_data
                             all_entities = raw_data.get("entities", [])
+                            all_triples = raw_data.get("triples", [])
                             resolver = get_resolver()
                             resolver.driver = driver
+                            all_entities, resolved_triples, entities_to_save = resolver.resolve_entities(all_entities, all_triples)
                             validator = get_validator()
+                            valid_triples, invalid_triples, report = validator.filter_valid_triples(entities_to_save, resolved_triples)
+                            if invalid_triples:
+                                st.warning(f"Rilevate {len(invalid_triples)} violazioni ontologiche. Scartate dalla persistenza.")
+                                # Salvataggio in DLQ (MongoDB)
+                                mongo_ur = os.getenv("MONGO_UR")
+                                mongo_user = os.getenv("MONGO_USER")
+                                mongo_pass = os.getenv("MONGO_PASS")
+                                if mongo_ur:
+                                    try:
+                                        client = MongoClient(mongo_ur, username=mongo_user, password=mongo_pass)
+                                        db = client["semantic_discovery"]["rejected_triples"]
+                                        docs = []
+                                        for doc in invalid_triples:
+                                            doc["timestamp"] = time.time()
+                                            docs.append(doc)
+                                        db.insert_many(docs)
+                                        st.info("💾 Scarti archiviati correttamente su MongoDB.")
+                                    except Exception as e:
+                                        st.error(f"Errore scrittura DLQ: {e}")
+                            persister = KnowledgeGraphPersister()
+                            persister.driver = driver # Inietto il driver testato
+                            # Salviamo SOLO le valide
+                            persister.save_entities_and_triples(entities_to_save, valid_triples)
                             st.session_state.pipeline_stage = 3
                             st.rerun()
                             st.error(f"Errore critico: {e}")
 # ==============================================================================
+# TAB 2: VALIDAZIONE E DLQ (Aggiornato per 1.4)
 # ==============================================================================
 with tab_val:
     st.header("Curation & Feedback Loop")
         RETURN elementId(r) as id,
                COALESCE(s.label, s.name, head(labels(s))) as Soggetto,
                type(r) as Predicato,
+               COALESCE(o.label, o.name, head(labels(o))) as Oggetto,
+               COALESCE(r.evidence, 'N/A') as Evidenza,
+               COALESCE(r.reasoning, 'N/A') as Ragionamento
+        LIMIT 100
         """
         triples_data = run_query(driver, cypher_val)
             df = pd.DataFrame(triples_data)
             st.dataframe(df.drop(columns=["id"]), width='stretch', hide_index=True)
         else:
+            st.info("Grafo vuoto o relazioni senza nuovi attributi.")
     else:
         st.warning("Database non connesso.")
+# ==============================================================================
+# TAB 3: ESPLORAZIONE GRAFO
+# ==============================================================================
 with tab_vis:
     st.header("Esplorazione Topologica")
     if driver:
                 RETURN COALESCE(s.label, s.name, head(labels(s))) as src,
                        type(r) as rel,
                        COALESCE(o.label, o.name, head(labels(o))) as dst
+                LIMIT 300
                 """
                 graph_data = run_query(driver, cypher_vis)

{data/ontologie_raw/ARCO → ontology}/ArCo.owl RENAMED Viewed

File without changes

{data/ontologie_raw/ARCO → ontology}/arco.owl RENAMED Viewed

File without changes

{data/ontologie_raw/ARCO → ontology}/context-description.owl RENAMED Viewed

File without changes

{data/ontologie_raw/ARCO → ontology}/core.owl RENAMED Viewed

File without changes

{data/ontologie_raw/ARCO → ontology}/location.owl RENAMED Viewed

File without changes

requirements.txt CHANGED Viewed

@@ -1,9 +1,8 @@
 # --- Core Framework & Orchestration ---
 langchain>=0.3.0
 langchain-community>=0.3.0
-langchain-ollama>=0.2.0
-langchain-huggingface>=0.1.0
-langchain-groq
 langchain-core
 huggingface_hub
@@ -22,6 +21,9 @@ spacy
 neo4j>=5.0.0
 rdflib
 # --- Web & API ---
 fastapi
 uvicorn

 # --- Core Framework & Orchestration ---
 langchain>=0.3.0
 langchain-community>=0.3.0
+langchain-google-genai>=2.0.0  # Per Gemini 2.0 Flash (TDDT)
+langchain-huggingface>=0.1.0   # Mantenuto per il Semantic Splitter
 langchain-core
 huggingface_hub
 neo4j>=5.0.0
 rdflib
+# --- Storage & DLQ ---
+pymongo
 # --- Web & API ---
 fastapi
 uvicorn

src/extraction/extractor.py CHANGED Viewed

@@ -1,238 +1,224 @@
-import json
 import os
-import numpy as np
-from typing import List, Optional
 from pydantic import BaseModel, Field, ValidationError
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
-from langchain_groq import ChatGroq
-from langchain_ollama import ChatOllama
-from langchain_huggingface import HuggingFaceEmbeddings, ChatHuggingFace, HuggingFaceEndpoint
-from sklearn.metrics.pairwise import cosine_similarity
 from dotenv import load_dotenv
-# Carico le variabili d'ambiente. Su HF Spaces non trova il .env ma pesca in automatico dai secrets.
-load_dotenv()
-# Modelli Pydantic per blindare l'output dell'LLM.
 class GraphTriple(BaseModel):
-    subject: str = Field(..., description="Entità sorgente.")
-    predicate: str = Field(..., description="Relazione (es. arco:hasCurrentLocation).")
-    object: str = Field(..., description="Entità target.")
-    confidence: float = Field(..., description="Confidenza (0.0 - 1.0).")
-    source: Optional[str] = Field(None)
 class KnowledgeGraphExtraction(BaseModel):
-    reasoning: Optional[str] = Field(None, description="Breve ragionamento logico.")
-    entities: List[str] = Field(default_factory=list, description="TUTTE le entità estratte, incluse quelle isolate/orfane.")
     triples: List[GraphTriple]
 class NeuroSymbolicExtractor:
-    def __init__(self, model_name="llama3", temperature=0, schema_path=None):
-        hf_token = os.getenv("HF_TOKEN")
-        groq_api_key = os.getenv("GROQ_API_KEY")
-        # Setup del provider LLM a cascata: do priorità ai servizi cloud ad alte performance,
-        # se mancano le key faccio un fallback sull'istanza locale di Ollama.
-        if groq_api_key:
-            print("☁️ Rilevato ambiente Groq Cloud!")
-            try:
-                self.llm = ChatGroq(
-                    temperature=0,
-                    model="llama-3.3-70b-versatile",
-                    api_key=groq_api_key
-                )
-            except Exception as e:
-                print(f"❌ Errore Groq API {e}")
-        elif hf_token:
-            print("☁️ Rilevato ambiente Cloud (HF Spaces). Utilizzo HuggingFace Inference API.")
-            repo_id = "meta-llama/Meta-Llama-3-8B-Instruct"
-            try:
-                endpoint = HuggingFaceEndpoint(
-                    repo_id=repo_id,
-                    task="text-generation",
-                    max_new_tokens=1024,
-                    temperature=0.1,
-                    huggingfacehub_api_token=hf_token
-                )
-                self.llm = ChatHuggingFace(llm=endpoint)
-                print(f"✅ Connesso a {repo_id} via API.")
-            except Exception as e:
-                print(f"❌ Errore connessione HF API: {e}. Fallback su CPU locale (sconsigliato).")
-                raise e
         else:
-            print(f"🏠 Ambiente Locale rilevato. Inizializzazione Ollama: {model_name}...")
             try:
-                self.llm = ChatOllama(
-                    model=model_name,
-                    temperature=temperature,
-                    format="json",
-                    base_url="http://localhost:11434"
-                )
             except Exception as e:
-                 print(f"⚠️ Errore Ollama: {e}")
-        # Carico il modello leggero per fare l'embedding delle query e matchare l'ontologia al volo
-        print("🧠 Caricamento modello embedding per Dynamic Selection...")
-        self.embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
-        self.ontology_elements = []
-        self.ontology_embeddings = None
-        # Se ho passato il dizionario json generato da ArCo, lo calcolo e lo tengo in RAM
-        if schema_path and os.path.exists(schema_path):
-            print(f"🌟 Indicizzazione vettoriale Ontologia da: {schema_path}")
-            self._index_ontology(schema_path)
-        # Prompt di sistema: le regole di Graceful Degradation qui sono critiche
-        # altrimenti il modello inizia a inventare predicati e inquina il grafo.
-        self.system_template_base = """Sei un esperto di Ingegneria della Conoscenza specializzato nell'Ontologia ArCo (Patrimonio Culturale Italiano).
-        Il tuo compito è analizzare il testo e generare un JSON contenente entità e relazioni.
-        REGOLE FONDAMENTALI:
-        1. Estrai TUTTI i reperti, luoghi, materiali, tecniche, concetti e persone e inseriscili nell'array "entities".
-        2. Crea le "triples" usando ESCLUSIVAMENTE le seguenti Classi (per rdf:type) e Proprietà, recuperate dall'ontologia:
-        CLASSI ARCO CONSENTITE (da usare come oggetto quando predicate = rdf:type):
-        {retrieved_classes}
-        PROPRIETÀ ARCO CONSENTITE (da usare come predicate):
-        {retrieved_properties}
-        REGOLE DI CLASSIFICAZIONE E ANTI-ALLUCINAZIONE (CRITICO):
-        - rdf:type: Sforzati di usare le classi ArCo specifiche fornite sopra (es. 'arco:HistoricOrArtisticProperty', 'cis:ArchaeologicalSite').
-        - Divieto di uso improprio di core:Concept: NON classificare materiali (es. marmo), tecniche costruttive (es. opera laterizia) o dettagli architettonici (es. capitello) come 'core:Concept'. Se non c'è una classe perfetta, classificali come 'arco:ArchaeologicalPropertySurveyType' o lasciali nell'array "entities" senza rdf:type.
-        - Usa 'core:Agent' SOLO per persone, famiglie storiche o organizzazioni (es. Antichi Romani, Canova, Imperatore Domiziano).
-        - Relazioni: Se due entità sono connesse ma nessuna delle proprietà fornite descrive il legame in modo accurato, usa il predicato generico 'skos:related'.
-        Rispondi SOLO ed ESCLUSIVAMENTE con un JSON valido strutturato così:
-        {{
-            "reasoning": "Breve logica delle estrazioni fatte...",
-            "entities": ["Entità 1", "Entità orfana", "Marmo"],
-            "triples": [
-                {{"subject": "Entità 1", "predicate": "rdf:type", "object": "arco:HistoricOrArtisticProperty", "confidence": 0.9}},
-                {{"subject": "Entità 1", "predicate": "a-loc:isLocatedIn", "object": "Entità 2", "confidence": 0.8}}
-            ]
-        }}
-        """
-    def _index_ontology(self, path: str):
-        """Vettorizza le descrizioni delle classi per permettere allo Schema-RAG di pescare solo quelle utili."""
-        try:
-            with open(path, 'r', encoding='utf-8') as f:
-                self.ontology_elements = json.load(f)
-            texts = [el['description'] for el in self.ontology_elements]
-            self.ontology_embeddings = self.embedding_model.embed_documents(texts)
-            print(f"✅ Indicizzati {len(self.ontology_elements)} elementi dell'ontologia.")
-        except Exception as e:
-            print(f"❌ Errore indicizzazione Ontologia: {e}")
-    def _retrieve_schema(self, query_text: str, top_k_classes=10, top_k_props=8):
-        """Calcola la cosine similarity tra il testo in ingresso e le voci dell'ontologia."""
-        if not self.ontology_elements or self.ontology_embeddings is None:
-            return "Nessuna classe specifica.", "skos:related"
-        query_embedding = self.embedding_model.embed_query(query_text)
-        similarities = cosine_similarity([query_embedding], self.ontology_embeddings)[0]
-        # Ordino per beccare i match migliori
-        sorted_indices = np.argsort(similarities)[::-1]
-        classes = []
-        properties = []
-        for idx in sorted_indices:
-            element = self.ontology_elements[idx]
-            if element["type"] == "Class" and len(classes) < top_k_classes:
-                classes.append(f"- {element['id']}: {element['description']}")
-            elif element["type"] == "Property" and len(properties) < top_k_props:
-                # N.B. Inietto Domain e Range estratti dallo script build_schema
-                # per dare all'LLM i paletti relazionali esatti.
-                prop_str = f"- {element['id']}: {element['description']}"
-                dom = element.get("domain")
-                rng = element.get("range")
-                if dom or rng:
-                    prop_str += f" [VINCOLO -> Soggetto: {dom or 'Qualsiasi'}, Oggetto: {rng or 'Qualsiasi'}]"
-                properties.append(prop_str)
-        return "\n".join(classes), "\n".join(properties)
-    def extract(self, text_chunk: str, source_id: str = "unknown", max_retries=3) -> KnowledgeGraphExtraction:
-        print(f"🧠 Processing {source_id} (Schema-RAG Mode)...")
-        # 1. Recupero dinamico (pesco solo lo schema utile per questo specifico frammento di testo)
-        retrieved_classes, retrieved_properties = self._retrieve_schema(text_chunk)
-        # 2. Inietto i paletti nel system prompt
-        final_sys_text = self.system_template_base.format(
-            retrieved_classes=retrieved_classes,
-            retrieved_properties=retrieved_properties
         )
-        sys_msg = SystemMessage(content=final_sys_text)
-        prompt = ChatPromptTemplate.from_messages([sys_msg, ("human", "{text}")])
-        chain = prompt | self.llm
-        for attempt in range(max_retries):
-            try:
-                response = chain.invoke({"text": text_chunk})
-                content = response.content
-                # I LLM a volte ci mettono i backtick markdown anche se chiedi solo JSON puro. Li elimino.
-                if "```json" in content:
-                    content = content.split("```json")[1].split("```")[0].strip()
-                elif "```" in content:
-                    content = content.split("```")[1].split("```")[0].strip()
-                if not content:
-                    raise ValueError("Il modello ha restituito una stringa vuota o un formato non parsabile.")
-                data = json.loads(content)
-                # Normalizzo l'output per gestire eventuali fluttuazioni della risposta
-                if isinstance(data, list):
-                    validated_data = KnowledgeGraphExtraction(triples=data, reasoning="Direct list output")
-                else:
-                    # Filtro eventuali chiavi fittizie inventate dal modello per rispettare strettamente Pydantic
-                    triples = [GraphTriple(**t) for t in data.get("triples", [])]
-                    validated_data = KnowledgeGraphExtraction(
-                        reasoning=data.get("reasoning", "N/A"),
-                        entities=data.get("entities", []),
-                        triples=triples
-                    )
-                for t in validated_data.triples:
-                    t.source = source_id
-                return validated_data
-            except (json.JSONDecodeError, ValidationError) as e:
-                print(f"⚠️ Errore Validazione (Tentativo {attempt+1}/{max_retries}): {e}")
-                # SELF-CORRECTION LOOP: Se l'LLM sbagliaa la struttura del JSON,
-                # non butto via tutto ma gli rido in pasto l'errore per farglielo correggere.
-                prev_content = locals().get('content', 'No content')
-                correction_prompt = ChatPromptTemplate.from_messages([
-                    sys_msg,
-                    HumanMessage(content=text_chunk),
-                    AIMessage(content=prev_content),
-                    HumanMessage(content=f"Errore nel JSON precedente: {e}. Correggi e restituisci SOLO JSON valido senza markdown.")
-                ])
-                chain = correction_prompt | self.llm
-            except Exception as e:
-                print(f"❌ Errore critico: {e}")
-                break
         return KnowledgeGraphExtraction(triples=[])

 import os
+import json
+from typing import List, Optional, Dict, Any
 from pydantic import BaseModel, Field, ValidationError
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
+from langchain_google_genai import ChatGoogleGenerativeAI
 from dotenv import load_dotenv
+load_dotenv()
+# --- MODELLI PYDANTIC (Contratti Formali) ---
+# PASS 1 - Livello 1
+class MacroCategoryCandidate(BaseModel):
+    category: str = Field(description="URI della macro-categoria (es. arco:CulturalProperty)")
+    reasoning: str = Field(description="Perché questa macro-categoria è appropriata per l'entità")
+class EntityMacroClassification(BaseModel):
+    name: str = Field(description="Nome dell'entità come appare nel testo")
+    candidates: List[MacroCategoryCandidate] = Field(
+        description="1-2 macro-categorie candidate, ordinate per preferenza (la prima è la più probabile)",
+        min_length=1,
+        max_length=2
+    )
+class MacroClassificationResult(BaseModel):
+    """Output del Livello 1"""
+    entities: List[EntityMacroClassification]
+# PASS 1 - Livello 2
+class TypedEntity(BaseModel):
+    name: str = Field(description="Nome dell'entità come appare nel testo")
+    type: str = Field(description="URI del tipo ontologico finale (es. arco:ArchaeologicalProperty)")
+class TypeInferenceResult(BaseModel):
+    """Output del Livello 2"""
+    entities: List[TypedEntity]
+# PASS 2 - Extraction
 class GraphTriple(BaseModel):
+    subject: str
+    subject_type: str = Field(description="Tipo ontologico del soggetto (da Pass 1)")
+    predicate: str
+    object: str
+    object_type: str = Field(description="Tipo ontologico dell'oggetto (da Pass 1)")
+    evidence: str = Field(description="Span testuale esatto dal chunk da cui la relazione è estratta")
+    reasoning: str = Field(description="Perché questo predicato è stato scelto per questa coppia di entità")
+    source: Optional[str] = Field(None) # Mantenuto per compatibilità con il batching Neo4j a valle
 class KnowledgeGraphExtraction(BaseModel):
     triples: List[GraphTriple]
 class NeuroSymbolicExtractor:
+    def __init__(self, index_path="../../ontology/schemas/domain_index.json"):
+        print("🧠 Inizializzazione TDDT Extractor (Type-Driven Domain Traversal)...")
+        google_api_key = os.getenv("GOOGLE_API_KEY")
+        if not google_api_key:
+            raise ValueError("❌ GOOGLE_API_KEY mancante. Richiesta per Gemini 2.0 Flash.")
+        # Inizializzo l'LLM primario. Temperatura 0 per massimizzare il determinismo.
+        self.llm = ChatGoogleGenerativeAI(
+            model="gemini-2.0-flash",
+            temperature=0,
+            api_key=google_api_key
+        )
+        # Inizializzo le chain con structured output
+        self.chain_pass1_l1 = self.llm.with_structured_output(MacroClassificationResult)
+        self.chain_pass1_l2 = self.llm.with_structured_output(TypeInferenceResult)
+        self.chain_pass2 = self.llm.with_structured_output(KnowledgeGraphExtraction)
+        # Caricamento del Domain Index in RAM
+        self.domain_index = {"classes": {}, "properties_by_domain": {}}
+        if os.path.exists(index_path):
+            with open(index_path, 'r', encoding='utf-8') as f:
+                self.domain_index = json.load(f)
+            print(f"✅ Domain Index caricato: {len(self.domain_index['classes'])} classi disponibili.")
         else:
+            print(f"⚠️ Attenzione: Domain Index non trovato al percorso {index_path}")
+        self.root_classes = self._extract_root_classes()
+    def _extract_root_classes(self) -> Dict[str, Any]:
+        """Estrae il primo livello ontologico per la macro-categorizzazione."""
+        roots = {}
+        for uri, data in self.domain_index["classes"].items():
+            # Consideriamo root le classi senza padri o figlie dirette di owl:Thing / l0:Entity
+            if not data["parents"] or "owl:Thing" in data["parents"] or "l0:Entity" in data["parents"]:
+                roots[uri] = data
+        return roots
+    def _get_subclasses(self, parent_uris: List[str]) -> Dict[str, Any]:
+        """Recupera tutte le sottoclassi dirette (e se stesse) dai rami indicati."""
+        subclasses = {}
+        for uri, data in self.domain_index["classes"].items():
+            if uri in parent_uris or any(p in parent_uris for p in data["parents"]):
+                subclasses[uri] = data
+        return subclasses
+    def _execute_with_retry(self, chain, prompt_messages, max_retries=3):
+        """Self-correction loop unificato."""
+        for attempt in range(max_retries):
             try:
+                result = chain.invoke(prompt_messages)
+                return result
             except Exception as e:
+                print(f"⚠️ Errore (Tentativo {attempt+1}/{max_retries}): {e}")
+                if attempt == max_retries - 1:
+                    print("❌ Fallimento critico del task LLM.")
+                    return None
+        return None
+    def extract(self, text_chunk: str, source_id: str = "unknown") -> KnowledgeGraphExtraction:
+        print(f"\n🧩 Processing {source_id} (TDDT Mode)...")
+        # ==========================================
+        # PASS 1 - LIVELLO 1: Macro-Categorizzazione
+        # ==========================================
+        roots_text = "\n".join([f"- {uri} — \"{data['label']}: {data['description']}\"" for uri, data in self.root_classes.items()])
+        sys_l1 = f"""Identifica le entità principali nel testo e assegna a ciascuna la macro-categoria più appropriata.
+Puoi assegnare fino a 2 candidati per entità se sei incerto, ordinandoli per confidenza.
+MACRO-CATEGORIE DISPONIBILI:
+{roots_text}"""
+        res_l1: MacroClassificationResult = self._execute_with_retry(
+            self.chain_pass1_l1,
+            [SystemMessage(content=sys_l1), HumanMessage(content=text_chunk)]
+        )
+        if not res_l1 or not res_l1.entities:
+            print("   -> Nessuna entità trovata al Livello 1.")
+            return KnowledgeGraphExtraction(triples=[])
+        # ==========================================
+        # PASS 1 - LIVELLO 2: Specializzazione
+        # ==========================================
+        # Raccogliamo tutti i rami candidati da esplorare
+        candidate_uris = set()
+        for ent in res_l1.entities:
+            for cand in ent.candidates:
+                candidate_uris.add(cand.category)
+        subclasses = self._get_subclasses(list(candidate_uris))
+        # Raggruppo le sottoclassi per visualizzarle ordinate nel prompt
+        subs_text_blocks = []
+        for parent in candidate_uris:
+            subs_text_blocks.append(f"\n[{parent} →]")
+            children = {k: v for k, v in subclasses.items() if parent in v["parents"] or k == parent}
+            for uri, data in children.items():
+                subs_text_blocks.append(f"- {uri} — \"{data['label']}: {data['description']}\"")
+        subs_text = "\n".join(subs_text_blocks)
+        ent_text = "\n".join([f"- '{e.name}': " + ", ".join([f"{c.category}" for c in e.candidates]) for e in res_l1.entities])
+        sys_l2 = f"""Per ciascuna entità identificata, scegli il sotto-tipo più specifico tra quelli elencati.
+Se non c'è un sotto-tipo rilevante per un'entità, conferma la sua macro-categoria.
+ENTITÀ IDENTIFICATE (con macro-categorie candidate):
+{ent_text}
+SOTTO-TIPI DISPONIBILI:
+{subs_text}"""
+        res_l2: TypeInferenceResult = self._execute_with_retry(
+            self.chain_pass1_l2,
+            [SystemMessage(content=sys_l2), HumanMessage(content=text_chunk)]
         )
+        if not res_l2 or not res_l2.entities:
+            return KnowledgeGraphExtraction(triples=[])
+        # ==========================================
+        # PASS 2: Estrazione Relazionale
+        # ==========================================
+        # Mappa dei tipi finali
+        typed_entities_map = {e.name: e.type for e in res_l2.entities}
+        # Recupero deterministico delle proprietà
+        valid_properties = []
+        seen_props = set()
+        for ent_type in typed_entities_map.values():
+            props = self.domain_index["properties_by_domain"].get(ent_type, [])
+            for p in props:
+                if p["id"] not in seen_props:
+                    valid_properties.append(f"- {p['id']}: {p['inherited_from']} → {p['range']} (Label: {p['label']})")
+                    seen_props.add(p["id"])
+        props_text = "\n".join(valid_properties) if valid_properties else "- (Nessuna proprietà specifica trovata. Usa skos:related)"
+        ent_final_text = "\n".join([f"- {name} ({uri_type})" for name, uri_type in typed_entities_map.items()])
+        sys_ext = f"""Estrai le relazioni semantiche tra le entità presenti nel testo.
+ENTITÀ IDENTIFICATE (con il loro tipo):
+{ent_final_text}
+PROPRIETÀ CONSENTITE (con vincoli domain → range):
+{props_text}
+- skos:related: Qualsiasi → Qualsiasi (Usa SOLO se nessuna proprietà sopra descrive accuratamente il legame)
+REGOLE CRITICHE:
+1. Usa SOLO le proprietà elencate sopra.
+2. Rispetta rigorosamente i vincoli ontologici: il tipo del 'subject' DEVE essere compatibile con il domain, e il tipo dell''object' con il range.
+3. Compila sempre i campi 'evidence' citando esattamente il testo, e 'reasoning' spiegando la scelta logica.
+"""
+        final_res: KnowledgeGraphExtraction = self._execute_with_retry(
+            self.chain_pass2,
+            [SystemMessage(content=sys_ext), HumanMessage(content=text_chunk)]
+        )
+        if final_res and final_res.triples:
+            # Propago il source_id prima di inviare l'output
+            for t in final_res.triples:
+                t.source = source_id
+            return final_res
         return KnowledgeGraphExtraction(triples=[])

src/utils/build_schema.py CHANGED Viewed

@@ -1,12 +1,10 @@
 import os
 import json
 from pathlib import Path
-from rdflib import Graph
-# --- MAPPA FORZATA DEI NAMESPACE ARCO E ONTOPIA ---
-# rdflib spesso fa casini con i prefissi di default (generando ID vuoti tipo ':Acquisition').
-# Forziamo la mano con un dizionario hardcoded per avere sempre QName puliti
-# e standardizzati, fondamentali per non confondere l'LLM durante lo Schema-RAG.
 ARCO_NAMESPACES = {
     "https://w3id.org/arco/ontology/arco/": "arco",
     "https://w3id.org/arco/ontology/core/": "core",
@@ -14,165 +12,208 @@ ARCO_NAMESPACES = {
     "https://w3id.org/arco/ontology/context-description/": "a-cd",
     "https://w3id.org/arco/ontology/denotative-description/": "a-dd",
     "https://w3id.org/arco/ontology/cultural-event/": "a-ce",
     "http://dati.beniculturali.it/cis/": "cis",
     "https://w3id.org/italia/onto/l0/": "l0",
     "https://w3id.org/italia/onto/CLV/": "clv",
     "https://w3id.org/italia/onto/TI/": "ti",
     "https://w3id.org/italia/onto/RO/": "ro",
     "https://w3id.org/italia/onto/SM/": "sm",
     "http://www.w3.org/2002/07/owl#": "owl"
 }
-def uri_to_qname(uri: str) -> str:
-    """
-    Prende un URI chilometrico e lo riduce a un QName compatto (es. arco:CulturalProperty).
-    L'LLM impazzirebbe a leggere URL completi nel prompt, sprecando token inutilmente.
-    """
-    if not uri:
         return None
     uri_str = str(uri)
-    # Match sulla base dei namespace noti (cerco la radice più lunga)
     best_match = ""
     for ns_uri in ARCO_NAMESPACES.keys():
         if uri_str.startswith(ns_uri) and len(ns_uri) > len(best_match):
             best_match = ns_uri
     if best_match:
-        prefix = ARCO_NAMESPACES[best_match]
-        name = uri_str[len(best_match):].lstrip('#')
-        return f"{prefix}:{name}"
-    # Fallback drastico se peschiamo qualcosa fuori dai radar: tengo solo l'ultimo pezzetto
-    if '#' in uri_str:
-        return uri_str.split('#')[-1]
     return uri_str.split('/')[-1]
-def build_schema_from_ontology(owl_folder_path: str, output_json_path: str):
-    print(f"⏳ Inizializzazione Graph e caricamento file .owl da {owl_folder_path}...")
-    # Creo un mega-grafo in memoria. Caricando tutti i file .owl insieme,
-    # risolvo automaticamente i cross-reference (es. una proprietà di 'location.owl'
-    # che punta a una classe di 'core.owl').
     g = Graph()
-    # 1. Caricamento Moduli
-    owl_files = list(Path(owl_folder_path).glob('**/*.owl'))
-    if not owl_files:
-        print("❌ Nessun file .owl trovato nella directory specificata.")
-        return
     for file_path in owl_files:
         try:
             g.parse(file_path, format="xml")
-            print(f"  -> Caricato (XML): {file_path.name}")
-        except Exception as e_xml:
-            print(f"  ⚠️ Impossibile parsare {file_path.name}. XML err: {e_xml}")
-    print("✅ Ontologia caricata in memoria. Esecuzione query SPARQL...")
-    # 2. Query SPARQL
-    # Estrazione massiva. Ho rimosso i FILTER(isIRI) su domain e range perché ArCo
-    # fa largo uso di Blank Nodes per definire le UNION di classi. Se li filtro,
-    # perdo un sacco di vincoli relazionali utili per l'estrattore LLM.
-    sparql_query = """
-    PREFIX owl: <http://www.w3.org/2002/07/owl#>
-    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
-    SELECT DISTINCT ?entity ?type ?label ?comment ?domain ?range
-    WHERE {
-      {
-        ?entity a owl:Class .
-        BIND("Class" AS ?type)
-      } UNION {
-        ?entity a owl:ObjectProperty .
-        BIND("Property" AS ?type)
-      }
-      OPTIONAL {
-          ?entity rdfs:label ?label .
-          FILTER(LANGMATCHES(LANG(?label), "it") || LANG(?label) = "")
-      }
-      OPTIONAL {
-          ?entity rdfs:comment ?comment .
-          FILTER(LANGMATCHES(LANG(?comment), "it") || LANG(?comment) = "")
-      }
-      OPTIONAL { ?entity rdfs:domain ?domain . }
-      OPTIONAL { ?entity rdfs:range ?range . }
-      FILTER(isIRI(?entity))
-    }
-    """
-    results = g.query(sparql_query)
-    schema_elements = {}
-    # 3. Formattazione e Pulizia
-    for row in results:
-        entity_uri = row.entity
-        entity_type = str(row.type)
-        label = str(row.label) if row.label else ""
-        comment = str(row.comment) if row.comment else ""
-        qname = uri_to_qname(entity_uri)
-        # Gestione Blank Nodes: se il dominio o range non è un URI netto (inizia con http),
-        # significa che l'ontologia sta usando una costruzione logica complessa (es. unione di classi).
-        # Metto "Mixed/Union" come fallback per avvisare l'LLM che accetta tipi misti.
-        domain_str = uri_to_qname(row.domain) if (row.domain and str(row.domain).startswith("http")) else ("Mixed/Union" if row.domain else None)
-        range_str = uri_to_qname(row.range) if (row.range and str(row.range).startswith("http")) else ("Mixed/Union" if row.range else None)
-        description_parts = []
-        if label: description_parts.append(label)
-        if comment: description_parts.append(comment)
-        final_description = " - ".join(description_parts)
-        # Scarto le voci senza documentazione testuale. Se non hanno un commento,
-        # l'LLM non capirebbe mai come usarle e farebbe solo allucinazioni.
-        if not final_description.strip():
-            continue
-        # Se l'entità non è ancora nel dizionario, la creiamo
-        if qname not in schema_elements:
-            element_data = {
-                "id": qname,
-                "type": entity_type,
-                "description": final_description.strip()
-            }
-            # Strutturo domain e range come chiavi a se stanti per poterle iniettare facilmente nel prompt
-            if entity_type == "Property":
-                element_data["domain"] = domain_str
-                element_data["range"] = range_str
-            schema_elements[qname] = element_data
-        else:
-            # Deduplica intelligente: poiché i file OWL si sovrappongono, potrei leggere la stessa
-            # proprietà due volte (una volta vuota, una volta con i vincoli).
-            # Se trovo i vincoli al secondo giro, aggiorno il dizionario per non perdere dati preziosi.
-            if entity_type == "Property":
-                if domain_str and not schema_elements[qname].get("domain"):
-                    schema_elements[qname]["domain"] = domain_str
-                if range_str and not schema_elements[qname].get("range"):
-                    schema_elements[qname]["range"] = range_str
-    # 4. Salvataggio su disco
-    output_list = list(schema_elements.values())
-    with open(output_json_path, 'w', encoding='utf-8') as f:
-        json.dump(output_list, f, ensure_ascii=False, indent=2)
-    print(f"🎉 Finito! Generato dizionario con {len(output_list)} elementi.")
-    print(f"💾 Salvato in: {output_json_path}")
 if __name__ == "__main__":
-    NOME_ONTOLOGIA = "ARCO"
-    INPUT_FOLDER = f"data/ontologie_raw/{NOME_ONTOLOGIA}"
-    OUTPUT_FILE = f"data/schemas/{NOME_ONTOLOGIA}_schema.json"
-    os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
-    build_schema_from_ontology(INPUT_FOLDER, OUTPUT_FILE)

 import os
 import json
 from pathlib import Path
+from collections import defaultdict
+from rdflib import Graph, URIRef, BNode, RDF, RDFS, OWL, Namespace
+# --- MAPPA DEI NAMESPACE (Estesa con CIDOC-CRM) ---
 ARCO_NAMESPACES = {
     "https://w3id.org/arco/ontology/arco/": "arco",
     "https://w3id.org/arco/ontology/core/": "core",
     "https://w3id.org/arco/ontology/context-description/": "a-cd",
     "https://w3id.org/arco/ontology/denotative-description/": "a-dd",
     "https://w3id.org/arco/ontology/cultural-event/": "a-ce",
+    "https://w3id.org/arco/ontology/catalogue/": "a-cat",
     "http://dati.beniculturali.it/cis/": "cis",
     "https://w3id.org/italia/onto/l0/": "l0",
     "https://w3id.org/italia/onto/CLV/": "clv",
     "https://w3id.org/italia/onto/TI/": "ti",
     "https://w3id.org/italia/onto/RO/": "ro",
     "https://w3id.org/italia/onto/SM/": "sm",
+    "https://w3id.org/italia/onto/MU/": "mu",
+    "http://www.cidoc-crm.org/cidoc-crm/": "crm",  # Aggiunto CIDOC-CRM
     "http://www.w3.org/2002/07/owl#": "owl"
 }
+def uri_to_qname(uri: URIRef) -> str:
+    if not uri or isinstance(uri, BNode):
         return None
     uri_str = str(uri)
     best_match = ""
     for ns_uri in ARCO_NAMESPACES.keys():
         if uri_str.startswith(ns_uri) and len(ns_uri) > len(best_match):
             best_match = ns_uri
     if best_match:
+        return f"{ARCO_NAMESPACES[best_match]}:{uri_str[len(best_match):].lstrip('#')}"
+    if '#' in uri_str: return uri_str.split('#')[-1]
     return uri_str.split('/')[-1]
+def get_union_classes(g: Graph, bnode: BNode):
+    """Estrae le classi da un costrutto owl:unionOf (usato spesso in ArCo per domini/range multipli)."""
+    union_list = g.value(bnode, OWL.unionOf)
+    classes = []
+    if union_list:
+        # Naviga la lista RDF
+        current = union_list
+        while current and current != RDF.nil:
+            item = g.value(current, RDF.first)
+            if isinstance(item, URIRef):
+                classes.append(uri_to_qname(item))
+            current = g.value(current, RDF.rest)
+    return [c for c in classes if c]
+def build_domain_index_and_shacl(ontology_dir: str, output_json: str, output_shacl: str):
+    print(f"⏳ Inizializzazione Graph e caricamento da {ontology_dir}...")
     g = Graph()
+    # 1. Carica tutti i file .owl
+    owl_files = list(Path(ontology_dir).glob('**/*.owl'))
     for file_path in owl_files:
         try:
             g.parse(file_path, format="xml")
+            print(f"  -> Caricato: {file_path.name}")
+        except Exception as e:
+            print(f"  ⚠️ Errore parsing {file_path.name}: {e}")
+    print("✅ Ontologie caricate in memoria. Compilazione indici in corso...")
+    classes_dict = {}
+    properties_list = []
+    # 2. Estrazione Classi e Gerarchia
+    for s in g.subjects(RDF.type, OWL.Class):
+        if isinstance(s, BNode): continue
+        qname = uri_to_qname(s)
+        label = g.value(s, RDFS.label)
+        comment = g.value(s, RDFS.comment)
+        # Filtro lingua: preferisco italiano, altrimenti inglese (per CIDOC-CRM)
+        label_str = str(label) if label else qname
+        for lang_label in g.objects(s, RDFS.label):
+            if lang_label.language == 'it': label_str = str(lang_label)
+        desc_str = str(comment) if comment else ""
+        for lang_comment in g.objects(s, RDFS.comment):
+            if lang_comment.language == 'it': desc_str = str(lang_comment)
+        # Trova parent diretti
+        parents = [uri_to_qname(p) for p in g.objects(s, RDFS.subClassOf) if isinstance(p, URIRef)]
+        classes_dict[qname] = {
+            "label": label_str,
+            "description": desc_str,
+            "parents": parents,
+            "namespace": qname.split(":")[0] if ":" in qname else "unknown"
+        }
+    # 3. Estrazione Proprietà
+    for prop_type in [OWL.ObjectProperty, OWL.DatatypeProperty]:
+        for s in g.subjects(RDF.type, prop_type):
+            if isinstance(s, BNode): continue
+            qname = uri_to_qname(s)
+            label = g.value(s, RDFS.label)
+            label_str = str(label) if label else qname
+            # Dominio
+            domain_node = g.value(s, RDFS.domain)
+            domains = []
+            if isinstance(domain_node, URIRef):
+                domains.append(uri_to_qname(domain_node))
+            elif isinstance(domain_node, BNode):
+                domains.extend(get_union_classes(g, domain_node))
+            # Range
+            range_node = g.value(s, RDFS.range)
+            ranges = []
+            if isinstance(range_node, URIRef):
+                ranges.append(uri_to_qname(range_node))
+            elif isinstance(range_node, BNode):
+                ranges.extend(get_union_classes(g, range_node))
+            properties_list.append({
+                "id": qname,
+                "label": label_str,
+                "domains": domains,
+                "ranges": ranges
+            })
+    # 4. Calcolo Ereditarietà Transitiva per il Domain Index
+    properties_by_domain = defaultdict(list)
+    # Mappo prima le proprietà ai domini espliciti
+    for prop in properties_list:
+        for d in prop["domains"]:
+            properties_by_domain[d].append({
+                "id": prop["id"],
+                "label": prop["label"],
+                "range": prop["ranges"][0] if prop["ranges"] else "Mixed/Union",
+                "inherited_from": d
+            })
+    # Funzione ricorsiva per raccogliere proprietà dai parent
+    def get_inherited_properties(class_qname, visited=None):
+        if visited is None: visited = set()
+        if class_qname in visited: return []
+        visited.add(class_qname)
+        props = list(properties_by_domain.get(class_qname, []))
+        for parent in classes_dict.get(class_qname, {}).get("parents", []):
+            inherited = get_inherited_properties(parent, visited)
+            for p in inherited:
+                # Evito duplicati
+                if not any(existing["id"] == p["id"] for existing in props):
+                    props.append(p)
+        return props
+    final_properties_by_domain = {}
+    for cls in classes_dict.keys():
+        all_props = get_inherited_properties(cls)
+        if all_props:
+            final_properties_by_domain[cls] = all_props
+    # 5. Generazione Text Embeddings Dictionary
+    class_embeddings_texts = {
+        k: f"{v['label']} - {v['description']}" for k, v in classes_dict.items() if v['description']
+    }
+    # 6. Salvataggio domain_index.json
+    domain_index = {
+        "classes": classes_dict,
+        "properties_by_domain": final_properties_by_domain,
+        "class_embeddings_texts": class_embeddings_texts
+    }
+    os.makedirs(os.path.dirname(output_json), exist_ok=True)
+    with open(output_json, 'w', encoding='utf-8') as f:
+        json.dump(domain_index, f, ensure_ascii=False, indent=2)
+    print(f"💾 Salvato Indice di Dominio in: {output_json}")
+    # 7. Generazione auto_constraints.ttl per SHACL
+    os.makedirs(os.path.dirname(output_shacl), exist_ok=True)
+    with open(output_shacl, 'w', encoding='utf-8') as f:
+        f.write("@prefix sh: <http://www.w3.org/ns/shacl#> .\n")
+        f.write("@prefix ex: <http://activadigital.it/ontology/> .\n")
+        for ns_uri, prefix in ARCO_NAMESPACES.items():
+            f.write(f"@prefix {prefix}: <{ns_uri}> .\n")
+        f.write("\n")
+        shape_count = 0
+        for prop in properties_list:
+            safe_id = prop["id"].replace(":", "_").replace("-", "_")
+            # Domain Shape (solo se domain esplicito singolo per non creare conflitti con le Union)
+            if len(prop["domains"]) == 1:
+                dom = prop["domains"][0]
+                f.write(f"ex:{safe_id}_DomainShape a sh:NodeShape ;\n")
+                f.write(f"    sh:targetSubjectsOf {prop['id']} ;\n")
+                f.write(f"    sh:class {dom} .\n\n")
+                shape_count += 1
+            # Range Shape (solo se range esplicito singolo)
+            if len(prop["ranges"]) == 1 and "http" not in prop["ranges"][0]: # Evito XSD datatypes complessi
+                rng = prop["ranges"][0]
+                f.write(f"ex:{safe_id}_RangeShape a sh:NodeShape ;\n")
+                f.write(f"    sh:targetObjectsOf {prop['id']} ;\n")
+                f.write(f"    sh:class {rng} .\n\n")
+                shape_count += 1
+    print(f"🛡️ Generato SHACL auto_constraints.ttl con {shape_count} regole rigorose in: {output_shacl}")
 if __name__ == "__main__":
+    ONTOLOGY_FOLDER = "../../ontology/"
+    OUTPUT_JSON = "../../ontology/schemas/domain_index.json"
+    OUTPUT_SHACL = "../../ontology/schemas/auto_constraints.ttl"
+    build_domain_index_and_shacl(ONTOLOGY_FOLDER, OUTPUT_JSON, OUTPUT_SHACL)

src/validation/validator.py CHANGED Viewed

@@ -1,108 +1,124 @@
 import os
 from rdflib import Graph, Literal, RDF, URIRef, Namespace
-from rdflib.namespace import SKOS, XSD
 from pyshacl import validate
 class SemanticValidator:
-    def __init__(self):
-        # Carico le regole SHACL.
-        # Se l'LLM ha un'allucinazione e inventa relazioni assurde, SHACL lo blocca qui.
-        self.shapes_file = os.path.join(os.path.dirname(__file__), "shapes/schema_constraints.ttl")
-        # Mappatura dei namespace di ArCo.
-        # Il namespace 'ex' ci serve come discarica/fallback per tutte le entità testuali pure
-        # (es. "Colosseo", "Monumento") che l'LLM non ha saputo ancorare a un'URI ufficiale.
         self.namespaces = {
             "arco": Namespace("https://w3id.org/arco/ontology/arco/"),
             "core": Namespace("https://w3id.org/arco/ontology/core/"),
             "a-loc": Namespace("https://w3id.org/arco/ontology/location/"),
             "cis": Namespace("http://dati.beniculturali.it/cis/"),
             "ex": Namespace("http://activadigital.it/ontology/")
         }
         if os.path.exists(self.shapes_file):
             self.shacl_graph = Graph()
             self.shacl_graph.parse(self.shapes_file, format="turtle")
-            print("🛡️  SHACL Constraints caricati.")
         else:
             print("⚠️  File SHACL non trovato. Validazione disabilitata (pericoloso in prod!).")
             self.shacl_graph = None
     def _get_uri(self, text_val):
-        # L'LLM ci restituisce stringhe come "arco:CulturalProperty" o semplice testo "Statua di bronzo".
-        # rdflib ha bisogno di URIRef veri, quindi faccio un po' di parsing per convertirli.
         if ":" in text_val and not text_val.startswith("http"):
             prefix, name = text_val.split(":", 1)
             if prefix in self.namespaces:
                 return self.namespaces[prefix][name]
-        # Se è testo libero senza namespace, lo ripulisco per evitare che gli spazi
-        # rompano l'URI e lo forzo nel nostro namespace custom.
         clean_name = text_val.replace(" ", "_").replace("'", "").replace('"', "")
         return self.namespaces["ex"][clean_name]
     def _json_to_rdf(self, entities, triples):
-        # Il validatore pyshacl non digerisce i nostri oggetti Pydantic o i JSON nativi.
-        # Devo ricostruire un micro-grafo RDF al volo solo per fargli fare il check formale.
         g = Graph()
-        # Registro i prefissi nel grafo per facilitare l'eventuale debug testuale
         for prefix, ns in self.namespaces.items():
             g.bind(prefix, ns)
         g.bind("skos", SKOS)
-        # 1. Recupero entità orfane (trovate nel testo ma non agganciate a nessuna tripla)
         if entities:
             for ent in entities:
-                # Gestisco il tipo di dato a seconda di cosa è uscito dal resolver
                 label = ent["label"] if isinstance(ent, dict) else str(ent)
                 ent_uri = self._get_uri(label)
                 g.add((ent_uri, SKOS.prefLabel, Literal(label, lang="it")))
-        # 2. Ricostruzione delle Triple relazionali
         if triples:
             for t in triples:
                 subj_uri = self._get_uri(t.subject)
-                # Le nostre regole SHACL (schema_constraints.ttl) esigono tipicamente che i nodi
-                # non siano scatole vuote (NodeLabelShape). Ci appiccico sempre la prefLabel in italiano.
                 g.add((subj_uri, SKOS.prefLabel, Literal(t.subject, lang="it")))
-                # Separo le classificazioni dalle relazioni standard
                 if t.predicate.lower() in ["rdf:type", "a", "type", "rdf_type"]:
                     obj_uri = self._get_uri(t.object)
                     g.add((subj_uri, RDF.type, obj_uri))
                 else:
-                    # Relazione standard (es. a-loc:hasCurrentLocation)
                     pred_uri = self._get_uri(t.predicate)
                     obj_uri = self._get_uri(t.object)
                     g.add((subj_uri, pred_uri, obj_uri))
-                    # Anche il nodo di destinazione deve avere un nome umano
                     g.add((obj_uri, SKOS.prefLabel, Literal(t.object, lang="it")))
         return g
-    def validate_batch(self, entities, triples):
         """
-        Scatena il motore di regole SHACL sia sulle entità isolate che sulle triple.
-        Ritorna l'esito, il report testuale degli errori, e il grafo temporaneo.
         """
-        if not self.shacl_graph:
-            return True, "No Constraints", None
-        # Converto la pappa di Pydantic in un vero grafo RDF
-        data_graph = self._json_to_rdf(entities, triples)
-        print("🔍 Esecuzione Validazione SHACL...")
-        # Abilito inference='rdfs' così se una regola si applica a una super-classe,
-        # pyshacl lo deduce da solo scendendo l'albero gerarchico.
         conforms, report_graph, report_text = validate(
-            data_graph,
             shacl_graph=self.shacl_graph,
-            inference='rdfs',
-            serialize_report_graph=True
         )
-        return conforms, report_text, data_graph

 import os
+from pathlib import Path
 from rdflib import Graph, Literal, RDF, URIRef, Namespace
+from rdflib.namespace import SKOS, OWL
 from pyshacl import validate
 class SemanticValidator:
+    def __init__(self, ontology_dir="../../ontology", shapes_file="../../ontology/shapes/auto_constraints.ttl"):
+        self.shapes_file = shapes_file
+        # Mappatura namespace
         self.namespaces = {
             "arco": Namespace("https://w3id.org/arco/ontology/arco/"),
             "core": Namespace("https://w3id.org/arco/ontology/core/"),
             "a-loc": Namespace("https://w3id.org/arco/ontology/location/"),
+            "a-cd": Namespace("https://w3id.org/arco/ontology/context-description/"),
             "cis": Namespace("http://dati.beniculturali.it/cis/"),
+            "crm": Namespace("http://www.cidoc-crm.org/cidoc-crm/"),
             "ex": Namespace("http://activadigital.it/ontology/")
         }
+        print("🛡️  Inizializzazione Semantic Validator (OWL RL)...")
+        # Caricamento massivo dell'Ontologia in memoria per il Reasoner
+        self.ont_graph = Graph()
+        arco_path = Path(ontology_dir) / "arco"
+        if arco_path.exists():
+            for owl_file in arco_path.glob("*.owl"):
+                self.ont_graph.parse(str(owl_file), format="xml")
+        cidoc_path = Path(ontology_dir) / "cidoc-crm" / "cidoc-crm.owl"
+        if cidoc_path.exists():
+            self.ont_graph.parse(str(cidoc_path), format="xml")
+        print(f"✅ Ontologia completa caricata nel reasoner ({len(self.ont_graph)} triple).")
         if os.path.exists(self.shapes_file):
             self.shacl_graph = Graph()
             self.shacl_graph.parse(self.shapes_file, format="turtle")
+            print("🛡️  SHACL Auto-Constraints caricati.")
         else:
             print("⚠️  File SHACL non trovato. Validazione disabilitata (pericoloso in prod!).")
             self.shacl_graph = None
     def _get_uri(self, text_val):
         if ":" in text_val and not text_val.startswith("http"):
             prefix, name = text_val.split(":", 1)
             if prefix in self.namespaces:
                 return self.namespaces[prefix][name]
         clean_name = text_val.replace(" ", "_").replace("'", "").replace('"', "")
         return self.namespaces["ex"][clean_name]
     def _json_to_rdf(self, entities, triples):
         g = Graph()
         for prefix, ns in self.namespaces.items():
             g.bind(prefix, ns)
         g.bind("skos", SKOS)
         if entities:
             for ent in entities:
                 label = ent["label"] if isinstance(ent, dict) else str(ent)
                 ent_uri = self._get_uri(label)
                 g.add((ent_uri, SKOS.prefLabel, Literal(label, lang="it")))
         if triples:
             for t in triples:
                 subj_uri = self._get_uri(t.subject)
                 g.add((subj_uri, SKOS.prefLabel, Literal(t.subject, lang="it")))
                 if t.predicate.lower() in ["rdf:type", "a", "type", "rdf_type"]:
                     obj_uri = self._get_uri(t.object)
                     g.add((subj_uri, RDF.type, obj_uri))
                 else:
                     pred_uri = self._get_uri(t.predicate)
                     obj_uri = self._get_uri(t.object)
                     g.add((subj_uri, pred_uri, obj_uri))
                     g.add((obj_uri, SKOS.prefLabel, Literal(t.object, lang="it")))
         return g
+    def filter_valid_triples(self, entities, triples):
         """
+        Esegue la validazione bloccante (OWL RL).
+        Ritorna le triple valide da salvare su Neo4j e quelle invalide da buttare su Mongo.
         """
+        if not self.shacl_graph or not triples:
+            return triples, [], "No Validation"
+        # 1. Testiamo l'intero batch in un colpo solo per massima velocità
+        batch_graph = self._json_to_rdf(entities, triples)
         conforms, report_graph, report_text = validate(
+            batch_graph,
             shacl_graph=self.shacl_graph,
+            ont_graph=self.ont_graph,
+            inference='owlrl'
         )
+        if conforms:
+            return triples, [], "All valid"
+        print("⚠️ Rilevate violazioni SHACL nel blocco. Isolamento colpevoli...")
+        # 2. Se fallisce, isoliamo chirurgicamente le triple non conformi
+        valid_triples = []
+        invalid_triples = []
+        for t in triples:
+            single_graph = self._json_to_rdf(entities, [t])
+            t_conforms, _, t_report = validate(
+                single_graph,
+                shacl_graph=self.shacl_graph,
+                ont_graph=self.ont_graph,
+                inference='owlrl'
+            )
+            if t_conforms:
+                valid_triples.append(t)
+            else:
+                invalid_triples.append({
+                    "triple": t.model_dump() if hasattr(t, 'model_dump') else t,
+                    "violation_report": t_report
+                })
+        return valid_triples, invalid_triples, report_text