GaetanoParente commited on
Commit
9fb3deb
·
1 Parent(s): b70d82f

refactoring

Browse files
api.py CHANGED
@@ -120,6 +120,10 @@ def run_discovery(payload: DiscoveryRequest):
120
  all_triples.extend(extraction_result.triples)
121
  if hasattr(extraction_result, 'entities') and extraction_result.entities:
122
  all_entities.extend(extraction_result.entities)
 
 
 
 
123
 
124
  if not all_triples:
125
  return {
 
120
  all_triples.extend(extraction_result.triples)
121
  if hasattr(extraction_result, 'entities') and extraction_result.entities:
122
  all_entities.extend(extraction_result.entities)
123
+
124
+ if i < len(chunks) - 1:
125
+ print(f"⏳ Pacing per Groq API: attesa 20s per non sforare i 30K TPM...")
126
+ time.sleep(20)
127
 
128
  if not all_triples:
129
  return {
app.py CHANGED
@@ -171,6 +171,8 @@ with tab_gen:
171
  if st.session_state.pipeline_stage >= 1:
172
  chunks = st.session_state.chunks
173
  st.success(f"Chunking completato! Generati {len(chunks)} frammenti semantici.")
 
 
174
  else:
175
  if st.button("Avvia Semantic Splitter", type="primary"):
176
  with st.spinner("Creazione chunks in corso..."):
@@ -193,7 +195,7 @@ with tab_gen:
193
  with st.container():
194
  color = "white" if is_step_b_unlocked else "gray"
195
  icon = "✅" if st.session_state.pipeline_stage >= 2 else ("2️⃣" if is_step_b_unlocked else "🔒")
196
- st.markdown(f"<h3 style='color:{color}'>{icon} Fase 2: TDDT Extraction (Gemini)</h3>", unsafe_allow_html=True)
197
 
198
  with st.expander("ℹ️ Cosa fa questa fase?"):
199
  st.write("Esegue l'estrazione gerarchica in due passaggi: prima classifica le entità usando le root dell'ontologia, poi estrae le relazioni passando all'LLM solo le proprietà ammesse (Domain Index).")
@@ -204,7 +206,7 @@ with tab_gen:
204
  data = st.session_state.extraction_data
205
  st.success(f"Estrazione TDDT completata! Identificate {len(data['entities'])} entità e {len(data['triples'])} triple.")
206
  with st.expander("Vedi dati estratti (Pre-Validazione)"):
207
- st.write("Entità Inferite:", [e.model_dump() for e in data['entities']])
208
  if data['triples']:
209
  st.dataframe(pd.DataFrame([t.model_dump() for t in data['triples']]), hide_index=True)
210
  else:
@@ -228,6 +230,10 @@ with tab_gen:
228
  if res.triples: all_triples.extend(res.triples)
229
 
230
  prog_bar.progress((i+1)/len(chunks))
 
 
 
 
231
 
232
  # Estraggo le entità univoche dalle triple per il Resolver
233
  unique_entities = list(set([t.subject for t in all_triples] + [t.object for t in all_triples]))
@@ -267,6 +273,10 @@ with tab_gen:
267
  raw_data = st.session_state.extraction_data
268
  all_entities = raw_data.get("entities", [])
269
  all_triples = raw_data.get("triples", [])
 
 
 
 
270
 
271
  resolver = get_resolver()
272
  resolver.driver = driver
@@ -296,8 +306,6 @@ with tab_gen:
296
  except Exception as e:
297
  st.error(f"Errore scrittura DLQ: {e}")
298
 
299
- persister = KnowledgeGraphPersister()
300
- persister.driver = driver # Inietto il driver testato
301
  # Salviamo SOLO le valide
302
  persister.save_entities_and_triples(entities_to_save, valid_triples)
303
 
@@ -321,12 +329,11 @@ with tab_val:
321
  cypher_val = """
322
  MATCH (s)-[r]->(o)
323
  RETURN elementId(r) as id,
324
- COALESCE(s.label, s.name, head(labels(s))) as Soggetto,
325
  type(r) as Predicato,
326
- COALESCE(o.label, o.name, head(labels(o))) as Oggetto,
327
- COALESCE(r.evidence, 'N/A') as Evidenza,
328
- COALESCE(r.reasoning, 'N/A') as Ragionamento
329
- LIMIT 100
330
  """
331
  triples_data = run_query(driver, cypher_val)
332
 
@@ -353,9 +360,9 @@ with tab_vis:
353
  with st.spinner("Estrazione dati e generazione del grafo interattivo..."):
354
  cypher_vis = """
355
  MATCH (s)-[r]->(o)
356
- RETURN COALESCE(s.label, s.name, head(labels(s))) as src,
357
  type(r) as rel,
358
- COALESCE(o.label, o.name, head(labels(o))) as dst
359
  LIMIT 300
360
  """
361
  graph_data = run_query(driver, cypher_vis)
 
171
  if st.session_state.pipeline_stage >= 1:
172
  chunks = st.session_state.chunks
173
  st.success(f"Chunking completato! Generati {len(chunks)} frammenti semantici.")
174
+ with st.expander("Vedi dettagli frammenti"):
175
+ st.json(chunks)
176
  else:
177
  if st.button("Avvia Semantic Splitter", type="primary"):
178
  with st.spinner("Creazione chunks in corso..."):
 
195
  with st.container():
196
  color = "white" if is_step_b_unlocked else "gray"
197
  icon = "✅" if st.session_state.pipeline_stage >= 2 else ("2️⃣" if is_step_b_unlocked else "🔒")
198
+ st.markdown(f"<h3 style='color:{color}'>{icon} Fase 2: TDDT Extraction</h3>", unsafe_allow_html=True)
199
 
200
  with st.expander("ℹ️ Cosa fa questa fase?"):
201
  st.write("Esegue l'estrazione gerarchica in due passaggi: prima classifica le entità usando le root dell'ontologia, poi estrae le relazioni passando all'LLM solo le proprietà ammesse (Domain Index).")
 
206
  data = st.session_state.extraction_data
207
  st.success(f"Estrazione TDDT completata! Identificate {len(data['entities'])} entità e {len(data['triples'])} triple.")
208
  with st.expander("Vedi dati estratti (Pre-Validazione)"):
209
+ st.write("Entità Inferite:", data['entities'])
210
  if data['triples']:
211
  st.dataframe(pd.DataFrame([t.model_dump() for t in data['triples']]), hide_index=True)
212
  else:
 
230
  if res.triples: all_triples.extend(res.triples)
231
 
232
  prog_bar.progress((i+1)/len(chunks))
233
+
234
+ if i < len(chunks) - 1:
235
+ print(f"⏳ Pacing per Groq API: attesa 20s per non sforare i 30K TPM...")
236
+ time.sleep(20)
237
 
238
  # Estraggo le entità univoche dalle triple per il Resolver
239
  unique_entities = list(set([t.subject for t in all_triples] + [t.object for t in all_triples]))
 
273
  raw_data = st.session_state.extraction_data
274
  all_entities = raw_data.get("entities", [])
275
  all_triples = raw_data.get("triples", [])
276
+
277
+ persister = KnowledgeGraphPersister()
278
+ persister.driver = driver
279
+ persister._create_constraints()
280
 
281
  resolver = get_resolver()
282
  resolver.driver = driver
 
306
  except Exception as e:
307
  st.error(f"Errore scrittura DLQ: {e}")
308
 
 
 
309
  # Salviamo SOLO le valide
310
  persister.save_entities_and_triples(entities_to_save, valid_triples)
311
 
 
329
  cypher_val = """
330
  MATCH (s)-[r]->(o)
331
  RETURN elementId(r) as id,
332
+ COALESCE(s["label"], s["name"], head(labels(s))) as Soggetto,
333
  type(r) as Predicato,
334
+ COALESCE(o["label"], o["name"], head(labels(o))) as Oggetto,
335
+ COALESCE(r["evidence"], 'N/A') as Evidenza,
336
+ COALESCE(r["reasoning"], 'N/A') as Ragionamento
 
337
  """
338
  triples_data = run_query(driver, cypher_val)
339
 
 
360
  with st.spinner("Estrazione dati e generazione del grafo interattivo..."):
361
  cypher_vis = """
362
  MATCH (s)-[r]->(o)
363
+ RETURN COALESCE(s["label"], s["name"], head(labels(s))) as src,
364
  type(r) as rel,
365
+ COALESCE(o["label"], o["name"], head(labels(o))) as dst
366
  LIMIT 300
367
  """
368
  graph_data = run_query(driver, cypher_vis)
data/gold_standard/examples.json DELETED
@@ -1,70 +0,0 @@
1
- [
2
- {
3
- "text": "Il Menhir di Canne, situato lungo la strada provinciale, è un monolite calcareo che fungeva da segnacolo funerario o, secondo una teoria recente e dibattuta, da confine territoriale.",
4
- "reasoning": "L'entità fisica e la localizzazione sono fatti certi (1.0). L'uso come segnacolo è consolidato ma non assoluto (0.9), mentre l'uso come confine è esplicitamente presentato come teoria incerta, quindi assegno un'ipotesi (0.6).",
5
- "entities": [
6
- "Menhir di Canne",
7
- "Strada Provinciale",
8
- "Segnacolo funerario",
9
- "Confine territoriale"
10
- ],
11
- "triples": [
12
- {"subject": "Menhir di Canne", "predicate": "core:hasType", "object": "arco:ArchaeologicalProperty", "confidence": 1.0},
13
- {"subject": "Menhir di Canne", "predicate": "a-loc:isLocatedIn", "object": "Strada Provinciale", "confidence": 1.0},
14
- {"subject": "Menhir di Canne", "predicate": "core:hasConcept", "object": "Segnacolo funerario", "confidence": 0.9},
15
- {"subject": "Menhir di Canne", "predicate": "core:hasConcept", "object": "Confine territoriale", "confidence": 0.6}
16
- ]
17
- },
18
- {
19
- "text": "La Battaglia di Canne del 216 a.C. vide la vittoria dell'esercito cartaginese guidato da Annibale. Le dinamiche dell'accerchiamento fanno presumere una conoscenza pregressa del terreno fangoso da parte dei comandanti.",
20
- "reasoning": "La battaglia, la data e gli agenti coinvolti sono certi (1.0). La conoscenza del terreno da parte di Annibale è una deduzione forte derivata dalle tattiche, quindi è un'inferenza logica (0.85).",
21
- "entities": [
22
- "Battaglia di Canne",
23
- "216 a.C.",
24
- "Esercito Cartaginese",
25
- "Annibale",
26
- "Conoscenza del terreno"
27
- ],
28
- "triples": [
29
- {"subject": "Battaglia di Canne", "predicate": "core:hasType", "object": "core:Event", "confidence": 1.0},
30
- {"subject": "Battaglia di Canne", "predicate": "ti:atTime", "object": "216 a.C.", "confidence": 1.0},
31
- {"subject": "Battaglia di Canne", "predicate": "ro:involvesAgent", "object": "Esercito Cartaginese", "confidence": 1.0},
32
- {"subject": "Annibale", "predicate": "core:hasConcept", "object": "Conoscenza del terreno", "confidence": 0.85}
33
- ]
34
- },
35
- {
36
- "text": "L'Antiquarium custodisce un prezioso corredo funerario proveniente dalla necropoli dauna. Alcuni dettagli pittorici sui vasi a figure rosse fanno sospettare un'influenza diretta della bottega del Pittore di Dario. All'ingresso della struttura è esposta anche una piccola stele iscritta.",
37
- "reasoning": "Aggiunta un'entità isolata ('stele iscritta') che non ha relazioni esplicite nel testo con gli altri reperti, ma va comunque tracciata. L'attribuzione alla bottega rimane un'ipotesi (0.5).",
38
- "entities": [
39
- "Antiquarium",
40
- "Corredo funerario",
41
- "Vasi a figure rosse",
42
- "Bottega del Pittore di Dario",
43
- "Stele iscritta"
44
- ],
45
- "triples": [
46
- {"subject": "Antiquarium", "predicate": "core:hasType", "object": "cis:CulturalInstituteOrSite", "confidence": 1.0},
47
- {"subject": "Corredo funerario", "predicate": "a-loc:hasCurrentLocation", "object": "Antiquarium", "confidence": 1.0},
48
- {"subject": "Corredo funerario", "predicate": "core:hasPart", "object": "Vasi a figure rosse", "confidence": 1.0},
49
- {"subject": "Vasi a figure rosse", "predicate": "ro:hasAuthor", "object": "Bottega del Pittore di Dario", "confidence": 0.5}
50
- ]
51
- },
52
- {
53
- "text": "Durante i recenti scavi nell'area nord, sono state rinvenute tre monete puniche d'argento mescolate a ceneri vicino a una struttura di accampamento. In un settore adiacente è stato trovato un elmo in bronzo frammentario.",
54
- "reasoning": "L'elmo in bronzo è un reperto rilevante ma nel testo non è relazionato direttamente a ceneri o monete. Lo estraggo come entità isolata. Le monete e le ceneri suggeriscono un accampamento cartaginese (0.8).",
55
- "entities": [
56
- "Area nord",
57
- "Monete puniche d'argento",
58
- "Struttura di accampamento",
59
- "Accampamento Cartaginese",
60
- "Evento di incendio",
61
- "Elmo in bronzo"
62
- ],
63
- "triples": [
64
- {"subject": "Area nord", "predicate": "core:hasPart", "object": "Monete puniche d'argento", "confidence": 1.0},
65
- {"subject": "Area nord", "predicate": "core:hasPart", "object": "Struttura di accampamento", "confidence": 1.0},
66
- {"subject": "Struttura di accampamento", "predicate": "core:hasConcept", "object": "Accampamento Cartaginese", "confidence": 0.8},
67
- {"subject": "Area nord", "predicate": "core:hasConcept", "object": "Evento di incendio", "confidence": 0.75}
68
- ]
69
- }
70
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ontology/ArCo.owl DELETED
The diff for this file is too large to render. See raw diff
 
ontology/{arco.owl → arco/arco.owl} RENAMED
File without changes
ontology/arco/cis.owl ADDED
The diff for this file is too large to render. See raw diff
 
ontology/{context-description.owl → arco/context-description.owl} RENAMED
File without changes
ontology/{core.owl → arco/core.owl} RENAMED
File without changes
ontology/{location.owl → arco/location.owl} RENAMED
File without changes
ontology/cidoc-crm/cidoc-crm.owl ADDED
The diff for this file is too large to render. See raw diff
 
ontology/domain_index.json ADDED
The diff for this file is too large to render. See raw diff
 
ontology/shapes/auto_constraints.ttl ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt CHANGED
@@ -4,6 +4,7 @@ langchain-community>=0.3.0
4
  langchain-google-genai>=2.0.0 # Per Gemini 2.0 Flash (TDDT)
5
  langchain-huggingface>=0.1.0 # Mantenuto per il Semantic Splitter
6
  langchain-core
 
7
  huggingface_hub
8
 
9
  # --- Data Validation ---
@@ -12,6 +13,7 @@ pyshacl
12
 
13
  # --- NLP & Semantic Chunking ---
14
  sentence-transformers
 
15
  scikit-learn
16
  numpy
17
  nltk
 
4
  langchain-google-genai>=2.0.0 # Per Gemini 2.0 Flash (TDDT)
5
  langchain-huggingface>=0.1.0 # Mantenuto per il Semantic Splitter
6
  langchain-core
7
+ langchain-groq
8
  huggingface_hub
9
 
10
  # --- Data Validation ---
 
13
 
14
  # --- NLP & Semantic Chunking ---
15
  sentence-transformers
16
+ torchvision
17
  scikit-learn
18
  numpy
19
  nltk
src/extraction/extractor.py CHANGED
@@ -1,10 +1,12 @@
1
  import os
2
  import json
 
3
  from typing import List, Optional, Dict, Any
4
  from pydantic import BaseModel, Field, ValidationError
5
  from langchain_core.prompts import ChatPromptTemplate
6
  from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
7
  from langchain_google_genai import ChatGoogleGenerativeAI
 
8
  from dotenv import load_dotenv
9
 
10
  load_dotenv()
@@ -53,18 +55,30 @@ class KnowledgeGraphExtraction(BaseModel):
53
 
54
 
55
  class NeuroSymbolicExtractor:
56
- def __init__(self, index_path="../../ontology/schemas/domain_index.json"):
57
  print("🧠 Inizializzazione TDDT Extractor (Type-Driven Domain Traversal)...")
58
 
59
- google_api_key = os.getenv("GOOGLE_API_KEY")
60
- if not google_api_key:
61
- raise ValueError("❌ GOOGLE_API_KEY mancante. Richiesta per Gemini 2.0 Flash.")
62
-
63
- # Inizializzo l'LLM primario. Temperatura 0 per massimizzare il determinismo.
64
- self.llm = ChatGoogleGenerativeAI(
65
- model="gemini-2.0-flash",
 
 
 
 
 
 
 
 
 
 
 
66
  temperature=0,
67
- api_key=google_api_key
 
68
  )
69
 
70
  # Inizializzo le chain con structured output
@@ -100,14 +114,24 @@ class NeuroSymbolicExtractor:
100
  subclasses[uri] = data
101
  return subclasses
102
 
103
- def _execute_with_retry(self, chain, prompt_messages, max_retries=3):
104
- """Self-correction loop unificato."""
 
 
105
  for attempt in range(max_retries):
106
  try:
107
  result = chain.invoke(prompt_messages)
108
  return result
109
  except Exception as e:
110
- print(f"⚠️ Errore (Tentativo {attempt+1}/{max_retries}): {e}")
 
 
 
 
 
 
 
 
111
  if attempt == max_retries - 1:
112
  print("❌ Fallimento critico del task LLM.")
113
  return None
 
1
  import os
2
  import json
3
+ import time
4
  from typing import List, Optional, Dict, Any
5
  from pydantic import BaseModel, Field, ValidationError
6
  from langchain_core.prompts import ChatPromptTemplate
7
  from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
8
  from langchain_google_genai import ChatGoogleGenerativeAI
9
+ from langchain_groq import ChatGroq
10
  from dotenv import load_dotenv
11
 
12
  load_dotenv()
 
55
 
56
 
57
  class NeuroSymbolicExtractor:
58
+ def __init__(self, index_path="./ontology/domain_index.json"):
59
  print("🧠 Inizializzazione TDDT Extractor (Type-Driven Domain Traversal)...")
60
 
61
+ # google_api_key = os.getenv("GOOGLE_API_KEY")
62
+ # if not google_api_key:
63
+ # raise ValueError("❌ GOOGLE_API_KEY mancante. Richiesta per Gemini 2.0 Flash.")
64
+
65
+ # # Inizializzo l'LLM primario. Temperatura 0 per massimizzare il determinismo.
66
+ # self.llm = ChatGoogleGenerativeAI(
67
+ # model="gemini-2.0-flash",
68
+ # temperature=0,
69
+ # api_key=google_api_key
70
+ # )
71
+
72
+ groq_api_key = os.getenv("GROQ_API_KEY")
73
+ if not groq_api_key:
74
+ raise ValueError("❌ GROQ_API_KEY mancante nel file .env.")
75
+
76
+ # Inizializzo l'LLM primario su Groq.
77
+ self.llm = ChatGroq(
78
+ model="meta-llama/llama-4-scout-17b-16e-instruct",
79
  temperature=0,
80
+ api_key=groq_api_key,
81
+ max_retries=5 # Aumentiamo i retry interni di LangChain
82
  )
83
 
84
  # Inizializzo le chain con structured output
 
114
  subclasses[uri] = data
115
  return subclasses
116
 
117
+ def _execute_with_retry(self, chain, prompt_messages, max_retries=4):
118
+ """Self-correction loop con Exponential Backoff per Rate Limits."""
119
+ base_delay = 5
120
+
121
  for attempt in range(max_retries):
122
  try:
123
  result = chain.invoke(prompt_messages)
124
  return result
125
  except Exception as e:
126
+ error_msg = str(e).upper()
127
+ print(error_msg)
128
+ if "429" in error_msg or "RESOURCE_EXHAUSTED" in error_msg:
129
+ wait_time = base_delay * (2 ** attempt)
130
+ print(f"⏳ [Rate Limit] Quota superata. Attendo {wait_time}s prima di riprovare (Tentativo {attempt+1}/{max_retries})...")
131
+ time.sleep(wait_time)
132
+ else:
133
+ print(f"⚠️ Errore (Tentativo {attempt+1}/{max_retries}): {e}")
134
+
135
  if attempt == max_retries - 1:
136
  print("❌ Fallimento critico del task LLM.")
137
  return None
src/graph/graph_loader.py CHANGED
@@ -83,8 +83,6 @@ class KnowledgeGraphPersister:
83
 
84
  print(f"💾 Preparazione Batch di {len(triples)} triple...")
85
 
86
- # Visto che non posso parametrizzare il predicato nella query Cypher,
87
- # raggruppo le triple per tipo di relazione e lancio un batch per ognuna.
88
  batched_by_pred = defaultdict(list)
89
 
90
  for t in triples:
@@ -93,10 +91,13 @@ class KnowledgeGraphPersister:
93
  item = {
94
  "subj_uri": self.sanitize_name(t.subject),
95
  "subj_label": t.subject,
 
96
  "obj_uri": self.sanitize_name(t.object),
97
  "obj_label": t.object,
98
- "conf": float(t.confidence),
99
- "src": t.source or "unknown"
 
 
100
  }
101
  batched_by_pred[safe_pred].append(item)
102
 
@@ -154,7 +155,7 @@ class KnowledgeGraphPersister:
154
  "MERGE (s:Resource {uri: row.subj_uri}) "
155
  "ON CREATE SET s.label = row.subj_label, s.last_updated = datetime() "
156
  "WITH s, row "
157
- "CALL apoc.create.addLabels(s, [replace(row.obj_label, ':', '_')]) YIELD node "
158
  "RETURN count(node)"
159
  )
160
  tx.run(query, batch=batch_data)
@@ -168,8 +169,13 @@ class KnowledgeGraphPersister:
168
  f"ON CREATE SET s.label = row.subj_label "
169
  f"MERGE (o:Resource {{uri: row.obj_uri}}) "
170
  f"ON CREATE SET o.label = row.obj_label "
 
 
 
 
171
  f"MERGE (s)-[r:`{predicate}`]->(o) "
172
- f"SET r.confidence = row.conf, "
 
173
  f" r.source = row.src, "
174
  f" r.last_updated = datetime()"
175
  )
 
83
 
84
  print(f"💾 Preparazione Batch di {len(triples)} triple...")
85
 
 
 
86
  batched_by_pred = defaultdict(list)
87
 
88
  for t in triples:
 
91
  item = {
92
  "subj_uri": self.sanitize_name(t.subject),
93
  "subj_label": t.subject,
94
+ "subj_type": getattr(t, 'subject_type', '').replace(":", "_").replace("-", "_"),
95
  "obj_uri": self.sanitize_name(t.object),
96
  "obj_label": t.object,
97
+ "obj_type": getattr(t, 'object_type', '').replace(":", "_").replace("-", "_"),
98
+ "evidence": getattr(t, 'evidence', 'N/A'),
99
+ "reasoning": getattr(t, 'reasoning', 'N/A'),
100
+ "src": getattr(t, 'source', 'unknown') or 'unknown'
101
  }
102
  batched_by_pred[safe_pred].append(item)
103
 
 
155
  "MERGE (s:Resource {uri: row.subj_uri}) "
156
  "ON CREATE SET s.label = row.subj_label, s.last_updated = datetime() "
157
  "WITH s, row "
158
+ "SET s:$( [replace(row.obj_label, ':', '_')] ) "
159
  "RETURN count(node)"
160
  )
161
  tx.run(query, batch=batch_data)
 
169
  f"ON CREATE SET s.label = row.subj_label "
170
  f"MERGE (o:Resource {{uri: row.obj_uri}}) "
171
  f"ON CREATE SET o.label = row.obj_label "
172
+ f"WITH s, o, row, "
173
+ f" CASE WHEN row.subj_type <> '' THEN [row.subj_type] ELSE [] END AS s_labels, "
174
+ f" CASE WHEN row.obj_type <> '' THEN [row.obj_type] ELSE [] END AS o_labels "
175
+ f"SET s:$(s_labels), o:$(o_labels) "
176
  f"MERGE (s)-[r:`{predicate}`]->(o) "
177
+ f"SET r.evidence = row.evidence, "
178
+ f" r.reasoning = row.reasoning, "
179
  f" r.source = row.src, "
180
  f" r.last_updated = datetime()"
181
  )
src/utils/build_schema.py CHANGED
@@ -4,7 +4,7 @@ from pathlib import Path
4
  from collections import defaultdict
5
  from rdflib import Graph, URIRef, BNode, RDF, RDFS, OWL, Namespace
6
 
7
- # --- MAPPA DEI NAMESPACE (Estesa con CIDOC-CRM) ---
8
  ARCO_NAMESPACES = {
9
  "https://w3id.org/arco/ontology/arco/": "arco",
10
  "https://w3id.org/arco/ontology/core/": "core",
@@ -20,8 +20,12 @@ ARCO_NAMESPACES = {
20
  "https://w3id.org/italia/onto/RO/": "ro",
21
  "https://w3id.org/italia/onto/SM/": "sm",
22
  "https://w3id.org/italia/onto/MU/": "mu",
23
- "http://www.cidoc-crm.org/cidoc-crm/": "crm", # Aggiunto CIDOC-CRM
24
- "http://www.w3.org/2002/07/owl#": "owl"
 
 
 
 
25
  }
26
 
27
  def uri_to_qname(uri: URIRef) -> str:
@@ -39,11 +43,9 @@ def uri_to_qname(uri: URIRef) -> str:
39
  return uri_str.split('/')[-1]
40
 
41
  def get_union_classes(g: Graph, bnode: BNode):
42
- """Estrae le classi da un costrutto owl:unionOf (usato spesso in ArCo per domini/range multipli)."""
43
  union_list = g.value(bnode, OWL.unionOf)
44
  classes = []
45
  if union_list:
46
- # Naviga la lista RDF
47
  current = union_list
48
  while current and current != RDF.nil:
49
  item = g.value(current, RDF.first)
@@ -56,7 +58,6 @@ def build_domain_index_and_shacl(ontology_dir: str, output_json: str, output_sha
56
  print(f"⏳ Inizializzazione Graph e caricamento da {ontology_dir}...")
57
  g = Graph()
58
 
59
- # 1. Carica tutti i file .owl
60
  owl_files = list(Path(ontology_dir).glob('**/*.owl'))
61
  for file_path in owl_files:
62
  try:
@@ -70,7 +71,6 @@ def build_domain_index_and_shacl(ontology_dir: str, output_json: str, output_sha
70
  classes_dict = {}
71
  properties_list = []
72
 
73
- # 2. Estrazione Classi e Gerarchia
74
  for s in g.subjects(RDF.type, OWL.Class):
75
  if isinstance(s, BNode): continue
76
 
@@ -78,7 +78,6 @@ def build_domain_index_and_shacl(ontology_dir: str, output_json: str, output_sha
78
  label = g.value(s, RDFS.label)
79
  comment = g.value(s, RDFS.comment)
80
 
81
- # Filtro lingua: preferisco italiano, altrimenti inglese (per CIDOC-CRM)
82
  label_str = str(label) if label else qname
83
  for lang_label in g.objects(s, RDFS.label):
84
  if lang_label.language == 'it': label_str = str(lang_label)
@@ -87,7 +86,6 @@ def build_domain_index_and_shacl(ontology_dir: str, output_json: str, output_sha
87
  for lang_comment in g.objects(s, RDFS.comment):
88
  if lang_comment.language == 'it': desc_str = str(lang_comment)
89
 
90
- # Trova parent diretti
91
  parents = [uri_to_qname(p) for p in g.objects(s, RDFS.subClassOf) if isinstance(p, URIRef)]
92
 
93
  classes_dict[qname] = {
@@ -97,7 +95,6 @@ def build_domain_index_and_shacl(ontology_dir: str, output_json: str, output_sha
97
  "namespace": qname.split(":")[0] if ":" in qname else "unknown"
98
  }
99
 
100
- # 3. Estrazione Proprietà
101
  for prop_type in [OWL.ObjectProperty, OWL.DatatypeProperty]:
102
  for s in g.subjects(RDF.type, prop_type):
103
  if isinstance(s, BNode): continue
@@ -106,7 +103,6 @@ def build_domain_index_and_shacl(ontology_dir: str, output_json: str, output_sha
106
  label = g.value(s, RDFS.label)
107
  label_str = str(label) if label else qname
108
 
109
- # Dominio
110
  domain_node = g.value(s, RDFS.domain)
111
  domains = []
112
  if isinstance(domain_node, URIRef):
@@ -114,7 +110,6 @@ def build_domain_index_and_shacl(ontology_dir: str, output_json: str, output_sha
114
  elif isinstance(domain_node, BNode):
115
  domains.extend(get_union_classes(g, domain_node))
116
 
117
- # Range
118
  range_node = g.value(s, RDFS.range)
119
  ranges = []
120
  if isinstance(range_node, URIRef):
@@ -129,10 +124,7 @@ def build_domain_index_and_shacl(ontology_dir: str, output_json: str, output_sha
129
  "ranges": ranges
130
  })
131
 
132
- # 4. Calcolo Ereditarietà Transitiva per il Domain Index
133
  properties_by_domain = defaultdict(list)
134
-
135
- # Mappo prima le proprietà ai domini espliciti
136
  for prop in properties_list:
137
  for d in prop["domains"]:
138
  properties_by_domain[d].append({
@@ -142,7 +134,6 @@ def build_domain_index_and_shacl(ontology_dir: str, output_json: str, output_sha
142
  "inherited_from": d
143
  })
144
 
145
- # Funzione ricorsiva per raccogliere proprietà dai parent
146
  def get_inherited_properties(class_qname, visited=None):
147
  if visited is None: visited = set()
148
  if class_qname in visited: return []
@@ -152,7 +143,6 @@ def build_domain_index_and_shacl(ontology_dir: str, output_json: str, output_sha
152
  for parent in classes_dict.get(class_qname, {}).get("parents", []):
153
  inherited = get_inherited_properties(parent, visited)
154
  for p in inherited:
155
- # Evito duplicati
156
  if not any(existing["id"] == p["id"] for existing in props):
157
  props.append(p)
158
  return props
@@ -163,12 +153,10 @@ def build_domain_index_and_shacl(ontology_dir: str, output_json: str, output_sha
163
  if all_props:
164
  final_properties_by_domain[cls] = all_props
165
 
166
- # 5. Generazione Text Embeddings Dictionary
167
  class_embeddings_texts = {
168
  k: f"{v['label']} - {v['description']}" for k, v in classes_dict.items() if v['description']
169
  }
170
 
171
- # 6. Salvataggio domain_index.json
172
  domain_index = {
173
  "classes": classes_dict,
174
  "properties_by_domain": final_properties_by_domain,
@@ -180,7 +168,6 @@ def build_domain_index_and_shacl(ontology_dir: str, output_json: str, output_sha
180
  json.dump(domain_index, f, ensure_ascii=False, indent=2)
181
  print(f"💾 Salvato Indice di Dominio in: {output_json}")
182
 
183
- # 7. Generazione auto_constraints.ttl per SHACL
184
  os.makedirs(os.path.dirname(output_shacl), exist_ok=True)
185
  with open(output_shacl, 'w', encoding='utf-8') as f:
186
  f.write("@prefix sh: <http://www.w3.org/ns/shacl#> .\n")
@@ -192,28 +179,31 @@ def build_domain_index_and_shacl(ontology_dir: str, output_json: str, output_sha
192
  shape_count = 0
193
  for prop in properties_list:
194
  safe_id = prop["id"].replace(":", "_").replace("-", "_")
195
-
196
- # Domain Shape (solo se domain esplicito singolo per non creare conflitti con le Union)
197
  if len(prop["domains"]) == 1:
198
  dom = prop["domains"][0]
199
- f.write(f"ex:{safe_id}_DomainShape a sh:NodeShape ;\n")
200
- f.write(f" sh:targetSubjectsOf {prop['id']} ;\n")
201
- f.write(f" sh:class {dom} .\n\n")
202
- shape_count += 1
 
203
 
204
- # Range Shape (solo se range esplicito singolo)
205
- if len(prop["ranges"]) == 1 and "http" not in prop["ranges"][0]: # Evito XSD datatypes complessi
206
  rng = prop["ranges"][0]
207
- f.write(f"ex:{safe_id}_RangeShape a sh:NodeShape ;\n")
208
- f.write(f" sh:targetObjectsOf {prop['id']} ;\n")
209
- f.write(f" sh:class {rng} .\n\n")
210
- shape_count += 1
 
 
 
 
211
 
212
  print(f"🛡️ Generato SHACL auto_constraints.ttl con {shape_count} regole rigorose in: {output_shacl}")
213
 
214
  if __name__ == "__main__":
215
- ONTOLOGY_FOLDER = "../../ontology/"
216
- OUTPUT_JSON = "../../ontology/schemas/domain_index.json"
217
- OUTPUT_SHACL = "../../ontology/schemas/auto_constraints.ttl"
218
 
219
  build_domain_index_and_shacl(ONTOLOGY_FOLDER, OUTPUT_JSON, OUTPUT_SHACL)
 
4
  from collections import defaultdict
5
  from rdflib import Graph, URIRef, BNode, RDF, RDFS, OWL, Namespace
6
 
7
+ # --- MAPPA DEI NAMESPACE--
8
  ARCO_NAMESPACES = {
9
  "https://w3id.org/arco/ontology/arco/": "arco",
10
  "https://w3id.org/arco/ontology/core/": "core",
 
20
  "https://w3id.org/italia/onto/RO/": "ro",
21
  "https://w3id.org/italia/onto/SM/": "sm",
22
  "https://w3id.org/italia/onto/MU/": "mu",
23
+ "http://www.cidoc-crm.org/cidoc-crm/": "crm",
24
+ "http://www.w3.org/2002/07/owl#": "owl",
25
+ "http://www.w3.org/2000/01/rdf-schema#": "rdfs",
26
+ "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
27
+ "http://www.w3.org/2001/XMLSchema#": "xsd",
28
+ "http://www.w3.org/2004/02/skos/core#": "skos"
29
  }
30
 
31
  def uri_to_qname(uri: URIRef) -> str:
 
43
  return uri_str.split('/')[-1]
44
 
45
  def get_union_classes(g: Graph, bnode: BNode):
 
46
  union_list = g.value(bnode, OWL.unionOf)
47
  classes = []
48
  if union_list:
 
49
  current = union_list
50
  while current and current != RDF.nil:
51
  item = g.value(current, RDF.first)
 
58
  print(f"⏳ Inizializzazione Graph e caricamento da {ontology_dir}...")
59
  g = Graph()
60
 
 
61
  owl_files = list(Path(ontology_dir).glob('**/*.owl'))
62
  for file_path in owl_files:
63
  try:
 
71
  classes_dict = {}
72
  properties_list = []
73
 
 
74
  for s in g.subjects(RDF.type, OWL.Class):
75
  if isinstance(s, BNode): continue
76
 
 
78
  label = g.value(s, RDFS.label)
79
  comment = g.value(s, RDFS.comment)
80
 
 
81
  label_str = str(label) if label else qname
82
  for lang_label in g.objects(s, RDFS.label):
83
  if lang_label.language == 'it': label_str = str(lang_label)
 
86
  for lang_comment in g.objects(s, RDFS.comment):
87
  if lang_comment.language == 'it': desc_str = str(lang_comment)
88
 
 
89
  parents = [uri_to_qname(p) for p in g.objects(s, RDFS.subClassOf) if isinstance(p, URIRef)]
90
 
91
  classes_dict[qname] = {
 
95
  "namespace": qname.split(":")[0] if ":" in qname else "unknown"
96
  }
97
 
 
98
  for prop_type in [OWL.ObjectProperty, OWL.DatatypeProperty]:
99
  for s in g.subjects(RDF.type, prop_type):
100
  if isinstance(s, BNode): continue
 
103
  label = g.value(s, RDFS.label)
104
  label_str = str(label) if label else qname
105
 
 
106
  domain_node = g.value(s, RDFS.domain)
107
  domains = []
108
  if isinstance(domain_node, URIRef):
 
110
  elif isinstance(domain_node, BNode):
111
  domains.extend(get_union_classes(g, domain_node))
112
 
 
113
  range_node = g.value(s, RDFS.range)
114
  ranges = []
115
  if isinstance(range_node, URIRef):
 
124
  "ranges": ranges
125
  })
126
 
 
127
  properties_by_domain = defaultdict(list)
 
 
128
  for prop in properties_list:
129
  for d in prop["domains"]:
130
  properties_by_domain[d].append({
 
134
  "inherited_from": d
135
  })
136
 
 
137
  def get_inherited_properties(class_qname, visited=None):
138
  if visited is None: visited = set()
139
  if class_qname in visited: return []
 
143
  for parent in classes_dict.get(class_qname, {}).get("parents", []):
144
  inherited = get_inherited_properties(parent, visited)
145
  for p in inherited:
 
146
  if not any(existing["id"] == p["id"] for existing in props):
147
  props.append(p)
148
  return props
 
153
  if all_props:
154
  final_properties_by_domain[cls] = all_props
155
 
 
156
  class_embeddings_texts = {
157
  k: f"{v['label']} - {v['description']}" for k, v in classes_dict.items() if v['description']
158
  }
159
 
 
160
  domain_index = {
161
  "classes": classes_dict,
162
  "properties_by_domain": final_properties_by_domain,
 
168
  json.dump(domain_index, f, ensure_ascii=False, indent=2)
169
  print(f"💾 Salvato Indice di Dominio in: {output_json}")
170
 
 
171
  os.makedirs(os.path.dirname(output_shacl), exist_ok=True)
172
  with open(output_shacl, 'w', encoding='utf-8') as f:
173
  f.write("@prefix sh: <http://www.w3.org/ns/shacl#> .\n")
 
179
  shape_count = 0
180
  for prop in properties_list:
181
  safe_id = prop["id"].replace(":", "_").replace("-", "_")
182
+
 
183
  if len(prop["domains"]) == 1:
184
  dom = prop["domains"][0]
185
+ if ":" in dom and ":" in prop["id"]:
186
+ f.write(f"ex:{safe_id}_DomainShape a sh:NodeShape ;\n")
187
+ f.write(f" sh:targetSubjectsOf {prop['id']} ;\n")
188
+ f.write(f" sh:class {dom} .\n\n")
189
+ shape_count += 1
190
 
191
+ if len(prop["ranges"]) == 1:
 
192
  rng = prop["ranges"][0]
193
+ if ":" in rng and ":" in prop["id"]:
194
+ f.write(f"ex:{safe_id}_RangeShape a sh:NodeShape ;\n")
195
+ f.write(f" sh:targetObjectsOf {prop['id']} ;\n")
196
+ if rng.startswith("xsd:") or rng == "rdfs:Literal":
197
+ f.write(f" sh:datatype {rng} .\n\n")
198
+ else:
199
+ f.write(f" sh:class {rng} .\n\n")
200
+ shape_count += 1
201
 
202
  print(f"🛡️ Generato SHACL auto_constraints.ttl con {shape_count} regole rigorose in: {output_shacl}")
203
 
204
  if __name__ == "__main__":
205
+ ONTOLOGY_FOLDER = "./ontology/"
206
+ OUTPUT_JSON = "./ontology/domain_index.json"
207
+ OUTPUT_SHACL = "./ontology/shapes/auto_constraints.ttl"
208
 
209
  build_domain_index_and_shacl(ONTOLOGY_FOLDER, OUTPUT_JSON, OUTPUT_SHACL)
src/validation/validator.py CHANGED
@@ -1,11 +1,12 @@
1
  import os
 
2
  from pathlib import Path
3
  from rdflib import Graph, Literal, RDF, URIRef, Namespace
4
  from rdflib.namespace import SKOS, OWL
5
  from pyshacl import validate
6
 
7
  class SemanticValidator:
8
- def __init__(self, ontology_dir="../../ontology", shapes_file="../../ontology/shapes/auto_constraints.ttl"):
9
  self.shapes_file = shapes_file
10
 
11
  # Mappatura namespace
@@ -49,7 +50,12 @@ class SemanticValidator:
49
  if prefix in self.namespaces:
50
  return self.namespaces[prefix][name]
51
 
52
- clean_name = text_val.replace(" ", "_").replace("'", "").replace('"', "")
 
 
 
 
 
53
  return self.namespaces["ex"][clean_name]
54
 
55
  def _json_to_rdf(self, entities, triples):
 
1
  import os
2
+ import re
3
  from pathlib import Path
4
  from rdflib import Graph, Literal, RDF, URIRef, Namespace
5
  from rdflib.namespace import SKOS, OWL
6
  from pyshacl import validate
7
 
8
  class SemanticValidator:
9
+ def __init__(self, ontology_dir="./ontology", shapes_file="./ontology/shapes/auto_constraints.ttl"):
10
  self.shapes_file = shapes_file
11
 
12
  # Mappatura namespace
 
50
  if prefix in self.namespaces:
51
  return self.namespaces[prefix][name]
52
 
53
+ clean_name = text_val.replace(" ", "_")
54
+ clean_name = re.sub(r'[^a-zA-Z0-9_]', '', clean_name)
55
+
56
+ if not clean_name:
57
+ clean_name = "UnknownEntity"
58
+
59
  return self.namespaces["ex"][clean_name]
60
 
61
  def _json_to_rdf(self, entities, triples):