|
|
""" |
|
|
Ontology Flattener |
|
|
Converts hierarchical JSON-LD plant data to flat fact lists for OG-RAG HyperGraph |
|
|
""" |
|
|
import sys |
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent)) |
|
|
|
|
|
from typing import List, Dict, Any |
|
|
import json |
|
|
from utils.key_normalizer import normalize_key |
|
|
from utils.chunker import chunk_long_value, estimate_tokens |
|
|
from utils.data_loader import PlantDataLoader |
|
|
|
|
|
|
|
|
def flatten_plant_ontology( |
|
|
plant_data: Dict[str, Any], |
|
|
chunk_threshold: int = 250 |
|
|
) -> List[Dict[str, Any]]: |
|
|
""" |
|
|
Convert nested JSON-LD to flat fact list with intelligent chunking |
|
|
|
|
|
Args: |
|
|
plant_data: Nested plant ontology data |
|
|
chunk_threshold: Maximum tokens before chunking (default: 250) |
|
|
|
|
|
Returns: |
|
|
List of flat facts suitable for HyperGraph |
|
|
""" |
|
|
facts = [] |
|
|
plant_name = plant_data.get("ten", "") |
|
|
|
|
|
if not plant_name: |
|
|
return facts |
|
|
|
|
|
|
|
|
basic_fact = { |
|
|
"Tên": plant_name, |
|
|
"Tên khoa học": plant_data.get("ten_khoa_hoc", ""), |
|
|
"Họ": plant_data.get("ho", "") |
|
|
} |
|
|
|
|
|
basic_fact = {k: v for k, v in basic_fact.items() if v} |
|
|
if basic_fact: |
|
|
basic_fact["_is_chunked"] = False |
|
|
facts.append(basic_fact) |
|
|
|
|
|
|
|
|
sections = [ |
|
|
"Mô tả", "Phân bố", |
|
|
"Công dụng", "Cách dùng", "Bộ phận dùng", |
|
|
"Thông tin khác" |
|
|
] |
|
|
|
|
|
for section in sections: |
|
|
if section not in plant_data: |
|
|
continue |
|
|
|
|
|
section_data = plant_data[section] |
|
|
|
|
|
if not isinstance(section_data, dict): |
|
|
continue |
|
|
|
|
|
|
|
|
for field_key, field_value in section_data.items(): |
|
|
if not field_value or field_value == "": |
|
|
continue |
|
|
|
|
|
|
|
|
normalized_key = normalize_key(field_key) |
|
|
|
|
|
|
|
|
value_str = str(field_value) |
|
|
|
|
|
|
|
|
if estimate_tokens(value_str) > chunk_threshold: |
|
|
|
|
|
chunks = chunk_long_value( |
|
|
normalized_key, |
|
|
value_str, |
|
|
max_tokens=chunk_threshold |
|
|
) |
|
|
|
|
|
for chunk_key, chunk_value, chunk_id in chunks: |
|
|
fact = { |
|
|
"Tên": plant_name, |
|
|
"Mục": normalize_key(section), |
|
|
chunk_key: chunk_value, |
|
|
"_chunk_id": chunk_id, |
|
|
"_is_chunked": True |
|
|
} |
|
|
facts.append(fact) |
|
|
else: |
|
|
|
|
|
fact = { |
|
|
"Tên": plant_name, |
|
|
"Mục": normalize_key(section), |
|
|
normalized_key: value_str, |
|
|
"_is_chunked": False |
|
|
} |
|
|
facts.append(fact) |
|
|
|
|
|
return facts |
|
|
|
|
|
|
|
|
def build_all_plant_facts( |
|
|
data_dir: str = "data", |
|
|
output_file: str = "plant_facts.json", |
|
|
chunk_threshold: int = 250 |
|
|
) -> List[Dict]: |
|
|
""" |
|
|
Process all plants and generate flat facts |
|
|
|
|
|
Args: |
|
|
data_dir: Directory containing JSON-LD files |
|
|
output_file: Output file for facts (optional) |
|
|
chunk_threshold: Token threshold for chunking |
|
|
|
|
|
Returns: |
|
|
List of all facts from all plants |
|
|
""" |
|
|
from tqdm import tqdm |
|
|
|
|
|
loader = PlantDataLoader(data_dir) |
|
|
all_facts = [] |
|
|
|
|
|
jsonld_files = sorted(Path(data_dir).glob("ontology_node_*.jsonld")) |
|
|
|
|
|
print(f"\nProcessing {len(jsonld_files)} plant files...") |
|
|
|
|
|
for jsonld_file in tqdm(jsonld_files, desc="Flattening plants"): |
|
|
|
|
|
plant_data = loader._load_jsonld_file(jsonld_file) |
|
|
|
|
|
if not plant_data: |
|
|
continue |
|
|
|
|
|
|
|
|
plant_facts = flatten_plant_ontology(plant_data, chunk_threshold) |
|
|
all_facts.extend(plant_facts) |
|
|
|
|
|
|
|
|
if output_file: |
|
|
print(f"\nSaving {len(all_facts)} facts to {output_file}...") |
|
|
with open(output_file, "w", encoding="utf-8") as f: |
|
|
json.dump(all_facts, f, ensure_ascii=False, indent=2) |
|
|
|
|
|
|
|
|
print(f"\n{'='*60}") |
|
|
print(f"STATISTICS") |
|
|
print(f"{'='*60}") |
|
|
print(f"Total plants processed: {len(jsonld_files)}") |
|
|
print(f"Total facts generated: {len(all_facts)}") |
|
|
print(f"Avg facts per plant: {len(all_facts) / len(jsonld_files):.1f}") |
|
|
|
|
|
chunked = [f for f in all_facts if f.get("_is_chunked", False)] |
|
|
print(f"Chunked facts: {len(chunked)} ({len(chunked)/len(all_facts)*100:.1f}%)") |
|
|
print(f"Unchunked facts: {len(all_facts) - len(chunked)}") |
|
|
|
|
|
|
|
|
sections = [f.get("Mục") for f in all_facts if "Mục" in f] |
|
|
section_counts = {} |
|
|
for section in sections: |
|
|
section_counts[section] = section_counts.get(section, 0) + 1 |
|
|
|
|
|
print(f"\nSection coverage:") |
|
|
for section, count in sorted(section_counts.items(), key=lambda x: -x[1]): |
|
|
print(f" {section}: {count}") |
|
|
|
|
|
print(f"{'='*60}\n") |
|
|
|
|
|
return all_facts |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
import sys |
|
|
|
|
|
|
|
|
data_dir = sys.argv[1] if len(sys.argv) > 1 else "data" |
|
|
output_file = sys.argv[2] if len(sys.argv) > 2 else "plant_facts.json" |
|
|
|
|
|
facts = build_all_plant_facts(data_dir, output_file) |
|
|
|
|
|
print(f"✅ Done! Generated {len(facts)} facts") |
|
|
print(f"📄 Saved to {output_file}") |
|
|
|