Spaces:

thuonguyenvan
/

Plant-Recognition-with-Q-A-System-Backend

Sleeping

Plant-Recognition-with-Q-A-System-Backend / scripts /flatten_ontology.py

Thuong Nguyen

Initial deployment: FastAPI backend with data and Dockerfile

f3cb94f 21 days ago

5.82 kB

	"""
	Ontology Flattener
	Converts hierarchical JSON-LD plant data to flat fact lists for OG-RAG HyperGraph
	"""
	import sys
	from pathlib import Path

	# Add parent directory to path
	sys.path.insert(0, str(Path(__file__).parent.parent))

	from typing import List, Dict, Any
	import json
	from utils.key_normalizer import normalize_key
	from utils.chunker import chunk_long_value, estimate_tokens
	from utils.data_loader import PlantDataLoader


	def flatten_plant_ontology(
	plant_data: Dict[str, Any],
	chunk_threshold: int = 250
	) -> List[Dict[str, Any]]:
	"""
	Convert nested JSON-LD to flat fact list with intelligent chunking

	Args:
	plant_data: Nested plant ontology data
	chunk_threshold: Maximum tokens before chunking (default: 250)

	Returns:
	List of flat facts suitable for HyperGraph
	"""
	facts = []
	plant_name = plant_data.get("ten", "")

	if not plant_name:
	return facts

	# 1. Basic Info (always keep together, no chunking)
	basic_fact = {
	"Tên": plant_name,
	"Tên khoa học": plant_data.get("ten_khoa_hoc", ""),
	"Họ": plant_data.get("ho", "")
	}
	# Remove empty values
	basic_fact = {k: v for k, v in basic_fact.items() if v}
	if basic_fact:
	basic_fact["_is_chunked"] = False
	facts.append(basic_fact)

	# 2. Process each section
	sections = [
	"Mô tả", "Phân bố",
	"Công dụng", "Cách dùng", "Bộ phận dùng",
	"Thông tin khác"
	]

	for section in sections:
	if section not in plant_data:
	continue

	section_data = plant_data[section]

	if not isinstance(section_data, dict):
	continue

	# Process each field in section
	for field_key, field_value in section_data.items():
	if not field_value or field_value == "":
	continue

	# Normalize key to Vietnamese
	normalized_key = normalize_key(field_key)

	# Convert to string
	value_str = str(field_value)

	# Check if chunking needed
	if estimate_tokens(value_str) > chunk_threshold:
	# CHUNK IT!
	chunks = chunk_long_value(
	normalized_key,
	value_str,
	max_tokens=chunk_threshold
	)

	for chunk_key, chunk_value, chunk_id in chunks:
	fact = {
	"Tên": plant_name,
	"Mục": normalize_key(section),
	chunk_key: chunk_value,
	"_chunk_id": chunk_id,
	"_is_chunked": True
	}
	facts.append(fact)
	else:
	# No chunking needed
	fact = {
	"Tên": plant_name,
	"Mục": normalize_key(section),
	normalized_key: value_str,
	"_is_chunked": False
	}
	facts.append(fact)

	return facts


	def build_all_plant_facts(
	data_dir: str = "data",
	output_file: str = "plant_facts.json",
	chunk_threshold: int = 250
	) -> List[Dict]:
	"""
	Process all plants and generate flat facts

	Args:
	data_dir: Directory containing JSON-LD files
	output_file: Output file for facts (optional)
	chunk_threshold: Token threshold for chunking

	Returns:
	List of all facts from all plants
	"""
	from tqdm import tqdm

	loader = PlantDataLoader(data_dir)
	all_facts = []

	jsonld_files = sorted(Path(data_dir).glob("ontology_node_*.jsonld"))

	print(f"\nProcessing {len(jsonld_files)} plant files...")

	for jsonld_file in tqdm(jsonld_files, desc="Flattening plants"):
	# Load plant data
	plant_data = loader._load_jsonld_file(jsonld_file)

	if not plant_data:
	continue

	# Flatten + chunk
	plant_facts = flatten_plant_ontology(plant_data, chunk_threshold)
	all_facts.extend(plant_facts)

	# Save if output file specified
	if output_file:
	print(f"\nSaving {len(all_facts)} facts to {output_file}...")
	with open(output_file, "w", encoding="utf-8") as f:
	json.dump(all_facts, f, ensure_ascii=False, indent=2)

	# Print statistics
	print(f"\n{'='*60}")
	print(f"STATISTICS")
	print(f"{'='*60}")
	print(f"Total plants processed: {len(jsonld_files)}")
	print(f"Total facts generated: {len(all_facts)}")
	print(f"Avg facts per plant: {len(all_facts) / len(jsonld_files):.1f}")

	chunked = [f for f in all_facts if f.get("_is_chunked", False)]
	print(f"Chunked facts: {len(chunked)} ({len(chunked)/len(all_facts)*100:.1f}%)")
	print(f"Unchunked facts: {len(all_facts) - len(chunked)}")

	# Section coverage
	sections = [f.get("Mục") for f in all_facts if "Mục" in f]
	section_counts = {}
	for section in sections:
	section_counts[section] = section_counts.get(section, 0) + 1

	print(f"\nSection coverage:")
	for section, count in sorted(section_counts.items(), key=lambda x: -x[1]):
	print(f" {section}: {count}")

	print(f"{'='*60}\n")

	return all_facts


	if __name__ == "__main__":
	import sys

	# Allow optional arguments
	data_dir = sys.argv[1] if len(sys.argv) > 1 else "data"
	output_file = sys.argv[2] if len(sys.argv) > 2 else "plant_facts.json"

	facts = build_all_plant_facts(data_dir, output_file)

	print(f"✅ Done! Generated {len(facts)} facts")
	print(f"📄 Saved to {output_file}")