Spaces:

hirthickraj2015
/

graphwiz-ireland

Sleeping

graphwiz-ireland / src /dataset_loader.py

hirthickraj2015

Use older compatible versions: sentence-transformers 2.2.2, transformers 4.30.2, torch 2.0.1, huggingface-hub 0.16.4 - fix dataset loader for older API

1586c67 12 days ago

raw

history blame

3.36 kB

	"""
	Dataset Loader for Hugging Face Datasets
	Downloads dataset files from HF Datasets repository if not present locally
	"""

	import os
	from pathlib import Path
	from huggingface_hub import hf_hub_download
	import streamlit as st

	# Dataset configuration
	DATASET_REPO = "hirthickraj2015/graphwiz-ireland-dataset"
	DATASET_FILES = [
	"chunks.json",
	"graphrag_index.json",
	"graphrag_graphs.pkl",
	"hybrid_hnsw_index.bin",
	"hybrid_indexes.pkl",
	"ireland_articles.json",
	"page_titles.json",
	"chunk_stats.json",
	"graphrag_stats.json",
	"extraction_stats.json",
	"extraction_progress.json"
	]

	def ensure_dataset_files(dataset_dir: str = "dataset/wikipedia_ireland") -> bool:
	"""
	Ensure all dataset files are available locally.
	Downloads from HF Datasets if missing.

	Args:
	dataset_dir: Local directory for dataset files

	Returns:
	True if all files are available, False otherwise
	"""
	dataset_path = Path(dataset_dir)
	dataset_path.mkdir(parents=True, exist_ok=True)

	missing_files = []
	for filename in DATASET_FILES:
	file_path = dataset_path / filename
	if not file_path.exists():
	missing_files.append(filename)

	if not missing_files:
	print(f"[INFO] All dataset files present locally in {dataset_dir}")
	return True

	print(f"[INFO] Missing {len(missing_files)} files, downloading from HF Datasets...")

	# Download missing files
	import shutil
	try:
	for filename in missing_files:
	print(f"[INFO] Downloading {filename}...")
	if hasattr(st, 'status'):
	with st.status(f"Downloading {filename}...", expanded=True) as status:
	downloaded_path = hf_hub_download(
	repo_id=DATASET_REPO,
	filename=filename,
	repo_type="dataset"
	)
	# Move to target directory
	target_path = dataset_path / filename
	shutil.copy2(downloaded_path, target_path)
	status.update(label=f"✓ Downloaded {filename}", state="complete")
	else:
	downloaded_path = hf_hub_download(
	repo_id=DATASET_REPO,
	filename=filename,
	repo_type="dataset"
	)
	# Move to target directory
	target_path = dataset_path / filename
	shutil.copy2(downloaded_path, target_path)
	print(f"[SUCCESS] Downloaded {filename}")

	print("[SUCCESS] All dataset files downloaded successfully!")
	return True

	except Exception as e:
	print(f"[ERROR] Failed to download dataset files: {e}")
	if hasattr(st, 'error'):
	st.error(f"Failed to download dataset files: {e}")
	return False


	def get_dataset_path(filename: str, dataset_dir: str = "dataset/wikipedia_ireland") -> str:
	"""
	Get full path to a dataset file, downloading if necessary.

	Args:
	filename: Name of the dataset file
	dataset_dir: Local directory for dataset files

	Returns:
	Full path to the dataset file
	"""
	# Ensure dataset files are available
	ensure_dataset_files(dataset_dir)

	return str(Path(dataset_dir) / filename)