graphwiz-ireland / src /dataset_loader.py
hirthickraj2015's picture
Use older compatible versions: sentence-transformers 2.2.2, transformers 4.30.2, torch 2.0.1, huggingface-hub 0.16.4 - fix dataset loader for older API
1586c67
raw
history blame
3.36 kB
"""
Dataset Loader for Hugging Face Datasets
Downloads dataset files from HF Datasets repository if not present locally
"""
import os
from pathlib import Path
from huggingface_hub import hf_hub_download
import streamlit as st
# Dataset configuration
DATASET_REPO = "hirthickraj2015/graphwiz-ireland-dataset"
DATASET_FILES = [
"chunks.json",
"graphrag_index.json",
"graphrag_graphs.pkl",
"hybrid_hnsw_index.bin",
"hybrid_indexes.pkl",
"ireland_articles.json",
"page_titles.json",
"chunk_stats.json",
"graphrag_stats.json",
"extraction_stats.json",
"extraction_progress.json"
]
def ensure_dataset_files(dataset_dir: str = "dataset/wikipedia_ireland") -> bool:
"""
Ensure all dataset files are available locally.
Downloads from HF Datasets if missing.
Args:
dataset_dir: Local directory for dataset files
Returns:
True if all files are available, False otherwise
"""
dataset_path = Path(dataset_dir)
dataset_path.mkdir(parents=True, exist_ok=True)
missing_files = []
for filename in DATASET_FILES:
file_path = dataset_path / filename
if not file_path.exists():
missing_files.append(filename)
if not missing_files:
print(f"[INFO] All dataset files present locally in {dataset_dir}")
return True
print(f"[INFO] Missing {len(missing_files)} files, downloading from HF Datasets...")
# Download missing files
import shutil
try:
for filename in missing_files:
print(f"[INFO] Downloading {filename}...")
if hasattr(st, 'status'):
with st.status(f"Downloading {filename}...", expanded=True) as status:
downloaded_path = hf_hub_download(
repo_id=DATASET_REPO,
filename=filename,
repo_type="dataset"
)
# Move to target directory
target_path = dataset_path / filename
shutil.copy2(downloaded_path, target_path)
status.update(label=f"βœ“ Downloaded {filename}", state="complete")
else:
downloaded_path = hf_hub_download(
repo_id=DATASET_REPO,
filename=filename,
repo_type="dataset"
)
# Move to target directory
target_path = dataset_path / filename
shutil.copy2(downloaded_path, target_path)
print(f"[SUCCESS] Downloaded {filename}")
print("[SUCCESS] All dataset files downloaded successfully!")
return True
except Exception as e:
print(f"[ERROR] Failed to download dataset files: {e}")
if hasattr(st, 'error'):
st.error(f"Failed to download dataset files: {e}")
return False
def get_dataset_path(filename: str, dataset_dir: str = "dataset/wikipedia_ireland") -> str:
"""
Get full path to a dataset file, downloading if necessary.
Args:
filename: Name of the dataset file
dataset_dir: Local directory for dataset files
Returns:
Full path to the dataset file
"""
# Ensure dataset files are available
ensure_dataset_files(dataset_dir)
return str(Path(dataset_dir) / filename)