YanBoChen
WIP: Remove obsolete files and implement cloud data loading for customization and retrieval systems
d603ef9
"""Customization System Cloud Configuration"""
import os
from pathlib import Path
from huggingface_hub import hf_hub_download
import logging
logger = logging.getLogger(__name__)
class CustomizationCloudLoader:
"""Customization-specific cloud data loader"""
def __init__(self):
self.dataset_repo = "ybchen928/oncall-guide-ai-models"
self.use_cloud = os.getenv('USE_CLOUD_DATA', 'true').lower() == 'true'
def get_processing_file_path(self, relative_path: str) -> str:
"""Get processing file path for Customization Pipeline"""
if self.use_cloud:
return hf_hub_download(
repo_id=self.dataset_repo,
filename=f"customization_data/processing/{relative_path}",
repo_type="dataset"
)
else:
# Local development mode - correct path to processing folder
base_path = Path(__file__).parent.parent.parent / "customization" / "processing"
return str(base_path / relative_path)
def preload_all_processing_files(self) -> tuple:
"""Preload all processing files and return directory paths"""
if self.use_cloud:
# Download all required files
files_to_download = [
"embeddings/document_index.json",
"embeddings/tag_embeddings.json",
"embeddings/document_tag_mapping.json",
"embeddings/chunk_embeddings.json",
"indices/chunk_mappings.json",
"indices/tag_mappings.json",
"indices/annoy_metadata.json",
"indices/chunk_embeddings.ann",
"indices/tag_embeddings.ann",
"mapping.json"
]
# Download each file to ensure they're all cached
for file_path in files_to_download:
try:
self.get_processing_file_path(file_path)
logger.info(f"Downloaded: {file_path}")
except Exception as e:
logger.warning(f"Failed to download {file_path}: {e}")
# Get directory paths from downloaded files
embeddings_dir = Path(self.get_processing_file_path("embeddings/document_index.json")).parent
indices_dir = Path(self.get_processing_file_path("indices/chunk_mappings.json")).parent
return str(embeddings_dir), str(indices_dir)
else:
# Local development mode
base_path = Path(__file__).parent.parent.parent / "customization" / "processing"
return str(base_path / "embeddings"), str(base_path / "indices")
# Global instance
customization_loader = CustomizationCloudLoader()