Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Prepares the repository for deployment to Hugging Face Spaces. | |
| This script: | |
| 1. Verifies that pre-processed data exists | |
| 2. Checks that PDF files aren't included in the repository | |
| 3. Lists the files that will be included in the deployment | |
| 4. Provides instructions for deploying to Hugging Face | |
| Run this after successfully pre-processing your data with preprocess_data.py | |
| """ | |
| import os | |
| import sys | |
| from pathlib import Path | |
| import shutil | |
| PROCESSED_DATA_DIR = Path("processed_data") | |
| CHUNKS_FILE = PROCESSED_DATA_DIR / "document_chunks.pkl" | |
| QDRANT_DIR = PROCESSED_DATA_DIR / "qdrant_vectorstore" | |
| DATA_DIR = Path("data") | |
| def check_preprocessed_data(): | |
| """Check if pre-processed data exists and is ready for deployment.""" | |
| print("\n=== Checking for pre-processed data ===") | |
| if not PROCESSED_DATA_DIR.exists(): | |
| print(f"β ERROR: Processed data directory not found: {PROCESSED_DATA_DIR}") | |
| print(" Please run scripts/preprocess_data.py first.") | |
| return False | |
| if not CHUNKS_FILE.exists(): | |
| print(f"β ERROR: Document chunks file not found: {CHUNKS_FILE}") | |
| print(" Please run scripts/preprocess_data.py first.") | |
| return False | |
| if not QDRANT_DIR.exists(): | |
| print(f"β ERROR: Vector store directory not found: {QDRANT_DIR}") | |
| print(" Please run scripts/preprocess_data.py first.") | |
| return False | |
| print(f"β Found processed data directory: {PROCESSED_DATA_DIR}") | |
| print(f"β Found document chunks file: {CHUNKS_FILE}") | |
| print(f"β Found vector store directory: {QDRANT_DIR}") | |
| # Check if vector store actually has content | |
| qdrant_files = list(QDRANT_DIR.glob("**/*")) | |
| if len(qdrant_files) < 5: # Arbitrary threshold for a minimum number of files | |
| print(f"β οΈ WARNING: Vector store directory might be empty or incomplete.") | |
| print(f" Only found {len(qdrant_files)} files in {QDRANT_DIR}") | |
| else: | |
| print(f"β Vector store directory contains {len(qdrant_files)} files/directories") | |
| return True | |
| def check_for_pdf_files(): | |
| """Check that PDF files aren't included in the data directory.""" | |
| print("\n=== Checking for PDF files ===") | |
| pdf_files = list(DATA_DIR.glob("**/*.pdf")) | |
| if pdf_files: | |
| print(f"β οΈ WARNING: Found {len(pdf_files)} PDF files in the data directory.") | |
| print(" These files will NOT be committed to the repository if you follow the instructions below.") | |
| print(" PDFs in the data directory are excluded in .gitignore.") | |
| for pdf in pdf_files[:5]: # Show first 5 PDFs only | |
| print(f" - {pdf}") | |
| if len(pdf_files) > 5: | |
| print(f" - ... and {len(pdf_files) - 5} more") | |
| else: | |
| print("β No PDF files found in the data directory - good!") | |
| return True | |
| def list_deployment_files(): | |
| """List the essential files that will be included in the deployment.""" | |
| print("\n=== Files to include in deployment ===") | |
| essential_files = [ | |
| "app.py", | |
| "requirements.txt", | |
| "Dockerfile", | |
| "docker-compose.yml", | |
| "README.md", | |
| "scripts/docker-entrypoint.sh", | |
| ".dockerignore", | |
| "processed_data/", | |
| ] | |
| print("The following files and directories should be included in your deployment:") | |
| for file in essential_files: | |
| file_path = Path(file) | |
| if file_path.exists() or (file.endswith('/') and Path(file.rstrip('/')).exists()): | |
| print(f"β {file}") | |
| else: | |
| print(f"β {file} - Not found!") | |
| return True | |
| def provide_deployment_instructions(): | |
| """Provide instructions for deploying to Hugging Face.""" | |
| print("\n=== Deployment Instructions ===") | |
| print(""" | |
| To deploy to Hugging Face Spaces: | |
| 1. Ensure you have the Hugging Face CLI installed: | |
| pip install huggingface_hub | |
| 2. Log in to Hugging Face: | |
| huggingface-cli login | |
| 3. Create a new Hugging Face Space with Docker deployment from the Hugging Face website | |
| 4. Add your repository as a remote: | |
| git remote add huggingface https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME | |
| 5. Stage the necessary files (do NOT include PDF files): | |
| git add app.py requirements.txt Dockerfile docker-compose.yml README.md scripts/docker-entrypoint.sh .dockerignore processed_data/ | |
| 6. Commit the changes: | |
| git commit -m "Prepare for Hugging Face deployment with pre-processed data" | |
| 7. Push to Hugging Face: | |
| git push huggingface main | |
| 8. Set your OpenAI API key in the Hugging Face Space settings | |
| """) | |
| return True | |
| def main(): | |
| """Main entry point of the script.""" | |
| print("=" * 80) | |
| print("PREPARING FOR HUGGING FACE DEPLOYMENT") | |
| print("=" * 80) | |
| checks = [ | |
| check_preprocessed_data, | |
| check_for_pdf_files, | |
| list_deployment_files, | |
| provide_deployment_instructions | |
| ] | |
| all_passed = True | |
| for check in checks: | |
| if not check(): | |
| all_passed = False | |
| if all_passed: | |
| print("\nβ All checks passed! Your repository is ready for deployment to Hugging Face Spaces.") | |
| print(" Follow the deployment instructions above to deploy your application.") | |
| else: | |
| print("\nβ Some checks failed. Please fix the issues before deploying.") | |
| sys.exit(1) | |
| if __name__ == "__main__": | |
| main() |