Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| import json | |
| import argparse | |
| from pathlib import Path | |
| import chromadb | |
| from chromadb.config import Settings | |
| def export_collection(collection, output_dir: Path, include_embeddings=False): | |
| """Export one ChromaDB collection to a JSON file.""" | |
| # Pull everything (large collections may need pagination) | |
| include_fields = ["documents", "metadatas"] | |
| if include_embeddings: | |
| include_fields.append("embeddings") | |
| items = collection.get(include=include_fields) | |
| data = [] | |
| for idx, _id in enumerate(items["ids"]): | |
| record = { | |
| "id": _id, | |
| "document": items["documents"][idx] if items.get("documents") else None, | |
| "metadata": items["metadatas"][idx] if items.get("metadatas") else None, | |
| } | |
| if include_embeddings: | |
| record["embedding"] = ( | |
| items["embeddings"][idx] if items.get("embeddings") else None | |
| ) | |
| data.append(record) | |
| # Write to <collection>.json | |
| out_path = output_dir / f"{collection.name}.json" | |
| out_path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8") | |
| print(f"β Exported {collection.name} β {out_path}") | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Export ChromaDB collections to JSON.") | |
| parser.add_argument( | |
| "--db-path", | |
| type=str, | |
| required=True, | |
| help="Path to the chromadb_store folder (where the DB is persisted)", | |
| ) | |
| parser.add_argument( | |
| "--output", | |
| type=str, | |
| default="chroma_exports", | |
| help="Output folder for json files", | |
| ) | |
| parser.add_argument( | |
| "--include-embeddings", | |
| action="store_true", | |
| help="Include embeddings in the export (off by default)", | |
| ) | |
| args = parser.parse_args() | |
| db_path = Path(args.db_path).expanduser().resolve() | |
| output_dir = Path(args.output).expanduser().resolve() | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| # Connect to the persistent ChromaDB store | |
| client = chromadb.PersistentClient( | |
| path=str(db_path), | |
| settings=Settings(anonymized_telemetry=False) | |
| ) | |
| # Iterate collections | |
| for cname in client.list_collections(): | |
| collection = client.get_collection(cname.name) | |
| export_collection(collection, output_dir, args.include_embeddings) | |
| print("\nπ All collections exported!") | |
| if __name__ == "__main__": | |
| main() | |