Spaces:

vikramvasudevan
/

sanatan_ai

Running on CPU Upgrade

App Files Files Community

vikramvasudevan commited on 8 days ago

Commit

8b5be8c

verified ·

1 Parent(s): 0c928ad

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

.gitignore +2 -0
modules/db/export.py +80 -0
modules/db/import.py +83 -0

.gitignore CHANGED Viewed

@@ -14,3 +14,5 @@ chromadb-store.zip
 outputs/
 chromadb-store_20251112.zip
 chromadb-store_20251118.zip

 outputs/
 chromadb-store_20251112.zip
 chromadb-store_20251118.zip
+chroma_exports/
+chroma_exports.zip

modules/db/export.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import json
+import argparse
+from pathlib import Path
+import chromadb
+from chromadb.config import Settings
+def export_collection(collection, output_dir: Path, include_embeddings=False):
+    """Export one ChromaDB collection to a JSON file."""
+    # Pull everything (large collections may need pagination)
+    include_fields = ["documents", "metadatas"]
+    if include_embeddings:
+        include_fields.append("embeddings")
+    items = collection.get(include=include_fields)
+    data = []
+    for idx, _id in enumerate(items["ids"]):
+        record = {
+            "id": _id,
+            "document": items["documents"][idx] if items.get("documents") else None,
+            "metadata": items["metadatas"][idx] if items.get("metadatas") else None,
+        }
+        if include_embeddings:
+            record["embedding"] = (
+                items["embeddings"][idx] if items.get("embeddings") else None
+            )
+        data.append(record)
+    # Write to <collection>.json
+    out_path = output_dir / f"{collection.name}.json"
+    out_path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8")
+    print(f"✔ Exported {collection.name} → {out_path}")
+def main():
+    parser = argparse.ArgumentParser(description="Export ChromaDB collections to JSON.")
+    parser.add_argument(
+        "--db-path",
+        type=str,
+        required=True,
+        help="Path to the chromadb_store folder (where the DB is persisted)",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        default="chroma_exports",
+        help="Output folder for json files",
+    )
+    parser.add_argument(
+        "--include-embeddings",
+        action="store_true",
+        help="Include embeddings in the export (off by default)",
+    )
+    args = parser.parse_args()
+    db_path = Path(args.db_path).expanduser().resolve()
+    output_dir = Path(args.output).expanduser().resolve()
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Connect to the persistent ChromaDB store
+    client = chromadb.PersistentClient(
+        path=str(db_path),
+        settings=Settings(anonymized_telemetry=False)
+    )
+    # Iterate collections
+    for cname in client.list_collections():
+        collection = client.get_collection(cname.name)
+        export_collection(collection, output_dir, args.include_embeddings)
+    print("\n🎉 All collections exported!")
+if __name__ == "__main__":
+    main()

modules/db/import.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import json
+import argparse
+from pathlib import Path
+import chromadb
+from chromadb.config import Settings
+def import_collection(client, json_file: Path, include_embeddings=False):
+    """Import a JSON file into a ChromaDB collection."""
+    collection_name = json_file.stem
+    print(f"📥 Importing {collection_name} from {json_file}")
+    # Load JSON
+    data = json.loads(json_file.read_text(encoding="utf-8"))
+    # Extract fields
+    ids = [item["id"] for item in data]
+    documents = [item.get("document") for item in data]
+    metadatas = [item.get("metadata") for item in data]
+    if include_embeddings:
+        embeddings = [item.get("embedding") for item in data]
+    else:
+        embeddings = None
+    # Create or get collection
+    collection = client.get_or_create_collection(collection_name)
+    # Add to collection
+    collection.add(
+        ids=ids,
+        documents=documents,
+        metadatas=metadatas,
+        embeddings=embeddings
+    )
+    print(f"✔ Imported {len(ids)} items into {collection_name}")
+def main():
+    parser = argparse.ArgumentParser(description="Import JSON files into ChromaDB collections.")
+    parser.add_argument(
+        "--db-path",
+        type=str,
+        required=True,
+        help="Path to the target chromadb_store folder",
+    )
+    parser.add_argument(
+        "--input",
+        type=str,
+        default="chroma_exports",
+        help="Folder containing JSON files to import",
+    )
+    parser.add_argument(
+        "--include-embeddings",
+        action="store_true",
+        help="Load embeddings from JSON (off by default)",
+    )
+    args = parser.parse_args()
+    db_path = Path(args.db_path).expanduser().resolve()
+    input_dir = Path(args.input).expanduser().resolve()
+    if not input_dir.exists():
+        print(f"❌ Input folder does not exist: {input_dir}")
+        return
+    # Connect to ChromaDB
+    client = chromadb.PersistentClient(
+        path=str(db_path),
+        settings=Settings(anonymized_telemetry=False)
+    )
+    # Iterate JSON files
+    for json_file in input_dir.glob("*.json"):
+        import_collection(client, json_file, args.include_embeddings)
+    print("\n🎉 All JSON files imported!")
+if __name__ == "__main__":
+    main()