vikramvasudevan commited on
Commit
8b5be8c
·
verified ·
1 Parent(s): 0c928ad

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. .gitignore +2 -0
  2. modules/db/export.py +80 -0
  3. modules/db/import.py +83 -0
.gitignore CHANGED
@@ -14,3 +14,5 @@ chromadb-store.zip
14
  outputs/
15
  chromadb-store_20251112.zip
16
  chromadb-store_20251118.zip
 
 
 
14
  outputs/
15
  chromadb-store_20251112.zip
16
  chromadb-store_20251118.zip
17
+ chroma_exports/
18
+ chroma_exports.zip
modules/db/export.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import argparse
3
+ from pathlib import Path
4
+ import chromadb
5
+ from chromadb.config import Settings
6
+
7
+
8
+ def export_collection(collection, output_dir: Path, include_embeddings=False):
9
+ """Export one ChromaDB collection to a JSON file."""
10
+
11
+ # Pull everything (large collections may need pagination)
12
+ include_fields = ["documents", "metadatas"]
13
+ if include_embeddings:
14
+ include_fields.append("embeddings")
15
+
16
+ items = collection.get(include=include_fields)
17
+
18
+ data = []
19
+ for idx, _id in enumerate(items["ids"]):
20
+ record = {
21
+ "id": _id,
22
+ "document": items["documents"][idx] if items.get("documents") else None,
23
+ "metadata": items["metadatas"][idx] if items.get("metadatas") else None,
24
+ }
25
+
26
+ if include_embeddings:
27
+ record["embedding"] = (
28
+ items["embeddings"][idx] if items.get("embeddings") else None
29
+ )
30
+
31
+ data.append(record)
32
+
33
+ # Write to <collection>.json
34
+ out_path = output_dir / f"{collection.name}.json"
35
+ out_path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8")
36
+ print(f"✔ Exported {collection.name} → {out_path}")
37
+
38
+
39
+ def main():
40
+ parser = argparse.ArgumentParser(description="Export ChromaDB collections to JSON.")
41
+ parser.add_argument(
42
+ "--db-path",
43
+ type=str,
44
+ required=True,
45
+ help="Path to the chromadb_store folder (where the DB is persisted)",
46
+ )
47
+ parser.add_argument(
48
+ "--output",
49
+ type=str,
50
+ default="chroma_exports",
51
+ help="Output folder for json files",
52
+ )
53
+ parser.add_argument(
54
+ "--include-embeddings",
55
+ action="store_true",
56
+ help="Include embeddings in the export (off by default)",
57
+ )
58
+
59
+ args = parser.parse_args()
60
+
61
+ db_path = Path(args.db_path).expanduser().resolve()
62
+ output_dir = Path(args.output).expanduser().resolve()
63
+ output_dir.mkdir(parents=True, exist_ok=True)
64
+
65
+ # Connect to the persistent ChromaDB store
66
+ client = chromadb.PersistentClient(
67
+ path=str(db_path),
68
+ settings=Settings(anonymized_telemetry=False)
69
+ )
70
+
71
+ # Iterate collections
72
+ for cname in client.list_collections():
73
+ collection = client.get_collection(cname.name)
74
+ export_collection(collection, output_dir, args.include_embeddings)
75
+
76
+ print("\n🎉 All collections exported!")
77
+
78
+
79
+ if __name__ == "__main__":
80
+ main()
modules/db/import.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import argparse
3
+ from pathlib import Path
4
+ import chromadb
5
+ from chromadb.config import Settings
6
+
7
+
8
+ def import_collection(client, json_file: Path, include_embeddings=False):
9
+ """Import a JSON file into a ChromaDB collection."""
10
+
11
+ collection_name = json_file.stem
12
+ print(f"📥 Importing {collection_name} from {json_file}")
13
+
14
+ # Load JSON
15
+ data = json.loads(json_file.read_text(encoding="utf-8"))
16
+
17
+ # Extract fields
18
+ ids = [item["id"] for item in data]
19
+ documents = [item.get("document") for item in data]
20
+ metadatas = [item.get("metadata") for item in data]
21
+
22
+ if include_embeddings:
23
+ embeddings = [item.get("embedding") for item in data]
24
+ else:
25
+ embeddings = None
26
+
27
+ # Create or get collection
28
+ collection = client.get_or_create_collection(collection_name)
29
+
30
+ # Add to collection
31
+ collection.add(
32
+ ids=ids,
33
+ documents=documents,
34
+ metadatas=metadatas,
35
+ embeddings=embeddings
36
+ )
37
+
38
+ print(f"✔ Imported {len(ids)} items into {collection_name}")
39
+
40
+
41
+ def main():
42
+ parser = argparse.ArgumentParser(description="Import JSON files into ChromaDB collections.")
43
+ parser.add_argument(
44
+ "--db-path",
45
+ type=str,
46
+ required=True,
47
+ help="Path to the target chromadb_store folder",
48
+ )
49
+ parser.add_argument(
50
+ "--input",
51
+ type=str,
52
+ default="chroma_exports",
53
+ help="Folder containing JSON files to import",
54
+ )
55
+ parser.add_argument(
56
+ "--include-embeddings",
57
+ action="store_true",
58
+ help="Load embeddings from JSON (off by default)",
59
+ )
60
+
61
+ args = parser.parse_args()
62
+
63
+ db_path = Path(args.db_path).expanduser().resolve()
64
+ input_dir = Path(args.input).expanduser().resolve()
65
+ if not input_dir.exists():
66
+ print(f"❌ Input folder does not exist: {input_dir}")
67
+ return
68
+
69
+ # Connect to ChromaDB
70
+ client = chromadb.PersistentClient(
71
+ path=str(db_path),
72
+ settings=Settings(anonymized_telemetry=False)
73
+ )
74
+
75
+ # Iterate JSON files
76
+ for json_file in input_dir.glob("*.json"):
77
+ import_collection(client, json_file, args.include_embeddings)
78
+
79
+ print("\n🎉 All JSON files imported!")
80
+
81
+
82
+ if __name__ == "__main__":
83
+ main()