Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Upload folder using huggingface_hub
Browse files- .gitignore +2 -0
- modules/db/export.py +80 -0
- modules/db/import.py +83 -0
.gitignore
CHANGED
|
@@ -14,3 +14,5 @@ chromadb-store.zip
|
|
| 14 |
outputs/
|
| 15 |
chromadb-store_20251112.zip
|
| 16 |
chromadb-store_20251118.zip
|
|
|
|
|
|
|
|
|
| 14 |
outputs/
|
| 15 |
chromadb-store_20251112.zip
|
| 16 |
chromadb-store_20251118.zip
|
| 17 |
+
chroma_exports/
|
| 18 |
+
chroma_exports.zip
|
modules/db/export.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import argparse
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
import chromadb
|
| 5 |
+
from chromadb.config import Settings
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def export_collection(collection, output_dir: Path, include_embeddings=False):
|
| 9 |
+
"""Export one ChromaDB collection to a JSON file."""
|
| 10 |
+
|
| 11 |
+
# Pull everything (large collections may need pagination)
|
| 12 |
+
include_fields = ["documents", "metadatas"]
|
| 13 |
+
if include_embeddings:
|
| 14 |
+
include_fields.append("embeddings")
|
| 15 |
+
|
| 16 |
+
items = collection.get(include=include_fields)
|
| 17 |
+
|
| 18 |
+
data = []
|
| 19 |
+
for idx, _id in enumerate(items["ids"]):
|
| 20 |
+
record = {
|
| 21 |
+
"id": _id,
|
| 22 |
+
"document": items["documents"][idx] if items.get("documents") else None,
|
| 23 |
+
"metadata": items["metadatas"][idx] if items.get("metadatas") else None,
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
if include_embeddings:
|
| 27 |
+
record["embedding"] = (
|
| 28 |
+
items["embeddings"][idx] if items.get("embeddings") else None
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
data.append(record)
|
| 32 |
+
|
| 33 |
+
# Write to <collection>.json
|
| 34 |
+
out_path = output_dir / f"{collection.name}.json"
|
| 35 |
+
out_path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8")
|
| 36 |
+
print(f"✔ Exported {collection.name} → {out_path}")
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def main():
|
| 40 |
+
parser = argparse.ArgumentParser(description="Export ChromaDB collections to JSON.")
|
| 41 |
+
parser.add_argument(
|
| 42 |
+
"--db-path",
|
| 43 |
+
type=str,
|
| 44 |
+
required=True,
|
| 45 |
+
help="Path to the chromadb_store folder (where the DB is persisted)",
|
| 46 |
+
)
|
| 47 |
+
parser.add_argument(
|
| 48 |
+
"--output",
|
| 49 |
+
type=str,
|
| 50 |
+
default="chroma_exports",
|
| 51 |
+
help="Output folder for json files",
|
| 52 |
+
)
|
| 53 |
+
parser.add_argument(
|
| 54 |
+
"--include-embeddings",
|
| 55 |
+
action="store_true",
|
| 56 |
+
help="Include embeddings in the export (off by default)",
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
args = parser.parse_args()
|
| 60 |
+
|
| 61 |
+
db_path = Path(args.db_path).expanduser().resolve()
|
| 62 |
+
output_dir = Path(args.output).expanduser().resolve()
|
| 63 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 64 |
+
|
| 65 |
+
# Connect to the persistent ChromaDB store
|
| 66 |
+
client = chromadb.PersistentClient(
|
| 67 |
+
path=str(db_path),
|
| 68 |
+
settings=Settings(anonymized_telemetry=False)
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
# Iterate collections
|
| 72 |
+
for cname in client.list_collections():
|
| 73 |
+
collection = client.get_collection(cname.name)
|
| 74 |
+
export_collection(collection, output_dir, args.include_embeddings)
|
| 75 |
+
|
| 76 |
+
print("\n🎉 All collections exported!")
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
if __name__ == "__main__":
|
| 80 |
+
main()
|
modules/db/import.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import argparse
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
import chromadb
|
| 5 |
+
from chromadb.config import Settings
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def import_collection(client, json_file: Path, include_embeddings=False):
|
| 9 |
+
"""Import a JSON file into a ChromaDB collection."""
|
| 10 |
+
|
| 11 |
+
collection_name = json_file.stem
|
| 12 |
+
print(f"📥 Importing {collection_name} from {json_file}")
|
| 13 |
+
|
| 14 |
+
# Load JSON
|
| 15 |
+
data = json.loads(json_file.read_text(encoding="utf-8"))
|
| 16 |
+
|
| 17 |
+
# Extract fields
|
| 18 |
+
ids = [item["id"] for item in data]
|
| 19 |
+
documents = [item.get("document") for item in data]
|
| 20 |
+
metadatas = [item.get("metadata") for item in data]
|
| 21 |
+
|
| 22 |
+
if include_embeddings:
|
| 23 |
+
embeddings = [item.get("embedding") for item in data]
|
| 24 |
+
else:
|
| 25 |
+
embeddings = None
|
| 26 |
+
|
| 27 |
+
# Create or get collection
|
| 28 |
+
collection = client.get_or_create_collection(collection_name)
|
| 29 |
+
|
| 30 |
+
# Add to collection
|
| 31 |
+
collection.add(
|
| 32 |
+
ids=ids,
|
| 33 |
+
documents=documents,
|
| 34 |
+
metadatas=metadatas,
|
| 35 |
+
embeddings=embeddings
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
print(f"✔ Imported {len(ids)} items into {collection_name}")
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def main():
|
| 42 |
+
parser = argparse.ArgumentParser(description="Import JSON files into ChromaDB collections.")
|
| 43 |
+
parser.add_argument(
|
| 44 |
+
"--db-path",
|
| 45 |
+
type=str,
|
| 46 |
+
required=True,
|
| 47 |
+
help="Path to the target chromadb_store folder",
|
| 48 |
+
)
|
| 49 |
+
parser.add_argument(
|
| 50 |
+
"--input",
|
| 51 |
+
type=str,
|
| 52 |
+
default="chroma_exports",
|
| 53 |
+
help="Folder containing JSON files to import",
|
| 54 |
+
)
|
| 55 |
+
parser.add_argument(
|
| 56 |
+
"--include-embeddings",
|
| 57 |
+
action="store_true",
|
| 58 |
+
help="Load embeddings from JSON (off by default)",
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
args = parser.parse_args()
|
| 62 |
+
|
| 63 |
+
db_path = Path(args.db_path).expanduser().resolve()
|
| 64 |
+
input_dir = Path(args.input).expanduser().resolve()
|
| 65 |
+
if not input_dir.exists():
|
| 66 |
+
print(f"❌ Input folder does not exist: {input_dir}")
|
| 67 |
+
return
|
| 68 |
+
|
| 69 |
+
# Connect to ChromaDB
|
| 70 |
+
client = chromadb.PersistentClient(
|
| 71 |
+
path=str(db_path),
|
| 72 |
+
settings=Settings(anonymized_telemetry=False)
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
# Iterate JSON files
|
| 76 |
+
for json_file in input_dir.glob("*.json"):
|
| 77 |
+
import_collection(client, json_file, args.include_embeddings)
|
| 78 |
+
|
| 79 |
+
print("\n🎉 All JSON files imported!")
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
if __name__ == "__main__":
|
| 83 |
+
main()
|