Spaces:
Sleeping
Sleeping
import os | |
import json | |
import chromadb | |
# Initialize a persistent Chroma client | |
client = chromadb.PersistentClient(path="/home/johannes/Desktop/proj/Datenbank/chroma") | |
# Create or retrieve a collection for the books | |
collection = client.get_or_create_collection(name="phil_en", metadata={"hnsw:space": "cosine"}) | |
# Function to safely get metadata, replacing None with "Unknown" | |
def get_metadata(entry, key): | |
return entry.get(key) if entry.get(key) is not None else "Unknown" | |
# Directory containing the JSON files with pre-computed embeddings | |
json_dir = "/home/johannes/Documents/Datenbank/bücher/en/verarbeitet/ready_for_chroma/" | |
# Function to load JSON data from a file | |
def load_json_data(file_path): | |
with open(file_path, 'r', encoding='utf-8') as file: | |
return json.load(file) | |
# Get all JSON files in the directory | |
json_files = [os.path.join(json_dir, file) for file in os.listdir(json_dir) if file.endswith('.json')] | |
# Loop through each file, read the data, and add it to the collection | |
for file_path in json_files: | |
try: | |
data = load_json_data(file_path) | |
documents = [] | |
embeddings = [] | |
metadatas = [] | |
ids = [] | |
# Extract entry information and embeddings from each object in the JSON file | |
for entry in data: | |
documents.append(entry['text']) | |
embeddings.append(entry['embedding']) # Assume embeddings are stored under the key 'embedding' | |
metadatas.append({ | |
'author': get_metadata(entry, 'autor'), | |
'book': get_metadata(entry, 'buch'), | |
'section': get_metadata(entry, 'abschnitt'), | |
'title': get_metadata(entry, 'titel') | |
}) | |
# Generating a structured ID for each entry | |
entry_number = entry['entry_number'] # Ensure each JSON object has a entry number | |
ids.append(f"{entry_number}") | |
# Add the entrys to the collection with pre-computed embeddings | |
collection.add(documents=documents, embeddings=embeddings, metadatas=metadatas, ids=ids) | |
print(f"Added {len(documents)} documents from {os.path.basename(file_path)}") | |
except Exception as e: | |
print(f"Failed to process {file_path}: {e}") | |