broadfield-dev commited on
Commit
a66e49f
·
verified ·
1 Parent(s): 64ba224

Update process_hf_dataset.py

Browse files
Files changed (1) hide show
  1. process_hf_dataset.py +4 -4
process_hf_dataset.py CHANGED
@@ -186,11 +186,11 @@ def process_hf_dataset(batch_size=100, use_gpu=False):
186
  # Do not clear or populate with defaults here—let UI buttons handle this
187
  try:
188
  collection = client.get_or_create_collection(DB_NAME)
189
- logger.info(f"Using existing or new ChromaDB collection: {DB_NAME}")
190
  # Verify collection is valid
191
  if collection is None or not hasattr(collection, 'add'):
192
  raise ValueError("ChromaDB collection access failed")
193
- logger.info(f"ChromaDB collection verified, contains {collection.count()} entries")
194
  except Exception as e:
195
  logger.error(f"Error accessing ChromaDB collection: {e}")
196
  raise
@@ -277,8 +277,8 @@ def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY"))
277
  logger.info(f"Created Hugging Face Dataset with {len(data['code'])} entries")
278
 
279
  # Push to Hugging Face Hub
280
- dataset.push_to_hub(dataset_name, token=token)
281
- logger.info(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
282
  # Verify push (optional, could check dataset on Hub)
283
  logger.info(f"Verified Hugging Face dataset push with {len(dataset)} entries")
284
  except Exception as e:
 
186
  # Do not clear or populate with defaults here—let UI buttons handle this
187
  try:
188
  collection = client.get_or_create_collection(DB_NAME)
189
+ logger.info(f"Using existing or new ChromaDB collection: {DB_NAME}, contains {collection.count()} entries")
190
  # Verify collection is valid
191
  if collection is None or not hasattr(collection, 'add'):
192
  raise ValueError("ChromaDB collection access failed")
193
+ logger.info("Verified ChromaDB collection is valid")
194
  except Exception as e:
195
  logger.error(f"Error accessing ChromaDB collection: {e}")
196
  raise
 
277
  logger.info(f"Created Hugging Face Dataset with {len(data['code'])} entries")
278
 
279
  # Push to Hugging Face Hub
280
+ dataset.push_to_hub(dataset_name, token=token, exist_ok=True) # Allow overwriting existing dataset
281
+ logger.info(f"Dataset pushed to Hugging Face Hub as {dataset_name}, overwriting existing dataset")
282
  # Verify push (optional, could check dataset on Hub)
283
  logger.info(f"Verified Hugging Face dataset push with {len(dataset)} entries")
284
  except Exception as e: