Spaces:

broadfield-dev
/

parse_py

Sleeping

broadfield-dev commited on Mar 5

Commit

16ea922

verified ·

1 Parent(s): 65c01b6

Update process_hf_dataset.py

Files changed (1) hide show

process_hf_dataset.py CHANGED Viewed

@@ -14,7 +14,7 @@ import time
 # Load environment variables
 load_dotenv()
-# Cache CodeBERT model globally to avoid repeated loading
 model_name = "microsoft/codebert-base"
 tokenizer = None
 model = None
@@ -148,6 +148,9 @@ def process_hf_dataset(batch_size=100, use_gpu=False):
     # Load the dataset
     dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
     # Initialize ChromaDB client
     client = init_chromadb()
@@ -159,9 +162,9 @@ def process_hf_dataset(batch_size=100, use_gpu=False):
     collection = client.create_collection(DB_NAME)
     # Process in batches with progress bar
-    total_entries = len(dataset)
     for i in tqdm(range(0, total_entries, batch_size), desc="Processing Hugging Face Dataset"):
-        batch = dataset[i:i + batch_size]
         batch_programs = []
         batch_ids = []
         batch_documents = []

 # Load environment variables
 load_dotenv()
+# Cache CodeBERT model globally to avoid repeated loading and reducing freezing
 model_name = "microsoft/codebert-base"
 tokenizer = None
 model = None
     # Load the dataset
     dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
+    # Convert dataset to list of dictionaries for iteration
+    dataset_list = list(dataset)
     # Initialize ChromaDB client
     client = init_chromadb()
     collection = client.create_collection(DB_NAME)
     # Process in batches with progress bar
+    total_entries = len(dataset_list)
     for i in tqdm(range(0, total_entries, batch_size), desc="Processing Hugging Face Dataset"):
+        batch = dataset_list[i:i + batch_size]
         batch_programs = []
         batch_ids = []
         batch_documents = []