broadfield-dev commited on
Commit
16ea922
·
verified ·
1 Parent(s): 65c01b6

Update process_hf_dataset.py

Browse files
Files changed (1) hide show
  1. process_hf_dataset.py +6 -3
process_hf_dataset.py CHANGED
@@ -14,7 +14,7 @@ import time
14
  # Load environment variables
15
  load_dotenv()
16
 
17
- # Cache CodeBERT model globally to avoid repeated loading
18
  model_name = "microsoft/codebert-base"
19
  tokenizer = None
20
  model = None
@@ -148,6 +148,9 @@ def process_hf_dataset(batch_size=100, use_gpu=False):
148
  # Load the dataset
149
  dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
150
 
 
 
 
151
  # Initialize ChromaDB client
152
  client = init_chromadb()
153
 
@@ -159,9 +162,9 @@ def process_hf_dataset(batch_size=100, use_gpu=False):
159
  collection = client.create_collection(DB_NAME)
160
 
161
  # Process in batches with progress bar
162
- total_entries = len(dataset)
163
  for i in tqdm(range(0, total_entries, batch_size), desc="Processing Hugging Face Dataset"):
164
- batch = dataset[i:i + batch_size]
165
  batch_programs = []
166
  batch_ids = []
167
  batch_documents = []
 
14
  # Load environment variables
15
  load_dotenv()
16
 
17
+ # Cache CodeBERT model globally to avoid repeated loading and reducing freezing
18
  model_name = "microsoft/codebert-base"
19
  tokenizer = None
20
  model = None
 
148
  # Load the dataset
149
  dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
150
 
151
+ # Convert dataset to list of dictionaries for iteration
152
+ dataset_list = list(dataset)
153
+
154
  # Initialize ChromaDB client
155
  client = init_chromadb()
156
 
 
162
  collection = client.create_collection(DB_NAME)
163
 
164
  # Process in batches with progress bar
165
+ total_entries = len(dataset_list)
166
  for i in tqdm(range(0, total_entries, batch_size), desc="Processing Hugging Face Dataset"):
167
+ batch = dataset_list[i:i + batch_size]
168
  batch_programs = []
169
  batch_ids = []
170
  batch_documents = []