Spaces:
Sleeping
Sleeping
Update process_hf_dataset.py
Browse files- process_hf_dataset.py +6 -3
process_hf_dataset.py
CHANGED
@@ -14,7 +14,7 @@ import time
|
|
14 |
# Load environment variables
|
15 |
load_dotenv()
|
16 |
|
17 |
-
# Cache CodeBERT model globally to avoid repeated loading
|
18 |
model_name = "microsoft/codebert-base"
|
19 |
tokenizer = None
|
20 |
model = None
|
@@ -148,6 +148,9 @@ def process_hf_dataset(batch_size=100, use_gpu=False):
|
|
148 |
# Load the dataset
|
149 |
dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
|
150 |
|
|
|
|
|
|
|
151 |
# Initialize ChromaDB client
|
152 |
client = init_chromadb()
|
153 |
|
@@ -159,9 +162,9 @@ def process_hf_dataset(batch_size=100, use_gpu=False):
|
|
159 |
collection = client.create_collection(DB_NAME)
|
160 |
|
161 |
# Process in batches with progress bar
|
162 |
-
total_entries = len(
|
163 |
for i in tqdm(range(0, total_entries, batch_size), desc="Processing Hugging Face Dataset"):
|
164 |
-
batch =
|
165 |
batch_programs = []
|
166 |
batch_ids = []
|
167 |
batch_documents = []
|
|
|
14 |
# Load environment variables
|
15 |
load_dotenv()
|
16 |
|
17 |
+
# Cache CodeBERT model globally to avoid repeated loading and reducing freezing
|
18 |
model_name = "microsoft/codebert-base"
|
19 |
tokenizer = None
|
20 |
model = None
|
|
|
148 |
# Load the dataset
|
149 |
dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
|
150 |
|
151 |
+
# Convert dataset to list of dictionaries for iteration
|
152 |
+
dataset_list = list(dataset)
|
153 |
+
|
154 |
# Initialize ChromaDB client
|
155 |
client = init_chromadb()
|
156 |
|
|
|
162 |
collection = client.create_collection(DB_NAME)
|
163 |
|
164 |
# Process in batches with progress bar
|
165 |
+
total_entries = len(dataset_list)
|
166 |
for i in tqdm(range(0, total_entries, batch_size), desc="Processing Hugging Face Dataset"):
|
167 |
+
batch = dataset_list[i:i + batch_size]
|
168 |
batch_programs = []
|
169 |
batch_ids = []
|
170 |
batch_documents = []
|