broadfield-dev commited on
Commit
6fa17d2
·
verified ·
1 Parent(s): c157f01

Update process_hf_dataset.py

Browse files
Files changed (1) hide show
  1. process_hf_dataset.py +54 -36
process_hf_dataset.py CHANGED
@@ -10,6 +10,11 @@ from transformers import AutoTokenizer, AutoModel
10
  import torch
11
  from tqdm import tqdm # For progress bar
12
  import time
 
 
 
 
 
13
 
14
  # Load environment variables
15
  load_dotenv()
@@ -28,9 +33,9 @@ def load_codebert_model(use_gpu=False):
28
  device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
29
  tokenizer = AutoTokenizer.from_pretrained(model_name)
30
  model = AutoModel.from_pretrained(model_name).to(device)
31
- print(f"CodeBERT model loaded on {device}")
32
  except Exception as e:
33
- print(f"Error loading CodeBERT model: {e}")
34
  raise
35
  return tokenizer, model, device
36
 
@@ -143,6 +148,7 @@ def generate_semantic_vector(description, total_lines=100, use_gpu=False):
143
 
144
  # Ensure vector isn’t all zeros or defaults
145
  if all(v == 0 for v in vector):
 
146
  # Fallback: Use heuristic if CodeBERT fails to generate meaningful embeddings
147
  category_map = {
148
  'import': 1, 'function': 2, 'assign': 17, 'input': 18, 'return': 19, 'if': 5, 'try': 8, 'except': 14
@@ -160,15 +166,19 @@ def generate_semantic_vector(description, total_lines=100, use_gpu=False):
160
  vector[5] = cat_id / len(category_map) # parent_weight
161
  break
162
 
 
163
  return vector
164
 
165
  def process_hf_dataset(batch_size=100, use_gpu=False):
166
  """Process the Hugging Face dataset in batches and store programs in ChromaDB, aligning with vector categories."""
167
  # Load the dataset
168
- dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
169
-
170
- # Convert dataset to list of dictionaries for iteration
171
- dataset_list = list(dataset)
 
 
 
172
 
173
  # Initialize ChromaDB client
174
  client = init_chromadb()
@@ -187,33 +197,36 @@ def process_hf_dataset(batch_size=100, use_gpu=False):
187
  batch_embeddings = []
188
 
189
  for entry in batch:
190
- instruction = entry['instruction']
191
- output = entry['output']
192
-
193
- # Rename variables to align with vector categories
194
- processed_code, var_map = rename_variables(output)
195
-
196
- # Parse the code to get parts and sequence, generating our 6D vectors
197
- parts, sequence = parse_python_code(processed_code)
198
- program_vectors = [part['vector'] for part in parts] # Use parser's 6D vectors for program structure
199
-
200
- # Generate description tokens including variable roles
201
- description_tokens = f"task:{instruction.replace(' ', '_')}"
202
- description_tokens_list = generate_description_tokens(sequence, program_vectors, var_map)
203
- description_tokens += " " + " ".join(description_tokens_list)
204
-
205
- # Generate a 6D semantic vector for the instruction
206
- semantic_vector = generate_semantic_vector(instruction, use_gpu=use_gpu)
207
-
208
- # Store program data
209
- program_id = str(hash(processed_code))
210
- batch_ids.append(program_id)
211
- batch_documents.append(processed_code)
212
- batch_metadatas.append({"sequence": ",".join(sequence), "description_tokens": description_tokens, "program_vectors": str(program_vectors)})
213
- batch_embeddings.append(semantic_vector)
214
-
215
- # Add small delay to prevent freezing (optional, adjust as needed)
216
- time.sleep(0.01)
 
 
 
217
 
218
  # Batch add to ChromaDB
219
  try:
@@ -223,8 +236,9 @@ def process_hf_dataset(batch_size=100, use_gpu=False):
223
  ids=batch_ids,
224
  embeddings=batch_embeddings
225
  )
 
226
  except Exception as e:
227
- print(f"Error adding batch to ChromaDB: {e}")
228
  raise
229
 
230
  # Save to Hugging Face Dataset
@@ -249,8 +263,12 @@ def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY"))
249
  dataset = Dataset.from_dict(data)
250
 
251
  # Push to Hugging Face Hub
252
- dataset.push_to_hub(dataset_name, token=token)
253
- print(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
 
 
 
 
254
 
255
  if __name__ == "__main__":
256
  process_hf_dataset(batch_size=100, use_gpu=False)
 
10
  import torch
11
  from tqdm import tqdm # For progress bar
12
  import time
13
+ import logging
14
+
15
+ # Set up logging
16
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
17
+ logger = logging.getLogger(__name__)
18
 
19
  # Load environment variables
20
  load_dotenv()
 
33
  device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
34
  tokenizer = AutoTokenizer.from_pretrained(model_name)
35
  model = AutoModel.from_pretrained(model_name).to(device)
36
+ logger.info(f"CodeBERT model loaded on {device}")
37
  except Exception as e:
38
+ logger.error(f"Error loading CodeBERT model: {e}")
39
  raise
40
  return tokenizer, model, device
41
 
 
148
 
149
  # Ensure vector isn’t all zeros or defaults
150
  if all(v == 0 for v in vector):
151
+ logger.warning(f"Default vector detected for description: {description}")
152
  # Fallback: Use heuristic if CodeBERT fails to generate meaningful embeddings
153
  category_map = {
154
  'import': 1, 'function': 2, 'assign': 17, 'input': 18, 'return': 19, 'if': 5, 'try': 8, 'except': 14
 
166
  vector[5] = cat_id / len(category_map) # parent_weight
167
  break
168
 
169
+ logger.debug(f"Generated semantic vector for '{description}': {vector}")
170
  return vector
171
 
172
  def process_hf_dataset(batch_size=100, use_gpu=False):
173
  """Process the Hugging Face dataset in batches and store programs in ChromaDB, aligning with vector categories."""
174
  # Load the dataset
175
+ try:
176
+ dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
177
+ dataset_list = list(dataset)
178
+ logger.info(f"Loaded dataset with {len(dataset_list)} entries")
179
+ except Exception as e:
180
+ logger.error(f"Error loading dataset: {e}")
181
+ raise
182
 
183
  # Initialize ChromaDB client
184
  client = init_chromadb()
 
197
  batch_embeddings = []
198
 
199
  for entry in batch:
200
+ try:
201
+ instruction = entry['instruction']
202
+ output = entry['output']
203
+
204
+ # Rename variables to align with vector categories
205
+ processed_code, var_map = rename_variables(output)
206
+
207
+ # Parse the code to get parts and sequence, generating our 6D vectors
208
+ parts, sequence = parse_python_code(processed_code)
209
+ program_vectors = [part['vector'] for part in parts] # Use parser's 6D vectors for program structure
210
+
211
+ # Generate description tokens including variable roles
212
+ description_tokens = f"task:{instruction.replace(' ', '_')}"
213
+ description_tokens_list = generate_description_tokens(sequence, program_vectors, var_map)
214
+ description_tokens += " " + " ".join(description_tokens_list)
215
+
216
+ # Generate a 6D semantic vector for the instruction
217
+ semantic_vector = generate_semantic_vector(instruction, use_gpu=use_gpu)
218
+
219
+ # Store program data
220
+ program_id = str(hash(processed_code))
221
+ batch_ids.append(program_id)
222
+ batch_documents.append(processed_code)
223
+ batch_metadatas.append({"sequence": ",".join(sequence), "description_tokens": description_tokens, "program_vectors": str(program_vectors)})
224
+ batch_embeddings.append(semantic_vector)
225
+
226
+ logger.debug(f"Processed entry: {program_id}, Vector: {semantic_vector}")
227
+ except Exception as e:
228
+ logger.error(f"Error processing entry {i}: {e}")
229
+ continue # Skip failed entries but continue processing
230
 
231
  # Batch add to ChromaDB
232
  try:
 
236
  ids=batch_ids,
237
  embeddings=batch_embeddings
238
  )
239
+ logger.info(f"Added batch {i//batch_size + 1} to ChromaDB with {len(batch_ids)} entries")
240
  except Exception as e:
241
+ logger.error(f"Error adding batch to ChromaDB: {e}")
242
  raise
243
 
244
  # Save to Hugging Face Dataset
 
263
  dataset = Dataset.from_dict(data)
264
 
265
  # Push to Hugging Face Hub
266
+ try:
267
+ dataset.push_to_hub(dataset_name, token=token)
268
+ logger.info(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
269
+ except Exception as e:
270
+ logger.error(f"Error pushing dataset to Hugging Face Hub: {e}")
271
+ raise
272
 
273
  if __name__ == "__main__":
274
  process_hf_dataset(batch_size=100, use_gpu=False)