broadfield-dev commited on
Commit
0df5c07
·
verified ·
1 Parent(s): b284540

Update process_hf_datasets.py

Browse files
Files changed (1) hide show
  1. process_hf_datasets.py +66 -10
process_hf_datasets.py CHANGED
@@ -5,14 +5,21 @@ from parser import parse_python_code
5
  from database import init_chromadb, store_program, DB_NAME, HF_DATASET_NAME, HF_KEY
6
  import chromadb
7
 
8
- def rename_variables(code):
9
- """Rename variables in Python code to input_var1, input_var2, etc."""
 
 
 
 
 
 
 
10
  # Simple variable name detection and renaming
11
  pattern = r'\b[a-zA-Z_]\w*\b' # Match variable names (simple heuristic)
12
  variables = set()
13
  code_lines = code.split('\n')
14
 
15
- # Find all variable names (simplified approach)
16
  for line in code_lines:
17
  matches = re.findall(pattern, line)
18
  for match in matches:
@@ -21,17 +28,64 @@ def rename_variables(code):
21
 
22
  # Sort variables by first appearance (simplified, could improve with AST)
23
  sorted_vars = sorted(list(variables))
24
- var_map = {var: f"input_var{i+1}" for i, var in enumerate(sorted_vars)}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  # Replace variables in code
27
  new_code = code
28
  for old_var, new_var in var_map.items():
29
  new_code = re.sub(r'\b' + old_var + r'\b', new_var, new_code)
30
 
31
- return new_code
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
  def process_hf_dataset():
34
- """Process the Hugging Face dataset and store programs in ChromaDB."""
35
  # Load the dataset
36
  dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
37
 
@@ -43,20 +97,22 @@ def process_hf_dataset():
43
  instruction = entry['instruction']
44
  output = entry['output']
45
 
46
- # Rename variables in the output code
47
- processed_code = rename_variables(output)
48
 
49
  # Parse the code to get parts and sequence
50
  parts, sequence = parse_python_code(processed_code)
51
  vectors = [part['vector'] for part in parts]
52
 
53
- # Generate description tokens from instruction
54
  description_tokens = f"task:{instruction.replace(' ', '_')}"
 
 
55
 
56
  # Store in ChromaDB with description
57
  store_program(client, processed_code, sequence, vectors, DB_NAME)
58
 
59
- # Update metadata with instruction as description
60
  collection = client.get_collection(DB_NAME)
61
  program_id = str(hash(processed_code))
62
  collection.update(
 
5
  from database import init_chromadb, store_program, DB_NAME, HF_DATASET_NAME, HF_KEY
6
  import chromadb
7
 
8
+ def rename_variables(code, variable_prefixes=None):
9
+ """Rename variables in Python code to align with vector categories (input_variable, assigned_variable, returned_variable)."""
10
+ if variable_prefixes is None:
11
+ variable_prefixes = {
12
+ 'input': 'input_variable',
13
+ 'assigned': 'assigned_variable',
14
+ 'returned': 'returned_variable'
15
+ }
16
+
17
  # Simple variable name detection and renaming
18
  pattern = r'\b[a-zA-Z_]\w*\b' # Match variable names (simple heuristic)
19
  variables = set()
20
  code_lines = code.split('\n')
21
 
22
+ # Find all variable names (simplified approach, could improve with AST)
23
  for line in code_lines:
24
  matches = re.findall(pattern, line)
25
  for match in matches:
 
28
 
29
  # Sort variables by first appearance (simplified, could improve with AST)
30
  sorted_vars = sorted(list(variables))
31
+ var_map = {}
32
+ var_count = {prefix: 1 for prefix in variable_prefixes.values()}
33
+
34
+ # Assign variables based on context (simplified heuristic)
35
+ for var in sorted_vars:
36
+ # Determine variable role based on context (simplified)
37
+ if var in ['expression', 'input']: # Assume input parameters or initial variables
38
+ role = 'input_variable'
39
+ elif var in code.split() and 'return' in line for line in code_lines if var in line: # Returned variables
40
+ role = 'returned_variable'
41
+ else: # Default to assigned variables
42
+ role = 'assigned_variable'
43
+
44
+ new_name = f"{role}{var_count[role]}"
45
+ var_map[var] = new_name
46
+ var_count[role] += 1
47
 
48
  # Replace variables in code
49
  new_code = code
50
  for old_var, new_var in var_map.items():
51
  new_code = re.sub(r'\b' + old_var + r'\b', new_var, new_code)
52
 
53
+ return new_code, var_map
54
+
55
+ def generate_description_tokens(sequence, vectors, var_map=None):
56
+ """Generate semantic description tokens for a program, including variable roles."""
57
+ tokens = []
58
+ category_descriptions = {
59
+ 'import': 'imports module',
60
+ 'function': 'defines function',
61
+ 'assigned_variable': 'assigns variable',
62
+ 'input_variable': 'input parameter',
63
+ 'returned_variable': 'returns value',
64
+ 'if': 'conditional statement',
65
+ 'return': 'returns result',
66
+ 'try': 'try block',
67
+ 'except': 'exception handler',
68
+ 'expression': 'expression statement',
69
+ 'spacer': 'empty line or comment'
70
+ }
71
+
72
+ for cat, vec in zip(sequence, vectors):
73
+ if cat in category_descriptions:
74
+ tokens.append(f"{category_descriptions[cat]}:{cat}")
75
+ # Add vector-derived features (e.g., level, span) as tokens
76
+ tokens.append(f"level:{vec[1]}")
77
+ tokens.append(f"span:{vec[3]:.2f}")
78
+
79
+ # Add variable role tokens if var_map exists
80
+ if var_map:
81
+ for old_var, new_var in var_map.items():
82
+ role = new_var.split('variable')[0] + 'variable' # Extract role (e.g., 'input_variable')
83
+ tokens.append(f"variable:{old_var}={new_var}:{role}")
84
+
85
+ return tokens
86
 
87
  def process_hf_dataset():
88
+ """Process the Hugging Face dataset and store programs in ChromaDB, aligning with vector categories."""
89
  # Load the dataset
90
  dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
91
 
 
97
  instruction = entry['instruction']
98
  output = entry['output']
99
 
100
+ # Rename variables to align with vector categories
101
+ processed_code, var_map = rename_variables(output)
102
 
103
  # Parse the code to get parts and sequence
104
  parts, sequence = parse_python_code(processed_code)
105
  vectors = [part['vector'] for part in parts]
106
 
107
+ # Generate description tokens including variable roles
108
  description_tokens = f"task:{instruction.replace(' ', '_')}"
109
+ description_tokens_list = generate_description_tokens(sequence, vectors, var_map)
110
+ description_tokens += " " + " ".join(description_tokens_list)
111
 
112
  # Store in ChromaDB with description
113
  store_program(client, processed_code, sequence, vectors, DB_NAME)
114
 
115
+ # Update metadata with instruction and variable roles as description
116
  collection = client.get_collection(DB_NAME)
117
  program_id = str(hash(processed_code))
118
  collection.update(