Spaces:

broadfield-dev
/

parse_py

Sleeping

App Files Files Community

broadfield-dev commited on Mar 5

Commit

0df5c07

verified ·

1 Parent(s): b284540

Update process_hf_datasets.py

Browse files

Files changed (1) hide show

process_hf_datasets.py +66 -10

process_hf_datasets.py CHANGED Viewed

@@ -5,14 +5,21 @@ from parser import parse_python_code
 from database import init_chromadb, store_program, DB_NAME, HF_DATASET_NAME, HF_KEY
 import chromadb
-def rename_variables(code):
-    """Rename variables in Python code to input_var1, input_var2, etc."""
     # Simple variable name detection and renaming
     pattern = r'\b[a-zA-Z_]\w*\b'  # Match variable names (simple heuristic)
     variables = set()
     code_lines = code.split('\n')
-    # Find all variable names (simplified approach)
     for line in code_lines:
         matches = re.findall(pattern, line)
         for match in matches:
@@ -21,17 +28,64 @@ def rename_variables(code):
     # Sort variables by first appearance (simplified, could improve with AST)
     sorted_vars = sorted(list(variables))
-    var_map = {var: f"input_var{i+1}" for i, var in enumerate(sorted_vars)}
     # Replace variables in code
     new_code = code
     for old_var, new_var in var_map.items():
         new_code = re.sub(r'\b' + old_var + r'\b', new_var, new_code)
-    return new_code
 def process_hf_dataset():
-    """Process the Hugging Face dataset and store programs in ChromaDB."""
     # Load the dataset
     dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
@@ -43,20 +97,22 @@ def process_hf_dataset():
         instruction = entry['instruction']
         output = entry['output']
-        # Rename variables in the output code
-        processed_code = rename_variables(output)
         # Parse the code to get parts and sequence
         parts, sequence = parse_python_code(processed_code)
         vectors = [part['vector'] for part in parts]
-        # Generate description tokens from instruction
         description_tokens = f"task:{instruction.replace(' ', '_')}"
         # Store in ChromaDB with description
         store_program(client, processed_code, sequence, vectors, DB_NAME)
-        # Update metadata with instruction as description
         collection = client.get_collection(DB_NAME)
         program_id = str(hash(processed_code))
         collection.update(

 from database import init_chromadb, store_program, DB_NAME, HF_DATASET_NAME, HF_KEY
 import chromadb
+def rename_variables(code, variable_prefixes=None):
+    """Rename variables in Python code to align with vector categories (input_variable, assigned_variable, returned_variable)."""
+    if variable_prefixes is None:
+        variable_prefixes = {
+            'input': 'input_variable',
+            'assigned': 'assigned_variable',
+            'returned': 'returned_variable'
+        }
     # Simple variable name detection and renaming
     pattern = r'\b[a-zA-Z_]\w*\b'  # Match variable names (simple heuristic)
     variables = set()
     code_lines = code.split('\n')
+    # Find all variable names (simplified approach, could improve with AST)
     for line in code_lines:
         matches = re.findall(pattern, line)
         for match in matches:
     # Sort variables by first appearance (simplified, could improve with AST)
     sorted_vars = sorted(list(variables))
+    var_map = {}
+    var_count = {prefix: 1 for prefix in variable_prefixes.values()}
+    # Assign variables based on context (simplified heuristic)
+    for var in sorted_vars:
+        # Determine variable role based on context (simplified)
+        if var in ['expression', 'input']:  # Assume input parameters or initial variables
+            role = 'input_variable'
+        elif var in code.split() and 'return' in line for line in code_lines if var in line:  # Returned variables
+            role = 'returned_variable'
+        else:  # Default to assigned variables
+            role = 'assigned_variable'
+        new_name = f"{role}{var_count[role]}"
+        var_map[var] = new_name
+        var_count[role] += 1
     # Replace variables in code
     new_code = code
     for old_var, new_var in var_map.items():
         new_code = re.sub(r'\b' + old_var + r'\b', new_var, new_code)
+    return new_code, var_map
+def generate_description_tokens(sequence, vectors, var_map=None):
+    """Generate semantic description tokens for a program, including variable roles."""
+    tokens = []
+    category_descriptions = {
+        'import': 'imports module',
+        'function': 'defines function',
+        'assigned_variable': 'assigns variable',
+        'input_variable': 'input parameter',
+        'returned_variable': 'returns value',
+        'if': 'conditional statement',
+        'return': 'returns result',
+        'try': 'try block',
+        'except': 'exception handler',
+        'expression': 'expression statement',
+        'spacer': 'empty line or comment'
+    }
+    for cat, vec in zip(sequence, vectors):
+        if cat in category_descriptions:
+            tokens.append(f"{category_descriptions[cat]}:{cat}")
+            # Add vector-derived features (e.g., level, span) as tokens
+            tokens.append(f"level:{vec[1]}")
+            tokens.append(f"span:{vec[3]:.2f}")
+    # Add variable role tokens if var_map exists
+    if var_map:
+        for old_var, new_var in var_map.items():
+            role = new_var.split('variable')[0] + 'variable'  # Extract role (e.g., 'input_variable')
+            tokens.append(f"variable:{old_var}={new_var}:{role}")
+    return tokens
 def process_hf_dataset():
+    """Process the Hugging Face dataset and store programs in ChromaDB, aligning with vector categories."""
     # Load the dataset
     dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
         instruction = entry['instruction']
         output = entry['output']
+        # Rename variables to align with vector categories
+        processed_code, var_map = rename_variables(output)
         # Parse the code to get parts and sequence
         parts, sequence = parse_python_code(processed_code)
         vectors = [part['vector'] for part in parts]
+        # Generate description tokens including variable roles
         description_tokens = f"task:{instruction.replace(' ', '_')}"
+        description_tokens_list = generate_description_tokens(sequence, vectors, var_map)
+        description_tokens += " " + " ".join(description_tokens_list)
         # Store in ChromaDB with description
         store_program(client, processed_code, sequence, vectors, DB_NAME)
+        # Update metadata with instruction and variable roles as description
         collection = client.get_collection(DB_NAME)
         program_id = str(hash(processed_code))
         collection.update(