Spaces:
Running
Running
Update process_hf_datasets.py
Browse files- process_hf_datasets.py +66 -10
process_hf_datasets.py
CHANGED
@@ -5,14 +5,21 @@ from parser import parse_python_code
|
|
5 |
from database import init_chromadb, store_program, DB_NAME, HF_DATASET_NAME, HF_KEY
|
6 |
import chromadb
|
7 |
|
8 |
-
def rename_variables(code):
|
9 |
-
"""Rename variables in Python code to
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
# Simple variable name detection and renaming
|
11 |
pattern = r'\b[a-zA-Z_]\w*\b' # Match variable names (simple heuristic)
|
12 |
variables = set()
|
13 |
code_lines = code.split('\n')
|
14 |
|
15 |
-
# Find all variable names (simplified approach)
|
16 |
for line in code_lines:
|
17 |
matches = re.findall(pattern, line)
|
18 |
for match in matches:
|
@@ -21,17 +28,64 @@ def rename_variables(code):
|
|
21 |
|
22 |
# Sort variables by first appearance (simplified, could improve with AST)
|
23 |
sorted_vars = sorted(list(variables))
|
24 |
-
var_map = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
# Replace variables in code
|
27 |
new_code = code
|
28 |
for old_var, new_var in var_map.items():
|
29 |
new_code = re.sub(r'\b' + old_var + r'\b', new_var, new_code)
|
30 |
|
31 |
-
return new_code
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
def process_hf_dataset():
|
34 |
-
"""Process the Hugging Face dataset and store programs in ChromaDB."""
|
35 |
# Load the dataset
|
36 |
dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
|
37 |
|
@@ -43,20 +97,22 @@ def process_hf_dataset():
|
|
43 |
instruction = entry['instruction']
|
44 |
output = entry['output']
|
45 |
|
46 |
-
# Rename variables
|
47 |
-
processed_code = rename_variables(output)
|
48 |
|
49 |
# Parse the code to get parts and sequence
|
50 |
parts, sequence = parse_python_code(processed_code)
|
51 |
vectors = [part['vector'] for part in parts]
|
52 |
|
53 |
-
# Generate description tokens
|
54 |
description_tokens = f"task:{instruction.replace(' ', '_')}"
|
|
|
|
|
55 |
|
56 |
# Store in ChromaDB with description
|
57 |
store_program(client, processed_code, sequence, vectors, DB_NAME)
|
58 |
|
59 |
-
# Update metadata with instruction as description
|
60 |
collection = client.get_collection(DB_NAME)
|
61 |
program_id = str(hash(processed_code))
|
62 |
collection.update(
|
|
|
5 |
from database import init_chromadb, store_program, DB_NAME, HF_DATASET_NAME, HF_KEY
|
6 |
import chromadb
|
7 |
|
8 |
+
def rename_variables(code, variable_prefixes=None):
|
9 |
+
"""Rename variables in Python code to align with vector categories (input_variable, assigned_variable, returned_variable)."""
|
10 |
+
if variable_prefixes is None:
|
11 |
+
variable_prefixes = {
|
12 |
+
'input': 'input_variable',
|
13 |
+
'assigned': 'assigned_variable',
|
14 |
+
'returned': 'returned_variable'
|
15 |
+
}
|
16 |
+
|
17 |
# Simple variable name detection and renaming
|
18 |
pattern = r'\b[a-zA-Z_]\w*\b' # Match variable names (simple heuristic)
|
19 |
variables = set()
|
20 |
code_lines = code.split('\n')
|
21 |
|
22 |
+
# Find all variable names (simplified approach, could improve with AST)
|
23 |
for line in code_lines:
|
24 |
matches = re.findall(pattern, line)
|
25 |
for match in matches:
|
|
|
28 |
|
29 |
# Sort variables by first appearance (simplified, could improve with AST)
|
30 |
sorted_vars = sorted(list(variables))
|
31 |
+
var_map = {}
|
32 |
+
var_count = {prefix: 1 for prefix in variable_prefixes.values()}
|
33 |
+
|
34 |
+
# Assign variables based on context (simplified heuristic)
|
35 |
+
for var in sorted_vars:
|
36 |
+
# Determine variable role based on context (simplified)
|
37 |
+
if var in ['expression', 'input']: # Assume input parameters or initial variables
|
38 |
+
role = 'input_variable'
|
39 |
+
elif var in code.split() and 'return' in line for line in code_lines if var in line: # Returned variables
|
40 |
+
role = 'returned_variable'
|
41 |
+
else: # Default to assigned variables
|
42 |
+
role = 'assigned_variable'
|
43 |
+
|
44 |
+
new_name = f"{role}{var_count[role]}"
|
45 |
+
var_map[var] = new_name
|
46 |
+
var_count[role] += 1
|
47 |
|
48 |
# Replace variables in code
|
49 |
new_code = code
|
50 |
for old_var, new_var in var_map.items():
|
51 |
new_code = re.sub(r'\b' + old_var + r'\b', new_var, new_code)
|
52 |
|
53 |
+
return new_code, var_map
|
54 |
+
|
55 |
+
def generate_description_tokens(sequence, vectors, var_map=None):
|
56 |
+
"""Generate semantic description tokens for a program, including variable roles."""
|
57 |
+
tokens = []
|
58 |
+
category_descriptions = {
|
59 |
+
'import': 'imports module',
|
60 |
+
'function': 'defines function',
|
61 |
+
'assigned_variable': 'assigns variable',
|
62 |
+
'input_variable': 'input parameter',
|
63 |
+
'returned_variable': 'returns value',
|
64 |
+
'if': 'conditional statement',
|
65 |
+
'return': 'returns result',
|
66 |
+
'try': 'try block',
|
67 |
+
'except': 'exception handler',
|
68 |
+
'expression': 'expression statement',
|
69 |
+
'spacer': 'empty line or comment'
|
70 |
+
}
|
71 |
+
|
72 |
+
for cat, vec in zip(sequence, vectors):
|
73 |
+
if cat in category_descriptions:
|
74 |
+
tokens.append(f"{category_descriptions[cat]}:{cat}")
|
75 |
+
# Add vector-derived features (e.g., level, span) as tokens
|
76 |
+
tokens.append(f"level:{vec[1]}")
|
77 |
+
tokens.append(f"span:{vec[3]:.2f}")
|
78 |
+
|
79 |
+
# Add variable role tokens if var_map exists
|
80 |
+
if var_map:
|
81 |
+
for old_var, new_var in var_map.items():
|
82 |
+
role = new_var.split('variable')[0] + 'variable' # Extract role (e.g., 'input_variable')
|
83 |
+
tokens.append(f"variable:{old_var}={new_var}:{role}")
|
84 |
+
|
85 |
+
return tokens
|
86 |
|
87 |
def process_hf_dataset():
|
88 |
+
"""Process the Hugging Face dataset and store programs in ChromaDB, aligning with vector categories."""
|
89 |
# Load the dataset
|
90 |
dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
|
91 |
|
|
|
97 |
instruction = entry['instruction']
|
98 |
output = entry['output']
|
99 |
|
100 |
+
# Rename variables to align with vector categories
|
101 |
+
processed_code, var_map = rename_variables(output)
|
102 |
|
103 |
# Parse the code to get parts and sequence
|
104 |
parts, sequence = parse_python_code(processed_code)
|
105 |
vectors = [part['vector'] for part in parts]
|
106 |
|
107 |
+
# Generate description tokens including variable roles
|
108 |
description_tokens = f"task:{instruction.replace(' ', '_')}"
|
109 |
+
description_tokens_list = generate_description_tokens(sequence, vectors, var_map)
|
110 |
+
description_tokens += " " + " ".join(description_tokens_list)
|
111 |
|
112 |
# Store in ChromaDB with description
|
113 |
store_program(client, processed_code, sequence, vectors, DB_NAME)
|
114 |
|
115 |
+
# Update metadata with instruction and variable roles as description
|
116 |
collection = client.get_collection(DB_NAME)
|
117 |
program_id = str(hash(processed_code))
|
118 |
collection.update(
|