File size: 9,736 Bytes
b284540
 
 
d6c93c4
e77acbf
b284540
e77acbf
 
506d255
 
ecb205b
 
e77acbf
 
 
b284540
ecb205b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0df5c07
 
 
 
 
 
 
 
 
b284540
 
 
 
 
0df5c07
b284540
 
 
 
 
 
 
 
0df5c07
e77acbf
0df5c07
 
 
e77acbf
 
 
 
 
 
0df5c07
e77acbf
0df5c07
e77acbf
0df5c07
e77acbf
 
0df5c07
 
 
 
b284540
 
 
 
 
 
0df5c07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b284540
4058ab2
506d255
ecb205b
 
 
4058ab2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
506d255
ecb205b
 
b284540
 
 
 
 
 
ecb205b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
927956e
ecb205b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b284540
ecb205b
 
 
 
 
 
 
 
 
 
 
b284540
 
 
 
e77acbf
b284540
 
 
 
 
 
 
 
 
506d255
 
 
b284540
 
 
 
 
 
 
 
 
 
ecb205b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
# process_hf_dataset.py
from datasets import load_dataset
import re
from parser import parse_python_code, create_vector
from database import init_chromadb, store_program, DB_NAME, HF_DATASET_NAME
import chromadb
import os
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm  # For progress bar
import time

# Load environment variables
load_dotenv()

# Cache CodeBERT model globally to avoid repeated loading
model_name = "microsoft/codebert-base"
tokenizer = None
model = None
device = None

def load_codebert_model(use_gpu=False):
    """Load and cache the CodeBERT model, handling GPU/CPU options."""
    global tokenizer, model, device
    if tokenizer is None or model is None:
        try:
            device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            model = AutoModel.from_pretrained(model_name).to(device)
            print(f"CodeBERT model loaded on {device}")
        except Exception as e:
            print(f"Error loading CodeBERT model: {e}")
            raise
    return tokenizer, model, device

def rename_variables(code, variable_prefixes=None):
    """Rename variables in Python code to align with vector categories (input_variable, assigned_variable, returned_variable)."""
    if variable_prefixes is None:
        variable_prefixes = {
            'input': 'input_variable',
            'assigned': 'assigned_variable',
            'returned': 'returned_variable'
        }
    
    # Simple variable name detection and renaming
    pattern = r'\b[a-zA-Z_]\w*\b'  # Match variable names (simple heuristic)
    variables = set()
    code_lines = code.split('\n')
    
    # Find all variable names (simplified approach, could improve with AST)
    for line in code_lines:
        matches = re.findall(pattern, line)
        for match in matches:
            if match not in ['def', 'if', 'else', 'for', 'while', 'return', 'import', 'print', 'eval', 'str', 'int']:  # Exclude keywords
                variables.add(match)
    
    # Sort variables by first appearance (simplified, could improve with AST)
    sorted_vars = sorted(list(variables))
    var_map = {}
    var_count = {'input_variable': 1, 'assigned_variable': 1, 'returned_variable': 1}
    
    # Assign variables based on context (simplified heuristic)
    for var in sorted_vars:
        # Determine variable role based on context
        is_input = any(var in line and 'def' in line for line in code_lines)  # Check if in function definition (input parameter)
        is_returned = any('return' in line and var in line for line in code_lines)  # Check if used in return statement
        is_assigned = any('=' in line and var in line.split('=')[0].strip() for line in code_lines)  # Check if assigned
        
        if is_input:
            role = 'input_variable'
        elif is_returned:
            role = 'returned_variable'
        elif is_assigned:
            role = 'assigned_variable'
        else:
            role = 'assigned_variable'  # Default to assigned if unclear
        
        new_name = f"{role}{var_count[role]}"
        var_map[var] = new_name
        var_count[role] += 1
    
    # Replace variables in code
    new_code = code
    for old_var, new_var in var_map.items():
        new_code = re.sub(r'\b' + old_var + r'\b', new_var, new_code)
    
    return new_code, var_map

def generate_description_tokens(sequence, vectors, var_map=None):
    """Generate semantic description tokens for a program, including variable roles."""
    tokens = []
    category_descriptions = {
        'import': 'imports module',
        'function': 'defines function',
        'assigned_variable': 'assigns variable',
        'input_variable': 'input parameter',
        'returned_variable': 'returns value',
        'if': 'conditional statement',
        'return': 'returns result',
        'try': 'try block',
        'except': 'exception handler',
        'expression': 'expression statement',
        'spacer': 'empty line or comment'
    }
    
    for cat, vec in zip(sequence, vectors):
        if cat in category_descriptions:
            tokens.append(f"{category_descriptions[cat]}:{cat}")
            # Add vector-derived features (e.g., level, span) as tokens
            tokens.append(f"level:{vec[1]}")
            tokens.append(f"span:{vec[3]:.2f}")
    
    # Add variable role tokens if var_map exists
    if var_map:
        for old_var, new_var in var_map.items():
            role = new_var.split('variable')[0] + 'variable'  # Extract role (e.g., 'input_variable')
            tokens.append(f"variable:{old_var}={new_var}:{role}")
    
    return tokens

def generate_semantic_vector(description, total_lines=100, use_gpu=False):
    """Generate a 6D semantic vector for a textual description using CodeBERT, projecting to 6D."""
    global tokenizer, model, device
    if tokenizer is None or model is None:
        tokenizer, model, device = load_codebert_model(use_gpu)
    
    # Tokenize and encode the description
    inputs = tokenizer(description, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Generate embeddings
    with torch.no_grad():
        outputs = model(**inputs)
        # Use mean pooling of the last hidden states
        vector = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy().tolist()
    
    # Truncate or project to 6D (simplified projection: take first 6 dimensions)
    if len(vector) < 6:
        vector.extend([0] * (6 - len(vector)))
    elif len(vector) > 6:
        vector = vector[:6]  # Truncate to 6D
    
    return vector

def process_hf_dataset(batch_size=100, use_gpu=False):
    """Process the Hugging Face dataset in batches and store programs in ChromaDB, aligning with vector categories."""
    # Load the dataset
    dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
    
    # Initialize ChromaDB client
    client = init_chromadb()
    
    # Clear existing collection (fresh install) if needed
    try:
        client.delete_collection(DB_NAME)
    except:
        pass  # Collection may not exist
    collection = client.create_collection(DB_NAME)
    
    # Process in batches with progress bar
    total_entries = len(dataset)
    for i in tqdm(range(0, total_entries, batch_size), desc="Processing Hugging Face Dataset"):
        batch = dataset[i:i + batch_size]
        batch_programs = []
        batch_ids = []
        batch_documents = []
        batch_metadatas = []
        batch_embeddings = []
        
        for entry in batch:
            instruction = entry['instruction']
            output = entry['output']
            
            # Rename variables to align with vector categories
            processed_code, var_map = rename_variables(output)
            
            # Parse the code to get parts and sequence, generating our 6D vectors
            parts, sequence = parse_python_code(processed_code)
            program_vectors = [part['vector'] for part in parts]  # Use parser's 6D vectors for program structure
            
            # Generate description tokens including variable roles
            description_tokens = f"task:{instruction.replace(' ', '_')}"
            description_tokens_list = generate_description_tokens(sequence, program_vectors, var_map)
            description_tokens += " " + " ".join(description_tokens_list)
            
            # Generate a 6D semantic vector for the instruction
            semantic_vector = generate_semantic_vector(instruction, use_gpu=use_gpu)
            
            # Store program data
            program_id = str(hash(processed_code))
            batch_ids.append(program_id)
            batch_documents.append(processed_code)
            batch_metadatas.append({"sequence": ",".join(sequence), "description_tokens": description_tokens, "program_vectors": str(program_vectors)})
            batch_embeddings.append(semantic_vector)
            
            # Add small delay to prevent freezing (optional, adjust as needed)
            time.sleep(0.01)
        
        # Batch add to ChromaDB
        try:
            collection.add(
                documents=batch_documents,
                metadatas=batch_metadatas,
                ids=batch_ids,
                embeddings=batch_embeddings
            )
        except Exception as e:
            print(f"Error adding batch to ChromaDB: {e}")
            raise
    
    # Save to Hugging Face Dataset
    save_chromadb_to_hf()

def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY")):
    """Save ChromaDB data to Hugging Face Dataset."""
    client = init_chromadb()
    collection = client.get_collection(DB_NAME)
    
    # Fetch all data from ChromaDB
    results = collection.get(include=["documents", "metadatas", "embeddings"])
    data = {
        "code": results["documents"],
        "sequence": [meta["sequence"] for meta in results["metadatas"]],
        "vectors": results["embeddings"],  # Semantic 6D vectors
        "description_tokens": [meta.get('description_tokens', '') for meta in results["metadatas"]],
        "program_vectors": [eval(meta.get('program_vectors', '[]')) for meta in results["metadatas"]]  # Store structural vectors
    }
    
    # Create a Hugging Face Dataset
    dataset = Dataset.from_dict(data)
    
    # Push to Hugging Face Hub
    dataset.push_to_hub(dataset_name, token=token)
    print(f"Dataset pushed to Hugging Face Hub as {dataset_name}")

if __name__ == "__main__":
    process_hf_dataset(batch_size=100, use_gpu=False)