Spaces:
Running
Running
# Fine-tuning Code LLM with CoVe (Chain of Verification) on HuggingFace Spaces | |
# Implements Chain of Verification for better code reasoning and verification | |
import os | |
import torch | |
import numpy as np | |
import functools | |
import random | |
import json | |
from tqdm import tqdm | |
from typing import Dict, List, Any | |
from transformers import ( | |
AutoModelForCausalLM, | |
AutoTokenizer, | |
Trainer, | |
TrainingArguments, | |
logging, | |
set_seed, | |
BitsAndBytesConfig, | |
DataCollatorForLanguageModeling, | |
) | |
from datasets import load_dataset, Dataset | |
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel | |
from torch.utils.data import IterableDataset | |
from huggingface_hub import HfApi, login | |
# Login to HuggingFace | |
if os.getenv("HF_TOKEN"): | |
login(token=os.getenv("HF_TOKEN")) | |
# Model and dataset configuration | |
MODEL = "codellama/CodeLlama-7b-Instruct-hf" # Using instruct version for better CoVe performance | |
# Using multiple datasets for better CoVe training | |
DATASETS = [ | |
"smangrul/hf-stack-v1", # Code repository data | |
"iamtarun/python_code_instructions_18k_alpaca", # Code instructions | |
"nickrosh/Evol-Instruct-Code-80k-v1", # Evolved code instructions | |
] | |
# CoVe-specific parameters | |
COVE_VERIFICATION_RATE = 0.7 # Proportion of samples that get verification steps | |
COVE_EXPLANATION_RATE = 0.8 # Proportion of samples that get explanations | |
# Training parameters optimized for CoVe | |
SEQ_LENGTH = 3072 # Longer sequences for verification chains | |
MAX_STEPS = 1500 | |
BATCH_SIZE = 2 # Smaller batch for longer sequences | |
GR_ACC_STEPS = 8 # Higher accumulation | |
LR = 1e-4 | |
LR_SCHEDULER_TYPE = "cosine" | |
WEIGHT_DECAY = 0.01 | |
NUM_WARMUP_STEPS = 100 | |
EVAL_FREQ = 150 | |
SAVE_FREQ = 300 | |
LOG_FREQ = 25 | |
OUTPUT_DIR = "codellama-7b-cove-finetuned" | |
BF16 = True | |
FP16 = False | |
# LoRA parameters | |
LORA_R = 32 # Higher rank for complex reasoning | |
LORA_ALPHA = 64 | |
LORA_DROPOUT = 0.1 | |
LORA_TARGET_MODULES = "q_proj,v_proj,k_proj,o_proj,gate_proj,up_proj,down_proj" | |
# Quantization config | |
USE_NESTED_QUANT = True | |
BNB_4BIT_COMPUTE_DTYPE = "bfloat16" | |
SEED = 42 | |
set_seed(SEED) | |
# CoVe prompt templates | |
COVE_TEMPLATES = { | |
"code_explanation": """<s>[INST] Explain the following code step by step, then verify your explanation: | |
Code: | |
{code} | |
Provide: | |
1. Step-by-step explanation | |
2. Verification of each step | |
3. Final summary [/INST] | |
## Step-by-step Explanation: | |
{explanation} | |
## Verification: | |
{verification} | |
## Summary: | |
{summary}</s>""", | |
"code_generation": """<s>[INST] {instruction} | |
Use Chain of Verification: | |
1. Generate the solution | |
2. Verify it works correctly | |
3. Check for edge cases | |
4. Provide final verified solution [/INST] | |
## Initial Solution: | |
{initial_solution} | |
## Verification Steps: | |
{verification_steps} | |
## Edge Case Analysis: | |
{edge_cases} | |
## Final Verified Solution: | |
{final_solution}</s>""", | |
"code_debugging": """<s>[INST] Debug the following code and explain your reasoning: | |
Code: | |
{buggy_code} | |
Problem: {problem_description} | |
Use verification to ensure your fix is correct. [/INST] | |
## Problem Analysis: | |
{analysis} | |
## Proposed Fix: | |
{fix} | |
## Verification: | |
{verification} | |
## Final Corrected Code: | |
{corrected_code}</s>""", | |
"code_review": """<s>[INST] Review this code and provide feedback: | |
{code} | |
Provide: | |
1. Initial assessment | |
2. Verify your observations | |
3. Specific improvement suggestions [/INST] | |
## Initial Assessment: | |
{assessment} | |
## Verification of Issues: | |
{verification} | |
## Improvement Suggestions: | |
{suggestions}</s>""" | |
} | |
class CoVeDataProcessor: | |
"""Processes various datasets into CoVe format""" | |
def __init__(self, tokenizer): | |
self.tokenizer = tokenizer | |
def create_code_explanation_sample(self, code_content: str) -> str: | |
"""Create a CoVe sample with code explanation and verification""" | |
# Extract meaningful code blocks (functions, classes) | |
lines = code_content.split('\n') | |
code_blocks = [] | |
current_block = [] | |
indent_level = 0 | |
for line in lines: | |
if line.strip(): | |
if (line.startswith('def ') or line.startswith('class ') or | |
line.startswith('async def ')): | |
if current_block: | |
code_blocks.append('\n'.join(current_block)) | |
current_block = [line] | |
indent_level = len(line) - len(line.lstrip()) | |
elif current_block: | |
current_block.append(line) | |
# End block if we return to original indent level | |
if line.strip() and (len(line) - len(line.lstrip())) <= indent_level: | |
if len(current_block) > 3: # Only keep substantial blocks | |
code_blocks.append('\n'.join(current_block)) | |
current_block = [] | |
if current_block and len(current_block) > 3: | |
code_blocks.append('\n'.join(current_block)) | |
if not code_blocks: | |
return None | |
# Select a random code block | |
code_block = random.choice(code_blocks) | |
# Generate explanation, verification, and summary | |
explanation = self._generate_explanation(code_block) | |
verification = self._generate_verification(code_block, explanation) | |
summary = self._generate_summary(code_block) | |
return COVE_TEMPLATES["code_explanation"].format( | |
code=code_block, | |
explanation=explanation, | |
verification=verification, | |
summary=summary | |
) | |
def _generate_explanation(self, code: str) -> str: | |
"""Generate step-by-step explanation""" | |
lines = [line for line in code.split('\n') if line.strip()] | |
explanations = [] | |
for i, line in enumerate(lines[:8]): # Limit to first 8 lines | |
line = line.strip() | |
if line.startswith('def '): | |
explanations.append(f"Step {i+1}: Define function with parameters") | |
elif line.startswith('class '): | |
explanations.append(f"Step {i+1}: Define class structure") | |
elif 'return' in line: | |
explanations.append(f"Step {i+1}: Return computed result") | |
elif '=' in line and not line.startswith('if'): | |
explanations.append(f"Step {i+1}: Variable assignment and computation") | |
elif line.startswith('if '): | |
explanations.append(f"Step {i+1}: Conditional logic check") | |
elif line.startswith('for ') or line.startswith('while '): | |
explanations.append(f"Step {i+1}: Loop iteration") | |
else: | |
explanations.append(f"Step {i+1}: Execute operation") | |
return '\n'.join(explanations) | |
def _generate_verification(self, code: str, explanation: str) -> str: | |
"""Generate verification steps""" | |
verifications = [ | |
"✓ Syntax check: Code follows Python syntax rules", | |
"✓ Logic check: Each step follows logically from the previous", | |
"✓ Variable usage: All variables are properly defined before use", | |
"✓ Return value: Function returns appropriate type and value" | |
] | |
if 'def ' in code: | |
verifications.append("✓ Function definition: Parameters and return type are clear") | |
if 'for ' in code or 'while ' in code: | |
verifications.append("✓ Loop logic: Iteration bounds and exit conditions are correct") | |
if 'if ' in code: | |
verifications.append("✓ Conditional logic: All branches are handled appropriately") | |
return '\n'.join(verifications) | |
def _generate_summary(self, code: str) -> str: | |
"""Generate summary of the code""" | |
if 'def ' in code: | |
return "This function implements a specific algorithm with clear input/output behavior and proper error handling." | |
elif 'class ' in code: | |
return "This class defines a data structure with methods for manipulation and access." | |
else: | |
return "This code block performs a specific computational task with clear logic flow." | |
def create_instruction_sample(self, instruction: str, code: str) -> str: | |
"""Create CoVe sample from instruction-code pair""" | |
# Generate verification components | |
initial_solution = code | |
verification_steps = [ | |
"1. Check syntax correctness", | |
"2. Verify logic flow", | |
"3. Test with sample inputs", | |
"4. Confirm output format" | |
] | |
edge_cases = [ | |
"- Empty input handling", | |
"- Boundary value testing", | |
"- Type validation", | |
"- Error condition handling" | |
] | |
return COVE_TEMPLATES["code_generation"].format( | |
instruction=instruction, | |
initial_solution=initial_solution, | |
verification_steps='\n'.join(verification_steps), | |
edge_cases='\n'.join(edge_cases), | |
final_solution=code | |
) | |
class CoVeDataset(IterableDataset): | |
"""Dataset that generates CoVe-formatted training examples""" | |
def __init__(self, datasets, tokenizer, max_samples=10000, seq_length=3072): | |
self.datasets = datasets | |
self.tokenizer = tokenizer | |
self.max_samples = max_samples | |
self.seq_length = seq_length | |
self.processor = CoVeDataProcessor(tokenizer) | |
self.samples_generated = 0 | |
def __iter__(self): | |
for dataset in self.datasets: | |
if self.samples_generated >= self.max_samples: | |
break | |
try: | |
ds = load_dataset(dataset, streaming=True, split='train') | |
for example in ds: | |
if self.samples_generated >= self.max_samples: | |
break | |
# Process based on dataset type | |
if 'content' in example: | |
# Repository code | |
cove_sample = self.processor.create_code_explanation_sample( | |
example['content'] | |
) | |
elif 'instruction' in example and 'output' in example: | |
# Instruction-following dataset | |
cove_sample = self.processor.create_instruction_sample( | |
example['instruction'], example['output'] | |
) | |
else: | |
continue | |
if cove_sample and len(cove_sample) > 100: | |
# Tokenize and create training example | |
tokenized = self.tokenizer( | |
cove_sample, | |
max_length=self.seq_length, | |
truncation=True, | |
padding=False, | |
return_tensors="pt" | |
) | |
if tokenized['input_ids'].shape[1] > 512: # Ensure substantial content | |
input_ids = tokenized['input_ids'].squeeze() | |
yield { | |
'input_ids': input_ids, | |
'labels': input_ids.clone(), | |
'attention_mask': tokenized['attention_mask'].squeeze() | |
} | |
self.samples_generated += 1 | |
if self.samples_generated % 100 == 0: | |
print(f"Generated {self.samples_generated} CoVe samples") | |
except Exception as e: | |
print(f"Error processing dataset {dataset}: {e}") | |
continue | |
def setup_model_and_tokenizer(): | |
"""Setup quantized model and tokenizer""" | |
print(f"Loading model: {MODEL}") | |
tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True) | |
if tokenizer.pad_token is None: | |
tokenizer.pad_token = tokenizer.eos_token | |
# 4-bit quantization config | |
compute_dtype = getattr(torch, BNB_4BIT_COMPUTE_DTYPE) | |
bnb_config = BitsAndBytesConfig( | |
load_in_4bit=True, | |
bnb_4bit_quant_type="nf4", | |
bnb_4bit_compute_dtype=compute_dtype, | |
bnb_4bit_use_double_quant=USE_NESTED_QUANT, | |
) | |
# Load quantized model | |
model = AutoModelForCausalLM.from_pretrained( | |
MODEL, | |
quantization_config=bnb_config, | |
device_map="auto", | |
use_cache=False, | |
trust_remote_code=True, | |
torch_dtype=compute_dtype, | |
) | |
model = prepare_model_for_kbit_training(model) | |
return model, tokenizer | |
def setup_lora(model): | |
"""Setup LoRA configuration""" | |
peft_config = LoraConfig( | |
lora_alpha=LORA_ALPHA, | |
lora_dropout=LORA_DROPOUT, | |
r=LORA_R, | |
bias="none", | |
task_type="CAUSAL_LM", | |
target_modules=LORA_TARGET_MODULES.split(","), | |
) | |
model = get_peft_model(model, peft_config) | |
model.print_trainable_parameters() | |
return model | |
def prepare_cove_datasets(tokenizer): | |
"""Prepare CoVe training datasets""" | |
print("Preparing CoVe datasets...") | |
# Create training dataset | |
train_dataset = CoVeDataset( | |
DATASETS, | |
tokenizer, | |
max_samples=8000, | |
seq_length=SEQ_LENGTH | |
) | |
# Create smaller validation dataset | |
eval_dataset = CoVeDataset( | |
DATASETS, | |
tokenizer, | |
max_samples=1000, | |
seq_length=SEQ_LENGTH | |
) | |
return train_dataset, eval_dataset | |
def train_cove_model(): | |
"""Main training function for CoVe""" | |
print("Setting up model and tokenizer...") | |
model, tokenizer = setup_model_and_tokenizer() | |
print("Setting up LoRA...") | |
model = setup_lora(model) | |
print("Preparing CoVe datasets...") | |
train_dataset, eval_dataset = prepare_cove_datasets(tokenizer) | |
# Data collator for language modeling | |
data_collator = DataCollatorForLanguageModeling( | |
tokenizer=tokenizer, | |
mlm=False, | |
return_tensors="pt", | |
pad_to_multiple_of=8, | |
) | |
# Training arguments | |
training_args = TrainingArguments( | |
output_dir=OUTPUT_DIR, | |
dataloader_drop_last=True, | |
eval_strategy="steps", | |
save_strategy="steps", | |
max_steps=MAX_STEPS, | |
eval_steps=EVAL_FREQ, | |
save_steps=SAVE_FREQ, | |
logging_steps=LOG_FREQ, | |
per_device_train_batch_size=BATCH_SIZE, | |
per_device_eval_batch_size=BATCH_SIZE, | |
learning_rate=LR, | |
lr_scheduler_type=LR_SCHEDULER_TYPE, | |
warmup_steps=NUM_WARMUP_STEPS, | |
gradient_accumulation_steps=GR_ACC_STEPS, | |
gradient_checkpointing=True, | |
fp16=FP16, | |
bf16=BF16, | |
weight_decay=WEIGHT_DECAY, | |
push_to_hub=True, | |
hub_model_id=OUTPUT_DIR, | |
hub_strategy="every_save", | |
include_tokens_per_second=True, | |
remove_unused_columns=False, | |
report_to="tensorboard", | |
dataloader_num_workers=2, | |
) | |
print("Starting CoVe training...") | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=train_dataset, | |
eval_dataset=eval_dataset, | |
data_collator=data_collator, | |
) | |
trainer.train() | |
trainer.save_model() | |
trainer.push_to_hub() | |
print("CoVe training completed!") | |
return model, tokenizer | |
def test_cove_inference(model_path=None): | |
"""Test CoVe inference""" | |
if model_path is None: | |
model_path = OUTPUT_DIR | |
print("Loading CoVe model for inference...") | |
tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True) | |
base_model = AutoModelForCausalLM.from_pretrained( | |
MODEL, | |
quantization_config=None, | |
device_map="auto", | |
trust_remote_code=True, | |
torch_dtype=torch.bfloat16, | |
) | |
model = PeftModel.from_pretrained(base_model, model_path) | |
model = model.merge_and_unload() | |
def generate_with_cove(prompt, max_length=512): | |
model.eval() | |
inputs = tokenizer(prompt, return_tensors="pt").to(model.device) | |
with torch.no_grad(): | |
outputs = model.generate( | |
**inputs, | |
max_new_tokens=max_length, | |
temperature=0.3, | |
top_k=50, | |
top_p=0.9, | |
do_sample=True, | |
repetition_penalty=1.1, | |
pad_token_id=tokenizer.eos_token_id, | |
) | |
return tokenizer.decode(outputs[0], skip_special_tokens=True) | |
# Test CoVe reasoning | |
test_prompt = """<s>[INST] Explain the following code step by step, then verify your explanation: | |
Code: | |
def fibonacci(n): | |
if n <= 1: | |
return n | |
a, b = 0, 1 | |
for i in range(2, n + 1): | |
a, b = b, a + b | |
return b | |
Provide: | |
1. Step-by-step explanation | |
2. Verification of each step | |
3. Final summary [/INST]""" | |
print("CoVe Test Prompt:") | |
print(test_prompt) | |
print("\n" + "="*80) | |
print("Generated CoVe Response:") | |
result = generate_with_cove(test_prompt) | |
print(result[len(test_prompt):]) | |
if __name__ == "__main__": | |
print("Starting CoVe Fine-tuning Process...") | |
if os.getenv("SPACE_ID"): | |
print("Running in HuggingFace Spaces") | |
if torch.cuda.is_available(): | |
print(f"GPU: {torch.cuda.get_device_name(0)}") | |
print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB") | |
else: | |
print("WARNING: No GPU available!") | |
# Train CoVe model | |
model, tokenizer = train_cove_model() | |
# Test CoVe inference | |
print("\n" + "="*80) | |
print("Testing CoVe Inference...") | |
test_cove_inference() |