| from pydriller import Repository |
| import os |
| import json |
| from tqdm import tqdm |
| import re |
| from multiprocessing import Pool |
|
|
| REPO_PATH = '../linux' |
| OUTPUT_FILE = './output/linux_bugfix_dataset.jsonl' |
|
|
| TEST_MODE = False |
| MAX_COMMITS_TEST = 50 |
| NUM_WORKERS = 16 |
|
|
| BUGFIX_KEYWORDS = [ |
| 'fix', 'bug', 'leak', 'null', 'overflow', 'error', 'failure', |
| 'crash', 'panic', 'memory', 'race', 'deadlock', 'corruption', |
| 'security', 'vulnerability', 'exploit', 'buffer', 'stack' |
| ] |
|
|
| def is_bugfix_commit(msg): |
| msg_lower = msg.lower() |
| return any(keyword in msg_lower for keyword in BUGFIX_KEYWORDS) |
|
|
| def extract_instruction_from_commit_msg(msg): |
| lines = msg.strip().splitlines() |
| for line in lines: |
| line = line.strip() |
| if len(line) < 5 or not any(c.isalpha() for c in line): |
| continue |
| if line.lower().startswith(( |
| '[patch]', 'signed-off-by', 'reviewed-by', 'tested-by', 'ack', |
| 'reported-by', 'cc:', 'co-authored-by', 'patchwork-id', |
| 'suggested-by', 'fixes:', 'link:', 'cherry picked from commit' |
| )): |
| continue |
| return line |
| return msg.strip().splitlines()[0] if msg.strip() else "fix" |
|
|
| def extract_code_context(code, line_number, context_lines=10): |
| if not code: |
| return "" |
| lines = code.split('\n') |
| start = max(0, line_number - context_lines) |
| end = min(len(lines), line_number + context_lines) |
| return '\n'.join(lines[start:end]) |
|
|
| def extract_diff_context(diff_text, context_lines=5): |
| if not diff_text: |
| return "" |
| lines = diff_text.split('\n') |
| change_lines = [i for i, line in enumerate(lines) if line.startswith('+') or line.startswith('-')] |
| if not change_lines: |
| return diff_text |
| start = max(0, change_lines[0] - context_lines) |
| end = min(len(lines), change_lines[-1] + context_lines + 1) |
| return '\n'.join(lines[start:end]) |
|
|
| def create_dataset_entry(original_code, commit_msg, diff_code): |
| return { |
| "input": { |
| "original code": original_code.strip(), |
| "instruction": extract_instruction_from_commit_msg(commit_msg) |
| }, |
| "output": { |
| "diff codes": diff_code.strip() |
| } |
| } |
|
|
| def process_commit(commit): |
| entries = [] |
| if not is_bugfix_commit(commit.msg): |
| return entries |
|
|
| for mod in commit.modified_files: |
| if not mod.new_path or not mod.new_path.endswith(('.c', '.h')): |
| continue |
| if mod.change_type.name != "MODIFY": |
| continue |
| if not mod.diff or not mod.source_code_before: |
| continue |
|
|
| focused_diff = extract_diff_context(mod.diff) |
|
|
| diff_lines = mod.diff.split('\n') |
| line_numbers = [] |
| for line in diff_lines: |
| if line.startswith('@@'): |
| match = re.search(r'@@ -(\d+),?\d* \+\d+,?\d* @@', line) |
| if match: |
| line_numbers.append(int(match.group(1))) |
|
|
| if line_numbers: |
| focused_code = extract_code_context(mod.source_code_before, line_numbers[0]) |
| else: |
| focused_code = '\n'.join(mod.source_code_before.split('\n')[:50]) |
|
|
| entry = create_dataset_entry( |
| original_code=focused_code, |
| commit_msg=commit.msg, |
| diff_code=focused_diff |
| ) |
| entries.append(entry) |
|
|
| return entries |
|
|
| def collect_entries_from_hash(commit_hash): |
| try: |
| commit = next(Repository(REPO_PATH, only_commits=[commit_hash]).traverse_commits()) |
| return process_commit(commit) |
| except Exception: |
| return [] |
|
|
| def main(): |
| if not os.path.exists(REPO_PATH): |
| print("[ERROR] Repository not found at:", REPO_PATH) |
| return |
|
|
| os.makedirs('./output', exist_ok=True) |
|
|
| print("[INFO] Building Linux kernel bug-fix dataset...") |
| print("[INFO] Repository:", REPO_PATH) |
| print("[INFO] Output file:", OUTPUT_FILE) |
|
|
| output_file = OUTPUT_FILE.replace('.jsonl', '_test.jsonl') if TEST_MODE else OUTPUT_FILE |
|
|
| all_hashes = [c.hash for c in Repository(REPO_PATH).traverse_commits()] |
| if TEST_MODE and MAX_COMMITS_TEST: |
| all_hashes = all_hashes[:MAX_COMMITS_TEST] |
|
|
| dataset_entries = [] |
| with Pool(NUM_WORKERS) as pool: |
| results = list(tqdm(pool.imap_unordered(collect_entries_from_hash, all_hashes), total=len(all_hashes))) |
|
|
| for entries in results: |
| dataset_entries.extend(entries) |
|
|
| with open(output_file, 'w', encoding='utf-8') as f: |
| for entry in dataset_entries: |
| f.write(json.dumps(entry, ensure_ascii=False) + '\n') |
|
|
| print("[DONE] Dataset creation completed!") |
| print("[INFO] Total commits processed:", len(all_hashes)) |
| print("[INFO] Total dataset entries:", len(dataset_entries)) |
| print("[INFO] Saved to:", output_file) |
|
|
| if dataset_entries: |
| print("[INFO] Sample dataset entry:") |
| sample = dataset_entries[0] |
| print(json.dumps(sample, indent=2, ensure_ascii=False)[:800] + "...") |
| print("[INFO] Dataset structure:") |
| print(" - Input: original code + instruction") |
| print(" - Output: diff codes") |
| print(" - Format: JSONL (one JSON object per line)") |
|
|
| if __name__ == "__main__": |
| main() |
|
|