ArchibaldAI
/

ruvector-fixed

Model card Files Files and versions

ruvector-fixed / dist /core /diff-embeddings.js

Archie

Fix dimension/dimensions bug and positional insert/search args

40d7073 1 day ago

history blame contribute delete

10.7 kB

	"use strict";
	/**
	* Diff Embeddings - Semantic encoding of git diffs
	*
	* Generates embeddings for code changes to enable:
	* - Change classification (feature, bugfix, refactor)
	* - Similar change detection
	* - Risk assessment
	* - Review prioritization
	*/
	Object.defineProperty(exports, "__esModule", { value: true });
	exports.parseDiff = parseDiff;
	exports.classifyChange = classifyChange;
	exports.calculateRiskScore = calculateRiskScore;
	exports.analyzeFileDiff = analyzeFileDiff;
	exports.getCommitDiff = getCommitDiff;
	exports.getStagedDiff = getStagedDiff;
	exports.getUnstagedDiff = getUnstagedDiff;
	exports.analyzeCommit = analyzeCommit;
	exports.findSimilarCommits = findSimilarCommits;
	const child_process_1 = require("child_process");
	const onnx_embedder_1 = require("./onnx-embedder");
	/**
	* Parse a unified diff into hunks
	*/
	function parseDiff(diff) {
	const hunks = [];
	const lines = diff.split('\n');
	let currentFile = '';
	let currentHunk = null;
	for (const line of lines) {
	// File header
	if (line.startsWith('diff --git')) {
	const match = line.match(/diff --git a\/(.+) b\/(.+)/);
	if (match) {
	currentFile = match[2];
	}
	}
	// Hunk header
	if (line.startsWith('@@')) {
	if (currentHunk) {
	hunks.push(currentHunk);
	}
	const match = line.match(/@@ -(\d+),?(\d) \+(\d+),?(\d) @@/);
	if (match) {
	currentHunk = {
	file: currentFile,
	oldStart: parseInt(match[1]),
	oldLines: parseInt(match[2] \|\| '1'),
	newStart: parseInt(match[3]),
	newLines: parseInt(match[4] \|\| '1'),
	content: '',
	additions: [],
	deletions: [],
	};
	}
	}
	else if (currentHunk) {
	// Content lines
	if (line.startsWith('+') && !line.startsWith('+++')) {
	currentHunk.additions.push(line.substring(1));
	currentHunk.content += line + '\n';
	}
	else if (line.startsWith('-') && !line.startsWith('---')) {
	currentHunk.deletions.push(line.substring(1));
	currentHunk.content += line + '\n';
	}
	else if (line.startsWith(' ')) {
	currentHunk.content += line + '\n';
	}
	}
	}
	if (currentHunk) {
	hunks.push(currentHunk);
	}
	return hunks;
	}
	/**
	* Classify a change based on patterns
	*/
	function classifyChange(diff, message = '') {
	const lowerMessage = message.toLowerCase();
	const lowerDiff = diff.toLowerCase();
	// Check message patterns
	if (/\b(fix\|bug\|issue\|error\|crash\|patch)\b/.test(lowerMessage))
	return 'bugfix';
	if (/\b(feat\|feature\|add\|new\|implement)\b/.test(lowerMessage))
	return 'feature';
	if (/\b(refactor\|clean\|improve\|optimize)\b/.test(lowerMessage))
	return 'refactor';
	if (/\b(doc\|readme\|comment\|jsdoc)\b/.test(lowerMessage))
	return 'docs';
	if (/\b(test\|spec\|coverage)\b/.test(lowerMessage))
	return 'test';
	if (/\b(config\|ci\|cd\|build\|deps)\b/.test(lowerMessage))
	return 'config';
	// Check diff patterns
	if (/\.(md\|txt\|rst)$/.test(diff))
	return 'docs';
	if (/\.(test\|spec)\.[jt]sx?/.test(diff))
	return 'test';
	if (/\.(json\|ya?ml\|toml\|ini)$/.test(diff))
	return 'config';
	// Check content patterns
	if (/\bcatch\b\|\btry\b\|\berror\b/.test(lowerDiff) && /\bfix\b/.test(lowerDiff))
	return 'bugfix';
	if (/\bfunction\b\|\bclass\b\|\bexport\b/.test(lowerDiff))
	return 'feature';
	return 'unknown';
	}
	/**
	* Calculate risk score for a diff
	*/
	function calculateRiskScore(analysis) {
	let risk = 0;
	// Size risk
	const totalChanges = analysis.totalAdditions + analysis.totalDeletions;
	if (totalChanges > 500)
	risk += 0.3;
	else if (totalChanges > 200)
	risk += 0.2;
	else if (totalChanges > 50)
	risk += 0.1;
	// Complexity risk
	if (analysis.complexity > 20)
	risk += 0.2;
	else if (analysis.complexity > 10)
	risk += 0.1;
	// File type risk
	if (analysis.file.includes('auth') \|\| analysis.file.includes('security'))
	risk += 0.2;
	if (analysis.file.includes('database') \|\| analysis.file.includes('migration'))
	risk += 0.15;
	if (analysis.file.includes('api') \|\| analysis.file.includes('endpoint'))
	risk += 0.1;
	// Pattern risk (deletions of error handling, etc.)
	for (const hunk of analysis.hunks) {
	for (const del of hunk.deletions) {
	if (/\bcatch\b\|\berror\b\|\bvalidat/.test(del))
	risk += 0.1;
	if (/\bif\b.*\bnull\b\|\bundefined\b/.test(del))
	risk += 0.05;
	}
	}
	return Math.min(1, risk);
	}
	/**
	* Analyze a single file diff
	*/
	async function analyzeFileDiff(file, diff, message = '') {
	const hunks = parseDiff(diff).filter(h => h.file === file \|\| h.file === '');
	const totalAdditions = hunks.reduce((sum, h) => sum + h.additions.length, 0);
	const totalDeletions = hunks.reduce((sum, h) => sum + h.deletions.length, 0);
	// Calculate complexity (branch keywords in additions)
	let complexity = 0;
	for (const hunk of hunks) {
	for (const add of hunk.additions) {
	if (/\bif\b\|\belse\b\|\bfor\b\|\bwhile\b\|\bswitch\b\|\bcatch\b\|\?/.test(add)) {
	complexity++;
	}
	}
	}
	const category = classifyChange(diff, message);
	const analysis = {
	file,
	hunks,
	totalAdditions,
	totalDeletions,
	complexity,
	riskScore: 0,
	category,
	};
	analysis.riskScore = calculateRiskScore(analysis);
	// Generate embedding for the diff
	if ((0, onnx_embedder_1.isReady)()) {
	const diffText = hunks.map(h => h.content).join('\n');
	const result = await (0, onnx_embedder_1.embed)(`${category} change in ${file}: ${diffText.substring(0, 500)}`);
	analysis.embedding = result.embedding;
	}
	return analysis;
	}
	/**
	* Get diff for a commit
	*/
	function getCommitDiff(commitHash = 'HEAD') {
	try {
	return (0, child_process_1.execSync)(`git show ${commitHash} --format="" 2>/dev/null`, {
	encoding: 'utf8',
	maxBuffer: 10 * 1024 * 1024,
	});
	}
	catch {
	return '';
	}
	}
	/**
	* Get diff for staged changes
	*/
	function getStagedDiff() {
	try {
	return (0, child_process_1.execSync)('git diff --cached 2>/dev/null', {
	encoding: 'utf8',
	maxBuffer: 10 * 1024 * 1024,
	});
	}
	catch {
	return '';
	}
	}
	/**
	* Get diff for unstaged changes
	*/
	function getUnstagedDiff() {
	try {
	return (0, child_process_1.execSync)('git diff 2>/dev/null', {
	encoding: 'utf8',
	maxBuffer: 10 * 1024 * 1024,
	});
	}
	catch {
	return '';
	}
	}
	/**
	* Analyze a commit
	*/
	async function analyzeCommit(commitHash = 'HEAD') {
	const diff = getCommitDiff(commitHash);
	// Get commit metadata
	let message = '', author = '', date = '';
	try {
	const info = (0, child_process_1.execSync)(`git log -1 --format="%s\|%an\|%aI" ${commitHash} 2>/dev/null`, {
	encoding: 'utf8',
	}).trim();
	[message, author, date] = info.split('\|');
	}
	catch { }
	// Parse hunks and group by file
	const hunks = parseDiff(diff);
	const fileHunks = new Map();
	for (const hunk of hunks) {
	if (!fileHunks.has(hunk.file)) {
	fileHunks.set(hunk.file, []);
	}
	fileHunks.get(hunk.file).push(hunk);
	}
	// Analyze each file
	const files = [];
	for (const [file, fileHunkList] of fileHunks) {
	const fileDiff = fileHunkList.map(h => h.content).join('\n');
	const analysis = await analyzeFileDiff(file, diff, message);
	files.push(analysis);
	}
	const totalAdditions = files.reduce((sum, f) => sum + f.totalAdditions, 0);
	const totalDeletions = files.reduce((sum, f) => sum + f.totalDeletions, 0);
	const riskScore = files.length > 0
	? files.reduce((sum, f) => sum + f.riskScore, 0) / files.length
	: 0;
	// Generate commit embedding
	let embedding;
	if ((0, onnx_embedder_1.isReady)()) {
	const commitText = `${message}\n\nFiles changed: ${files.map(f => f.file).join(', ')}\n+${totalAdditions} -${totalDeletions}`;
	const result = await (0, onnx_embedder_1.embed)(commitText);
	embedding = result.embedding;
	}
	return {
	hash: commitHash,
	message,
	author,
	date,
	files,
	totalAdditions,
	totalDeletions,
	riskScore,
	embedding,
	};
	}
	/**
	* Find similar past commits based on diff embeddings
	*/
	async function findSimilarCommits(currentDiff, recentCommits = 50, topK = 5) {
	if (!(0, onnx_embedder_1.isReady)()) {
	await (0, onnx_embedder_1.initOnnxEmbedder)();
	}
	// Get current diff embedding
	const currentEmbedding = (await (0, onnx_embedder_1.embed)(currentDiff.substring(0, 1000))).embedding;
	// Get recent commits
	let commits = [];
	try {
	commits = (0, child_process_1.execSync)(`git log -${recentCommits} --format="%H" 2>/dev/null`, {
	encoding: 'utf8',
	}).trim().split('\n');
	}
	catch {
	return [];
	}
	// Analyze and compare
	const results = [];
	for (const hash of commits.slice(0, Math.min(commits.length, recentCommits))) {
	const analysis = await analyzeCommit(hash);
	if (analysis.embedding) {
	const similarity = cosineSimilarity(currentEmbedding, analysis.embedding);
	results.push({ hash, similarity, message: analysis.message });
	}
	}
	return results
	.sort((a, b) => b.similarity - a.similarity)
	.slice(0, topK);
	}
	function cosineSimilarity(a, b) {
	if (a.length !== b.length)
	return 0;
	let dotProduct = 0;
	let normA = 0;
	let normB = 0;
	for (let i = 0; i < a.length; i++) {
	dotProduct += a[i] * b[i];
	normA += a[i] * a[i];
	normB += b[i] * b[i];
	}
	const magnitude = Math.sqrt(normA) * Math.sqrt(normB);
	return magnitude === 0 ? 0 : dotProduct / magnitude;
	}
	exports.default = {
	parseDiff,
	classifyChange,
	calculateRiskScore,
	analyzeFileDiff,
	analyzeCommit,
	getCommitDiff,
	getStagedDiff,
	getUnstagedDiff,
	findSimilarCommits,
	};