Files
dotfiles/home/dot_local/bin/executable_commit-helper
Xevion fa4ffe8dce improve commit-helper: add smart truncation for large/generated files and binary detection
- Add per-file character/line limits to prevent massive diffs
- Detect and aggressively truncate lockfiles, minified, and base64-heavy files
- Expand binary file detection and format binary summaries
- Add global output safety limit (150K chars) to prevent context overflow
2026-01-06 12:59:16 -06:00

616 lines
19 KiB
Plaintext

#!/usr/bin/env bun
/**
* commit-helper - Efficient git context gathering for AI-assisted commits
*
* Provides optimized git context with smart truncation and filtering.
* Designed to give AI assistants the right amount of context without overwhelming them.
*
* Usage:
* commit-helper --staged [maxLines] # For committing staged changes
* commit-helper --amend [maxLines] # For amending last commit
*
* Default maxLines: 1000
*/
import { $ } from "bun";
// Files to ignore in diffs (lockfiles, generated files, etc.)
const IGNORE_PATTERNS = [
/package-lock\.json$/,
/yarn\.lock$/,
/pnpm-lock\.yaml$/,
/Cargo\.lock$/,
/poetry\.lock$/,
/bun\.lockb?$/,
/\.min\.(js|css)$/,
/\.bundle\.(js|css)$/,
/dist\/.*\.map$/,
];
// Binary/large file extensions to skip in diffs
const BINARY_EXTENSIONS = [
'.png', '.jpg', '.jpeg', '.gif', '.ico', '.webp', '.avif', '.bmp',
'.svg', // Often base64-encoded or huge
'.pdf', '.zip', '.tar', '.gz', '.woff', '.woff2', '.ttf', '.eot', '.otf',
'.mp4', '.mp3', '.wav', '.avi', '.mov', '.webm',
'.so', '.dylib', '.dll', '.exe', '.bin',
'.wasm', '.pyc', '.class',
'.db', '.sqlite', '.sqlite3',
'.lockb', // Bun binary lockfile
];
// Lockfiles - always truncate to first 100 lines
const LOCKFILE_PATTERNS = [
/\.lock$/,
/lock\.(json|yaml)$/,
/^(package|pnpm|yarn|bun|composer|Cargo|Gemfile|Pipfile|poetry)[-.]lock/,
/^go\.sum$/,
];
interface ChangeStats {
files: number;
additions: number;
deletions: number;
}
interface FileChange {
path: string;
additions: number;
deletions: number;
isBinary: boolean;
shouldIgnore: boolean;
}
/**
* Parse git diff numstat output into structured data
*/
function parseNumstat(numstat: string): FileChange[] {
return numstat
.split('\n')
.filter(line => line.trim())
.map(line => {
const parts = line.split('\t');
const additions = parts[0] === '-' ? 0 : parseInt(parts[0], 10);
const deletions = parts[1] === '-' ? 0 : parseInt(parts[1], 10);
const path = parts[2] || '';
const isBinary = parts[0] === '-' && parts[1] === '-';
const shouldIgnore = IGNORE_PATTERNS.some(pattern => pattern.test(path)) ||
BINARY_EXTENSIONS.some(ext => path.endsWith(ext));
return { path, additions, deletions, isBinary, shouldIgnore };
});
}
/**
* Check if content is predominantly base64-encoded
*/
function hasBase64Pattern(content: string): boolean {
const base64Chunks = content.match(/[A-Za-z0-9+/=]{100,}/g) || [];
const base64Length = base64Chunks.reduce((sum, chunk) => sum + chunk.length, 0);
return base64Length > content.length * 0.3; // >30% base64
}
/**
* Determine if a file should be aggressively truncated
*/
function shouldCullAggressively(file: string, content: string): boolean {
const lines = content.split('\n');
const avgLineLength = content.length / Math.max(lines.length, 1);
return (
// Known lockfiles
LOCKFILE_PATTERNS.some(p => p.test(file)) ||
// Extremely long average line length (minified/generated)
avgLineLength > 200 ||
// Any single line over 5000 chars
lines.some(line => line.length > 5000) ||
// Predominantly base64 content
hasBase64Pattern(content) ||
// Known generated patterns
/\.generated\./i.test(file) ||
/\.min\./i.test(file) ||
/\.bundle\./i.test(file) ||
content.includes('/* @generated */') ||
content.includes('// Auto-generated') ||
content.includes('@autogenerated')
);
}
/**
* Check if file is binary based on extension
*/
function isBinaryFile(path: string): boolean {
return BINARY_EXTENSIONS.some(ext => path.toLowerCase().endsWith(ext));
}
/**
* Format file size in human-readable format
*/
function formatBytes(bytes: number): string {
if (bytes < 1024) return `${bytes} B`;
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
}
/**
* Count lines in text
*/
function countLines(text: string): number {
return text.split('\n').length;
}
/**
* Truncate text to maximum number of lines
*/
function truncateToLines(text: string, maxLines: number): string {
const lines = text.split('\n');
if (lines.length <= maxLines) return text;
return lines.slice(0, maxLines).join('\n');
}
/**
* Format binary file summary for diff
*/
function formatBinarySummary(filePath: string): string {
return `Binary file: ${filePath}\n(content omitted)`;
}
/**
* Get overall change statistics
*/
function getChangeStats(files: FileChange[]): ChangeStats {
return files.reduce((acc, file) => ({
files: acc.files + 1,
additions: acc.additions + file.additions,
deletions: acc.deletions + file.deletions,
}), { files: 0, additions: 0, deletions: 0 });
}
/**
* Get file type distribution summary
*/
function getFileTypeDistribution(files: FileChange[]): string {
const extensions = files.map(f => {
const match = f.path.match(/\.([^.]+)$/);
return match ? match[1] : '(no extension)';
});
const counts = new Map<string, number>();
for (const ext of extensions) {
counts.set(ext, (counts.get(ext) || 0) + 1);
}
return Array.from(counts.entries())
.sort((a, b) => b[1] - a[1])
.map(([ext, count]) => ` ${count.toString().padStart(3)} .${ext}`)
.join('\n');
}
/**
* Get categorized file changes summary
*/
function getFileSummary(files: FileChange[]): string {
const included = files.filter(f => !f.shouldIgnore && !f.isBinary);
const ignored = files.filter(f => f.shouldIgnore);
const binary = files.filter(f => f.isBinary && !f.shouldIgnore);
let summary = '';
if (included.length > 0) {
summary += '**Included changes:**\n';
summary += included.map(f => {
const changes = `(+${f.additions}/-${f.deletions})`;
return ` ${changes.padEnd(12)} ${f.path}`;
}).join('\n');
}
if (ignored.length > 0) {
summary += '\n\n**Ignored files (lockfiles/generated):**\n';
summary += ignored.map(f => ` ${f.path}`).join('\n');
summary += '\n _(Changes to these files are omitted from diff output)_';
}
if (binary.length > 0) {
summary += '\n\n**Binary files:**\n';
summary += binary.map(f => ` ${f.path}`).join('\n');
}
return summary;
}
/**
* Get filtered diff output (excluding ignored files)
*/
async function getFilteredDiff(staged: boolean): Promise<string> {
// Get list of files to exclude
const numstat = staged
? await $`git diff --staged --numstat`.text()
: await $`git diff HEAD~1..HEAD --numstat`.text();
const files = parseNumstat(numstat);
const filesToExclude = files
.filter(f => f.shouldIgnore || f.isBinary)
.map(f => f.path);
// Build diff command with exclusions
if (filesToExclude.length === 0) {
return staged
? await $`git diff --staged`.text()
: await $`git diff HEAD~1..HEAD`.text();
}
// Git diff with pathspec exclusions - construct as array to avoid shell quoting issues
try {
const baseArgs = staged ? ['diff', '--staged'] : ['diff', 'HEAD~1..HEAD'];
const args = [...baseArgs, '--', '.', ...filesToExclude.map(f => `:(exclude)${f}`)];
// Use Bun.spawn to call git with proper argument handling
const proc = Bun.spawn(['git', ...args], {
cwd: process.cwd(),
stdout: 'pipe',
stderr: 'pipe',
});
const output = await new Response(proc.stdout).text();
await proc.exited;
return output;
} catch {
// If exclusion fails, return diff without exclusions
return staged
? await $`git diff --staged`.text()
: await $`git diff HEAD~1..HEAD`.text();
}
}
/**
* Truncate diff with per-file character/line limits and smart culling
*/
function truncateDiff(diff: string, maxLines: number, filesInfo: string): string {
const lines = diff.split('\n');
if (lines.length <= maxLines) {
return diff;
}
// Parse into individual file diffs
const fileDiffs: Array<{
header: string;
content: string;
lineCount: number;
path: string;
}> = [];
let currentFile: { header: string; lines: string[]; path: string } | null = null;
for (const line of lines) {
if (line.startsWith('diff --git')) {
if (currentFile) {
fileDiffs.push({
header: currentFile.header,
content: currentFile.lines.join('\n'),
lineCount: currentFile.lines.length,
path: currentFile.path,
});
}
// Extract file path from "diff --git a/path b/path"
const match = line.match(/diff --git a\/(.*?) b\//);
const path = match ? match[1] : 'unknown';
currentFile = { header: line, lines: [line], path };
} else if (currentFile) {
currentFile.lines.push(line);
}
}
if (currentFile) {
fileDiffs.push({
header: currentFile.header,
content: currentFile.lines.join('\n'),
lineCount: currentFile.lines.length,
path: currentFile.path,
});
}
// Process each file with per-file limits and smart culling
let totalLines = 0;
const includedDiffs: string[] = [];
const omittedFiles: Array<{file: string, reason: string}> = [];
for (const fileDiff of fileDiffs) {
// Check if binary file
if (isBinaryFile(fileDiff.path)) {
const summary = formatBinarySummary(fileDiff.path);
includedDiffs.push(summary);
totalLines += countLines(summary);
continue;
}
let content = fileDiff.content;
const fileLines = fileDiff.lineCount;
const fileChars = content.length;
let truncationNotice = '';
// Apply per-file safety limits
const CHAR_THRESHOLD = 10000;
const LINE_THRESHOLD = 1500;
if (fileChars > CHAR_THRESHOLD || fileLines > LINE_THRESHOLD) {
// File exceeded threshold - check if it should be culled
if (shouldCullAggressively(fileDiff.path, content)) {
// Check if it's a lockfile (special handling)
if (LOCKFILE_PATTERNS.some(p => p.test(fileDiff.path))) {
content = truncateToLines(content, 100);
truncationNotice = `\n... (lockfile truncated - showing first 100 of ${fileLines} lines)`;
} else {
// Other noise - aggressive truncation
content = truncateToLines(content, 30);
truncationNotice = `\n... (generated/noisy file truncated - showing first 30 of ${fileLines} lines, ${formatBytes(fileChars)} total)`;
}
} else {
// Legitimate large file - more generous truncation
content = truncateToLines(content, 300);
truncationNotice = `\n... (large file truncated - showing first 300 of ${fileLines} lines, ${formatBytes(fileChars)} total)`;
}
}
// Check if it fits in global budget
const contentLines = countLines(content);
if (totalLines + contentLines <= maxLines - 10) { // Reserve space for summary
includedDiffs.push(content + truncationNotice);
totalLines += contentLines;
} else {
omittedFiles.push({
file: fileDiff.path,
reason: 'global line budget exceeded'
});
}
}
let result = includedDiffs.join('\n\n');
if (omittedFiles.length > 0) {
result += '\n\n---\n';
result += `**Note:** ${omittedFiles.length} file(s) omitted due to output size limit:\n`;
result += omittedFiles.map(f => ` - ${f.file} (${f.reason})`).join('\n');
result += '\n\n_Full changes visible in git status/stat output above._';
}
return result;
}
/**
* Get preview of new files being added (with per-file and total limits)
*/
async function getNewFilesPreviews(maxFiles: number = 5, maxLinesPerFile: number = 50): Promise<string> {
try {
// Get list of new files (A = added)
const newFiles = await $`git diff --staged --name-only --diff-filter=A`.text();
const files = newFiles.trim().split('\n').filter(f => f);
if (files.length === 0) {
return '';
}
const previews: string[] = [];
const filesToShow = files.slice(0, maxFiles);
let totalChars = 0;
const MAX_TOTAL_CHARS = 30000;
const MAX_CHARS_PER_FILE = 10000;
for (const file of filesToShow) {
// Skip binary files
if (isBinaryFile(file)) {
previews.push(`=== ${file} ===\n(binary file)`);
continue;
}
try {
const content = await Bun.file(file).text();
// Apply per-file char limit FIRST (prevents single-line disasters)
if (content.length > MAX_CHARS_PER_FILE) {
if (shouldCullAggressively(file, content)) {
previews.push(`=== ${file} ===\n(generated/noisy file - preview omitted)\nSize: ${formatBytes(content.length)}`);
} else {
const truncated = content.slice(0, MAX_CHARS_PER_FILE);
previews.push(`=== ${file} ===\n${truncated}\n... (truncated from ${formatBytes(content.length)})`);
}
continue;
}
// Apply line limit
const lines = content.split('\n');
const truncatedLines = lines.slice(0, maxLinesPerFile);
const truncated = truncatedLines.join('\n');
const notice = lines.length > maxLinesPerFile
? `\n... (${lines.length - maxLinesPerFile} more lines)`
: '';
const preview = `=== ${file} ===\n${truncated}${notice}`;
// Check total budget
if (totalChars + preview.length > MAX_TOTAL_CHARS) {
const remaining = files.length - previews.length;
previews.push(`\n... (${remaining} more file(s) omitted - preview size limit reached)`);
break;
}
previews.push(preview);
totalChars += preview.length;
} catch {
previews.push(`=== ${file} ===\n(unreadable)`);
}
}
if (files.length > maxFiles && previews[previews.length - 1]?.includes('omitted') === false) {
previews.push(`\n... (${files.length - maxFiles} more new file(s) not shown)`);
}
return previews.join('\n\n');
} catch {
return '';
}
}
/**
* Generate context for staged changes
*/
async function stagedContext(maxLines: number): Promise<string> {
// Check if there are staged changes
try {
await $`git diff --staged --quiet`;
// If command succeeds (exit 0), there are no changes
throw new Error('No staged changes to commit');
} catch (err) {
// Exit code 1 means there are changes (expected)
// Any other error will be re-thrown
if (err && typeof err === 'object' && 'exitCode' in err && err.exitCode !== 1) {
throw err;
}
}
// Gather all git information
const [status, numstat, recentCommits] = await Promise.all([
$`git status`.text(),
$`git diff --staged --numstat`.text(),
$`git log --format='%h %s' -10`.text(),
]);
const files = parseNumstat(numstat);
const stats = getChangeStats(files);
const fileSummary = getFileSummary(files);
const fileTypes = getFileTypeDistribution(files);
// Calculate how many lines we can use for diff
const headerLines = 50; // Approximate lines for headers/summaries
const diffMaxLines = Math.max(100, maxLines - headerLines);
const diff = await getFilteredDiff(true);
const truncatedDiff = truncateDiff(diff, diffMaxLines, fileSummary);
const newFilesPreviews = await getNewFilesPreviews(5, 50);
// Build output
let output = '# Git Commit Context (Staged Changes)\n\n';
output += '## Status\n```\n' + status.trim() + '\n```\n\n';
output += '## Change Summary\n';
output += `**Files:** ${stats.files} | **Additions:** ${stats.additions} | **Deletions:** ${stats.deletions}\n\n`;
output += '## Files Changed\n' + fileSummary + '\n\n';
output += '## File Types Modified\n```\n' + fileTypes + '\n```\n\n';
output += '## Staged Changes (Diff)\n';
output += '```diff\n' + truncatedDiff.trim() + '\n```\n\n';
if (newFilesPreviews) {
output += '## New Files Preview\n```\n' + newFilesPreviews + '\n```\n\n';
}
output += '## Recent Commit Style\n```\n' + recentCommits.trim() + '\n```\n';
// Final safety: ensure total output doesn't exceed safe limit
const MAX_TOTAL_OUTPUT = 150000; // 150K chars, leaves 50K headroom
if (output.length > MAX_TOTAL_OUTPUT) {
const lastNewline = output.slice(0, MAX_TOTAL_OUTPUT).lastIndexOf('\n');
output = output.slice(0, lastNewline) +
'\n\n[OUTPUT TRUNCATED - Exceeds safe character limit for AI context]\n' +
`(Shown ${lastNewline.toLocaleString()} of ${output.length.toLocaleString()} chars)`;
}
return output;
}
/**
* Generate context for amending last commit
*/
async function amendContext(maxLines: number): Promise<string> {
// Check if we have any commits
try {
await $`git rev-parse HEAD`;
} catch {
throw new Error('No commits to amend');
}
// Gather git information
const [stagedStat, lastCommitStat, recentCommits] = await Promise.all([
$`git diff --staged --stat`.text(),
$`git show --stat --pretty=format: HEAD`.text().then(s => s.split('\n').filter(l => l.trim()).join('\n')),
$`git log --oneline -5`.text(),
]);
let output = '# Git Commit Context (Amend)\n\n';
output += '## Current Staged Changes\n';
if (stagedStat.trim()) {
output += '```\n' + stagedStat.trim() + '\n```\n\n';
} else {
output += '_No staged changes (message-only amendment)_\n\n';
}
output += '## Files in Most Recent Commit\n';
output += '```\n' + lastCommitStat.trim() + '\n```\n\n';
output += '## Recent Commit History (for style reference)\n';
output += '```\n' + recentCommits.trim() + '\n```\n';
return output;
}
/**
* Main entry point
*/
async function main() {
const args = Bun.argv.slice(2);
if (args.length === 0 || (!args[0].startsWith('--staged') && !args[0].startsWith('--amend'))) {
console.error('Usage: commit-helper --staged [maxLines] | --amend [maxLines]');
console.error(' Default maxLines: 1000');
process.exit(1);
}
const mode = args[0];
const maxLines = args[1] ? parseInt(args[1], 10) : 1000;
if (isNaN(maxLines) || maxLines < 100) {
console.error('Error: maxLines must be a number >= 100');
process.exit(1);
}
try {
let output: string;
if (mode === '--staged') {
output = await stagedContext(maxLines);
} else if (mode === '--amend') {
output = await amendContext(maxLines);
} else {
throw new Error(`Unknown mode: ${mode}`);
}
console.log(output);
} catch (error) {
if (error instanceof Error) {
console.error(`Error: ${error.message}`);
if (error.stack) {
console.error('\nStack trace:');
console.error(error.stack);
}
} else if (error && typeof error === 'object') {
console.error('Error details:', JSON.stringify(error, null, 2));
} else {
console.error('Error: Unknown error occurred:', error);
}
process.exit(1);
}
}
main();