improve commit-helper: add smart truncation for large/generated files and binary detection

- Add per-file character/line limits to prevent massive diffs
- Detect and aggressively truncate lockfiles, minified, and base64-heavy files
- Expand binary file detection and format binary summaries
- Add global output safety limit (150K chars) to prevent context overflow
This commit is contained in:
2026-01-06 12:59:16 -06:00
parent df6b36be41
commit fa4ffe8dce
+229 -49
View File
@@ -26,15 +26,26 @@ const IGNORE_PATTERNS = [
/\.min\.(js|css)$/, /\.min\.(js|css)$/,
/\.bundle\.(js|css)$/, /\.bundle\.(js|css)$/,
/dist\/.*\.map$/, /dist\/.*\.map$/,
/\.svg$/, // Often generated or binary-like
]; ];
// Binary/large file extensions to skip in diffs // Binary/large file extensions to skip in diffs
const BINARY_EXTENSIONS = [ const BINARY_EXTENSIONS = [
'.png', '.jpg', '.jpeg', '.gif', '.ico', '.webp', '.png', '.jpg', '.jpeg', '.gif', '.ico', '.webp', '.avif', '.bmp',
'.pdf', '.zip', '.tar', '.gz', '.woff', '.woff2', '.ttf', '.eot', '.svg', // Often base64-encoded or huge
'.mp4', '.mp3', '.wav', '.avi', '.mov', '.pdf', '.zip', '.tar', '.gz', '.woff', '.woff2', '.ttf', '.eot', '.otf',
'.so', '.dylib', '.dll', '.exe', '.mp4', '.mp3', '.wav', '.avi', '.mov', '.webm',
'.so', '.dylib', '.dll', '.exe', '.bin',
'.wasm', '.pyc', '.class',
'.db', '.sqlite', '.sqlite3',
'.lockb', // Bun binary lockfile
];
// Lockfiles - always truncate to first 100 lines
const LOCKFILE_PATTERNS = [
/\.lock$/,
/lock\.(json|yaml)$/,
/^(package|pnpm|yarn|bun|composer|Cargo|Gemfile|Pipfile|poetry)[-.]lock/,
/^go\.sum$/,
]; ];
interface ChangeStats { interface ChangeStats {
@@ -72,6 +83,84 @@ function parseNumstat(numstat: string): FileChange[] {
}); });
} }
/**
* Check if content is predominantly base64-encoded
*/
function hasBase64Pattern(content: string): boolean {
const base64Chunks = content.match(/[A-Za-z0-9+/=]{100,}/g) || [];
const base64Length = base64Chunks.reduce((sum, chunk) => sum + chunk.length, 0);
return base64Length > content.length * 0.3; // >30% base64
}
/**
* Determine if a file should be aggressively truncated
*/
function shouldCullAggressively(file: string, content: string): boolean {
const lines = content.split('\n');
const avgLineLength = content.length / Math.max(lines.length, 1);
return (
// Known lockfiles
LOCKFILE_PATTERNS.some(p => p.test(file)) ||
// Extremely long average line length (minified/generated)
avgLineLength > 200 ||
// Any single line over 5000 chars
lines.some(line => line.length > 5000) ||
// Predominantly base64 content
hasBase64Pattern(content) ||
// Known generated patterns
/\.generated\./i.test(file) ||
/\.min\./i.test(file) ||
/\.bundle\./i.test(file) ||
content.includes('/* @generated */') ||
content.includes('// Auto-generated') ||
content.includes('@autogenerated')
);
}
/**
* Check if file is binary based on extension
*/
function isBinaryFile(path: string): boolean {
return BINARY_EXTENSIONS.some(ext => path.toLowerCase().endsWith(ext));
}
/**
* Format file size in human-readable format
*/
function formatBytes(bytes: number): string {
if (bytes < 1024) return `${bytes} B`;
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
}
/**
* Count lines in text
*/
function countLines(text: string): number {
return text.split('\n').length;
}
/**
* Truncate text to maximum number of lines
*/
function truncateToLines(text: string, maxLines: number): string {
const lines = text.split('\n');
if (lines.length <= maxLines) return text;
return lines.slice(0, maxLines).join('\n');
}
/**
* Format binary file summary for diff
*/
function formatBinarySummary(filePath: string): string {
return `Binary file: ${filePath}\n(content omitted)`;
}
/** /**
* Get overall change statistics * Get overall change statistics
*/ */
@@ -139,16 +228,11 @@ function getFileSummary(files: FileChange[]): string {
* Get filtered diff output (excluding ignored files) * Get filtered diff output (excluding ignored files)
*/ */
async function getFilteredDiff(staged: boolean): Promise<string> { async function getFilteredDiff(staged: boolean): Promise<string> {
const command = staged
? 'git diff --staged'
: 'git diff HEAD~1..HEAD';
// Get list of files to exclude // Get list of files to exclude
const numstatCmd = staged const numstat = staged
? 'git diff --staged --numstat' ? await $`git diff --staged --numstat`.text()
: 'git diff HEAD~1..HEAD --numstat'; : await $`git diff HEAD~1..HEAD --numstat`.text();
const numstat = await $`sh -c ${numstatCmd}`.text();
const files = parseNumstat(numstat); const files = parseNumstat(numstat);
const filesToExclude = files const filesToExclude = files
.filter(f => f.shouldIgnore || f.isBinary) .filter(f => f.shouldIgnore || f.isBinary)
@@ -156,23 +240,37 @@ async function getFilteredDiff(staged: boolean): Promise<string> {
// Build diff command with exclusions // Build diff command with exclusions
if (filesToExclude.length === 0) { if (filesToExclude.length === 0) {
return await $`sh -c ${command}`.text(); return staged
? await $`git diff --staged`.text()
: await $`git diff HEAD~1..HEAD`.text();
} }
// Git diff with pathspec exclusions // Git diff with pathspec exclusions - construct as array to avoid shell quoting issues
const excludeArgs = filesToExclude.map(f => `:(exclude)${f}`).join(' ');
const fullCommand = `${command} -- . ${excludeArgs}`;
try { try {
return await $`sh -c ${fullCommand}`.text(); const baseArgs = staged ? ['diff', '--staged'] : ['diff', 'HEAD~1..HEAD'];
const args = [...baseArgs, '--', '.', ...filesToExclude.map(f => `:(exclude)${f}`)];
// Use Bun.spawn to call git with proper argument handling
const proc = Bun.spawn(['git', ...args], {
cwd: process.cwd(),
stdout: 'pipe',
stderr: 'pipe',
});
const output = await new Response(proc.stdout).text();
await proc.exited;
return output;
} catch { } catch {
// If exclusion fails, just return full diff // If exclusion fails, return diff without exclusions
return await $`sh -c ${command}`.text(); return staged
? await $`git diff --staged`.text()
: await $`git diff HEAD~1..HEAD`.text();
} }
} }
/** /**
* Truncate diff to fit within line budget * Truncate diff with per-file character/line limits and smart culling
*/ */
function truncateDiff(diff: string, maxLines: number, filesInfo: string): string { function truncateDiff(diff: string, maxLines: number, filesInfo: string): string {
const lines = diff.split('\n'); const lines = diff.split('\n');
@@ -181,9 +279,14 @@ function truncateDiff(diff: string, maxLines: number, filesInfo: string): string
return diff; return diff;
} }
// Try to include complete file diffs rather than cutting mid-file // Parse into individual file diffs
const fileDiffs: Array<{ header: string; content: string; lineCount: number }> = []; const fileDiffs: Array<{
let currentFile: { header: string; lines: string[] } | null = null; header: string;
content: string;
lineCount: number;
path: string;
}> = [];
let currentFile: { header: string; lines: string[]; path: string } | null = null;
for (const line of lines) { for (const line of lines) {
if (line.startsWith('diff --git')) { if (line.startsWith('diff --git')) {
@@ -192,9 +295,13 @@ function truncateDiff(diff: string, maxLines: number, filesInfo: string): string
header: currentFile.header, header: currentFile.header,
content: currentFile.lines.join('\n'), content: currentFile.lines.join('\n'),
lineCount: currentFile.lines.length, lineCount: currentFile.lines.length,
path: currentFile.path,
}); });
} }
currentFile = { header: line, lines: [line] }; // Extract file path from "diff --git a/path b/path"
const match = line.match(/diff --git a\/(.*?) b\//);
const path = match ? match[1] : 'unknown';
currentFile = { header: line, lines: [line], path };
} else if (currentFile) { } else if (currentFile) {
currentFile.lines.push(line); currentFile.lines.push(line);
} }
@@ -205,24 +312,62 @@ function truncateDiff(diff: string, maxLines: number, filesInfo: string): string
header: currentFile.header, header: currentFile.header,
content: currentFile.lines.join('\n'), content: currentFile.lines.join('\n'),
lineCount: currentFile.lines.length, lineCount: currentFile.lines.length,
path: currentFile.path,
}); });
} }
// Include files until we hit the limit // Process each file with per-file limits and smart culling
let includedLines = 0; let totalLines = 0;
const includedDiffs: string[] = []; const includedDiffs: string[] = [];
const omittedFiles: string[] = []; const omittedFiles: Array<{file: string, reason: string}> = [];
for (const fileDiff of fileDiffs) { for (const fileDiff of fileDiffs) {
if (includedLines + fileDiff.lineCount <= maxLines - 10) { // Reserve space for summary // Check if binary file
includedDiffs.push(fileDiff.content); if (isBinaryFile(fileDiff.path)) {
includedLines += fileDiff.lineCount; const summary = formatBinarySummary(fileDiff.path);
} else { includedDiffs.push(summary);
// Extract filename from diff header totalLines += countLines(summary);
const match = fileDiff.header.match(/diff --git a\/(.*?) b\//); continue;
if (match) {
omittedFiles.push(match[1]);
} }
let content = fileDiff.content;
const fileLines = fileDiff.lineCount;
const fileChars = content.length;
let truncationNotice = '';
// Apply per-file safety limits
const CHAR_THRESHOLD = 10000;
const LINE_THRESHOLD = 1500;
if (fileChars > CHAR_THRESHOLD || fileLines > LINE_THRESHOLD) {
// File exceeded threshold - check if it should be culled
if (shouldCullAggressively(fileDiff.path, content)) {
// Check if it's a lockfile (special handling)
if (LOCKFILE_PATTERNS.some(p => p.test(fileDiff.path))) {
content = truncateToLines(content, 100);
truncationNotice = `\n... (lockfile truncated - showing first 100 of ${fileLines} lines)`;
} else {
// Other noise - aggressive truncation
content = truncateToLines(content, 30);
truncationNotice = `\n... (generated/noisy file truncated - showing first 30 of ${fileLines} lines, ${formatBytes(fileChars)} total)`;
}
} else {
// Legitimate large file - more generous truncation
content = truncateToLines(content, 300);
truncationNotice = `\n... (large file truncated - showing first 300 of ${fileLines} lines, ${formatBytes(fileChars)} total)`;
}
}
// Check if it fits in global budget
const contentLines = countLines(content);
if (totalLines + contentLines <= maxLines - 10) { // Reserve space for summary
includedDiffs.push(content + truncationNotice);
totalLines += contentLines;
} else {
omittedFiles.push({
file: fileDiff.path,
reason: 'global line budget exceeded'
});
} }
} }
@@ -231,7 +376,7 @@ function truncateDiff(diff: string, maxLines: number, filesInfo: string): string
if (omittedFiles.length > 0) { if (omittedFiles.length > 0) {
result += '\n\n---\n'; result += '\n\n---\n';
result += `**Note:** ${omittedFiles.length} file(s) omitted due to output size limit:\n`; result += `**Note:** ${omittedFiles.length} file(s) omitted due to output size limit:\n`;
result += omittedFiles.map(f => ` - ${f}`).join('\n'); result += omittedFiles.map(f => ` - ${f.file} (${f.reason})`).join('\n');
result += '\n\n_Full changes visible in git status/stat output above._'; result += '\n\n_Full changes visible in git status/stat output above._';
} }
@@ -239,7 +384,7 @@ function truncateDiff(diff: string, maxLines: number, filesInfo: string): string
} }
/** /**
* Get preview of new files being added * Get preview of new files being added (with per-file and total limits)
*/ */
async function getNewFilesPreviews(maxFiles: number = 5, maxLinesPerFile: number = 50): Promise<string> { async function getNewFilesPreviews(maxFiles: number = 5, maxLinesPerFile: number = 50): Promise<string> {
try { try {
@@ -253,34 +398,60 @@ async function getNewFilesPreviews(maxFiles: number = 5, maxLinesPerFile: number
const previews: string[] = []; const previews: string[] = [];
const filesToShow = files.slice(0, maxFiles); const filesToShow = files.slice(0, maxFiles);
let totalChars = 0;
const MAX_TOTAL_CHARS = 30000;
const MAX_CHARS_PER_FILE = 10000;
for (const file of filesToShow) { for (const file of filesToShow) {
// Skip binary files // Skip binary files
if (BINARY_EXTENSIONS.some(ext => file.endsWith(ext))) { if (isBinaryFile(file)) {
previews.push(`=== ${file} ===\n(binary file)`); previews.push(`=== ${file} ===\n(binary file)`);
continue; continue;
} }
try { try {
const content = await Bun.file(file).text(); const content = await Bun.file(file).text();
const lines = content.split('\n').slice(0, maxLinesPerFile);
const truncated = lines.length < content.split('\n').length // Apply per-file char limit FIRST (prevents single-line disasters)
? `\n... (${content.split('\n').length - lines.length} more lines)` if (content.length > MAX_CHARS_PER_FILE) {
if (shouldCullAggressively(file, content)) {
previews.push(`=== ${file} ===\n(generated/noisy file - preview omitted)\nSize: ${formatBytes(content.length)}`);
} else {
const truncated = content.slice(0, MAX_CHARS_PER_FILE);
previews.push(`=== ${file} ===\n${truncated}\n... (truncated from ${formatBytes(content.length)})`);
}
continue;
}
// Apply line limit
const lines = content.split('\n');
const truncatedLines = lines.slice(0, maxLinesPerFile);
const truncated = truncatedLines.join('\n');
const notice = lines.length > maxLinesPerFile
? `\n... (${lines.length - maxLinesPerFile} more lines)`
: ''; : '';
previews.push(`=== ${file} ===\n${lines.join('\n')}${truncated}`); const preview = `=== ${file} ===\n${truncated}${notice}`;
// Check total budget
if (totalChars + preview.length > MAX_TOTAL_CHARS) {
const remaining = files.length - previews.length;
previews.push(`\n... (${remaining} more file(s) omitted - preview size limit reached)`);
break;
}
previews.push(preview);
totalChars += preview.length;
} catch { } catch {
previews.push(`=== ${file} ===\n(unreadable)`); previews.push(`=== ${file} ===\n(unreadable)`);
} }
} }
let result = previews.join('\n\n'); if (files.length > maxFiles && previews[previews.length - 1]?.includes('omitted') === false) {
previews.push(`\n... (${files.length - maxFiles} more new file(s) not shown)`);
if (files.length > maxFiles) {
result += `\n\n_... and ${files.length - maxFiles} more new file(s)_`;
} }
return result; return previews.join('\n\n');
} catch { } catch {
return ''; return '';
} }
@@ -345,6 +516,15 @@ async function stagedContext(maxLines: number): Promise<string> {
output += '## Recent Commit Style\n```\n' + recentCommits.trim() + '\n```\n'; output += '## Recent Commit Style\n```\n' + recentCommits.trim() + '\n```\n';
// Final safety: ensure total output doesn't exceed safe limit
const MAX_TOTAL_OUTPUT = 150000; // 150K chars, leaves 50K headroom
if (output.length > MAX_TOTAL_OUTPUT) {
const lastNewline = output.slice(0, MAX_TOTAL_OUTPUT).lastIndexOf('\n');
output = output.slice(0, lastNewline) +
'\n\n[OUTPUT TRUNCATED - Exceeds safe character limit for AI context]\n' +
`(Shown ${lastNewline.toLocaleString()} of ${output.length.toLocaleString()} chars)`;
}
return output; return output;
} }