improve commit-helper: add smart truncation for large/generated files and binary detection

- Add per-file character/line limits to prevent massive diffs
- Detect and aggressively truncate lockfiles, minified, and base64-heavy files
- Expand binary file detection and format binary summaries
- Add global output safety limit (150K chars) to prevent context overflow
This commit is contained in:
2026-01-06 12:59:16 -06:00
parent df6b36be41
commit fa4ffe8dce
+229 -49
View File
@@ -26,15 +26,26 @@ const IGNORE_PATTERNS = [
/\.min\.(js|css)$/,
/\.bundle\.(js|css)$/,
/dist\/.*\.map$/,
/\.svg$/, // Often generated or binary-like
];
// Binary/large file extensions to skip in diffs
const BINARY_EXTENSIONS = [
'.png', '.jpg', '.jpeg', '.gif', '.ico', '.webp',
'.pdf', '.zip', '.tar', '.gz', '.woff', '.woff2', '.ttf', '.eot',
'.mp4', '.mp3', '.wav', '.avi', '.mov',
'.so', '.dylib', '.dll', '.exe',
'.png', '.jpg', '.jpeg', '.gif', '.ico', '.webp', '.avif', '.bmp',
'.svg', // Often base64-encoded or huge
'.pdf', '.zip', '.tar', '.gz', '.woff', '.woff2', '.ttf', '.eot', '.otf',
'.mp4', '.mp3', '.wav', '.avi', '.mov', '.webm',
'.so', '.dylib', '.dll', '.exe', '.bin',
'.wasm', '.pyc', '.class',
'.db', '.sqlite', '.sqlite3',
'.lockb', // Bun binary lockfile
];
// Lockfiles - always truncate to first 100 lines
const LOCKFILE_PATTERNS = [
/\.lock$/,
/lock\.(json|yaml)$/,
/^(package|pnpm|yarn|bun|composer|Cargo|Gemfile|Pipfile|poetry)[-.]lock/,
/^go\.sum$/,
];
interface ChangeStats {
@@ -72,6 +83,84 @@ function parseNumstat(numstat: string): FileChange[] {
});
}
/**
* Check if content is predominantly base64-encoded
*/
function hasBase64Pattern(content: string): boolean {
const base64Chunks = content.match(/[A-Za-z0-9+/=]{100,}/g) || [];
const base64Length = base64Chunks.reduce((sum, chunk) => sum + chunk.length, 0);
return base64Length > content.length * 0.3; // >30% base64
}
/**
* Determine if a file should be aggressively truncated
*/
function shouldCullAggressively(file: string, content: string): boolean {
const lines = content.split('\n');
const avgLineLength = content.length / Math.max(lines.length, 1);
return (
// Known lockfiles
LOCKFILE_PATTERNS.some(p => p.test(file)) ||
// Extremely long average line length (minified/generated)
avgLineLength > 200 ||
// Any single line over 5000 chars
lines.some(line => line.length > 5000) ||
// Predominantly base64 content
hasBase64Pattern(content) ||
// Known generated patterns
/\.generated\./i.test(file) ||
/\.min\./i.test(file) ||
/\.bundle\./i.test(file) ||
content.includes('/* @generated */') ||
content.includes('// Auto-generated') ||
content.includes('@autogenerated')
);
}
/**
* Check if file is binary based on extension
*/
function isBinaryFile(path: string): boolean {
return BINARY_EXTENSIONS.some(ext => path.toLowerCase().endsWith(ext));
}
/**
* Format file size in human-readable format
*/
function formatBytes(bytes: number): string {
if (bytes < 1024) return `${bytes} B`;
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
}
/**
* Count lines in text
*/
function countLines(text: string): number {
return text.split('\n').length;
}
/**
* Truncate text to maximum number of lines
*/
function truncateToLines(text: string, maxLines: number): string {
const lines = text.split('\n');
if (lines.length <= maxLines) return text;
return lines.slice(0, maxLines).join('\n');
}
/**
* Format binary file summary for diff
*/
function formatBinarySummary(filePath: string): string {
return `Binary file: ${filePath}\n(content omitted)`;
}
/**
* Get overall change statistics
*/
@@ -139,16 +228,11 @@ function getFileSummary(files: FileChange[]): string {
* Get filtered diff output (excluding ignored files)
*/
async function getFilteredDiff(staged: boolean): Promise<string> {
const command = staged
? 'git diff --staged'
: 'git diff HEAD~1..HEAD';
// Get list of files to exclude
const numstatCmd = staged
? 'git diff --staged --numstat'
: 'git diff HEAD~1..HEAD --numstat';
const numstat = staged
? await $`git diff --staged --numstat`.text()
: await $`git diff HEAD~1..HEAD --numstat`.text();
const numstat = await $`sh -c ${numstatCmd}`.text();
const files = parseNumstat(numstat);
const filesToExclude = files
.filter(f => f.shouldIgnore || f.isBinary)
@@ -156,23 +240,37 @@ async function getFilteredDiff(staged: boolean): Promise<string> {
// Build diff command with exclusions
if (filesToExclude.length === 0) {
return await $`sh -c ${command}`.text();
return staged
? await $`git diff --staged`.text()
: await $`git diff HEAD~1..HEAD`.text();
}
// Git diff with pathspec exclusions
const excludeArgs = filesToExclude.map(f => `:(exclude)${f}`).join(' ');
const fullCommand = `${command} -- . ${excludeArgs}`;
// Git diff with pathspec exclusions - construct as array to avoid shell quoting issues
try {
return await $`sh -c ${fullCommand}`.text();
const baseArgs = staged ? ['diff', '--staged'] : ['diff', 'HEAD~1..HEAD'];
const args = [...baseArgs, '--', '.', ...filesToExclude.map(f => `:(exclude)${f}`)];
// Use Bun.spawn to call git with proper argument handling
const proc = Bun.spawn(['git', ...args], {
cwd: process.cwd(),
stdout: 'pipe',
stderr: 'pipe',
});
const output = await new Response(proc.stdout).text();
await proc.exited;
return output;
} catch {
// If exclusion fails, just return full diff
return await $`sh -c ${command}`.text();
// If exclusion fails, return diff without exclusions
return staged
? await $`git diff --staged`.text()
: await $`git diff HEAD~1..HEAD`.text();
}
}
/**
* Truncate diff to fit within line budget
* Truncate diff with per-file character/line limits and smart culling
*/
function truncateDiff(diff: string, maxLines: number, filesInfo: string): string {
const lines = diff.split('\n');
@@ -181,9 +279,14 @@ function truncateDiff(diff: string, maxLines: number, filesInfo: string): string
return diff;
}
// Try to include complete file diffs rather than cutting mid-file
const fileDiffs: Array<{ header: string; content: string; lineCount: number }> = [];
let currentFile: { header: string; lines: string[] } | null = null;
// Parse into individual file diffs
const fileDiffs: Array<{
header: string;
content: string;
lineCount: number;
path: string;
}> = [];
let currentFile: { header: string; lines: string[]; path: string } | null = null;
for (const line of lines) {
if (line.startsWith('diff --git')) {
@@ -192,9 +295,13 @@ function truncateDiff(diff: string, maxLines: number, filesInfo: string): string
header: currentFile.header,
content: currentFile.lines.join('\n'),
lineCount: currentFile.lines.length,
path: currentFile.path,
});
}
currentFile = { header: line, lines: [line] };
// Extract file path from "diff --git a/path b/path"
const match = line.match(/diff --git a\/(.*?) b\//);
const path = match ? match[1] : 'unknown';
currentFile = { header: line, lines: [line], path };
} else if (currentFile) {
currentFile.lines.push(line);
}
@@ -205,25 +312,63 @@ function truncateDiff(diff: string, maxLines: number, filesInfo: string): string
header: currentFile.header,
content: currentFile.lines.join('\n'),
lineCount: currentFile.lines.length,
path: currentFile.path,
});
}
// Include files until we hit the limit
let includedLines = 0;
// Process each file with per-file limits and smart culling
let totalLines = 0;
const includedDiffs: string[] = [];
const omittedFiles: string[] = [];
const omittedFiles: Array<{file: string, reason: string}> = [];
for (const fileDiff of fileDiffs) {
if (includedLines + fileDiff.lineCount <= maxLines - 10) { // Reserve space for summary
includedDiffs.push(fileDiff.content);
includedLines += fileDiff.lineCount;
} else {
// Extract filename from diff header
const match = fileDiff.header.match(/diff --git a\/(.*?) b\//);
if (match) {
omittedFiles.push(match[1]);
// Check if binary file
if (isBinaryFile(fileDiff.path)) {
const summary = formatBinarySummary(fileDiff.path);
includedDiffs.push(summary);
totalLines += countLines(summary);
continue;
}
let content = fileDiff.content;
const fileLines = fileDiff.lineCount;
const fileChars = content.length;
let truncationNotice = '';
// Apply per-file safety limits
const CHAR_THRESHOLD = 10000;
const LINE_THRESHOLD = 1500;
if (fileChars > CHAR_THRESHOLD || fileLines > LINE_THRESHOLD) {
// File exceeded threshold - check if it should be culled
if (shouldCullAggressively(fileDiff.path, content)) {
// Check if it's a lockfile (special handling)
if (LOCKFILE_PATTERNS.some(p => p.test(fileDiff.path))) {
content = truncateToLines(content, 100);
truncationNotice = `\n... (lockfile truncated - showing first 100 of ${fileLines} lines)`;
} else {
// Other noise - aggressive truncation
content = truncateToLines(content, 30);
truncationNotice = `\n... (generated/noisy file truncated - showing first 30 of ${fileLines} lines, ${formatBytes(fileChars)} total)`;
}
} else {
// Legitimate large file - more generous truncation
content = truncateToLines(content, 300);
truncationNotice = `\n... (large file truncated - showing first 300 of ${fileLines} lines, ${formatBytes(fileChars)} total)`;
}
}
// Check if it fits in global budget
const contentLines = countLines(content);
if (totalLines + contentLines <= maxLines - 10) { // Reserve space for summary
includedDiffs.push(content + truncationNotice);
totalLines += contentLines;
} else {
omittedFiles.push({
file: fileDiff.path,
reason: 'global line budget exceeded'
});
}
}
let result = includedDiffs.join('\n\n');
@@ -231,7 +376,7 @@ function truncateDiff(diff: string, maxLines: number, filesInfo: string): string
if (omittedFiles.length > 0) {
result += '\n\n---\n';
result += `**Note:** ${omittedFiles.length} file(s) omitted due to output size limit:\n`;
result += omittedFiles.map(f => ` - ${f}`).join('\n');
result += omittedFiles.map(f => ` - ${f.file} (${f.reason})`).join('\n');
result += '\n\n_Full changes visible in git status/stat output above._';
}
@@ -239,7 +384,7 @@ function truncateDiff(diff: string, maxLines: number, filesInfo: string): string
}
/**
* Get preview of new files being added
* Get preview of new files being added (with per-file and total limits)
*/
async function getNewFilesPreviews(maxFiles: number = 5, maxLinesPerFile: number = 50): Promise<string> {
try {
@@ -253,34 +398,60 @@ async function getNewFilesPreviews(maxFiles: number = 5, maxLinesPerFile: number
const previews: string[] = [];
const filesToShow = files.slice(0, maxFiles);
let totalChars = 0;
const MAX_TOTAL_CHARS = 30000;
const MAX_CHARS_PER_FILE = 10000;
for (const file of filesToShow) {
// Skip binary files
if (BINARY_EXTENSIONS.some(ext => file.endsWith(ext))) {
if (isBinaryFile(file)) {
previews.push(`=== ${file} ===\n(binary file)`);
continue;
}
try {
const content = await Bun.file(file).text();
const lines = content.split('\n').slice(0, maxLinesPerFile);
const truncated = lines.length < content.split('\n').length
? `\n... (${content.split('\n').length - lines.length} more lines)`
// Apply per-file char limit FIRST (prevents single-line disasters)
if (content.length > MAX_CHARS_PER_FILE) {
if (shouldCullAggressively(file, content)) {
previews.push(`=== ${file} ===\n(generated/noisy file - preview omitted)\nSize: ${formatBytes(content.length)}`);
} else {
const truncated = content.slice(0, MAX_CHARS_PER_FILE);
previews.push(`=== ${file} ===\n${truncated}\n... (truncated from ${formatBytes(content.length)})`);
}
continue;
}
// Apply line limit
const lines = content.split('\n');
const truncatedLines = lines.slice(0, maxLinesPerFile);
const truncated = truncatedLines.join('\n');
const notice = lines.length > maxLinesPerFile
? `\n... (${lines.length - maxLinesPerFile} more lines)`
: '';
previews.push(`=== ${file} ===\n${lines.join('\n')}${truncated}`);
const preview = `=== ${file} ===\n${truncated}${notice}`;
// Check total budget
if (totalChars + preview.length > MAX_TOTAL_CHARS) {
const remaining = files.length - previews.length;
previews.push(`\n... (${remaining} more file(s) omitted - preview size limit reached)`);
break;
}
previews.push(preview);
totalChars += preview.length;
} catch {
previews.push(`=== ${file} ===\n(unreadable)`);
}
}
let result = previews.join('\n\n');
if (files.length > maxFiles) {
result += `\n\n_... and ${files.length - maxFiles} more new file(s)_`;
if (files.length > maxFiles && previews[previews.length - 1]?.includes('omitted') === false) {
previews.push(`\n... (${files.length - maxFiles} more new file(s) not shown)`);
}
return result;
return previews.join('\n\n');
} catch {
return '';
}
@@ -345,6 +516,15 @@ async function stagedContext(maxLines: number): Promise<string> {
output += '## Recent Commit Style\n```\n' + recentCommits.trim() + '\n```\n';
// Final safety: ensure total output doesn't exceed safe limit
const MAX_TOTAL_OUTPUT = 150000; // 150K chars, leaves 50K headroom
if (output.length > MAX_TOTAL_OUTPUT) {
const lastNewline = output.slice(0, MAX_TOTAL_OUTPUT).lastIndexOf('\n');
output = output.slice(0, lastNewline) +
'\n\n[OUTPUT TRUNCATED - Exceeds safe character limit for AI context]\n' +
`(Shown ${lastNewline.toLocaleString()} of ${output.length.toLocaleString()} chars)`;
}
return output;
}