mirror of
https://github.com/Xevion/dotfiles.git
synced 2026-01-31 08:24:11 -06:00
improve commit-helper: add smart truncation for large/generated files and binary detection
- Add per-file character/line limits to prevent massive diffs - Detect and aggressively truncate lockfiles, minified, and base64-heavy files - Expand binary file detection and format binary summaries - Add global output safety limit (150K chars) to prevent context overflow
This commit is contained in:
@@ -26,15 +26,26 @@ const IGNORE_PATTERNS = [
|
||||
/\.min\.(js|css)$/,
|
||||
/\.bundle\.(js|css)$/,
|
||||
/dist\/.*\.map$/,
|
||||
/\.svg$/, // Often generated or binary-like
|
||||
];
|
||||
|
||||
// Binary/large file extensions to skip in diffs
|
||||
const BINARY_EXTENSIONS = [
|
||||
'.png', '.jpg', '.jpeg', '.gif', '.ico', '.webp',
|
||||
'.pdf', '.zip', '.tar', '.gz', '.woff', '.woff2', '.ttf', '.eot',
|
||||
'.mp4', '.mp3', '.wav', '.avi', '.mov',
|
||||
'.so', '.dylib', '.dll', '.exe',
|
||||
'.png', '.jpg', '.jpeg', '.gif', '.ico', '.webp', '.avif', '.bmp',
|
||||
'.svg', // Often base64-encoded or huge
|
||||
'.pdf', '.zip', '.tar', '.gz', '.woff', '.woff2', '.ttf', '.eot', '.otf',
|
||||
'.mp4', '.mp3', '.wav', '.avi', '.mov', '.webm',
|
||||
'.so', '.dylib', '.dll', '.exe', '.bin',
|
||||
'.wasm', '.pyc', '.class',
|
||||
'.db', '.sqlite', '.sqlite3',
|
||||
'.lockb', // Bun binary lockfile
|
||||
];
|
||||
|
||||
// Lockfiles - always truncate to first 100 lines
|
||||
const LOCKFILE_PATTERNS = [
|
||||
/\.lock$/,
|
||||
/lock\.(json|yaml)$/,
|
||||
/^(package|pnpm|yarn|bun|composer|Cargo|Gemfile|Pipfile|poetry)[-.]lock/,
|
||||
/^go\.sum$/,
|
||||
];
|
||||
|
||||
interface ChangeStats {
|
||||
@@ -72,6 +83,84 @@ function parseNumstat(numstat: string): FileChange[] {
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if content is predominantly base64-encoded
|
||||
*/
|
||||
function hasBase64Pattern(content: string): boolean {
|
||||
const base64Chunks = content.match(/[A-Za-z0-9+/=]{100,}/g) || [];
|
||||
const base64Length = base64Chunks.reduce((sum, chunk) => sum + chunk.length, 0);
|
||||
return base64Length > content.length * 0.3; // >30% base64
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine if a file should be aggressively truncated
|
||||
*/
|
||||
function shouldCullAggressively(file: string, content: string): boolean {
|
||||
const lines = content.split('\n');
|
||||
const avgLineLength = content.length / Math.max(lines.length, 1);
|
||||
|
||||
return (
|
||||
// Known lockfiles
|
||||
LOCKFILE_PATTERNS.some(p => p.test(file)) ||
|
||||
|
||||
// Extremely long average line length (minified/generated)
|
||||
avgLineLength > 200 ||
|
||||
|
||||
// Any single line over 5000 chars
|
||||
lines.some(line => line.length > 5000) ||
|
||||
|
||||
// Predominantly base64 content
|
||||
hasBase64Pattern(content) ||
|
||||
|
||||
// Known generated patterns
|
||||
/\.generated\./i.test(file) ||
|
||||
/\.min\./i.test(file) ||
|
||||
/\.bundle\./i.test(file) ||
|
||||
content.includes('/* @generated */') ||
|
||||
content.includes('// Auto-generated') ||
|
||||
content.includes('@autogenerated')
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if file is binary based on extension
|
||||
*/
|
||||
function isBinaryFile(path: string): boolean {
|
||||
return BINARY_EXTENSIONS.some(ext => path.toLowerCase().endsWith(ext));
|
||||
}
|
||||
|
||||
/**
|
||||
* Format file size in human-readable format
|
||||
*/
|
||||
function formatBytes(bytes: number): string {
|
||||
if (bytes < 1024) return `${bytes} B`;
|
||||
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
|
||||
return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Count lines in text
|
||||
*/
|
||||
function countLines(text: string): number {
|
||||
return text.split('\n').length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Truncate text to maximum number of lines
|
||||
*/
|
||||
function truncateToLines(text: string, maxLines: number): string {
|
||||
const lines = text.split('\n');
|
||||
if (lines.length <= maxLines) return text;
|
||||
return lines.slice(0, maxLines).join('\n');
|
||||
}
|
||||
|
||||
/**
|
||||
* Format binary file summary for diff
|
||||
*/
|
||||
function formatBinarySummary(filePath: string): string {
|
||||
return `Binary file: ${filePath}\n(content omitted)`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get overall change statistics
|
||||
*/
|
||||
@@ -139,16 +228,11 @@ function getFileSummary(files: FileChange[]): string {
|
||||
* Get filtered diff output (excluding ignored files)
|
||||
*/
|
||||
async function getFilteredDiff(staged: boolean): Promise<string> {
|
||||
const command = staged
|
||||
? 'git diff --staged'
|
||||
: 'git diff HEAD~1..HEAD';
|
||||
|
||||
// Get list of files to exclude
|
||||
const numstatCmd = staged
|
||||
? 'git diff --staged --numstat'
|
||||
: 'git diff HEAD~1..HEAD --numstat';
|
||||
const numstat = staged
|
||||
? await $`git diff --staged --numstat`.text()
|
||||
: await $`git diff HEAD~1..HEAD --numstat`.text();
|
||||
|
||||
const numstat = await $`sh -c ${numstatCmd}`.text();
|
||||
const files = parseNumstat(numstat);
|
||||
const filesToExclude = files
|
||||
.filter(f => f.shouldIgnore || f.isBinary)
|
||||
@@ -156,23 +240,37 @@ async function getFilteredDiff(staged: boolean): Promise<string> {
|
||||
|
||||
// Build diff command with exclusions
|
||||
if (filesToExclude.length === 0) {
|
||||
return await $`sh -c ${command}`.text();
|
||||
return staged
|
||||
? await $`git diff --staged`.text()
|
||||
: await $`git diff HEAD~1..HEAD`.text();
|
||||
}
|
||||
|
||||
// Git diff with pathspec exclusions
|
||||
const excludeArgs = filesToExclude.map(f => `:(exclude)${f}`).join(' ');
|
||||
const fullCommand = `${command} -- . ${excludeArgs}`;
|
||||
|
||||
// Git diff with pathspec exclusions - construct as array to avoid shell quoting issues
|
||||
try {
|
||||
return await $`sh -c ${fullCommand}`.text();
|
||||
const baseArgs = staged ? ['diff', '--staged'] : ['diff', 'HEAD~1..HEAD'];
|
||||
const args = [...baseArgs, '--', '.', ...filesToExclude.map(f => `:(exclude)${f}`)];
|
||||
|
||||
// Use Bun.spawn to call git with proper argument handling
|
||||
const proc = Bun.spawn(['git', ...args], {
|
||||
cwd: process.cwd(),
|
||||
stdout: 'pipe',
|
||||
stderr: 'pipe',
|
||||
});
|
||||
|
||||
const output = await new Response(proc.stdout).text();
|
||||
await proc.exited;
|
||||
|
||||
return output;
|
||||
} catch {
|
||||
// If exclusion fails, just return full diff
|
||||
return await $`sh -c ${command}`.text();
|
||||
// If exclusion fails, return diff without exclusions
|
||||
return staged
|
||||
? await $`git diff --staged`.text()
|
||||
: await $`git diff HEAD~1..HEAD`.text();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Truncate diff to fit within line budget
|
||||
* Truncate diff with per-file character/line limits and smart culling
|
||||
*/
|
||||
function truncateDiff(diff: string, maxLines: number, filesInfo: string): string {
|
||||
const lines = diff.split('\n');
|
||||
@@ -181,9 +279,14 @@ function truncateDiff(diff: string, maxLines: number, filesInfo: string): string
|
||||
return diff;
|
||||
}
|
||||
|
||||
// Try to include complete file diffs rather than cutting mid-file
|
||||
const fileDiffs: Array<{ header: string; content: string; lineCount: number }> = [];
|
||||
let currentFile: { header: string; lines: string[] } | null = null;
|
||||
// Parse into individual file diffs
|
||||
const fileDiffs: Array<{
|
||||
header: string;
|
||||
content: string;
|
||||
lineCount: number;
|
||||
path: string;
|
||||
}> = [];
|
||||
let currentFile: { header: string; lines: string[]; path: string } | null = null;
|
||||
|
||||
for (const line of lines) {
|
||||
if (line.startsWith('diff --git')) {
|
||||
@@ -192,9 +295,13 @@ function truncateDiff(diff: string, maxLines: number, filesInfo: string): string
|
||||
header: currentFile.header,
|
||||
content: currentFile.lines.join('\n'),
|
||||
lineCount: currentFile.lines.length,
|
||||
path: currentFile.path,
|
||||
});
|
||||
}
|
||||
currentFile = { header: line, lines: [line] };
|
||||
// Extract file path from "diff --git a/path b/path"
|
||||
const match = line.match(/diff --git a\/(.*?) b\//);
|
||||
const path = match ? match[1] : 'unknown';
|
||||
currentFile = { header: line, lines: [line], path };
|
||||
} else if (currentFile) {
|
||||
currentFile.lines.push(line);
|
||||
}
|
||||
@@ -205,25 +312,63 @@ function truncateDiff(diff: string, maxLines: number, filesInfo: string): string
|
||||
header: currentFile.header,
|
||||
content: currentFile.lines.join('\n'),
|
||||
lineCount: currentFile.lines.length,
|
||||
path: currentFile.path,
|
||||
});
|
||||
}
|
||||
|
||||
// Include files until we hit the limit
|
||||
let includedLines = 0;
|
||||
// Process each file with per-file limits and smart culling
|
||||
let totalLines = 0;
|
||||
const includedDiffs: string[] = [];
|
||||
const omittedFiles: string[] = [];
|
||||
const omittedFiles: Array<{file: string, reason: string}> = [];
|
||||
|
||||
for (const fileDiff of fileDiffs) {
|
||||
if (includedLines + fileDiff.lineCount <= maxLines - 10) { // Reserve space for summary
|
||||
includedDiffs.push(fileDiff.content);
|
||||
includedLines += fileDiff.lineCount;
|
||||
} else {
|
||||
// Extract filename from diff header
|
||||
const match = fileDiff.header.match(/diff --git a\/(.*?) b\//);
|
||||
if (match) {
|
||||
omittedFiles.push(match[1]);
|
||||
// Check if binary file
|
||||
if (isBinaryFile(fileDiff.path)) {
|
||||
const summary = formatBinarySummary(fileDiff.path);
|
||||
includedDiffs.push(summary);
|
||||
totalLines += countLines(summary);
|
||||
continue;
|
||||
}
|
||||
|
||||
let content = fileDiff.content;
|
||||
const fileLines = fileDiff.lineCount;
|
||||
const fileChars = content.length;
|
||||
let truncationNotice = '';
|
||||
|
||||
// Apply per-file safety limits
|
||||
const CHAR_THRESHOLD = 10000;
|
||||
const LINE_THRESHOLD = 1500;
|
||||
|
||||
if (fileChars > CHAR_THRESHOLD || fileLines > LINE_THRESHOLD) {
|
||||
// File exceeded threshold - check if it should be culled
|
||||
if (shouldCullAggressively(fileDiff.path, content)) {
|
||||
// Check if it's a lockfile (special handling)
|
||||
if (LOCKFILE_PATTERNS.some(p => p.test(fileDiff.path))) {
|
||||
content = truncateToLines(content, 100);
|
||||
truncationNotice = `\n... (lockfile truncated - showing first 100 of ${fileLines} lines)`;
|
||||
} else {
|
||||
// Other noise - aggressive truncation
|
||||
content = truncateToLines(content, 30);
|
||||
truncationNotice = `\n... (generated/noisy file truncated - showing first 30 of ${fileLines} lines, ${formatBytes(fileChars)} total)`;
|
||||
}
|
||||
} else {
|
||||
// Legitimate large file - more generous truncation
|
||||
content = truncateToLines(content, 300);
|
||||
truncationNotice = `\n... (large file truncated - showing first 300 of ${fileLines} lines, ${formatBytes(fileChars)} total)`;
|
||||
}
|
||||
}
|
||||
|
||||
// Check if it fits in global budget
|
||||
const contentLines = countLines(content);
|
||||
if (totalLines + contentLines <= maxLines - 10) { // Reserve space for summary
|
||||
includedDiffs.push(content + truncationNotice);
|
||||
totalLines += contentLines;
|
||||
} else {
|
||||
omittedFiles.push({
|
||||
file: fileDiff.path,
|
||||
reason: 'global line budget exceeded'
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
let result = includedDiffs.join('\n\n');
|
||||
@@ -231,7 +376,7 @@ function truncateDiff(diff: string, maxLines: number, filesInfo: string): string
|
||||
if (omittedFiles.length > 0) {
|
||||
result += '\n\n---\n';
|
||||
result += `**Note:** ${omittedFiles.length} file(s) omitted due to output size limit:\n`;
|
||||
result += omittedFiles.map(f => ` - ${f}`).join('\n');
|
||||
result += omittedFiles.map(f => ` - ${f.file} (${f.reason})`).join('\n');
|
||||
result += '\n\n_Full changes visible in git status/stat output above._';
|
||||
}
|
||||
|
||||
@@ -239,7 +384,7 @@ function truncateDiff(diff: string, maxLines: number, filesInfo: string): string
|
||||
}
|
||||
|
||||
/**
|
||||
* Get preview of new files being added
|
||||
* Get preview of new files being added (with per-file and total limits)
|
||||
*/
|
||||
async function getNewFilesPreviews(maxFiles: number = 5, maxLinesPerFile: number = 50): Promise<string> {
|
||||
try {
|
||||
@@ -253,34 +398,60 @@ async function getNewFilesPreviews(maxFiles: number = 5, maxLinesPerFile: number
|
||||
|
||||
const previews: string[] = [];
|
||||
const filesToShow = files.slice(0, maxFiles);
|
||||
let totalChars = 0;
|
||||
const MAX_TOTAL_CHARS = 30000;
|
||||
const MAX_CHARS_PER_FILE = 10000;
|
||||
|
||||
for (const file of filesToShow) {
|
||||
// Skip binary files
|
||||
if (BINARY_EXTENSIONS.some(ext => file.endsWith(ext))) {
|
||||
if (isBinaryFile(file)) {
|
||||
previews.push(`=== ${file} ===\n(binary file)`);
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
const content = await Bun.file(file).text();
|
||||
const lines = content.split('\n').slice(0, maxLinesPerFile);
|
||||
const truncated = lines.length < content.split('\n').length
|
||||
? `\n... (${content.split('\n').length - lines.length} more lines)`
|
||||
|
||||
// Apply per-file char limit FIRST (prevents single-line disasters)
|
||||
if (content.length > MAX_CHARS_PER_FILE) {
|
||||
if (shouldCullAggressively(file, content)) {
|
||||
previews.push(`=== ${file} ===\n(generated/noisy file - preview omitted)\nSize: ${formatBytes(content.length)}`);
|
||||
} else {
|
||||
const truncated = content.slice(0, MAX_CHARS_PER_FILE);
|
||||
previews.push(`=== ${file} ===\n${truncated}\n... (truncated from ${formatBytes(content.length)})`);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Apply line limit
|
||||
const lines = content.split('\n');
|
||||
const truncatedLines = lines.slice(0, maxLinesPerFile);
|
||||
const truncated = truncatedLines.join('\n');
|
||||
const notice = lines.length > maxLinesPerFile
|
||||
? `\n... (${lines.length - maxLinesPerFile} more lines)`
|
||||
: '';
|
||||
|
||||
previews.push(`=== ${file} ===\n${lines.join('\n')}${truncated}`);
|
||||
const preview = `=== ${file} ===\n${truncated}${notice}`;
|
||||
|
||||
// Check total budget
|
||||
if (totalChars + preview.length > MAX_TOTAL_CHARS) {
|
||||
const remaining = files.length - previews.length;
|
||||
previews.push(`\n... (${remaining} more file(s) omitted - preview size limit reached)`);
|
||||
break;
|
||||
}
|
||||
|
||||
previews.push(preview);
|
||||
totalChars += preview.length;
|
||||
} catch {
|
||||
previews.push(`=== ${file} ===\n(unreadable)`);
|
||||
}
|
||||
}
|
||||
|
||||
let result = previews.join('\n\n');
|
||||
|
||||
if (files.length > maxFiles) {
|
||||
result += `\n\n_... and ${files.length - maxFiles} more new file(s)_`;
|
||||
if (files.length > maxFiles && previews[previews.length - 1]?.includes('omitted') === false) {
|
||||
previews.push(`\n... (${files.length - maxFiles} more new file(s) not shown)`);
|
||||
}
|
||||
|
||||
return result;
|
||||
return previews.join('\n\n');
|
||||
} catch {
|
||||
return '';
|
||||
}
|
||||
@@ -345,6 +516,15 @@ async function stagedContext(maxLines: number): Promise<string> {
|
||||
|
||||
output += '## Recent Commit Style\n```\n' + recentCommits.trim() + '\n```\n';
|
||||
|
||||
// Final safety: ensure total output doesn't exceed safe limit
|
||||
const MAX_TOTAL_OUTPUT = 150000; // 150K chars, leaves 50K headroom
|
||||
if (output.length > MAX_TOTAL_OUTPUT) {
|
||||
const lastNewline = output.slice(0, MAX_TOTAL_OUTPUT).lastIndexOf('\n');
|
||||
output = output.slice(0, lastNewline) +
|
||||
'\n\n[OUTPUT TRUNCATED - Exceeds safe character limit for AI context]\n' +
|
||||
`(Shown ${lastNewline.toLocaleString()} of ${output.length.toLocaleString()} chars)`;
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user