improve commit-helper: add smart truncation for large/generated files and binary detection

- Add per-file character/line limits to prevent massive diffs - Detect and aggressively truncate lockfiles, minified, and base64-heavy files - Expand binary file detection and format binary summaries - Add global output safety limit (150K chars) to prevent context overflow
2026-01-31 08:24:11 -06:00 · 2026-01-06 12:59:16 -06:00
parent df6b36be41
commit fa4ffe8dce
1 changed files with 229 additions and 49 deletions
@@ -26,15 +26,26 @@ const IGNORE_PATTERNS = [
  /\.min\.(js|css)$/,
  /\.bundle\.(js|css)$/,
  /dist\/.*\.map$/,
-  /\.svg$/,  // Often generated or binary-like
 ];

 // Binary/large file extensions to skip in diffs
 const BINARY_EXTENSIONS = [
-  '.png', '.jpg', '.jpeg', '.gif', '.ico', '.webp',
-  '.pdf', '.zip', '.tar', '.gz', '.woff', '.woff2', '.ttf', '.eot',
-  '.mp4', '.mp3', '.wav', '.avi', '.mov',
-  '.so', '.dylib', '.dll', '.exe',
+  '.png', '.jpg', '.jpeg', '.gif', '.ico', '.webp', '.avif', '.bmp',
+  '.svg',  // Often base64-encoded or huge
+  '.pdf', '.zip', '.tar', '.gz', '.woff', '.woff2', '.ttf', '.eot', '.otf',
+  '.mp4', '.mp3', '.wav', '.avi', '.mov', '.webm',
+  '.so', '.dylib', '.dll', '.exe', '.bin',
+  '.wasm', '.pyc', '.class',
+  '.db', '.sqlite', '.sqlite3',
+  '.lockb',  // Bun binary lockfile
+];
+
+// Lockfiles - always truncate to first 100 lines
+const LOCKFILE_PATTERNS = [
+  /\.lock$/,
+  /lock\.(json|yaml)$/,
+  /^(package|pnpm|yarn|bun|composer|Cargo|Gemfile|Pipfile|poetry)[-.]lock/,
+  /^go\.sum$/,
 ];

 interface ChangeStats {
@@ -72,6 +83,84 @@ function parseNumstat(numstat: string): FileChange[] {
    });
 }

+/**
+ * Check if content is predominantly base64-encoded
+ */
+function hasBase64Pattern(content: string): boolean {
+  const base64Chunks = content.match(/[A-Za-z0-9+/=]{100,}/g) || [];
+  const base64Length = base64Chunks.reduce((sum, chunk) => sum + chunk.length, 0);
+  return base64Length > content.length * 0.3; // >30% base64
+}
+
+/**
+ * Determine if a file should be aggressively truncated
+ */
+function shouldCullAggressively(file: string, content: string): boolean {
+  const lines = content.split('\n');
+  const avgLineLength = content.length / Math.max(lines.length, 1);
+  
+  return (
+    // Known lockfiles
+    LOCKFILE_PATTERNS.some(p => p.test(file)) ||
+    
+    // Extremely long average line length (minified/generated)
+    avgLineLength > 200 ||
+    
+    // Any single line over 5000 chars
+    lines.some(line => line.length > 5000) ||
+    
+    // Predominantly base64 content
+    hasBase64Pattern(content) ||
+    
+    // Known generated patterns
+    /\.generated\./i.test(file) ||
+    /\.min\./i.test(file) ||
+    /\.bundle\./i.test(file) ||
+    content.includes('/* @generated */') ||
+    content.includes('// Auto-generated') ||
+    content.includes('@autogenerated')
+  );
+}
+
+/**
+ * Check if file is binary based on extension
+ */
+function isBinaryFile(path: string): boolean {
+  return BINARY_EXTENSIONS.some(ext => path.toLowerCase().endsWith(ext));
+}
+
+/**
+ * Format file size in human-readable format
+ */
+function formatBytes(bytes: number): string {
+  if (bytes < 1024) return `${bytes} B`;
+  if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
+  return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
+}
+
+/**
+ * Count lines in text
+ */
+function countLines(text: string): number {
+  return text.split('\n').length;
+}
+
+/**
+ * Truncate text to maximum number of lines
+ */
+function truncateToLines(text: string, maxLines: number): string {
+  const lines = text.split('\n');
+  if (lines.length <= maxLines) return text;
+  return lines.slice(0, maxLines).join('\n');
+}
+
+/**
+ * Format binary file summary for diff
+ */
+function formatBinarySummary(filePath: string): string {
+  return `Binary file: ${filePath}\n(content omitted)`;
+}
+
 /**
 * Get overall change statistics
 */
@@ -139,16 +228,11 @@ function getFileSummary(files: FileChange[]): string {
 * Get filtered diff output (excluding ignored files)
 */
 async function getFilteredDiff(staged: boolean): Promise<string> {
-  const command = staged 
-    ? 'git diff --staged'
-    : 'git diff HEAD~1..HEAD';
-  
  // Get list of files to exclude
-  const numstatCmd = staged
-    ? 'git diff --staged --numstat'
-    : 'git diff HEAD~1..HEAD --numstat';
+  const numstat = staged
+    ? await $`git diff --staged --numstat`.text()
+    : await $`git diff HEAD~1..HEAD --numstat`.text();
  
-  const numstat = await $`sh -c ${numstatCmd}`.text();
  const files = parseNumstat(numstat);
  const filesToExclude = files
    .filter(f => f.shouldIgnore || f.isBinary)
@@ -156,23 +240,37 @@ async function getFilteredDiff(staged: boolean): Promise<string> {
  
  // Build diff command with exclusions
  if (filesToExclude.length === 0) {
-    return await $`sh -c ${command}`.text();
+    return staged
+      ? await $`git diff --staged`.text()
+      : await $`git diff HEAD~1..HEAD`.text();
  }
  
-  // Git diff with pathspec exclusions
-  const excludeArgs = filesToExclude.map(f => `:(exclude)${f}`).join(' ');
-  const fullCommand = `${command} -- . ${excludeArgs}`;
-  
+  // Git diff with pathspec exclusions - construct as array to avoid shell quoting issues
  try {
-    return await $`sh -c ${fullCommand}`.text();
+    const baseArgs = staged ? ['diff', '--staged'] : ['diff', 'HEAD~1..HEAD'];
+    const args = [...baseArgs, '--', '.', ...filesToExclude.map(f => `:(exclude)${f}`)];
+    
+    // Use Bun.spawn to call git with proper argument handling
+    const proc = Bun.spawn(['git', ...args], {
+      cwd: process.cwd(),
+      stdout: 'pipe',
+      stderr: 'pipe',
+    });
+    
+    const output = await new Response(proc.stdout).text();
+    await proc.exited;
+    
+    return output;
  } catch {
-    // If exclusion fails, just return full diff
-    return await $`sh -c ${command}`.text();
+    // If exclusion fails, return diff without exclusions
+    return staged
+      ? await $`git diff --staged`.text()
+      : await $`git diff HEAD~1..HEAD`.text();
  }
 }

 /**
- * Truncate diff to fit within line budget
+ * Truncate diff with per-file character/line limits and smart culling
 */
 function truncateDiff(diff: string, maxLines: number, filesInfo: string): string {
  const lines = diff.split('\n');
@@ -181,9 +279,14 @@ function truncateDiff(diff: string, maxLines: number, filesInfo: string): string
    return diff;
  }
  
-  // Try to include complete file diffs rather than cutting mid-file
-  const fileDiffs: Array<{ header: string; content: string; lineCount: number }> = [];
-  let currentFile: { header: string; lines: string[] } | null = null;
+  // Parse into individual file diffs
+  const fileDiffs: Array<{ 
+    header: string; 
+    content: string; 
+    lineCount: number;
+    path: string;
+  }> = [];
+  let currentFile: { header: string; lines: string[]; path: string } | null = null;
  
  for (const line of lines) {
    if (line.startsWith('diff --git')) {
@@ -192,9 +295,13 @@ function truncateDiff(diff: string, maxLines: number, filesInfo: string): string
          header: currentFile.header,
          content: currentFile.lines.join('\n'),
          lineCount: currentFile.lines.length,
+          path: currentFile.path,
        });
      }
-      currentFile = { header: line, lines: [line] };
+      // Extract file path from "diff --git a/path b/path"
+      const match = line.match(/diff --git a\/(.*?) b\//);
+      const path = match ? match[1] : 'unknown';
+      currentFile = { header: line, lines: [line], path };
    } else if (currentFile) {
      currentFile.lines.push(line);
    }
@@ -205,25 +312,63 @@ function truncateDiff(diff: string, maxLines: number, filesInfo: string): string
      header: currentFile.header,
      content: currentFile.lines.join('\n'),
      lineCount: currentFile.lines.length,
+      path: currentFile.path,
    });
  }
  
-  // Include files until we hit the limit
-  let includedLines = 0;
+  // Process each file with per-file limits and smart culling
+  let totalLines = 0;
  const includedDiffs: string[] = [];
-  const omittedFiles: string[] = [];
+  const omittedFiles: Array<{file: string, reason: string}> = [];
  
  for (const fileDiff of fileDiffs) {
-    if (includedLines + fileDiff.lineCount <= maxLines - 10) { // Reserve space for summary
-      includedDiffs.push(fileDiff.content);
-      includedLines += fileDiff.lineCount;
-    } else {
-      // Extract filename from diff header
-      const match = fileDiff.header.match(/diff --git a\/(.*?) b\//);
-      if (match) {
-        omittedFiles.push(match[1]);
+    // Check if binary file
+    if (isBinaryFile(fileDiff.path)) {
+      const summary = formatBinarySummary(fileDiff.path);
+      includedDiffs.push(summary);
+      totalLines += countLines(summary);
+      continue;
+    }
+    
+    let content = fileDiff.content;
+    const fileLines = fileDiff.lineCount;
+    const fileChars = content.length;
+    let truncationNotice = '';
+    
+    // Apply per-file safety limits
+    const CHAR_THRESHOLD = 10000;
+    const LINE_THRESHOLD = 1500;
+    
+    if (fileChars > CHAR_THRESHOLD || fileLines > LINE_THRESHOLD) {
+      // File exceeded threshold - check if it should be culled
+      if (shouldCullAggressively(fileDiff.path, content)) {
+        // Check if it's a lockfile (special handling)
+        if (LOCKFILE_PATTERNS.some(p => p.test(fileDiff.path))) {
+          content = truncateToLines(content, 100);
+          truncationNotice = `\n... (lockfile truncated - showing first 100 of ${fileLines} lines)`;
+        } else {
+          // Other noise - aggressive truncation
+          content = truncateToLines(content, 30);
+          truncationNotice = `\n... (generated/noisy file truncated - showing first 30 of ${fileLines} lines, ${formatBytes(fileChars)} total)`;
+        }
+      } else {
+        // Legitimate large file - more generous truncation
+        content = truncateToLines(content, 300);
+        truncationNotice = `\n... (large file truncated - showing first 300 of ${fileLines} lines, ${formatBytes(fileChars)} total)`;
      }
    }
+    
+    // Check if it fits in global budget
+    const contentLines = countLines(content);
+    if (totalLines + contentLines <= maxLines - 10) { // Reserve space for summary
+      includedDiffs.push(content + truncationNotice);
+      totalLines += contentLines;
+    } else {
+      omittedFiles.push({
+        file: fileDiff.path,
+        reason: 'global line budget exceeded'
+      });
+    }
  }
  
  let result = includedDiffs.join('\n\n');
@@ -231,7 +376,7 @@ function truncateDiff(diff: string, maxLines: number, filesInfo: string): string
  if (omittedFiles.length > 0) {
    result += '\n\n---\n';
    result += `**Note:** ${omittedFiles.length} file(s) omitted due to output size limit:\n`;
-    result += omittedFiles.map(f => `  - ${f}`).join('\n');
+    result += omittedFiles.map(f => `  - ${f.file} (${f.reason})`).join('\n');
    result += '\n\n_Full changes visible in git status/stat output above._';
  }
  
@@ -239,7 +384,7 @@ function truncateDiff(diff: string, maxLines: number, filesInfo: string): string
 }

 /**
- * Get preview of new files being added
+ * Get preview of new files being added (with per-file and total limits)
 */
 async function getNewFilesPreviews(maxFiles: number = 5, maxLinesPerFile: number = 50): Promise<string> {
  try {
@@ -253,34 +398,60 @@ async function getNewFilesPreviews(maxFiles: number = 5, maxLinesPerFile: number
    
    const previews: string[] = [];
    const filesToShow = files.slice(0, maxFiles);
+    let totalChars = 0;
+    const MAX_TOTAL_CHARS = 30000;
+    const MAX_CHARS_PER_FILE = 10000;
    
    for (const file of filesToShow) {
      // Skip binary files
-      if (BINARY_EXTENSIONS.some(ext => file.endsWith(ext))) {
+      if (isBinaryFile(file)) {
        previews.push(`=== ${file} ===\n(binary file)`);
        continue;
      }
      
      try {
        const content = await Bun.file(file).text();
-        const lines = content.split('\n').slice(0, maxLinesPerFile);
-        const truncated = lines.length < content.split('\n').length 
-          ? `\n... (${content.split('\n').length - lines.length} more lines)`
+        
+        // Apply per-file char limit FIRST (prevents single-line disasters)
+        if (content.length > MAX_CHARS_PER_FILE) {
+          if (shouldCullAggressively(file, content)) {
+            previews.push(`=== ${file} ===\n(generated/noisy file - preview omitted)\nSize: ${formatBytes(content.length)}`);
+          } else {
+            const truncated = content.slice(0, MAX_CHARS_PER_FILE);
+            previews.push(`=== ${file} ===\n${truncated}\n... (truncated from ${formatBytes(content.length)})`);
+          }
+          continue;
+        }
+        
+        // Apply line limit
+        const lines = content.split('\n');
+        const truncatedLines = lines.slice(0, maxLinesPerFile);
+        const truncated = truncatedLines.join('\n');
+        const notice = lines.length > maxLinesPerFile 
+          ? `\n... (${lines.length - maxLinesPerFile} more lines)`
          : '';
        
-        previews.push(`=== ${file} ===\n${lines.join('\n')}${truncated}`);
+        const preview = `=== ${file} ===\n${truncated}${notice}`;
+        
+        // Check total budget
+        if (totalChars + preview.length > MAX_TOTAL_CHARS) {
+          const remaining = files.length - previews.length;
+          previews.push(`\n... (${remaining} more file(s) omitted - preview size limit reached)`);
+          break;
+        }
+        
+        previews.push(preview);
+        totalChars += preview.length;
      } catch {
        previews.push(`=== ${file} ===\n(unreadable)`);
      }
    }
    
-    let result = previews.join('\n\n');
-    
-    if (files.length > maxFiles) {
-      result += `\n\n_... and ${files.length - maxFiles} more new file(s)_`;
+    if (files.length > maxFiles && previews[previews.length - 1]?.includes('omitted') === false) {
+      previews.push(`\n... (${files.length - maxFiles} more new file(s) not shown)`);
    }
    
-    return result;
+    return previews.join('\n\n');
  } catch {
    return '';
  }
@@ -345,6 +516,15 @@ async function stagedContext(maxLines: number): Promise<string> {
  
  output += '## Recent Commit Style\n```\n' + recentCommits.trim() + '\n```\n';
  
+  // Final safety: ensure total output doesn't exceed safe limit
+  const MAX_TOTAL_OUTPUT = 150000; // 150K chars, leaves 50K headroom
+  if (output.length > MAX_TOTAL_OUTPUT) {
+    const lastNewline = output.slice(0, MAX_TOTAL_OUTPUT).lastIndexOf('\n');
+    output = output.slice(0, lastNewline) + 
+      '\n\n[OUTPUT TRUNCATED - Exceeds safe character limit for AI context]\n' +
+      `(Shown ${lastNewline.toLocaleString()} of ${output.length.toLocaleString()} chars)`;
+  }
+  
  return output;
 }