improve commit-helper: add smart truncation for large/generated files and binary detection

- Add per-file character/line limits to prevent massive diffs - Detect and aggressively truncate lockfiles, minified, and base64-heavy files - Expand binary file detection and format binary summaries - Add global output safety limit (150K chars) to prevent context overflow
2026-01-31 08:24:11 -06:00 · 2026-01-06 12:59:16 -06:00
parent df6b36be41
commit fa4ffe8dce
1 changed files with 229 additions and 49 deletions
@@ -26,15 +26,26 @@ const IGNORE_PATTERNS = [
  /\.min\.(js|css)$/,
  /\.bundle\.(js|css)$/,
  /dist\/.*\.map$/,
  /\.svg$/,  // Often generated or binary-like
 ];
 // Binary/large file extensions to skip in diffs
 const BINARY_EXTENSIONS = [
-  '.png', '.jpg', '.jpeg', '.gif', '.ico', '.webp',
+  '.png', '.jpg', '.jpeg', '.gif', '.ico', '.webp', '.avif', '.bmp',
-  '.pdf', '.zip', '.tar', '.gz', '.woff', '.woff2', '.ttf', '.eot',
+  '.svg',  // Often base64-encoded or huge
-  '.mp4', '.mp3', '.wav', '.avi', '.mov',
+  '.pdf', '.zip', '.tar', '.gz', '.woff', '.woff2', '.ttf', '.eot', '.otf',
-  '.so', '.dylib', '.dll', '.exe',
+  '.mp4', '.mp3', '.wav', '.avi', '.mov', '.webm',
  '.so', '.dylib', '.dll', '.exe', '.bin',
  '.wasm', '.pyc', '.class',
  '.db', '.sqlite', '.sqlite3',
  '.lockb',  // Bun binary lockfile
 ];
 // Lockfiles - always truncate to first 100 lines
 const LOCKFILE_PATTERNS = [
  /\.lock$/,
  /lock\.(json|yaml)$/,
  /^(package|pnpm|yarn|bun|composer|Cargo|Gemfile|Pipfile|poetry)[-.]lock/,
  /^go\.sum$/,
 ];
 interface ChangeStats {
@@ -72,6 +83,84 @@ function parseNumstat(numstat: string): FileChange[] {
    });
 }
 /**
 * Check if content is predominantly base64-encoded
 */
 function hasBase64Pattern(content: string): boolean {
  const base64Chunks = content.match(/[A-Za-z0-9+/=]{100,}/g) || [];
  const base64Length = base64Chunks.reduce((sum, chunk) => sum + chunk.length, 0);
  return base64Length > content.length * 0.3; // >30% base64
 }
 /**
 * Determine if a file should be aggressively truncated
 */
 function shouldCullAggressively(file: string, content: string): boolean {
  const lines = content.split('\n');
  const avgLineLength = content.length / Math.max(lines.length, 1);
  return (
    // Known lockfiles
    LOCKFILE_PATTERNS.some(p => p.test(file)) ||
    // Extremely long average line length (minified/generated)
    avgLineLength > 200 ||
    // Any single line over 5000 chars
    lines.some(line => line.length > 5000) ||
    // Predominantly base64 content
    hasBase64Pattern(content) ||
    // Known generated patterns
    /\.generated\./i.test(file) ||
    /\.min\./i.test(file) ||
    /\.bundle\./i.test(file) ||
    content.includes('/* @generated */') ||
    content.includes('// Auto-generated') ||
    content.includes('@autogenerated')
  );
 }
 /**
 * Check if file is binary based on extension
 */
 function isBinaryFile(path: string): boolean {
  return BINARY_EXTENSIONS.some(ext => path.toLowerCase().endsWith(ext));
 }
 /**
 * Format file size in human-readable format
 */
 function formatBytes(bytes: number): string {
  if (bytes < 1024) return `${bytes} B`;
  if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
  return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
 }
 /**
 * Count lines in text
 */
 function countLines(text: string): number {
  return text.split('\n').length;
 }
 /**
 * Truncate text to maximum number of lines
 */
 function truncateToLines(text: string, maxLines: number): string {
  const lines = text.split('\n');
  if (lines.length <= maxLines) return text;
  return lines.slice(0, maxLines).join('\n');
 }
 /**
 * Format binary file summary for diff
 */
 function formatBinarySummary(filePath: string): string {
  return `Binary file: ${filePath}\n(content omitted)`;
 }
 /**
 * Get overall change statistics
 */
@@ -139,16 +228,11 @@ function getFileSummary(files: FileChange[]): string {
 * Get filtered diff output (excluding ignored files)
 */
 async function getFilteredDiff(staged: boolean): Promise<string> {
  const command = staged 
    ? 'git diff --staged'
    : 'git diff HEAD~1..HEAD';
  // Get list of files to exclude
-  const numstatCmd = staged
+  const numstat = staged
-    ? 'git diff --staged --numstat'
+    ? await $`git diff --staged --numstat`.text()
-    : 'git diff HEAD~1..HEAD --numstat';
+    : await $`git diff HEAD~1..HEAD --numstat`.text();
  const numstat = await $`sh -c ${numstatCmd}`.text();
  const files = parseNumstat(numstat);
  const filesToExclude = files
    .filter(f => f.shouldIgnore || f.isBinary)
@@ -156,23 +240,37 @@ async function getFilteredDiff(staged: boolean): Promise<string> {
  // Build diff command with exclusions
  if (filesToExclude.length === 0) {
-    return await $`sh -c ${command}`.text();
+    return staged
      ? await $`git diff --staged`.text()
      : await $`git diff HEAD~1..HEAD`.text();
  }
-  // Git diff with pathspec exclusions
+  // Git diff with pathspec exclusions - construct as array to avoid shell quoting issues
  const excludeArgs = filesToExclude.map(f => `:(exclude)${f}`).join(' ');
  const fullCommand = `${command} -- . ${excludeArgs}`;
  try {
-    return await $`sh -c ${fullCommand}`.text();
+    const baseArgs = staged ? ['diff', '--staged'] : ['diff', 'HEAD~1..HEAD'];
    const args = [...baseArgs, '--', '.', ...filesToExclude.map(f => `:(exclude)${f}`)];
    // Use Bun.spawn to call git with proper argument handling
    const proc = Bun.spawn(['git', ...args], {
      cwd: process.cwd(),
      stdout: 'pipe',
      stderr: 'pipe',
    });
    const output = await new Response(proc.stdout).text();
    await proc.exited;
    return output;
  } catch {
-    // If exclusion fails, just return full diff
+    // If exclusion fails, return diff without exclusions
-    return await $`sh -c ${command}`.text();
+    return staged
      ? await $`git diff --staged`.text()
      : await $`git diff HEAD~1..HEAD`.text();
  }
 }
 /**
- * Truncate diff to fit within line budget
+ * Truncate diff with per-file character/line limits and smart culling
 */
 function truncateDiff(diff: string, maxLines: number, filesInfo: string): string {
  const lines = diff.split('\n');
@@ -181,9 +279,14 @@ function truncateDiff(diff: string, maxLines: number, filesInfo: string): string
    return diff;
  }
-  // Try to include complete file diffs rather than cutting mid-file
+  // Parse into individual file diffs
-  const fileDiffs: Array<{ header: string; content: string; lineCount: number }> = [];
+  const fileDiffs: Array<{ 
-  let currentFile: { header: string; lines: string[] } | null = null;
+    header: string; 
    content: string; 
    lineCount: number;
    path: string;
  }> = [];
  let currentFile: { header: string; lines: string[]; path: string } | null = null;
  for (const line of lines) {
    if (line.startsWith('diff --git')) {
@@ -192,9 +295,13 @@ function truncateDiff(diff: string, maxLines: number, filesInfo: string): string
          header: currentFile.header,
          content: currentFile.lines.join('\n'),
          lineCount: currentFile.lines.length,
          path: currentFile.path,
        });
      }
-      currentFile = { header: line, lines: [line] };
+      // Extract file path from "diff --git a/path b/path"
      const match = line.match(/diff --git a\/(.*?) b\//);
      const path = match ? match[1] : 'unknown';
      currentFile = { header: line, lines: [line], path };
    } else if (currentFile) {
      currentFile.lines.push(line);
    }
@@ -205,24 +312,62 @@ function truncateDiff(diff: string, maxLines: number, filesInfo: string): string
      header: currentFile.header,
      content: currentFile.lines.join('\n'),
      lineCount: currentFile.lines.length,
      path: currentFile.path,
    });
  }
-  // Include files until we hit the limit
+  // Process each file with per-file limits and smart culling
-  let includedLines = 0;
+  let totalLines = 0;
  const includedDiffs: string[] = [];
-  const omittedFiles: string[] = [];
+  const omittedFiles: Array<{file: string, reason: string}> = [];
  for (const fileDiff of fileDiffs) {
-    if (includedLines + fileDiff.lineCount <= maxLines - 10) { // Reserve space for summary
+    // Check if binary file
-      includedDiffs.push(fileDiff.content);
+    if (isBinaryFile(fileDiff.path)) {
-      includedLines += fileDiff.lineCount;
+      const summary = formatBinarySummary(fileDiff.path);
-    } else {
+      includedDiffs.push(summary);
-      // Extract filename from diff header
+      totalLines += countLines(summary);
-      const match = fileDiff.header.match(/diff --git a\/(.*?) b\//);
+      continue;
      if (match) {
        omittedFiles.push(match[1]);
    }
    let content = fileDiff.content;
    const fileLines = fileDiff.lineCount;
    const fileChars = content.length;
    let truncationNotice = '';
    // Apply per-file safety limits
    const CHAR_THRESHOLD = 10000;
    const LINE_THRESHOLD = 1500;
    if (fileChars > CHAR_THRESHOLD || fileLines > LINE_THRESHOLD) {
      // File exceeded threshold - check if it should be culled
      if (shouldCullAggressively(fileDiff.path, content)) {
        // Check if it's a lockfile (special handling)
        if (LOCKFILE_PATTERNS.some(p => p.test(fileDiff.path))) {
          content = truncateToLines(content, 100);
          truncationNotice = `\n... (lockfile truncated - showing first 100 of ${fileLines} lines)`;
        } else {
          // Other noise - aggressive truncation
          content = truncateToLines(content, 30);
          truncationNotice = `\n... (generated/noisy file truncated - showing first 30 of ${fileLines} lines, ${formatBytes(fileChars)} total)`;
        }
      } else {
        // Legitimate large file - more generous truncation
        content = truncateToLines(content, 300);
        truncationNotice = `\n... (large file truncated - showing first 300 of ${fileLines} lines, ${formatBytes(fileChars)} total)`;
      }
    }
    // Check if it fits in global budget
    const contentLines = countLines(content);
    if (totalLines + contentLines <= maxLines - 10) { // Reserve space for summary
      includedDiffs.push(content + truncationNotice);
      totalLines += contentLines;
    } else {
      omittedFiles.push({
        file: fileDiff.path,
        reason: 'global line budget exceeded'
      });
    }
  }
@@ -231,7 +376,7 @@ function truncateDiff(diff: string, maxLines: number, filesInfo: string): string
  if (omittedFiles.length > 0) {
    result += '\n\n---\n';
    result += `**Note:** ${omittedFiles.length} file(s) omitted due to output size limit:\n`;
-    result += omittedFiles.map(f => `  - ${f}`).join('\n');
+    result += omittedFiles.map(f => `  - ${f.file} (${f.reason})`).join('\n');
    result += '\n\n_Full changes visible in git status/stat output above._';
  }
@@ -239,7 +384,7 @@ function truncateDiff(diff: string, maxLines: number, filesInfo: string): string
 }
 /**
- * Get preview of new files being added
+ * Get preview of new files being added (with per-file and total limits)
 */
 async function getNewFilesPreviews(maxFiles: number = 5, maxLinesPerFile: number = 50): Promise<string> {
  try {
@@ -253,34 +398,60 @@ async function getNewFilesPreviews(maxFiles: number = 5, maxLinesPerFile: number
    const previews: string[] = [];
    const filesToShow = files.slice(0, maxFiles);
    let totalChars = 0;
    const MAX_TOTAL_CHARS = 30000;
    const MAX_CHARS_PER_FILE = 10000;
    for (const file of filesToShow) {
      // Skip binary files
-      if (BINARY_EXTENSIONS.some(ext => file.endsWith(ext))) {
+      if (isBinaryFile(file)) {
        previews.push(`=== ${file} ===\n(binary file)`);
        continue;
      }
      try {
        const content = await Bun.file(file).text();
-        const lines = content.split('\n').slice(0, maxLinesPerFile);
+        
-        const truncated = lines.length < content.split('\n').length 
+        // Apply per-file char limit FIRST (prevents single-line disasters)
-          ? `\n... (${content.split('\n').length - lines.length} more lines)`
+        if (content.length > MAX_CHARS_PER_FILE) {
          if (shouldCullAggressively(file, content)) {
            previews.push(`=== ${file} ===\n(generated/noisy file - preview omitted)\nSize: ${formatBytes(content.length)}`);
          } else {
            const truncated = content.slice(0, MAX_CHARS_PER_FILE);
            previews.push(`=== ${file} ===\n${truncated}\n... (truncated from ${formatBytes(content.length)})`);
          }
          continue;
        }
        // Apply line limit
        const lines = content.split('\n');
        const truncatedLines = lines.slice(0, maxLinesPerFile);
        const truncated = truncatedLines.join('\n');
        const notice = lines.length > maxLinesPerFile 
          ? `\n... (${lines.length - maxLinesPerFile} more lines)`
          : '';
-        previews.push(`=== ${file} ===\n${lines.join('\n')}${truncated}`);
+        const preview = `=== ${file} ===\n${truncated}${notice}`;
        // Check total budget
        if (totalChars + preview.length > MAX_TOTAL_CHARS) {
          const remaining = files.length - previews.length;
          previews.push(`\n... (${remaining} more file(s) omitted - preview size limit reached)`);
          break;
        }
        previews.push(preview);
        totalChars += preview.length;
      } catch {
        previews.push(`=== ${file} ===\n(unreadable)`);
      }
    }
-    let result = previews.join('\n\n');
+    if (files.length > maxFiles && previews[previews.length - 1]?.includes('omitted') === false) {
-    
+      previews.push(`\n... (${files.length - maxFiles} more new file(s) not shown)`);
    if (files.length > maxFiles) {
      result += `\n\n_... and ${files.length - maxFiles} more new file(s)_`;
    }
-    return result;
+    return previews.join('\n\n');
  } catch {
    return '';
  }
@@ -345,6 +516,15 @@ async function stagedContext(maxLines: number): Promise<string> {
  output += '## Recent Commit Style\n```\n' + recentCommits.trim() + '\n```\n';
  // Final safety: ensure total output doesn't exceed safe limit
  const MAX_TOTAL_OUTPUT = 150000; // 150K chars, leaves 50K headroom
  if (output.length > MAX_TOTAL_OUTPUT) {
    const lastNewline = output.slice(0, MAX_TOTAL_OUTPUT).lastIndexOf('\n');
    output = output.slice(0, lastNewline) + 
      '\n\n[OUTPUT TRUNCATED - Exceeds safe character limit for AI context]\n' +
      `(Shown ${lastNewline.toLocaleString()} of ${output.length.toLocaleString()} chars)`;
  }
  return output;
 }