perf: implement batch operations and optimize database indexes

Add batch upsert functionality to reduce database round-trips from N to 1 when inserting courses. Create comprehensive database indexes for common query patterns including term/subject lookups, time-series metrics, and job scheduling. Remove redundant indexes and add monitoring guidance for BRIN index effectiveness.
2026-01-30 22:23:32 -06:00 · 2025-11-03 11:18:42 -06:00
parent 51f8256e61
commit 1c6d2d4b6e
5 changed files with 240 additions and 38 deletions
@@ -0,0 +1,45 @@
+-- Performance optimization indexes
+
+-- Index for term-based queries (most common access pattern)
+CREATE INDEX IF NOT EXISTS idx_courses_term_code ON courses(term_code);
+
+-- Index for subject-based filtering
+CREATE INDEX IF NOT EXISTS idx_courses_subject ON courses(subject);
+
+-- Composite index for subject + term queries
+CREATE INDEX IF NOT EXISTS idx_courses_subject_term ON courses(subject, term_code);
+
+-- Index for course number lookups
+CREATE INDEX IF NOT EXISTS idx_courses_course_number ON courses(course_number);
+
+-- Index for last scraped timestamp (useful for finding stale data)
+CREATE INDEX IF NOT EXISTS idx_courses_last_scraped ON courses(last_scraped_at);
+
+-- Index for course metrics time-series queries
+-- BRIN index is optimal for time-series data
+CREATE INDEX IF NOT EXISTS idx_course_metrics_timestamp ON course_metrics USING BRIN(timestamp);
+
+-- B-tree index for specific course metric lookups
+CREATE INDEX IF NOT EXISTS idx_course_metrics_course_timestamp
+    ON course_metrics(course_id, timestamp DESC);
+
+-- Partial index for pending scrape jobs (only unlocked jobs)
+CREATE INDEX IF NOT EXISTS idx_scrape_jobs_pending
+    ON scrape_jobs(execute_at ASC)
+    WHERE locked_at IS NULL;
+
+-- Index for high-priority job processing
+CREATE INDEX IF NOT EXISTS idx_scrape_jobs_priority_pending
+    ON scrape_jobs(priority DESC, execute_at ASC)
+    WHERE locked_at IS NULL;
+
+-- Index for retry tracking
+CREATE INDEX IF NOT EXISTS idx_scrape_jobs_retry_count
+    ON scrape_jobs(retry_count)
+    WHERE retry_count > 0 AND locked_at IS NULL;
+
+-- Analyze tables to update statistics
+ANALYZE courses;
+ANALYZE course_metrics;
+ANALYZE course_audits;
+ANALYZE scrape_jobs;
@@ -0,0 +1,53 @@
+-- Index Optimization Follow-up Migration
+
+-- Reason: Redundant with composite index idx_courses_subject_term
+DROP INDEX IF EXISTS idx_courses_subject;
+
+-- Remove: idx_scrape_jobs_retry_count
+DROP INDEX IF EXISTS idx_scrape_jobs_retry_count;
+
+-- Purpose: Optimize the scheduler's frequent query (runs every 60 seconds)
+CREATE INDEX IF NOT EXISTS idx_scrape_jobs_scheduler_lookup
+    ON scrape_jobs(target_type, target_payload)
+    WHERE locked_at IS NULL;
+
+-- Note: We use (target_type, target_payload) instead of including locked_at
+-- in the index columns because:
+-- 1. The WHERE clause filters locked_at IS NULL (partial index optimization)
+-- 2. target_payload is JSONB and already large; keeping it as an indexed column
+--    allows PostgreSQL to use index-only scans for the SELECT target_payload query
+-- 3. This design minimizes index size while maximizing query performance
+
+
+-- Purpose: Enable efficient audit trail queries by course
+CREATE INDEX IF NOT EXISTS idx_course_audits_course_timestamp
+    ON course_audits(course_id, timestamp DESC);
+
+-- Purpose: Enable queries like "Show all changes in the last 24 hours"
+CREATE INDEX IF NOT EXISTS idx_course_audits_timestamp
+    ON course_audits(timestamp DESC);
+
+
+-- The BRIN index on course_metrics(timestamp) assumes data is inserted in
+-- chronological order. BRIN indexes are only effective when data is physically
+-- ordered on disk. If you perform:
+-- - Backfills of historical data
+-- - Out-of-order inserts
+-- - Frequent UPDATEs that move rows
+--
+-- Then the BRIN index effectiveness will degrade. Monitor with:
+--   SELECT * FROM brin_page_items(get_raw_page('idx_course_metrics_timestamp', 1));
+--
+-- If you see poor selectivity, consider:
+-- 1. REINDEX to rebuild after bulk loads
+-- 2. Switch to B-tree if inserts are not time-ordered
+-- 3. Use CLUSTER to physically reorder the table (requires downtime)
+
+COMMENT ON INDEX idx_course_metrics_timestamp IS
+    'BRIN index - requires chronologically ordered inserts for efficiency. Monitor selectivity.';
+
+-- Update statistics for query planner
+ANALYZE courses;
+ANALYZE course_metrics;
+ANALYZE course_audits;
+ANALYZE scrape_jobs;