feat: add scrape job result persistence for effectiveness tracking

2026-01-31 08:23:35 -06:00 · 2026-01-30 01:37:41 -06:00
parent 857ceabcca
commit 75a99c10ea
7 changed files with 242 additions and 33 deletions
@@ -0,0 +1,31 @@
+-- Scrape job results log: one row per completed (or failed) job for effectiveness tracking.
+CREATE TABLE scrape_job_results (
+    id              BIGSERIAL PRIMARY KEY,
+    target_type     target_type NOT NULL,
+    payload         JSONB NOT NULL,
+    priority        scrape_priority NOT NULL,
+
+    -- Timing
+    queued_at       TIMESTAMPTZ NOT NULL,
+    started_at      TIMESTAMPTZ NOT NULL,
+    completed_at    TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+    duration_ms     INT NOT NULL,
+
+    -- Outcome
+    success         BOOLEAN NOT NULL,
+    error_message   TEXT,
+    retry_count     INT NOT NULL DEFAULT 0,
+
+    -- Effectiveness (NULL when success = false)
+    courses_fetched     INT,
+    courses_changed     INT,
+    courses_unchanged   INT,
+    audits_generated    INT,
+    metrics_generated   INT
+);
+
+CREATE INDEX idx_scrape_job_results_target_time
+    ON scrape_job_results (target_type, completed_at);
+
+CREATE INDEX idx_scrape_job_results_completed
+    ON scrape_job_results (completed_at);
@@ -1,11 +1,11 @@
 //! Batch database operations for improved performance.

 use crate::banner::Course;
-use crate::data::models::DbMeetingTime;
+use crate::data::models::{DbMeetingTime, UpsertCounts};
 use crate::error::Result;
 use sqlx::PgConnection;
 use sqlx::PgPool;
-use std::collections::HashSet;
+use std::collections::{HashMap, HashSet};
 use std::time::Instant;
 use tracing::info;

@@ -368,10 +368,10 @@ async fn insert_metrics(metrics: &[MetricEntry], conn: &mut PgConnection) -> Res
 /// # Performance
 /// - Reduces N database round-trips to 5 (old-data CTE + upsert, audits, metrics, instructors, junction)
 /// - Typical usage: 50-200 courses per batch
-pub async fn batch_upsert_courses(courses: &[Course], db_pool: &PgPool) -> Result<()> {
+pub async fn batch_upsert_courses(courses: &[Course], db_pool: &PgPool) -> Result<UpsertCounts> {
    if courses.is_empty() {
        info!("No courses to upsert, skipping batch operation");
-        return Ok(());
+        return Ok(UpsertCounts::default());
    }

    let start = Instant::now();
@@ -388,6 +388,19 @@ pub async fn batch_upsert_courses(courses: &[Course], db_pool: &PgPool) -> Resul
    // Step 3: Compute audit/metric diffs
    let (audits, metrics) = compute_diffs(&diff_rows);

+    // Count courses that had at least one field change (existing rows only)
+    let changed_ids: HashSet<i32> = audits.iter().map(|a| a.course_id).collect();
+    let existing_count = diff_rows.iter().filter(|r| r.old_id.is_some()).count() as i32;
+    let courses_changed = changed_ids.len() as i32;
+
+    let counts = UpsertCounts {
+        courses_fetched: course_count as i32,
+        courses_changed,
+        courses_unchanged: existing_count - courses_changed,
+        audits_generated: audits.len() as i32,
+        metrics_generated: metrics.len() as i32,
+    };
+
    // Step 4: Insert audits and metrics
    insert_audits(&audits, &mut tx).await?;
    insert_metrics(&metrics, &mut tx).await?;
@@ -403,13 +416,15 @@ pub async fn batch_upsert_courses(courses: &[Course], db_pool: &PgPool) -> Resul
    let duration = start.elapsed();
    info!(
        courses_count = course_count,
-        audit_entries = audits.len(),
-        metric_entries = metrics.len(),
+        courses_changed = counts.courses_changed,
+        courses_unchanged = counts.courses_unchanged,
+        audit_entries = counts.audits_generated,
+        metric_entries = counts.metrics_generated,
        duration_ms = duration.as_millis(),
        "Batch upserted courses with instructors, audits, and metrics"
    );

-    Ok(())
+    Ok(counts)
 }

 // ---------------------------------------------------------------------------
@@ -159,6 +159,16 @@ pub struct CourseAudit {
    pub new_value: String,
 }

+/// Aggregate counts returned by batch upsert, used for scrape job result logging.
+#[derive(Debug, Clone, Default)]
+pub struct UpsertCounts {
+    pub courses_fetched: i32,
+    pub courses_changed: i32,
+    pub courses_unchanged: i32,
+    pub audits_generated: i32,
+    pub metrics_generated: i32,
+}
+
 /// The priority level of a scrape job.
 #[derive(sqlx::Type, Copy, Debug, Clone)]
 #[sqlx(type_name = "scrape_priority", rename_all = "PascalCase")]
@@ -1,7 +1,8 @@
 //! Database operations for scrape job queue management.

-use crate::data::models::{ScrapeJob, ScrapePriority, TargetType};
+use crate::data::models::{ScrapeJob, ScrapePriority, TargetType, UpsertCounts};
 use crate::error::Result;
+use chrono::{DateTime, Utc};
 use sqlx::PgPool;
 use std::collections::HashSet;

@@ -166,6 +167,52 @@ pub async fn find_existing_job_payloads(
    Ok(existing_payloads)
 }

+/// Insert a scrape job result log entry.
+#[allow(clippy::too_many_arguments)]
+pub async fn insert_job_result(
+    target_type: TargetType,
+    payload: serde_json::Value,
+    priority: ScrapePriority,
+    queued_at: DateTime<Utc>,
+    started_at: DateTime<Utc>,
+    duration_ms: i32,
+    success: bool,
+    error_message: Option<&str>,
+    retry_count: i32,
+    counts: Option<&UpsertCounts>,
+    db_pool: &PgPool,
+) -> Result<()> {
+    sqlx::query(
+        r#"
+        INSERT INTO scrape_job_results (
+            target_type, payload, priority,
+            queued_at, started_at, duration_ms,
+            success, error_message, retry_count,
+            courses_fetched, courses_changed, courses_unchanged,
+            audits_generated, metrics_generated
+        ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14)
+        "#,
+    )
+    .bind(target_type)
+    .bind(&payload)
+    .bind(priority)
+    .bind(queued_at)
+    .bind(started_at)
+    .bind(duration_ms)
+    .bind(success)
+    .bind(error_message)
+    .bind(retry_count)
+    .bind(counts.map(|c| c.courses_fetched))
+    .bind(counts.map(|c| c.courses_changed))
+    .bind(counts.map(|c| c.courses_unchanged))
+    .bind(counts.map(|c| c.audits_generated))
+    .bind(counts.map(|c| c.metrics_generated))
+    .execute(db_pool)
+    .await?;
+
+    Ok(())
+}
+
 /// Batch insert scrape jobs using UNNEST for a single round-trip.
 ///
 /// All jobs are inserted with `execute_at` set to the current time.
@@ -1,7 +1,7 @@
 pub mod subject;

 use crate::banner::BannerApi;
-use crate::data::models::TargetType;
+use crate::data::models::{TargetType, UpsertCounts};
 use crate::error::Result;
 use serde::{Deserialize, Serialize};
 use sqlx::PgPool;
@@ -32,8 +32,9 @@ pub trait Job: Send + Sync {
    #[allow(dead_code)]
    fn target_type(&self) -> TargetType;

-    /// Process the job with the given API client and database pool
-    async fn process(&self, banner_api: &BannerApi, db_pool: &PgPool) -> Result<()>;
+    /// Process the job with the given API client and database pool.
+    /// Returns upsert effectiveness counts on success.
+    async fn process(&self, banner_api: &BannerApi, db_pool: &PgPool) -> Result<UpsertCounts>;

    /// Get a human-readable description of the job
    fn description(&self) -> String;
@@ -1,7 +1,7 @@
 use super::Job;
 use crate::banner::{BannerApi, SearchQuery, Term};
 use crate::data::batch::batch_upsert_courses;
-use crate::data::models::TargetType;
+use crate::data::models::{TargetType, UpsertCounts};
 use crate::error::Result;
 use serde::{Deserialize, Serialize};
 use sqlx::PgPool;
@@ -26,7 +26,7 @@ impl Job for SubjectJob {
    }

    #[tracing::instrument(skip(self, banner_api, db_pool), fields(subject = %self.subject))]
-    async fn process(&self, banner_api: &BannerApi, db_pool: &PgPool) -> Result<()> {
+    async fn process(&self, banner_api: &BannerApi, db_pool: &PgPool) -> Result<UpsertCounts> {
        let subject_code = &self.subject;

        // Get the current term
@@ -37,17 +37,19 @@ impl Job for SubjectJob {
            .search(&term, &query, "subjectDescription", false)
            .await?;

-        if let Some(courses_from_api) = search_result.data {
+        let counts = if let Some(courses_from_api) = search_result.data {
            info!(
                subject = %subject_code,
                count = courses_from_api.len(),
                "Found courses"
            );
-            batch_upsert_courses(&courses_from_api, db_pool).await?;
-        }
+            batch_upsert_courses(&courses_from_api, db_pool).await?
+        } else {
+            UpsertCounts::default()
+        };

        debug!(subject = %subject_code, "Subject job completed");
-        Ok(())
+        Ok(counts)
    }

    fn description(&self) -> String {
@@ -1,10 +1,10 @@
 use crate::banner::{BannerApi, BannerApiError};
-use crate::data::models::{ScrapeJob, ScrapeJobStatus};
+use crate::data::models::{ScrapeJob, ScrapeJobStatus, UpsertCounts};
 use crate::data::scrape_jobs;
 use crate::error::Result;
 use crate::scraper::jobs::{JobError, JobType};
 use crate::web::ws::ScrapeJobEvent;
-use chrono::Utc;
+use chrono::{DateTime, Utc};
 use sqlx::PgPool;
 use std::sync::Arc;
 use std::time::Duration;
@@ -72,10 +72,15 @@ impl Worker {
            let job_id = job.id;
            let retry_count = job.retry_count;
            let max_retries = job.max_retries;
+            let target_type = job.target_type;
+            let payload = job.target_payload.clone();
+            let priority = job.priority;
+            let queued_at = job.queued_at;
+            let started_at = Utc::now();
            let start = std::time::Instant::now();

            // Emit JobLocked event
-            let locked_at = Utc::now().to_rfc3339();
+            let locked_at = started_at.to_rfc3339();
            debug!(job_id, "Emitting JobLocked event");
            let _ = self.job_events_tx.send(ScrapeJobEvent::JobLocked {
                id: job_id,
@@ -105,7 +110,18 @@ impl Worker {
            let duration = start.elapsed();

            // Handle the job processing result
-            self.handle_job_result(job_id, retry_count, max_retries, process_result, duration)
+            self.handle_job_result(
+                job_id,
+                retry_count,
+                max_retries,
+                process_result,
+                duration,
+                target_type,
+                payload,
+                priority,
+                queued_at,
+                started_at,
+            )
            .await;
        }
    }
@@ -118,7 +134,7 @@ impl Worker {
        scrape_jobs::fetch_and_lock_job(&self.db_pool).await
    }

-    async fn process_job(&self, job: ScrapeJob) -> Result<(), JobError> {
+    async fn process_job(&self, job: ScrapeJob) -> Result<UpsertCounts, JobError> {
        // Convert the database job to our job type
        let job_type = JobType::from_target_type_and_payload(job.target_type, job.target_payload)
            .map_err(|e| JobError::Unrecoverable(anyhow::anyhow!(e)))?; // Parse errors are unrecoverable
@@ -145,9 +161,7 @@ impl Worker {
            job_impl
                .process(&self.banner_api, &self.db_pool)
                .await
-                .map_err(JobError::Recoverable)?;
-
-            Ok(())
+                .map_err(JobError::Recoverable)
        }
        .instrument(span)
        .await
@@ -191,22 +205,53 @@ impl Worker {
    }

    /// Handle the result of job processing
+    #[allow(clippy::too_many_arguments)]
    async fn handle_job_result(
        &self,
        job_id: i32,
        retry_count: i32,
        max_retries: i32,
-        result: Result<(), JobError>,
+        result: Result<UpsertCounts, JobError>,
        duration: std::time::Duration,
+        target_type: crate::data::models::TargetType,
+        payload: serde_json::Value,
+        priority: crate::data::models::ScrapePriority,
+        queued_at: DateTime<Utc>,
+        started_at: DateTime<Utc>,
    ) {
+        let duration_ms = duration.as_millis() as i32;
+
        match result {
-            Ok(()) => {
+            Ok(counts) => {
                debug!(
                    worker_id = self.id,
                    job_id,
                    duration_ms = duration.as_millis(),
+                    courses_fetched = counts.courses_fetched,
+                    courses_changed = counts.courses_changed,
+                    courses_unchanged = counts.courses_unchanged,
                    "Job completed successfully"
                );
+
+                // Log the result
+                if let Err(e) = scrape_jobs::insert_job_result(
+                    target_type,
+                    payload,
+                    priority,
+                    queued_at,
+                    started_at,
+                    duration_ms,
+                    true,
+                    None,
+                    retry_count,
+                    Some(&counts),
+                    &self.db_pool,
+                )
+                .await
+                {
+                    error!(worker_id = self.id, job_id, error = ?e, "Failed to insert job result");
+                }
+
                if let Err(e) = self.delete_job(job_id).await {
                    error!(worker_id = self.id, job_id, error = ?e, "Failed to delete completed job");
                }
@@ -216,10 +261,41 @@ impl Worker {
                    .send(ScrapeJobEvent::JobCompleted { id: job_id });
            }
            Err(JobError::Recoverable(e)) => {
-                self.handle_recoverable_error(job_id, retry_count, max_retries, e, duration)
+                self.handle_recoverable_error(
+                    job_id,
+                    retry_count,
+                    max_retries,
+                    e,
+                    duration,
+                    target_type,
+                    payload,
+                    priority,
+                    queued_at,
+                    started_at,
+                )
                .await;
            }
            Err(JobError::Unrecoverable(e)) => {
+                // Log the failed result
+                let err_msg = format!("{e:#}");
+                if let Err(log_err) = scrape_jobs::insert_job_result(
+                    target_type,
+                    payload,
+                    priority,
+                    queued_at,
+                    started_at,
+                    duration_ms,
+                    false,
+                    Some(&err_msg),
+                    retry_count,
+                    None,
+                    &self.db_pool,
+                )
+                .await
+                {
+                    error!(worker_id = self.id, job_id, error = ?log_err, "Failed to insert job result");
+                }
+
                error!(
                    worker_id = self.id,
                    job_id,
@@ -239,6 +315,7 @@ impl Worker {
    }

    /// Handle recoverable errors by logging appropriately and unlocking the job
+    #[allow(clippy::too_many_arguments)]
    async fn handle_recoverable_error(
        &self,
        job_id: i32,
@@ -246,6 +323,11 @@ impl Worker {
        max_retries: i32,
        e: anyhow::Error,
        duration: std::time::Duration,
+        target_type: crate::data::models::TargetType,
+        payload: serde_json::Value,
+        priority: crate::data::models::ScrapePriority,
+        queued_at: DateTime<Utc>,
+        started_at: DateTime<Utc>,
    ) {
        let next_attempt = retry_count.saturating_add(1);
        let remaining_retries = max_retries.saturating_sub(next_attempt);
@@ -276,7 +358,7 @@ impl Worker {

        // Atomically unlock and increment retry count, checking if retry is allowed
        match self.unlock_and_increment_retry(job_id, max_retries).await {
-            Ok(Some(queued_at)) => {
+            Ok(Some(new_queued_at)) => {
                debug!(
                    worker_id = self.id,
                    job_id,
@@ -288,12 +370,33 @@ impl Worker {
                let _ = self.job_events_tx.send(ScrapeJobEvent::JobRetried {
                    id: job_id,
                    retry_count: next_attempt,
-                    queued_at: queued_at.to_rfc3339(),
+                    queued_at: new_queued_at.to_rfc3339(),
                    status: ScrapeJobStatus::Pending,
                });
+                // Don't log a result yet — the job will be retried
            }
            Ok(None) => {
-                // Max retries exceeded (detected atomically)
+                // Max retries exceeded — log final failure result
+                let duration_ms = duration.as_millis() as i32;
+                let err_msg = format!("{e:#}");
+                if let Err(log_err) = scrape_jobs::insert_job_result(
+                    target_type,
+                    payload,
+                    priority,
+                    queued_at,
+                    started_at,
+                    duration_ms,
+                    false,
+                    Some(&err_msg),
+                    next_attempt,
+                    None,
+                    &self.db_pool,
+                )
+                .await
+                {
+                    error!(worker_id = self.id, job_id, error = ?log_err, "Failed to insert job result");
+                }
+
                error!(
                    worker_id = self.id,
                    job_id,