feat: scraper system

2026-01-31 12:23:33 -06:00 · 2025-09-01 00:46:38 -05:00
parent 1bdbd1d6d6
commit 43647096e9
10 changed files with 407 additions and 15 deletions
@@ -0,0 +1,85 @@
+use crate::banner::{BannerApi, Term};
+use crate::data::models::{ScrapePriority, TargetType};
+use crate::error::Result;
+use serde_json::json;
+use sqlx::PgPool;
+use std::sync::Arc;
+use std::time::Duration;
+use tokio::time;
+use tracing::{error, info};
+
+/// Periodically analyzes data and enqueues prioritized scrape jobs.
+pub struct Scheduler {
+    db_pool: PgPool,
+    banner_api: Arc<BannerApi>,
+}
+
+impl Scheduler {
+    pub fn new(db_pool: PgPool, banner_api: Arc<BannerApi>) -> Self {
+        Self {
+            db_pool,
+            banner_api,
+        }
+    }
+
+    /// Runs the scheduler's main loop.
+    pub async fn run(&self) {
+        info!("Scheduler service started.");
+        let mut interval = time::interval(Duration::from_secs(60)); // Runs every minute
+
+        loop {
+            interval.tick().await;
+            info!("Scheduler waking up to analyze and schedule jobs...");
+            if let Err(e) = self.schedule_jobs().await {
+                error!(error = ?e, "Failed to schedule jobs");
+            }
+        }
+    }
+
+    /// The core logic for deciding what jobs to create.
+    async fn schedule_jobs(&self) -> Result<()> {
+        // For now, we will implement a simple baseline scheduling strategy:
+        // 1. Get a list of all subjects from the Banner API.
+        // 2. For each subject, check if an active (not locked, not completed) job already exists.
+        // 3. If no job exists, create a new, low-priority job to be executed in the near future.
+        let term = Term::get_current().inner().to_string();
+
+        info!(
+            term = term,
+            "[Scheduler] Enqueuing baseline subject scrape jobs..."
+        );
+
+        let subjects = self.banner_api.get_subjects("", &term, 1, 500).await?;
+
+        for subject in subjects {
+            let payload = json!({ "subject": subject.code });
+
+            let existing_job: Option<(i32,)> = sqlx::query_as(
+                "SELECT id FROM scrape_jobs WHERE target_type = $1 AND target_payload = $2 AND locked_at IS NULL"
+            )
+            .bind(TargetType::Subject)
+            .bind(&payload)
+            .fetch_optional(&self.db_pool)
+            .await?;
+
+            if existing_job.is_some() {
+                continue;
+            }
+
+            sqlx::query(
+                "INSERT INTO scrape_jobs (target_type, target_payload, priority, execute_at) VALUES ($1, $2, $3, $4)"
+            )
+            .bind(TargetType::Subject)
+            .bind(&payload)
+            .bind(ScrapePriority::Low)
+            .bind(chrono::Utc::now())
+            .execute(&self.db_pool)
+            .await?;
+
+            info!(subject = subject.code, "[Scheduler] Enqueued new job");
+        }
+
+        info!("[Scheduler] Job scheduling complete.");
+        Ok(())
+    }
+}