pub mod jobs; pub mod scheduler; pub mod worker; use crate::banner::BannerApi; use crate::data::scrape_jobs; use crate::services::Service; use crate::state::ReferenceCache; use crate::status::{ServiceStatus, ServiceStatusRegistry}; use crate::web::ws::ScrapeJobEvent; use sqlx::PgPool; use std::sync::Arc; use tokio::sync::{RwLock, broadcast}; use tokio::task::JoinHandle; use tracing::{info, warn}; use self::scheduler::Scheduler; use self::worker::Worker; /// The main service that will be managed by the application's `ServiceManager`. /// /// It holds the shared resources (database pool, API client) and manages the /// lifecycle of the Scheduler and Worker tasks. pub struct ScraperService { db_pool: PgPool, banner_api: Arc, reference_cache: Arc>, service_statuses: ServiceStatusRegistry, job_events_tx: broadcast::Sender, scheduler_handle: Option>, worker_handles: Vec>, shutdown_tx: Option>, } impl ScraperService { /// Creates a new `ScraperService`. pub fn new( db_pool: PgPool, banner_api: Arc, reference_cache: Arc>, service_statuses: ServiceStatusRegistry, job_events_tx: broadcast::Sender, ) -> Self { Self { db_pool, banner_api, reference_cache, service_statuses, job_events_tx, scheduler_handle: None, worker_handles: Vec::new(), shutdown_tx: None, } } /// Starts the scheduler and a pool of workers. /// /// Force-unlocks any jobs left locked by a previous unclean shutdown before /// spawning workers, so those jobs re-enter the queue immediately. pub async fn start(&mut self) { // Recover jobs left locked by a previous crash/unclean shutdown match scrape_jobs::force_unlock_all(&self.db_pool).await { Ok(0) => {} Ok(count) => warn!(count, "Force-unlocked stale jobs from previous run"), Err(e) => warn!(error = ?e, "Failed to force-unlock stale jobs"), } info!("ScraperService starting"); // Create shutdown channel let (shutdown_tx, _) = broadcast::channel(1); self.shutdown_tx = Some(shutdown_tx.clone()); let scheduler = Scheduler::new( self.db_pool.clone(), self.banner_api.clone(), self.reference_cache.clone(), self.job_events_tx.clone(), ); let shutdown_rx = shutdown_tx.subscribe(); let scheduler_handle = tokio::spawn(async move { scheduler.run(shutdown_rx).await; }); self.scheduler_handle = Some(scheduler_handle); info!("Scheduler task spawned"); let worker_count = 4; // This could be configurable for i in 0..worker_count { let worker = Worker::new( i, self.db_pool.clone(), self.banner_api.clone(), self.job_events_tx.clone(), ); let shutdown_rx = shutdown_tx.subscribe(); let worker_handle = tokio::spawn(async move { worker.run(shutdown_rx).await; }); self.worker_handles.push(worker_handle); } info!( worker_count = self.worker_handles.len(), "Spawned worker tasks" ); self.service_statuses.set("scraper", ServiceStatus::Active); } } #[async_trait::async_trait] impl Service for ScraperService { fn name(&self) -> &'static str { "scraper" } async fn run(&mut self) -> Result<(), anyhow::Error> { self.start().await; std::future::pending::<()>().await; Ok(()) } async fn shutdown(&mut self) -> Result<(), anyhow::Error> { self.service_statuses .set("scraper", ServiceStatus::Disabled); info!("Shutting down scraper service"); // Send shutdown signal to all tasks if let Some(shutdown_tx) = self.shutdown_tx.take() { let _ = shutdown_tx.send(()); } else { warn!("No shutdown channel found for scraper service"); return Err(anyhow::anyhow!("No shutdown channel available")); } // Collect all handles let mut all_handles = Vec::new(); if let Some(handle) = self.scheduler_handle.take() { all_handles.push(handle); } all_handles.append(&mut self.worker_handles); // Wait for all tasks to complete (no internal timeout - let ServiceManager handle it) let results = futures::future::join_all(all_handles).await; let failed = results.iter().filter(|r| r.is_err()).count(); if failed > 0 { warn!( failed_count = failed, "Some scraper tasks panicked during shutdown" ); return Err(anyhow::anyhow!("{} task(s) panicked", failed)); } info!("All scraper tasks shutdown gracefully"); Ok(()) } }