Files
banner/src/scraper/mod.rs

156 lines
5.1 KiB
Rust

pub mod jobs;
pub mod scheduler;
pub mod worker;
use crate::banner::BannerApi;
use crate::data::scrape_jobs;
use crate::services::Service;
use crate::state::ReferenceCache;
use crate::status::{ServiceStatus, ServiceStatusRegistry};
use crate::web::ws::ScrapeJobEvent;
use sqlx::PgPool;
use std::sync::Arc;
use tokio::sync::{RwLock, broadcast};
use tokio::task::JoinHandle;
use tracing::{info, warn};
use self::scheduler::Scheduler;
use self::worker::Worker;
/// The main service that will be managed by the application's `ServiceManager`.
///
/// It holds the shared resources (database pool, API client) and manages the
/// lifecycle of the Scheduler and Worker tasks.
pub struct ScraperService {
db_pool: PgPool,
banner_api: Arc<BannerApi>,
reference_cache: Arc<RwLock<ReferenceCache>>,
service_statuses: ServiceStatusRegistry,
job_events_tx: broadcast::Sender<ScrapeJobEvent>,
scheduler_handle: Option<JoinHandle<()>>,
worker_handles: Vec<JoinHandle<()>>,
shutdown_tx: Option<broadcast::Sender<()>>,
}
impl ScraperService {
/// Creates a new `ScraperService`.
pub fn new(
db_pool: PgPool,
banner_api: Arc<BannerApi>,
reference_cache: Arc<RwLock<ReferenceCache>>,
service_statuses: ServiceStatusRegistry,
job_events_tx: broadcast::Sender<ScrapeJobEvent>,
) -> Self {
Self {
db_pool,
banner_api,
reference_cache,
service_statuses,
job_events_tx,
scheduler_handle: None,
worker_handles: Vec::new(),
shutdown_tx: None,
}
}
/// Starts the scheduler and a pool of workers.
///
/// Force-unlocks any jobs left locked by a previous unclean shutdown before
/// spawning workers, so those jobs re-enter the queue immediately.
pub async fn start(&mut self) {
// Recover jobs left locked by a previous crash/unclean shutdown
match scrape_jobs::force_unlock_all(&self.db_pool).await {
Ok(0) => {}
Ok(count) => warn!(count, "Force-unlocked stale jobs from previous run"),
Err(e) => warn!(error = ?e, "Failed to force-unlock stale jobs"),
}
info!("ScraperService starting");
// Create shutdown channel
let (shutdown_tx, _) = broadcast::channel(1);
self.shutdown_tx = Some(shutdown_tx.clone());
let scheduler = Scheduler::new(
self.db_pool.clone(),
self.banner_api.clone(),
self.reference_cache.clone(),
self.job_events_tx.clone(),
);
let shutdown_rx = shutdown_tx.subscribe();
let scheduler_handle = tokio::spawn(async move {
scheduler.run(shutdown_rx).await;
});
self.scheduler_handle = Some(scheduler_handle);
info!("Scheduler task spawned");
let worker_count = 4; // This could be configurable
for i in 0..worker_count {
let worker = Worker::new(
i,
self.db_pool.clone(),
self.banner_api.clone(),
self.job_events_tx.clone(),
);
let shutdown_rx = shutdown_tx.subscribe();
let worker_handle = tokio::spawn(async move {
worker.run(shutdown_rx).await;
});
self.worker_handles.push(worker_handle);
}
info!(
worker_count = self.worker_handles.len(),
"Spawned worker tasks"
);
self.service_statuses.set("scraper", ServiceStatus::Active);
}
}
#[async_trait::async_trait]
impl Service for ScraperService {
fn name(&self) -> &'static str {
"scraper"
}
async fn run(&mut self) -> Result<(), anyhow::Error> {
self.start().await;
std::future::pending::<()>().await;
Ok(())
}
async fn shutdown(&mut self) -> Result<(), anyhow::Error> {
self.service_statuses
.set("scraper", ServiceStatus::Disabled);
info!("Shutting down scraper service");
// Send shutdown signal to all tasks
if let Some(shutdown_tx) = self.shutdown_tx.take() {
let _ = shutdown_tx.send(());
} else {
warn!("No shutdown channel found for scraper service");
return Err(anyhow::anyhow!("No shutdown channel available"));
}
// Collect all handles
let mut all_handles = Vec::new();
if let Some(handle) = self.scheduler_handle.take() {
all_handles.push(handle);
}
all_handles.append(&mut self.worker_handles);
// Wait for all tasks to complete (no internal timeout - let ServiceManager handle it)
let results = futures::future::join_all(all_handles).await;
let failed = results.iter().filter(|r| r.is_err()).count();
if failed > 0 {
warn!(
failed_count = failed,
"Some scraper tasks panicked during shutdown"
);
return Err(anyhow::anyhow!("{} task(s) panicked", failed));
}
info!("All scraper tasks shutdown gracefully");
Ok(())
}
}