mirror of
https://github.com/Xevion/banner.git
synced 2026-01-31 02:23:34 -06:00
refactor(scraper): implement graceful shutdown with broadcast channels
Replace task abortion with broadcast-based graceful shutdown for scheduler and workers. Implement cancellation tokens for in-progress work with 5s timeout. Add tokio-util dependency for CancellationToken support. Update ServiceManager to use completion channels and abort handles for better service lifecycle control.
This commit is contained in:
+54
-15
@@ -5,8 +5,10 @@ pub mod worker;
|
||||
use crate::banner::BannerApi;
|
||||
use sqlx::PgPool;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use tokio::sync::broadcast;
|
||||
use tokio::task::JoinHandle;
|
||||
use tracing::info;
|
||||
use tracing::{info, warn};
|
||||
|
||||
use self::scheduler::Scheduler;
|
||||
use self::worker::Worker;
|
||||
@@ -21,6 +23,7 @@ pub struct ScraperService {
|
||||
banner_api: Arc<BannerApi>,
|
||||
scheduler_handle: Option<JoinHandle<()>>,
|
||||
worker_handles: Vec<JoinHandle<()>>,
|
||||
shutdown_tx: Option<broadcast::Sender<()>>,
|
||||
}
|
||||
|
||||
impl ScraperService {
|
||||
@@ -31,6 +34,7 @@ impl ScraperService {
|
||||
banner_api,
|
||||
scheduler_handle: None,
|
||||
worker_handles: Vec::new(),
|
||||
shutdown_tx: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -38,9 +42,14 @@ impl ScraperService {
|
||||
pub fn start(&mut self) {
|
||||
info!("ScraperService starting");
|
||||
|
||||
// Create shutdown channel
|
||||
let (shutdown_tx, _) = broadcast::channel(1);
|
||||
self.shutdown_tx = Some(shutdown_tx.clone());
|
||||
|
||||
let scheduler = Scheduler::new(self.db_pool.clone(), self.banner_api.clone());
|
||||
let shutdown_rx = shutdown_tx.subscribe();
|
||||
let scheduler_handle = tokio::spawn(async move {
|
||||
scheduler.run().await;
|
||||
scheduler.run(shutdown_rx).await;
|
||||
});
|
||||
self.scheduler_handle = Some(scheduler_handle);
|
||||
info!("Scheduler task spawned");
|
||||
@@ -48,8 +57,9 @@ impl ScraperService {
|
||||
let worker_count = 4; // This could be configurable
|
||||
for i in 0..worker_count {
|
||||
let worker = Worker::new(i, self.db_pool.clone(), self.banner_api.clone());
|
||||
let shutdown_rx = shutdown_tx.subscribe();
|
||||
let worker_handle = tokio::spawn(async move {
|
||||
worker.run().await;
|
||||
worker.run(shutdown_rx).await;
|
||||
});
|
||||
self.worker_handles.push(worker_handle);
|
||||
}
|
||||
@@ -59,17 +69,6 @@ impl ScraperService {
|
||||
);
|
||||
}
|
||||
|
||||
/// Signals all child tasks to gracefully shut down.
|
||||
pub async fn shutdown(&mut self) {
|
||||
info!("Shutting down scraper service");
|
||||
if let Some(handle) = self.scheduler_handle.take() {
|
||||
handle.abort();
|
||||
}
|
||||
for handle in self.worker_handles.drain(..) {
|
||||
handle.abort();
|
||||
}
|
||||
info!("Scraper service shutdown");
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
@@ -85,7 +84,47 @@ impl Service for ScraperService {
|
||||
}
|
||||
|
||||
async fn shutdown(&mut self) -> Result<(), anyhow::Error> {
|
||||
self.shutdown().await;
|
||||
info!("Shutting down scraper service");
|
||||
|
||||
// Send shutdown signal to all tasks
|
||||
if let Some(shutdown_tx) = self.shutdown_tx.take() {
|
||||
let _ = shutdown_tx.send(());
|
||||
} else {
|
||||
warn!("No shutdown channel found for scraper service");
|
||||
}
|
||||
|
||||
// Collect all handles
|
||||
let mut all_handles = Vec::new();
|
||||
if let Some(handle) = self.scheduler_handle.take() {
|
||||
all_handles.push(handle);
|
||||
}
|
||||
all_handles.append(&mut self.worker_handles);
|
||||
|
||||
// Wait for all tasks to complete with a timeout
|
||||
let timeout_duration = Duration::from_secs(5);
|
||||
|
||||
match tokio::time::timeout(
|
||||
timeout_duration,
|
||||
futures::future::join_all(all_handles),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(results) => {
|
||||
let failed = results.iter().filter(|r| r.is_err()).count();
|
||||
if failed > 0 {
|
||||
warn!(failed_count = failed, "Some scraper tasks failed during shutdown");
|
||||
} else {
|
||||
info!("All scraper tasks shutdown gracefully");
|
||||
}
|
||||
}
|
||||
Err(_) => {
|
||||
warn!(
|
||||
timeout = format!("{:.2?}", timeout_duration),
|
||||
"Scraper service shutdown timed out"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
+65
-12
@@ -6,8 +6,10 @@ use serde_json::json;
|
||||
use sqlx::PgPool;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use tokio::sync::broadcast;
|
||||
use tokio::time;
|
||||
use tracing::{debug, error, info, trace};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, error, info, trace, warn};
|
||||
|
||||
/// Periodically analyzes data and enqueues prioritized scrape jobs.
|
||||
pub struct Scheduler {
|
||||
@@ -24,21 +26,72 @@ impl Scheduler {
|
||||
}
|
||||
|
||||
/// Runs the scheduler's main loop.
|
||||
pub async fn run(&self) {
|
||||
pub async fn run(&self, mut shutdown_rx: broadcast::Receiver<()>) {
|
||||
info!("Scheduler service started");
|
||||
let mut interval = time::interval(Duration::from_secs(60)); // Runs every minute
|
||||
|
||||
let work_interval = Duration::from_secs(60);
|
||||
let mut next_run = time::Instant::now();
|
||||
let mut current_work: Option<(tokio::task::JoinHandle<()>, CancellationToken)> = None;
|
||||
|
||||
loop {
|
||||
interval.tick().await;
|
||||
// Scheduler analyzing data...
|
||||
if let Err(e) = self.schedule_jobs().await {
|
||||
error!(error = ?e, "Failed to schedule jobs");
|
||||
tokio::select! {
|
||||
// Sleep until next scheduled run - instantly cancellable
|
||||
_ = time::sleep_until(next_run) => {
|
||||
// Create cancellation token for graceful task cancellation
|
||||
let cancel_token = CancellationToken::new();
|
||||
|
||||
// Spawn scheduling work in a separate task for cancellability
|
||||
let work_handle = tokio::spawn({
|
||||
let db_pool = self.db_pool.clone();
|
||||
let banner_api = self.banner_api.clone();
|
||||
let cancel_token = cancel_token.clone();
|
||||
|
||||
async move {
|
||||
// Check for cancellation while running
|
||||
tokio::select! {
|
||||
result = Self::schedule_jobs_impl(&db_pool, &banner_api) => {
|
||||
if let Err(e) = result {
|
||||
error!(error = ?e, "Failed to schedule jobs");
|
||||
}
|
||||
}
|
||||
_ = cancel_token.cancelled() => {
|
||||
debug!("Scheduling work cancelled gracefully");
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
current_work = Some((work_handle, cancel_token));
|
||||
next_run = time::Instant::now() + work_interval;
|
||||
}
|
||||
_ = shutdown_rx.recv() => {
|
||||
info!("Scheduler received shutdown signal");
|
||||
|
||||
// Gracefully cancel any in-progress work
|
||||
if let Some((handle, cancel_token)) = current_work.take() {
|
||||
// Signal cancellation
|
||||
cancel_token.cancel();
|
||||
|
||||
// Wait for graceful completion with timeout
|
||||
match time::timeout(Duration::from_secs(5), handle).await {
|
||||
Ok(_) => {
|
||||
debug!("Scheduling work completed gracefully");
|
||||
}
|
||||
Err(_) => {
|
||||
warn!("Scheduling work did not complete within 5s timeout, may have been aborted");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
info!("Scheduler exiting gracefully");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// The core logic for deciding what jobs to create.
|
||||
async fn schedule_jobs(&self) -> Result<()> {
|
||||
async fn schedule_jobs_impl(db_pool: &PgPool, banner_api: &BannerApi) -> Result<()> {
|
||||
// For now, we will implement a simple baseline scheduling strategy:
|
||||
// 1. Get a list of all subjects from the Banner API.
|
||||
// 2. Query existing jobs for all subjects in a single query.
|
||||
@@ -47,7 +100,7 @@ impl Scheduler {
|
||||
|
||||
debug!(term = term, "Enqueuing subject jobs");
|
||||
|
||||
let subjects = self.banner_api.get_subjects("", &term, 1, 500).await?;
|
||||
let subjects = banner_api.get_subjects("", &term, 1, 500).await?;
|
||||
debug!(
|
||||
subject_count = subjects.len(),
|
||||
"Retrieved subjects from API"
|
||||
@@ -61,12 +114,12 @@ impl Scheduler {
|
||||
|
||||
// Query existing jobs for all subjects in a single query
|
||||
let existing_jobs: Vec<(serde_json::Value,)> = sqlx::query_as(
|
||||
"SELECT target_payload FROM scrape_jobs
|
||||
"SELECT target_payload FROM scrape_jobs
|
||||
WHERE target_type = $1 AND target_payload = ANY($2) AND locked_at IS NULL",
|
||||
)
|
||||
.bind(TargetType::Subject)
|
||||
.bind(&subject_payloads)
|
||||
.fetch_all(&self.db_pool)
|
||||
.fetch_all(db_pool)
|
||||
.await?;
|
||||
|
||||
// Convert to a HashSet for efficient lookup
|
||||
@@ -95,7 +148,7 @@ impl Scheduler {
|
||||
// Insert all new jobs in a single batch
|
||||
if !new_jobs.is_empty() {
|
||||
let now = chrono::Utc::now();
|
||||
let mut tx = self.db_pool.begin().await?;
|
||||
let mut tx = db_pool.begin().await?;
|
||||
|
||||
for (payload, subject_code) in new_jobs {
|
||||
sqlx::query(
|
||||
|
||||
+106
-64
@@ -5,6 +5,7 @@ use crate::scraper::jobs::{JobError, JobType};
|
||||
use sqlx::PgPool;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use tokio::sync::broadcast;
|
||||
use tokio::time;
|
||||
use tracing::{debug, error, info, trace, warn};
|
||||
|
||||
@@ -28,77 +29,97 @@ impl Worker {
|
||||
}
|
||||
|
||||
/// Runs the worker's main loop.
|
||||
pub async fn run(&self) {
|
||||
pub async fn run(&self, mut shutdown_rx: broadcast::Receiver<()>) {
|
||||
info!(worker_id = self.id, "Worker started.");
|
||||
loop {
|
||||
match self.fetch_and_lock_job().await {
|
||||
Ok(Some(job)) => {
|
||||
let job_id = job.id;
|
||||
debug!(worker_id = self.id, job_id = job.id, "Processing job");
|
||||
match self.process_job(job).await {
|
||||
Ok(()) => {
|
||||
debug!(worker_id = self.id, job_id, "Job completed");
|
||||
// If successful, delete the job.
|
||||
if let Err(delete_err) = self.delete_job(job_id).await {
|
||||
error!(
|
||||
worker_id = self.id,
|
||||
job_id,
|
||||
?delete_err,
|
||||
"Failed to delete job"
|
||||
);
|
||||
}
|
||||
}
|
||||
Err(JobError::Recoverable(e)) => {
|
||||
// Check if the error is due to an invalid session
|
||||
if let Some(BannerApiError::InvalidSession(_)) =
|
||||
e.downcast_ref::<BannerApiError>()
|
||||
{
|
||||
warn!(
|
||||
worker_id = self.id,
|
||||
job_id, "Invalid session detected. Forcing session refresh."
|
||||
);
|
||||
} else {
|
||||
error!(worker_id = self.id, job_id, error = ?e, "Failed to process job");
|
||||
}
|
||||
|
||||
// Unlock the job so it can be retried
|
||||
if let Err(unlock_err) = self.unlock_job(job_id).await {
|
||||
error!(
|
||||
worker_id = self.id,
|
||||
job_id,
|
||||
?unlock_err,
|
||||
"Failed to unlock job"
|
||||
);
|
||||
}
|
||||
loop {
|
||||
// Fetch and lock a job, racing against shutdown signal
|
||||
let job = tokio::select! {
|
||||
_ = shutdown_rx.recv() => {
|
||||
info!(worker_id = self.id, "Worker received shutdown signal");
|
||||
info!(worker_id = self.id, "Worker exiting gracefully");
|
||||
break;
|
||||
}
|
||||
result = self.fetch_and_lock_job() => {
|
||||
match result {
|
||||
Ok(Some(job)) => job,
|
||||
Ok(None) => {
|
||||
// No job found, wait for a bit before polling again
|
||||
trace!(worker_id = self.id, "No jobs available, waiting");
|
||||
time::sleep(Duration::from_secs(5)).await;
|
||||
continue;
|
||||
}
|
||||
Err(JobError::Unrecoverable(e)) => {
|
||||
error!(
|
||||
worker_id = self.id,
|
||||
job_id,
|
||||
error = ?e,
|
||||
"Job corrupted, deleting"
|
||||
);
|
||||
// Parse errors are unrecoverable - delete the job
|
||||
if let Err(delete_err) = self.delete_job(job_id).await {
|
||||
error!(
|
||||
worker_id = self.id,
|
||||
job_id,
|
||||
?delete_err,
|
||||
"Failed to delete corrupted job"
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(worker_id = self.id, error = ?e, "Failed to fetch job");
|
||||
// Wait before retrying to avoid spamming errors
|
||||
time::sleep(Duration::from_secs(10)).await;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(None) => {
|
||||
// No job found, wait for a bit before polling again.
|
||||
trace!(worker_id = self.id, "No jobs available, waiting");
|
||||
time::sleep(Duration::from_secs(5)).await;
|
||||
};
|
||||
|
||||
let job_id = job.id;
|
||||
debug!(worker_id = self.id, job_id, "Processing job");
|
||||
|
||||
// Process the job, racing against shutdown signal
|
||||
let process_result = tokio::select! {
|
||||
_ = shutdown_rx.recv() => {
|
||||
info!(worker_id = self.id, job_id, "Shutdown received during job processing");
|
||||
|
||||
// Unlock the job so it can be retried
|
||||
if let Err(e) = self.unlock_job(job_id).await {
|
||||
warn!(
|
||||
worker_id = self.id,
|
||||
job_id,
|
||||
error = ?e,
|
||||
"Failed to unlock job during shutdown"
|
||||
);
|
||||
} else {
|
||||
debug!(worker_id = self.id, job_id, "Job unlocked during shutdown");
|
||||
}
|
||||
|
||||
info!(worker_id = self.id, "Worker exiting gracefully");
|
||||
break;
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(worker_id = self.id, error = ?e, "Failed to fetch job");
|
||||
// Wait before retrying to avoid spamming errors.
|
||||
time::sleep(Duration::from_secs(10)).await;
|
||||
result = self.process_job(job) => {
|
||||
result
|
||||
}
|
||||
};
|
||||
|
||||
// Handle the job processing result
|
||||
match process_result {
|
||||
Ok(()) => {
|
||||
debug!(worker_id = self.id, job_id, "Job completed");
|
||||
// If successful, delete the job
|
||||
if let Err(delete_err) = self.delete_job(job_id).await {
|
||||
error!(
|
||||
worker_id = self.id,
|
||||
job_id,
|
||||
?delete_err,
|
||||
"Failed to delete job"
|
||||
);
|
||||
}
|
||||
}
|
||||
Err(JobError::Recoverable(e)) => {
|
||||
self.handle_recoverable_error(job_id, e).await;
|
||||
}
|
||||
Err(JobError::Unrecoverable(e)) => {
|
||||
error!(
|
||||
worker_id = self.id,
|
||||
job_id,
|
||||
error = ?e,
|
||||
"Job corrupted, deleting"
|
||||
);
|
||||
// Parse errors are unrecoverable - delete the job
|
||||
if let Err(delete_err) = self.delete_job(job_id).await {
|
||||
error!(
|
||||
worker_id = self.id,
|
||||
job_id,
|
||||
?delete_err,
|
||||
"Failed to delete corrupted job"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -169,4 +190,25 @@ impl Worker {
|
||||
info!(worker_id = self.id, job_id, "Job unlocked for retry");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Handle recoverable errors by logging appropriately and unlocking the job
|
||||
async fn handle_recoverable_error(&self, job_id: i32, e: anyhow::Error) {
|
||||
if let Some(BannerApiError::InvalidSession(_)) = e.downcast_ref::<BannerApiError>() {
|
||||
warn!(
|
||||
worker_id = self.id,
|
||||
job_id, "Invalid session detected. Forcing session refresh."
|
||||
);
|
||||
} else {
|
||||
error!(worker_id = self.id, job_id, error = ?e, "Failed to process job");
|
||||
}
|
||||
|
||||
if let Err(unlock_err) = self.unlock_job(job_id).await {
|
||||
error!(
|
||||
worker_id = self.id,
|
||||
job_id,
|
||||
?unlock_err,
|
||||
"Failed to unlock job"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user