mirror of
https://github.com/Xevion/banner.git
synced 2026-01-31 04:23:34 -06:00
feat: add scrape job result persistence for effectiveness tracking
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
pub mod subject;
|
||||
|
||||
use crate::banner::BannerApi;
|
||||
use crate::data::models::TargetType;
|
||||
use crate::data::models::{TargetType, UpsertCounts};
|
||||
use crate::error::Result;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use sqlx::PgPool;
|
||||
@@ -32,8 +32,9 @@ pub trait Job: Send + Sync {
|
||||
#[allow(dead_code)]
|
||||
fn target_type(&self) -> TargetType;
|
||||
|
||||
/// Process the job with the given API client and database pool
|
||||
async fn process(&self, banner_api: &BannerApi, db_pool: &PgPool) -> Result<()>;
|
||||
/// Process the job with the given API client and database pool.
|
||||
/// Returns upsert effectiveness counts on success.
|
||||
async fn process(&self, banner_api: &BannerApi, db_pool: &PgPool) -> Result<UpsertCounts>;
|
||||
|
||||
/// Get a human-readable description of the job
|
||||
fn description(&self) -> String;
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use super::Job;
|
||||
use crate::banner::{BannerApi, SearchQuery, Term};
|
||||
use crate::data::batch::batch_upsert_courses;
|
||||
use crate::data::models::TargetType;
|
||||
use crate::data::models::{TargetType, UpsertCounts};
|
||||
use crate::error::Result;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use sqlx::PgPool;
|
||||
@@ -26,7 +26,7 @@ impl Job for SubjectJob {
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip(self, banner_api, db_pool), fields(subject = %self.subject))]
|
||||
async fn process(&self, banner_api: &BannerApi, db_pool: &PgPool) -> Result<()> {
|
||||
async fn process(&self, banner_api: &BannerApi, db_pool: &PgPool) -> Result<UpsertCounts> {
|
||||
let subject_code = &self.subject;
|
||||
|
||||
// Get the current term
|
||||
@@ -37,17 +37,19 @@ impl Job for SubjectJob {
|
||||
.search(&term, &query, "subjectDescription", false)
|
||||
.await?;
|
||||
|
||||
if let Some(courses_from_api) = search_result.data {
|
||||
let counts = if let Some(courses_from_api) = search_result.data {
|
||||
info!(
|
||||
subject = %subject_code,
|
||||
count = courses_from_api.len(),
|
||||
"Found courses"
|
||||
);
|
||||
batch_upsert_courses(&courses_from_api, db_pool).await?;
|
||||
}
|
||||
batch_upsert_courses(&courses_from_api, db_pool).await?
|
||||
} else {
|
||||
UpsertCounts::default()
|
||||
};
|
||||
|
||||
debug!(subject = %subject_code, "Subject job completed");
|
||||
Ok(())
|
||||
Ok(counts)
|
||||
}
|
||||
|
||||
fn description(&self) -> String {
|
||||
|
||||
+119
-16
@@ -1,10 +1,10 @@
|
||||
use crate::banner::{BannerApi, BannerApiError};
|
||||
use crate::data::models::{ScrapeJob, ScrapeJobStatus};
|
||||
use crate::data::models::{ScrapeJob, ScrapeJobStatus, UpsertCounts};
|
||||
use crate::data::scrape_jobs;
|
||||
use crate::error::Result;
|
||||
use crate::scraper::jobs::{JobError, JobType};
|
||||
use crate::web::ws::ScrapeJobEvent;
|
||||
use chrono::Utc;
|
||||
use chrono::{DateTime, Utc};
|
||||
use sqlx::PgPool;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
@@ -72,10 +72,15 @@ impl Worker {
|
||||
let job_id = job.id;
|
||||
let retry_count = job.retry_count;
|
||||
let max_retries = job.max_retries;
|
||||
let target_type = job.target_type;
|
||||
let payload = job.target_payload.clone();
|
||||
let priority = job.priority;
|
||||
let queued_at = job.queued_at;
|
||||
let started_at = Utc::now();
|
||||
let start = std::time::Instant::now();
|
||||
|
||||
// Emit JobLocked event
|
||||
let locked_at = Utc::now().to_rfc3339();
|
||||
let locked_at = started_at.to_rfc3339();
|
||||
debug!(job_id, "Emitting JobLocked event");
|
||||
let _ = self.job_events_tx.send(ScrapeJobEvent::JobLocked {
|
||||
id: job_id,
|
||||
@@ -105,8 +110,19 @@ impl Worker {
|
||||
let duration = start.elapsed();
|
||||
|
||||
// Handle the job processing result
|
||||
self.handle_job_result(job_id, retry_count, max_retries, process_result, duration)
|
||||
.await;
|
||||
self.handle_job_result(
|
||||
job_id,
|
||||
retry_count,
|
||||
max_retries,
|
||||
process_result,
|
||||
duration,
|
||||
target_type,
|
||||
payload,
|
||||
priority,
|
||||
queued_at,
|
||||
started_at,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -118,7 +134,7 @@ impl Worker {
|
||||
scrape_jobs::fetch_and_lock_job(&self.db_pool).await
|
||||
}
|
||||
|
||||
async fn process_job(&self, job: ScrapeJob) -> Result<(), JobError> {
|
||||
async fn process_job(&self, job: ScrapeJob) -> Result<UpsertCounts, JobError> {
|
||||
// Convert the database job to our job type
|
||||
let job_type = JobType::from_target_type_and_payload(job.target_type, job.target_payload)
|
||||
.map_err(|e| JobError::Unrecoverable(anyhow::anyhow!(e)))?; // Parse errors are unrecoverable
|
||||
@@ -145,9 +161,7 @@ impl Worker {
|
||||
job_impl
|
||||
.process(&self.banner_api, &self.db_pool)
|
||||
.await
|
||||
.map_err(JobError::Recoverable)?;
|
||||
|
||||
Ok(())
|
||||
.map_err(JobError::Recoverable)
|
||||
}
|
||||
.instrument(span)
|
||||
.await
|
||||
@@ -191,22 +205,53 @@ impl Worker {
|
||||
}
|
||||
|
||||
/// Handle the result of job processing
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn handle_job_result(
|
||||
&self,
|
||||
job_id: i32,
|
||||
retry_count: i32,
|
||||
max_retries: i32,
|
||||
result: Result<(), JobError>,
|
||||
result: Result<UpsertCounts, JobError>,
|
||||
duration: std::time::Duration,
|
||||
target_type: crate::data::models::TargetType,
|
||||
payload: serde_json::Value,
|
||||
priority: crate::data::models::ScrapePriority,
|
||||
queued_at: DateTime<Utc>,
|
||||
started_at: DateTime<Utc>,
|
||||
) {
|
||||
let duration_ms = duration.as_millis() as i32;
|
||||
|
||||
match result {
|
||||
Ok(()) => {
|
||||
Ok(counts) => {
|
||||
debug!(
|
||||
worker_id = self.id,
|
||||
job_id,
|
||||
duration_ms = duration.as_millis(),
|
||||
courses_fetched = counts.courses_fetched,
|
||||
courses_changed = counts.courses_changed,
|
||||
courses_unchanged = counts.courses_unchanged,
|
||||
"Job completed successfully"
|
||||
);
|
||||
|
||||
// Log the result
|
||||
if let Err(e) = scrape_jobs::insert_job_result(
|
||||
target_type,
|
||||
payload,
|
||||
priority,
|
||||
queued_at,
|
||||
started_at,
|
||||
duration_ms,
|
||||
true,
|
||||
None,
|
||||
retry_count,
|
||||
Some(&counts),
|
||||
&self.db_pool,
|
||||
)
|
||||
.await
|
||||
{
|
||||
error!(worker_id = self.id, job_id, error = ?e, "Failed to insert job result");
|
||||
}
|
||||
|
||||
if let Err(e) = self.delete_job(job_id).await {
|
||||
error!(worker_id = self.id, job_id, error = ?e, "Failed to delete completed job");
|
||||
}
|
||||
@@ -216,10 +261,41 @@ impl Worker {
|
||||
.send(ScrapeJobEvent::JobCompleted { id: job_id });
|
||||
}
|
||||
Err(JobError::Recoverable(e)) => {
|
||||
self.handle_recoverable_error(job_id, retry_count, max_retries, e, duration)
|
||||
.await;
|
||||
self.handle_recoverable_error(
|
||||
job_id,
|
||||
retry_count,
|
||||
max_retries,
|
||||
e,
|
||||
duration,
|
||||
target_type,
|
||||
payload,
|
||||
priority,
|
||||
queued_at,
|
||||
started_at,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
Err(JobError::Unrecoverable(e)) => {
|
||||
// Log the failed result
|
||||
let err_msg = format!("{e:#}");
|
||||
if let Err(log_err) = scrape_jobs::insert_job_result(
|
||||
target_type,
|
||||
payload,
|
||||
priority,
|
||||
queued_at,
|
||||
started_at,
|
||||
duration_ms,
|
||||
false,
|
||||
Some(&err_msg),
|
||||
retry_count,
|
||||
None,
|
||||
&self.db_pool,
|
||||
)
|
||||
.await
|
||||
{
|
||||
error!(worker_id = self.id, job_id, error = ?log_err, "Failed to insert job result");
|
||||
}
|
||||
|
||||
error!(
|
||||
worker_id = self.id,
|
||||
job_id,
|
||||
@@ -239,6 +315,7 @@ impl Worker {
|
||||
}
|
||||
|
||||
/// Handle recoverable errors by logging appropriately and unlocking the job
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn handle_recoverable_error(
|
||||
&self,
|
||||
job_id: i32,
|
||||
@@ -246,6 +323,11 @@ impl Worker {
|
||||
max_retries: i32,
|
||||
e: anyhow::Error,
|
||||
duration: std::time::Duration,
|
||||
target_type: crate::data::models::TargetType,
|
||||
payload: serde_json::Value,
|
||||
priority: crate::data::models::ScrapePriority,
|
||||
queued_at: DateTime<Utc>,
|
||||
started_at: DateTime<Utc>,
|
||||
) {
|
||||
let next_attempt = retry_count.saturating_add(1);
|
||||
let remaining_retries = max_retries.saturating_sub(next_attempt);
|
||||
@@ -276,7 +358,7 @@ impl Worker {
|
||||
|
||||
// Atomically unlock and increment retry count, checking if retry is allowed
|
||||
match self.unlock_and_increment_retry(job_id, max_retries).await {
|
||||
Ok(Some(queued_at)) => {
|
||||
Ok(Some(new_queued_at)) => {
|
||||
debug!(
|
||||
worker_id = self.id,
|
||||
job_id,
|
||||
@@ -288,12 +370,33 @@ impl Worker {
|
||||
let _ = self.job_events_tx.send(ScrapeJobEvent::JobRetried {
|
||||
id: job_id,
|
||||
retry_count: next_attempt,
|
||||
queued_at: queued_at.to_rfc3339(),
|
||||
queued_at: new_queued_at.to_rfc3339(),
|
||||
status: ScrapeJobStatus::Pending,
|
||||
});
|
||||
// Don't log a result yet — the job will be retried
|
||||
}
|
||||
Ok(None) => {
|
||||
// Max retries exceeded (detected atomically)
|
||||
// Max retries exceeded — log final failure result
|
||||
let duration_ms = duration.as_millis() as i32;
|
||||
let err_msg = format!("{e:#}");
|
||||
if let Err(log_err) = scrape_jobs::insert_job_result(
|
||||
target_type,
|
||||
payload,
|
||||
priority,
|
||||
queued_at,
|
||||
started_at,
|
||||
duration_ms,
|
||||
false,
|
||||
Some(&err_msg),
|
||||
next_attempt,
|
||||
None,
|
||||
&self.db_pool,
|
||||
)
|
||||
.await
|
||||
{
|
||||
error!(worker_id = self.id, job_id, error = ?log_err, "Failed to insert job result");
|
||||
}
|
||||
|
||||
error!(
|
||||
worker_id = self.id,
|
||||
job_id,
|
||||
|
||||
Reference in New Issue
Block a user