mirror of
https://github.com/Xevion/banner.git
synced 2025-12-12 07:09:20 -06:00
feat: implement common job trait & better interface for scheduler & workers
This commit is contained in:
52
src/scraper/jobs/mod.rs
Normal file
52
src/scraper/jobs/mod.rs
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
pub mod subject;
|
||||||
|
|
||||||
|
use crate::data::models::TargetType;
|
||||||
|
use crate::error::Result;
|
||||||
|
use crate::{banner::BannerApi, scraper::jobs::subject::SubjectJob};
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use sqlx::PgPool;
|
||||||
|
|
||||||
|
/// Common trait interface for all job types
|
||||||
|
#[async_trait::async_trait]
|
||||||
|
pub trait Job: Send + Sync {
|
||||||
|
/// The target type this job handles
|
||||||
|
fn target_type(&self) -> TargetType;
|
||||||
|
|
||||||
|
/// Process the job with the given API client and database pool
|
||||||
|
async fn process(&self, banner_api: &BannerApi, db_pool: &PgPool) -> Result<()>;
|
||||||
|
|
||||||
|
/// Get a human-readable description of the job
|
||||||
|
fn description(&self) -> String;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Main job enum that dispatches to specific job implementations
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub enum JobType {
|
||||||
|
Subject(SubjectJob),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl JobType {
|
||||||
|
/// Create a job from the target type and payload
|
||||||
|
pub fn from_target_type_and_payload(
|
||||||
|
target_type: TargetType,
|
||||||
|
payload: serde_json::Value,
|
||||||
|
) -> Result<Self> {
|
||||||
|
match target_type {
|
||||||
|
TargetType::Subject => {
|
||||||
|
let subject_payload: SubjectJob = serde_json::from_value(payload)?;
|
||||||
|
Ok(JobType::Subject(subject_payload))
|
||||||
|
}
|
||||||
|
_ => Err(anyhow::anyhow!(
|
||||||
|
"Unsupported target type: {:?}",
|
||||||
|
target_type
|
||||||
|
)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert to a Job trait object
|
||||||
|
pub fn as_job(self) -> Box<dyn Job> {
|
||||||
|
match self {
|
||||||
|
JobType::Subject(payload) => Box::new(payload),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
93
src/scraper/jobs/subject.rs
Normal file
93
src/scraper/jobs/subject.rs
Normal file
@@ -0,0 +1,93 @@
|
|||||||
|
use super::Job;
|
||||||
|
use crate::banner::{BannerApi, Course, SearchQuery, Term};
|
||||||
|
use crate::data::models::TargetType;
|
||||||
|
use crate::error::Result;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use sqlx::PgPool;
|
||||||
|
use tracing::{debug, info, trace};
|
||||||
|
|
||||||
|
/// Job implementation for scraping subject data
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct SubjectJob {
|
||||||
|
pub subject: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SubjectJob {
|
||||||
|
pub fn new(subject: String) -> Self {
|
||||||
|
Self { subject }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait::async_trait]
|
||||||
|
impl Job for SubjectJob {
|
||||||
|
fn target_type(&self) -> TargetType {
|
||||||
|
TargetType::Subject
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn process(&self, banner_api: &BannerApi, db_pool: &PgPool) -> Result<()> {
|
||||||
|
let subject_code = &self.subject;
|
||||||
|
debug!(subject = subject_code, "Processing subject job");
|
||||||
|
|
||||||
|
// Get the current term
|
||||||
|
let term = Term::get_current().inner().to_string();
|
||||||
|
let query = SearchQuery::new().subject(subject_code).max_results(500);
|
||||||
|
|
||||||
|
let search_result = banner_api
|
||||||
|
.search(&term, &query, "subjectDescription", false)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
if let Some(courses_from_api) = search_result.data {
|
||||||
|
info!(
|
||||||
|
subject = subject_code,
|
||||||
|
count = courses_from_api.len(),
|
||||||
|
"Found courses"
|
||||||
|
);
|
||||||
|
for course in courses_from_api {
|
||||||
|
self.upsert_course(&course, db_pool).await?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
debug!(subject = subject_code, "Subject job completed");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn description(&self) -> String {
|
||||||
|
format!("Scrape subject: {}", self.subject)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SubjectJob {
|
||||||
|
async fn upsert_course(&self, course: &Course, db_pool: &PgPool) -> Result<()> {
|
||||||
|
sqlx::query(
|
||||||
|
r#"
|
||||||
|
INSERT INTO courses (crn, subject, course_number, title, term_code, enrollment, max_enrollment, wait_count, wait_capacity, last_scraped_at)
|
||||||
|
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)
|
||||||
|
ON CONFLICT (crn, term_code) DO UPDATE SET
|
||||||
|
subject = EXCLUDED.subject,
|
||||||
|
course_number = EXCLUDED.course_number,
|
||||||
|
title = EXCLUDED.title,
|
||||||
|
enrollment = EXCLUDED.enrollment,
|
||||||
|
max_enrollment = EXCLUDED.max_enrollment,
|
||||||
|
wait_count = EXCLUDED.wait_count,
|
||||||
|
wait_capacity = EXCLUDED.wait_capacity,
|
||||||
|
last_scraped_at = EXCLUDED.last_scraped_at
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(&course.course_reference_number)
|
||||||
|
.bind(&course.subject)
|
||||||
|
.bind(&course.course_number)
|
||||||
|
.bind(&course.course_title)
|
||||||
|
.bind(&course.term)
|
||||||
|
.bind(course.enrollment)
|
||||||
|
.bind(course.maximum_enrollment)
|
||||||
|
.bind(course.wait_count)
|
||||||
|
.bind(course.wait_capacity)
|
||||||
|
.bind(chrono::Utc::now())
|
||||||
|
.execute(db_pool)
|
||||||
|
.await
|
||||||
|
.map(|result| {
|
||||||
|
trace!(result = ?result, "Course upserted");
|
||||||
|
})
|
||||||
|
.map_err(|e| anyhow::anyhow!("Failed to upsert course: {e}"))
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,3 +1,4 @@
|
|||||||
|
pub mod jobs;
|
||||||
pub mod scheduler;
|
pub mod scheduler;
|
||||||
pub mod worker;
|
pub mod worker;
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
use crate::banner::{BannerApi, Term};
|
use crate::banner::{BannerApi, Term};
|
||||||
use crate::data::models::{ScrapePriority, TargetType};
|
use crate::data::models::{ScrapePriority, TargetType};
|
||||||
use crate::error::Result;
|
use crate::error::Result;
|
||||||
|
use crate::scraper::jobs::subject::SubjectJob;
|
||||||
use serde_json::json;
|
use serde_json::json;
|
||||||
use sqlx::PgPool;
|
use sqlx::PgPool;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
@@ -78,7 +79,8 @@ impl Scheduler {
|
|||||||
let new_jobs: Vec<_> = subjects
|
let new_jobs: Vec<_> = subjects
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.filter_map(|subject| {
|
.filter_map(|subject| {
|
||||||
let payload = json!({ "subject": subject.code });
|
let job = SubjectJob::new(subject.code.clone());
|
||||||
|
let payload = serde_json::to_value(&job).unwrap();
|
||||||
let payload_str = payload.to_string();
|
let payload_str = payload.to_string();
|
||||||
|
|
||||||
if existing_payloads.contains(&payload_str) {
|
if existing_payloads.contains(&payload_str) {
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
use crate::banner::{BannerApi, BannerApiError, Course, SearchQuery, Term};
|
use crate::banner::{BannerApi, BannerApiError};
|
||||||
use crate::data::models::ScrapeJob;
|
use crate::data::models::ScrapeJob;
|
||||||
use crate::error::Result;
|
use crate::error::Result;
|
||||||
use serde_json::Value;
|
use crate::scraper::jobs::JobType;
|
||||||
use sqlx::PgPool;
|
use sqlx::PgPool;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
@@ -110,81 +110,27 @@ impl Worker {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async fn process_job(&self, job: ScrapeJob) -> Result<()> {
|
async fn process_job(&self, job: ScrapeJob) -> Result<()> {
|
||||||
match job.target_type {
|
// Convert the database job to our job type
|
||||||
crate::data::models::TargetType::Subject => {
|
let job_type = JobType::from_target_type_and_payload(
|
||||||
self.process_subject_job(&job.target_payload).await
|
job.target_type,
|
||||||
}
|
job.target_payload,
|
||||||
_ => {
|
)?;
|
||||||
warn!(worker_id = self.id, job_id = job.id, "unhandled job type");
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn process_subject_job(&self, payload: &Value) -> Result<()> {
|
// Get the job implementation
|
||||||
let subject_code = payload["subject"]
|
let job_impl = job_type.as_job();
|
||||||
.as_str()
|
|
||||||
.ok_or_else(|| anyhow::anyhow!("Invalid subject payload"))?;
|
debug!(
|
||||||
info!(
|
|
||||||
worker_id = self.id,
|
worker_id = self.id,
|
||||||
subject = subject_code,
|
job_id = job.id,
|
||||||
"Scraping subject"
|
description = job_impl.description(),
|
||||||
|
"Processing job"
|
||||||
);
|
);
|
||||||
|
|
||||||
let term = Term::get_current().inner().to_string();
|
// Process the job
|
||||||
let query = SearchQuery::new().subject(subject_code).max_results(500);
|
job_impl.process(&self.banner_api, &self.db_pool).await
|
||||||
|
|
||||||
let search_result = self
|
|
||||||
.banner_api
|
|
||||||
.search(&term, &query, "subjectDescription", false)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
if let Some(courses_from_api) = search_result.data {
|
|
||||||
info!(
|
|
||||||
worker_id = self.id,
|
|
||||||
subject = subject_code,
|
|
||||||
count = courses_from_api.len(),
|
|
||||||
"Found courses"
|
|
||||||
);
|
|
||||||
for course in courses_from_api {
|
|
||||||
self.upsert_course(&course).await?;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn upsert_course(&self, course: &Course) -> Result<()> {
|
|
||||||
sqlx::query(
|
|
||||||
r#"
|
|
||||||
INSERT INTO courses (crn, subject, course_number, title, term_code, enrollment, max_enrollment, wait_count, wait_capacity, last_scraped_at)
|
|
||||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)
|
|
||||||
ON CONFLICT (crn, term_code) DO UPDATE SET
|
|
||||||
subject = EXCLUDED.subject,
|
|
||||||
course_number = EXCLUDED.course_number,
|
|
||||||
title = EXCLUDED.title,
|
|
||||||
enrollment = EXCLUDED.enrollment,
|
|
||||||
max_enrollment = EXCLUDED.max_enrollment,
|
|
||||||
wait_count = EXCLUDED.wait_count,
|
|
||||||
wait_capacity = EXCLUDED.wait_capacity,
|
|
||||||
last_scraped_at = EXCLUDED.last_scraped_at
|
|
||||||
"#,
|
|
||||||
)
|
|
||||||
.bind(&course.course_reference_number)
|
|
||||||
.bind(&course.subject)
|
|
||||||
.bind(&course.course_number)
|
|
||||||
.bind(&course.course_title)
|
|
||||||
.bind(&course.term)
|
|
||||||
.bind(course.enrollment)
|
|
||||||
.bind(course.maximum_enrollment)
|
|
||||||
.bind(course.wait_count)
|
|
||||||
.bind(course.wait_capacity)
|
|
||||||
.bind(chrono::Utc::now())
|
|
||||||
.execute(&self.db_pool)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn delete_job(&self, job_id: i32) -> Result<()> {
|
async fn delete_job(&self, job_id: i32) -> Result<()> {
|
||||||
sqlx::query("DELETE FROM scrape_jobs WHERE id = $1")
|
sqlx::query("DELETE FROM scrape_jobs WHERE id = $1")
|
||||||
|
|||||||
Reference in New Issue
Block a user