feat: move scraper into separate module, begin building data models

This commit is contained in:
2025-08-29 11:07:46 -05:00
parent e734e40347
commit 4764d48ac9
10 changed files with 146 additions and 329 deletions

View File

@@ -11,15 +11,15 @@ use tracing::{error, info};
/// Main Banner API client.
#[derive(Debug)]
pub struct BannerApi {
session_manager: SessionManager,
client: Client,
sessions: SessionManager,
http: Client,
base_url: String,
}
impl BannerApi {
/// Creates a new Banner API client.
pub fn new(base_url: String) -> Result<Self> {
let client = Client::builder()
let http = Client::builder()
.cookie_store(true)
.user_agent(user_agent())
.tcp_keepalive(Some(std::time::Duration::from_secs(60 * 5)))
@@ -29,11 +29,11 @@ impl BannerApi {
.build()
.context("Failed to create HTTP client")?;
let session_manager = SessionManager::new(base_url.clone(), client.clone());
let session_manager = SessionManager::new(base_url.clone(), http.clone());
Ok(Self {
session_manager,
client,
sessions: session_manager,
http,
base_url,
})
}
@@ -41,7 +41,7 @@ impl BannerApi {
/// Sets up the API client by initializing session cookies.
pub async fn setup(&self) -> Result<()> {
info!(base_url = self.base_url, "setting up banner api client");
let result = self.session_manager.setup().await;
let result = self.sessions.setup().await;
match &result {
Ok(()) => info!("banner api client setup completed successfully"),
Err(e) => error!(error = ?e, "banner api client setup failed"),
@@ -69,7 +69,7 @@ impl BannerApi {
];
let response = self
.client
.http
.get(&url)
.query(&params)
.send()
@@ -96,7 +96,7 @@ impl BannerApi {
return Err(anyhow::anyhow!("Offset must be greater than 0"));
}
let session_id = self.session_manager.ensure_session()?;
let session_id = self.sessions.ensure_session()?;
let url = format!("{}/classSearch/get_subject", self.base_url);
let params = [
("searchTerm", search),
@@ -108,7 +108,7 @@ impl BannerApi {
];
let response = self
.client
.http
.get(&url)
.query(&params)
.send()
@@ -135,7 +135,7 @@ impl BannerApi {
return Err(anyhow::anyhow!("Offset must be greater than 0"));
}
let session_id = self.session_manager.ensure_session()?;
let session_id = self.sessions.ensure_session()?;
let url = format!("{}/classSearch/get_instructor", self.base_url);
let params = [
("searchTerm", search),
@@ -147,7 +147,7 @@ impl BannerApi {
];
let response = self
.client
.http
.get(&url)
.query(&params)
.send()
@@ -174,7 +174,7 @@ impl BannerApi {
return Err(anyhow::anyhow!("Offset must be greater than 0"));
}
let session_id = self.session_manager.ensure_session()?;
let session_id = self.sessions.ensure_session()?;
let url = format!("{}/classSearch/get_campus", self.base_url);
let params = [
("searchTerm", search),
@@ -186,7 +186,7 @@ impl BannerApi {
];
let response = self
.client
.http
.get(&url)
.query(&params)
.send()
@@ -211,7 +211,7 @@ impl BannerApi {
let params = [("term", term), ("courseReferenceNumber", crn)];
let response = self
.client
.http
.get(&url)
.query(&params)
.send()
@@ -260,9 +260,9 @@ impl BannerApi {
sort: &str,
sort_descending: bool,
) -> Result<SearchResult> {
self.session_manager.reset_data_form().await?;
self.sessions.reset_data_form().await?;
let session_id = self.session_manager.ensure_session()?;
let session_id = self.sessions.ensure_session()?;
let mut params = query.to_params();
// Add additional parameters
@@ -278,7 +278,7 @@ impl BannerApi {
let url = format!("{}/searchResults/searchResults", self.base_url);
let response = self
.client
.http
.get(&url)
.query(&params)
.send()
@@ -301,16 +301,16 @@ impl BannerApi {
/// Selects a term for the current session.
pub async fn select_term(&self, term: &str) -> Result<()> {
self.session_manager.select_term(term).await
self.sessions.select_term(term).await
}
/// Retrieves a single course by CRN by issuing a minimal search
pub async fn get_course_by_crn(&self, term: &str, crn: &str) -> Result<Option<Course>> {
self.session_manager.reset_data_form().await?;
self.sessions.reset_data_form().await?;
// Ensure session is configured for this term
self.select_term(term).await?;
let session_id = self.session_manager.ensure_session()?;
let session_id = self.sessions.ensure_session()?;
let query = SearchQuery::new()
.course_reference_number(crn)
@@ -326,7 +326,7 @@ impl BannerApi {
let url = format!("{}/searchResults/searchResults", self.base_url);
let response = self
.client
.http
.get(&url)
.query(&params)
.send()
@@ -366,7 +366,7 @@ impl BannerApi {
let url = format!("{}/searchResults/getClassDetails", self.base_url);
let response = self
.client
.http
.post(&url)
.json(&body)
.send()

View File

@@ -11,7 +11,6 @@
pub mod api;
pub mod models;
pub mod query;
pub mod scraper;
pub mod session;
pub mod util;

View File

@@ -1,292 +0,0 @@
//! Course scraping functionality for the Banner API.
use crate::banner::{api::BannerApi, models::*, query::SearchQuery};
use anyhow::{Context, Result};
use redis::AsyncCommands;
use std::sync::Arc;
use std::time::Duration;
use tokio::time;
use tracing::{debug, error, info, warn};
/// Priority majors that should be scraped more frequently
const PRIORITY_MAJORS: &[&str] = &["CS", "CPE", "MAT", "EE", "IS"];
/// Maximum number of courses to fetch per page
const MAX_PAGE_SIZE: i32 = 500;
/// Course scraper for Banner API
pub struct CourseScraper {
api: Arc<BannerApi>,
redis_client: redis::Client,
}
impl CourseScraper {
/// Creates a new course scraper
pub fn new(api: Arc<BannerApi>, redis_url: &str) -> Result<Self> {
let redis_client =
redis::Client::open(redis_url).context("Failed to create Redis client")?;
Ok(Self { api, redis_client })
}
/// Scrapes all courses and stores them in Redis
pub async fn scrape_all(&self, term: &str) -> Result<()> {
// Get all subjects
let subjects = self
.api
.get_subjects("", term, 1, 100)
.await
.context("Failed to get subjects for scraping")?;
if subjects.is_empty() {
return Err(anyhow::anyhow!("no subjects found for term {term}"));
}
// Categorize subjects
let (priority_subjects, other_subjects): (Vec<_>, Vec<_>) = subjects
.into_iter()
.partition(|subject| PRIORITY_MAJORS.contains(&subject.code.as_str()));
// Get expired subjects that need scraping
let mut expired_subjects = Vec::new();
expired_subjects.extend(self.get_expired_subjects(&priority_subjects, term).await?);
expired_subjects.extend(self.get_expired_subjects(&other_subjects, term).await?);
if expired_subjects.is_empty() {
info!("no expired subjects found, skipping scrape");
return Ok(());
}
info!(
"scraping {count} subjects for term {term}",
count = expired_subjects.len()
);
// Scrape each expired subject
for subject in expired_subjects {
if let Err(e) = self.scrape_subject(&subject.code, term).await {
error!(
"failed to scrape subject {subject}: {e}",
subject = subject.code
);
}
// Rate limiting between subjects
time::sleep(Duration::from_secs(2)).await;
}
Ok(())
}
/// Gets subjects that have expired and need to be scraped
async fn get_expired_subjects(&self, subjects: &[Pair], term: &str) -> Result<Vec<Pair>> {
let mut conn = self
.redis_client
.get_multiplexed_async_connection()
.await
.context("Failed to get Redis connection")?;
let mut expired = Vec::new();
for subject in subjects {
let key = format!("scraped:{code}:{term}", code = subject.code);
let scraped: Option<String> = conn
.get(&key)
.await
.context("Failed to check scrape status in Redis")?;
// If not scraped or marked as expired (empty/0), add to list
if scraped.is_none() || scraped.as_deref() == Some("0") {
expired.push(subject.clone());
}
}
Ok(expired)
}
/// Scrapes all courses for a specific subject
pub async fn scrape_subject(&self, subject: &str, term: &str) -> Result<()> {
let mut offset = 0;
let mut total_courses = 0;
loop {
let query = SearchQuery::new()
.subject(subject)
.offset(offset)
.max_results(MAX_PAGE_SIZE * 2);
// Ensure session term is selected before searching
self.api.select_term(term).await?;
let result = self
.api
.search(term, &query, "subjectDescription", false)
.await
.with_context(|| {
format!("failed to search for subject {subject} at offset {offset}")
})?;
if !result.success {
return Err(anyhow::anyhow!(
"search marked unsuccessful for subject {subject}"
));
}
let course_count = result.data.as_ref().map(|v| v.len() as i32).unwrap_or(0);
total_courses += course_count;
debug!(
"retrieved {count} courses for subject {subject} at offset {offset}",
count = course_count
);
// Store each course in Redis
for course in result.data.unwrap_or_default() {
if let Err(e) = self.store_course(&course).await {
error!(
"failed to store course {crn}: {e}",
crn = course.course_reference_number
);
}
}
// Check if we got a full page and should continue
if course_count >= MAX_PAGE_SIZE {
if course_count > MAX_PAGE_SIZE {
warn!(
"course count {count} exceeds max page size {max_page_size}",
count = course_count,
max_page_size = MAX_PAGE_SIZE
);
}
offset += MAX_PAGE_SIZE;
debug!("continuing to next page for subject {subject} at offset {offset}");
// Rate limiting between pages
time::sleep(Duration::from_secs(3)).await;
continue;
}
break;
}
info!(
"scraped {count} total courses for subject {subject}",
count = total_courses
);
// Mark subject as scraped with expiry
self.mark_subject_scraped(subject, term, total_courses)
.await?;
Ok(())
}
/// Stores a course in Redis
async fn store_course(&self, course: &Course) -> Result<()> {
let mut conn = self
.redis_client
.get_multiplexed_async_connection()
.await
.context("Failed to get Redis connection")?;
let key = format!("class:{crn}", crn = course.course_reference_number);
let serialized = serde_json::to_string(course).context("Failed to serialize course")?;
let _: () = conn
.set(&key, serialized)
.await
.context("Failed to store course in Redis")?;
Ok(())
}
/// Marks a subject as scraped with appropriate expiry time
async fn mark_subject_scraped(
&self,
subject: &str,
term: &str,
course_count: i32,
) -> Result<()> {
let mut conn = self
.redis_client
.get_multiplexed_async_connection()
.await
.context("Failed to get Redis connection")?;
let key = format!("scraped:{subject}:{term}", subject = subject);
let expiry = self.calculate_expiry(subject, course_count);
let value = if course_count == 0 { -1 } else { course_count };
let _: () = conn
.set_ex(&key, value, expiry.as_secs())
.await
.context("Failed to mark subject as scraped")?;
debug!(
"marked subject {subject} as scraped with {count} courses, expiry: {expiry:?}",
subject = subject,
count = course_count,
expiry = expiry
);
Ok(())
}
/// Calculates expiry time for a scraped subject based on various factors
fn calculate_expiry(&self, subject: &str, course_count: i32) -> Duration {
// Base calculation: 1 hour per 100 courses
let mut base_expiry = Duration::from_secs(3600 * (course_count as u64 / 100).max(1));
// Special handling for subjects with few courses
if course_count < 50 {
// Linear interpolation: 1 course = 12 hours, 49 courses = 1 hour
let hours = 12.0 - ((course_count as f64 - 1.0) / 48.0) * 11.0;
base_expiry = Duration::from_secs((hours * 3600.0) as u64);
}
// Priority subjects get shorter expiry (more frequent updates)
if PRIORITY_MAJORS.contains(&subject) {
base_expiry /= 3;
}
// Add random variance (±15%)
let variance = (base_expiry.as_secs() as f64 * 0.15) as u64;
let random_offset = (rand::random::<f64>() - 0.5) * 2.0 * variance as f64;
let final_expiry = if random_offset > 0.0 {
base_expiry + Duration::from_secs(random_offset as u64)
} else {
base_expiry.saturating_sub(Duration::from_secs((-random_offset) as u64))
};
// Ensure minimum of 1 hour
final_expiry.max(Duration::from_secs(3600))
}
/// Gets a course from Redis cache
pub async fn get_course(&self, crn: &str) -> Result<Option<Course>> {
let mut conn = self
.redis_client
.get_multiplexed_async_connection()
.await
.context("Failed to get Redis connection")?;
let key = format!("class:{crn}");
let serialized: Option<String> = conn
.get(&key)
.await
.context("Failed to get course from Redis")?;
match serialized {
Some(data) => {
let course: Course = serde_json::from_str(&data)
.context("Failed to deserialize course from Redis")?;
Ok(Some(course))
}
None => Ok(None),
}
}
}

View File

@@ -1,8 +1,9 @@
//! Diesel models for the database schema.
use crate::data::schema::{course_audits, course_metrics, courses};
use crate::data::schema::{course_audits, course_metrics, courses, scrape_jobs};
use chrono::{DateTime, Utc};
use diesel::{Insertable, Queryable, Selectable};
use diesel::{Insertable, Queryable, QueryableByName, Selectable};
use serde_json::Value;
#[derive(Queryable, Selectable)]
#[diesel(table_name = courses)]
@@ -78,3 +79,45 @@ pub struct NewCourseAudit<'a> {
pub old_value: &'a str,
pub new_value: &'a str,
}
/// The priority level of a scrape job.
#[derive(diesel_derive_enum::DbEnum, Copy, Debug, Clone)]
pub enum ScrapePriority {
Low,
Medium,
High,
Critical,
}
/// The type of target for a scrape job, determining how the payload is interpreted.
#[derive(diesel_derive_enum::DbEnum, Copy, Debug, Clone)]
pub enum TargetType {
Subject,
CourseRange,
CrnList,
SingleCrn,
}
/// Represents a queryable job from the database.
#[derive(Debug, Clone, Queryable, QueryableByName)]
#[diesel(table_name = scrape_jobs)]
pub struct ScrapeJob {
pub id: i32,
pub target_type: TargetType,
pub target_payload: Value,
pub priority: ScrapePriority,
pub execute_at: DateTime<Utc>,
pub created_at: DateTime<Utc>,
pub locked_at: Option<DateTime<Utc>>,
}
/// Represents a new job to be inserted into the database.
#[derive(Debug, Clone, Insertable)]
#[diesel(table_name = scrape_jobs)]
pub struct NewScrapeJob {
pub target_type: TargetType,
#[diesel(sql_type = diesel::sql_types::Jsonb)]
pub target_payload: Value,
pub priority: ScrapePriority,
pub execute_at: DateTime<Utc>,
}

View File

@@ -1,3 +1,30 @@
pub mod sql_types {
#[derive(diesel::sql_types::SqlType)]
#[diesel(postgres_type(name = "scrape_priority"))]
pub struct ScrapePriority;
#[derive(diesel::sql_types::SqlType)]
#[diesel(postgres_type(name = "target_type"))]
pub struct TargetType;
}
use super::models::{ScrapePriorityMapping, TargetTypeMapping};
diesel::table! {
use diesel::sql_types::*;
use super::{ScrapePriorityMapping, TargetTypeMapping};
scrape_jobs (id) {
id -> Int4,
target_type -> TargetTypeMapping,
target_payload -> Jsonb,
priority -> ScrapePriorityMapping,
execute_at -> Timestamptz,
created_at -> Timestamptz,
locked_at -> Nullable<Timestamptz>,
}
}
diesel::table! {
courses (id) {
id -> Int4,
@@ -39,4 +66,4 @@ diesel::table! {
diesel::joinable!(course_metrics -> courses (course_id));
diesel::joinable!(course_audits -> courses (course_id));
diesel::allow_tables_to_appear_in_same_query!(courses, course_metrics, course_audits,);
diesel::allow_tables_to_appear_in_same_query!(courses, course_metrics, course_audits, scrape_jobs,);

View File

@@ -3,5 +3,6 @@ pub mod banner;
pub mod bot;
pub mod data;
pub mod error;
pub mod scraper;
pub mod services;
pub mod web;

View File

@@ -5,7 +5,6 @@ use tracing_subscriber::{EnvFilter, FmtSubscriber};
use crate::app_state::AppState;
use crate::banner::BannerApi;
use crate::banner::scraper::CourseScraper;
use crate::bot::{Data, get_commands};
use crate::config::Config;
use crate::services::manager::ServiceManager;
@@ -80,14 +79,9 @@ async fn main() {
let app_state = AppState::new(banner_api_arc.clone(), &config.redis_url)
.expect("Failed to create AppState");
// Create CourseScraper for web service
let scraper = CourseScraper::new(banner_api_arc.clone(), &config.redis_url)
.expect("Failed to create CourseScraper");
// Create BannerState for web service
let banner_state = BannerState {
api: banner_api_arc,
scraper: Arc::new(scraper),
};
// Configure the client with your Discord bot token in the environment

View File

@@ -5,11 +5,12 @@ use serde_json::{Value, json};
use std::sync::Arc;
use tracing::info;
use crate::banner::BannerApi;
/// Shared application state for web server
#[derive(Clone)]
pub struct BannerState {
pub api: Arc<crate::banner::BannerApi>,
pub scraper: Arc<crate::banner::scraper::CourseScraper>,
pub api: Arc<BannerApi>,
}
/// Creates the web server router