feat: implement comprehensive course data model with reference cache and search

2026-01-31 14:23:36 -06:00 · 2026-01-28 21:06:29 -06:00
parent e3b855b956
commit 6df4303bd6
16 changed files with 1121 additions and 76 deletions
@@ -1,43 +1,73 @@
 //! Batch database operations for improved performance.

 use crate::banner::Course;
+use crate::data::models::DbMeetingTime;
 use crate::error::Result;
 use sqlx::PgPool;
+use std::collections::HashSet;
 use std::time::Instant;
 use tracing::info;

+/// Convert a Banner API course's meeting times to the DB JSONB shape.
+fn to_db_meeting_times(course: &Course) -> serde_json::Value {
+    let meetings: Vec<DbMeetingTime> = course
+        .meetings_faculty
+        .iter()
+        .map(|mf| {
+            let mt = &mf.meeting_time;
+            DbMeetingTime {
+                begin_time: mt.begin_time.clone(),
+                end_time: mt.end_time.clone(),
+                start_date: mt.start_date.clone(),
+                end_date: mt.end_date.clone(),
+                monday: mt.monday,
+                tuesday: mt.tuesday,
+                wednesday: mt.wednesday,
+                thursday: mt.thursday,
+                friday: mt.friday,
+                saturday: mt.saturday,
+                sunday: mt.sunday,
+                building: mt.building.clone(),
+                building_description: mt.building_description.clone(),
+                room: mt.room.clone(),
+                campus: mt.campus.clone(),
+                meeting_type: mt.meeting_type.clone(),
+                meeting_schedule_type: mt.meeting_schedule_type.clone(),
+            }
+        })
+        .collect();
+    serde_json::to_value(meetings).unwrap_or_default()
+}
+
+/// Convert a Banner API course's section attributes to a JSONB array of code strings.
+fn to_db_attributes(course: &Course) -> serde_json::Value {
+    let codes: Vec<&str> = course
+        .section_attributes
+        .iter()
+        .map(|a| a.code.as_str())
+        .collect();
+    serde_json::to_value(codes).unwrap_or_default()
+}
+
+/// Extract the campus code from the first meeting time (Banner doesn't put it on the course directly).
+fn extract_campus_code(course: &Course) -> Option<String> {
+    course
+        .meetings_faculty
+        .first()
+        .and_then(|mf| mf.meeting_time.campus.clone())
+}
+
 /// Batch upsert courses in a single database query.
 ///
-/// This function performs a bulk INSERT...ON CONFLICT DO UPDATE for all courses
-/// in a single round-trip to the database, significantly reducing overhead compared
-/// to individual inserts.
+/// Performs a bulk INSERT...ON CONFLICT DO UPDATE for all courses, including
+/// new fields (meeting times, attributes, instructor data). Returns the
+/// database IDs for all upserted courses (in input order) so instructors
+/// can be linked.
 ///
 /// # Performance
-/// - Reduces N database round-trips to 1
+/// - Reduces N database round-trips to 3 (courses, instructors, junction)
 /// - Typical usage: 50-200 courses per batch
-/// - PostgreSQL parameter limit: 65,535 (we use ~10 per course)
-///
-/// # Arguments
-/// * `courses` - Slice of Course structs from the Banner API
-/// * `db_pool` - PostgreSQL connection pool
-///
-/// # Returns
-/// * `Ok(())` on success
-/// * `Err(_)` if the database operation fails
-///
-/// # Example
-/// ```no_run
-/// use banner::data::batch::batch_upsert_courses;
-/// use banner::banner::Course;
-/// use sqlx::PgPool;
-///
-/// async fn example(courses: &[Course], pool: &PgPool) -> anyhow::Result<()> {
-///     batch_upsert_courses(courses, pool).await?;
-///     Ok(())
-/// }
-/// ```
 pub async fn batch_upsert_courses(courses: &[Course], db_pool: &PgPool) -> Result<()> {
-    // Early return for empty batches
    if courses.is_empty() {
        info!("No courses to upsert, skipping batch operation");
        return Ok(());
@@ -46,42 +76,108 @@ pub async fn batch_upsert_courses(courses: &[Course], db_pool: &PgPool) -> Resul
    let start = Instant::now();
    let course_count = courses.len();

-    // Extract course fields into vectors for UNNEST
+    // Step 1: Upsert courses with all fields, returning IDs
+    let course_ids = upsert_courses(courses, db_pool).await?;
+
+    // Step 2: Upsert instructors (deduplicated across batch)
+    upsert_instructors(courses, db_pool).await?;
+
+    // Step 3: Link courses to instructors via junction table
+    upsert_course_instructors(courses, &course_ids, db_pool).await?;
+
+    let duration = start.elapsed();
+    info!(
+        courses_count = course_count,
+        duration_ms = duration.as_millis(),
+        "Batch upserted courses with instructors"
+    );
+
+    Ok(())
+}
+
+/// Upsert all courses and return their database IDs in input order.
+async fn upsert_courses(courses: &[Course], db_pool: &PgPool) -> Result<Vec<i32>> {
    let crns: Vec<&str> = courses
        .iter()
        .map(|c| c.course_reference_number.as_str())
        .collect();
-
    let subjects: Vec<&str> = courses.iter().map(|c| c.subject.as_str()).collect();
-
    let course_numbers: Vec<&str> = courses.iter().map(|c| c.course_number.as_str()).collect();
-
    let titles: Vec<&str> = courses.iter().map(|c| c.course_title.as_str()).collect();
-
    let term_codes: Vec<&str> = courses.iter().map(|c| c.term.as_str()).collect();
-
    let enrollments: Vec<i32> = courses.iter().map(|c| c.enrollment).collect();
-
    let max_enrollments: Vec<i32> = courses.iter().map(|c| c.maximum_enrollment).collect();
-
    let wait_counts: Vec<i32> = courses.iter().map(|c| c.wait_count).collect();
-
    let wait_capacities: Vec<i32> = courses.iter().map(|c| c.wait_capacity).collect();

-    // Perform batch upsert using UNNEST for efficient bulk insertion
-    let result = sqlx::query(
+    // New scalar fields
+    let sequence_numbers: Vec<Option<&str>> = courses
+        .iter()
+        .map(|c| Some(c.sequence_number.as_str()))
+        .collect();
+    let parts_of_term: Vec<Option<&str>> = courses
+        .iter()
+        .map(|c| Some(c.part_of_term.as_str()))
+        .collect();
+    let instructional_methods: Vec<Option<&str>> = courses
+        .iter()
+        .map(|c| Some(c.instructional_method.as_str()))
+        .collect();
+    let campuses: Vec<Option<String>> = courses.iter().map(extract_campus_code).collect();
+    let credit_hours: Vec<Option<i32>> = courses.iter().map(|c| c.credit_hours).collect();
+    let credit_hour_lows: Vec<Option<i32>> = courses.iter().map(|c| c.credit_hour_low).collect();
+    let credit_hour_highs: Vec<Option<i32>> = courses.iter().map(|c| c.credit_hour_high).collect();
+    let cross_lists: Vec<Option<&str>> = courses.iter().map(|c| c.cross_list.as_deref()).collect();
+    let cross_list_capacities: Vec<Option<i32>> =
+        courses.iter().map(|c| c.cross_list_capacity).collect();
+    let cross_list_counts: Vec<Option<i32>> = courses.iter().map(|c| c.cross_list_count).collect();
+    let link_identifiers: Vec<Option<&str>> = courses
+        .iter()
+        .map(|c| c.link_identifier.as_deref())
+        .collect();
+    let is_section_linkeds: Vec<Option<bool>> =
+        courses.iter().map(|c| Some(c.is_section_linked)).collect();
+
+    // JSONB fields
+    let meeting_times_json: Vec<serde_json::Value> =
+        courses.iter().map(to_db_meeting_times).collect();
+    let attributes_json: Vec<serde_json::Value> = courses.iter().map(to_db_attributes).collect();
+
+    let rows = sqlx::query_scalar::<_, i32>(
        r#"
        INSERT INTO courses (
            crn, subject, course_number, title, term_code,
-            enrollment, max_enrollment, wait_count, wait_capacity, last_scraped_at
+            enrollment, max_enrollment, wait_count, wait_capacity, last_scraped_at,
+            sequence_number, part_of_term, instructional_method, campus,
+            credit_hours, credit_hour_low, credit_hour_high,
+            cross_list, cross_list_capacity, cross_list_count,
+            link_identifier, is_section_linked,
+            meeting_times, attributes
        )
-        SELECT * FROM UNNEST(
+        SELECT
+            v.crn, v.subject, v.course_number, v.title, v.term_code,
+            v.enrollment, v.max_enrollment, v.wait_count, v.wait_capacity, NOW(),
+            v.sequence_number, v.part_of_term, v.instructional_method, v.campus,
+            v.credit_hours, v.credit_hour_low, v.credit_hour_high,
+            v.cross_list, v.cross_list_capacity, v.cross_list_count,
+            v.link_identifier, v.is_section_linked,
+            v.meeting_times, v.attributes
+        FROM UNNEST(
            $1::text[], $2::text[], $3::text[], $4::text[], $5::text[],
            $6::int4[], $7::int4[], $8::int4[], $9::int4[],
-            array_fill(NOW()::timestamptz, ARRAY[$10])
-        ) AS t(
+            $10::text[], $11::text[], $12::text[], $13::text[],
+            $14::int4[], $15::int4[], $16::int4[],
+            $17::text[], $18::int4[], $19::int4[],
+            $20::text[], $21::bool[],
+            $22::jsonb[], $23::jsonb[]
+        ) AS v(
            crn, subject, course_number, title, term_code,
-            enrollment, max_enrollment, wait_count, wait_capacity, last_scraped_at
+            enrollment, max_enrollment, wait_count, wait_capacity,
+            sequence_number, part_of_term, instructional_method, campus,
+            credit_hours, credit_hour_low, credit_hour_high,
+            cross_list, cross_list_capacity, cross_list_count,
+            link_identifier, is_section_linked,
+            meeting_times, attributes
        )
        ON CONFLICT (crn, term_code)
        DO UPDATE SET
@@ -92,7 +188,22 @@ pub async fn batch_upsert_courses(courses: &[Course], db_pool: &PgPool) -> Resul
            max_enrollment = EXCLUDED.max_enrollment,
            wait_count = EXCLUDED.wait_count,
            wait_capacity = EXCLUDED.wait_capacity,
-            last_scraped_at = EXCLUDED.last_scraped_at
+            last_scraped_at = EXCLUDED.last_scraped_at,
+            sequence_number = EXCLUDED.sequence_number,
+            part_of_term = EXCLUDED.part_of_term,
+            instructional_method = EXCLUDED.instructional_method,
+            campus = EXCLUDED.campus,
+            credit_hours = EXCLUDED.credit_hours,
+            credit_hour_low = EXCLUDED.credit_hour_low,
+            credit_hour_high = EXCLUDED.credit_hour_high,
+            cross_list = EXCLUDED.cross_list,
+            cross_list_capacity = EXCLUDED.cross_list_capacity,
+            cross_list_count = EXCLUDED.cross_list_count,
+            link_identifier = EXCLUDED.link_identifier,
+            is_section_linked = EXCLUDED.is_section_linked,
+            meeting_times = EXCLUDED.meeting_times,
+            attributes = EXCLUDED.attributes
+        RETURNING id
        "#,
    )
    .bind(&crns)
@@ -104,19 +215,111 @@ pub async fn batch_upsert_courses(courses: &[Course], db_pool: &PgPool) -> Resul
    .bind(&max_enrollments)
    .bind(&wait_counts)
    .bind(&wait_capacities)
-    .bind(course_count as i32)
-    .execute(db_pool)
+    .bind(&sequence_numbers)
+    .bind(&parts_of_term)
+    .bind(&instructional_methods)
+    .bind(&campuses)
+    .bind(&credit_hours)
+    .bind(&credit_hour_lows)
+    .bind(&credit_hour_highs)
+    .bind(&cross_lists)
+    .bind(&cross_list_capacities)
+    .bind(&cross_list_counts)
+    .bind(&link_identifiers)
+    .bind(&is_section_linkeds)
+    .bind(&meeting_times_json)
+    .bind(&attributes_json)
+    .fetch_all(db_pool)
    .await
    .map_err(|e| anyhow::anyhow!("Failed to batch upsert courses: {}", e))?;

-    let duration = start.elapsed();
+    Ok(rows)
+}

-    info!(
-        courses_count = course_count,
-        rows_affected = result.rows_affected(),
-        duration_ms = duration.as_millis(),
-        "Batch upserted courses"
-    );
+/// Deduplicate and upsert all instructors from the batch.
+async fn upsert_instructors(courses: &[Course], db_pool: &PgPool) -> Result<()> {
+    let mut seen = HashSet::new();
+    let mut banner_ids = Vec::new();
+    let mut display_names = Vec::new();
+    let mut emails: Vec<Option<&str>> = Vec::new();
+
+    for course in courses {
+        for faculty in &course.faculty {
+            if seen.insert(faculty.banner_id.as_str()) {
+                banner_ids.push(faculty.banner_id.as_str());
+                display_names.push(faculty.display_name.as_str());
+                emails.push(faculty.email_address.as_deref());
+            }
+        }
+    }
+
+    if banner_ids.is_empty() {
+        return Ok(());
+    }
+
+    sqlx::query(
+        r#"
+        INSERT INTO instructors (banner_id, display_name, email)
+        SELECT * FROM UNNEST($1::text[], $2::text[], $3::text[])
+        ON CONFLICT (banner_id)
+        DO UPDATE SET
+            display_name = EXCLUDED.display_name,
+            email = COALESCE(EXCLUDED.email, instructors.email)
+        "#,
+    )
+    .bind(&banner_ids)
+    .bind(&display_names)
+    .bind(&emails)
+    .execute(db_pool)
+    .await
+    .map_err(|e| anyhow::anyhow!("Failed to batch upsert instructors: {}", e))?;
+
+    Ok(())
+}
+
+/// Link courses to their instructors via the junction table.
+async fn upsert_course_instructors(
+    courses: &[Course],
+    course_ids: &[i32],
+    db_pool: &PgPool,
+) -> Result<()> {
+    let mut cids = Vec::new();
+    let mut iids = Vec::new();
+    let mut primaries = Vec::new();
+
+    for (course, &course_id) in courses.iter().zip(course_ids) {
+        for faculty in &course.faculty {
+            cids.push(course_id);
+            iids.push(faculty.banner_id.as_str());
+            primaries.push(faculty.primary_indicator);
+        }
+    }
+
+    if cids.is_empty() {
+        return Ok(());
+    }
+
+    // Delete existing links for these courses then re-insert.
+    // This handles instructor changes cleanly.
+    sqlx::query("DELETE FROM course_instructors WHERE course_id = ANY($1)")
+        .bind(&cids)
+        .execute(db_pool)
+        .await?;
+
+    sqlx::query(
+        r#"
+        INSERT INTO course_instructors (course_id, instructor_id, is_primary)
+        SELECT * FROM UNNEST($1::int4[], $2::text[], $3::bool[])
+        ON CONFLICT (course_id, instructor_id)
+        DO UPDATE SET is_primary = EXCLUDED.is_primary
+        "#,
+    )
+    .bind(&cids)
+    .bind(&iids)
+    .bind(&primaries)
+    .execute(db_pool)
+    .await
+    .map_err(|e| anyhow::anyhow!("Failed to batch upsert course_instructors: {}", e))?;

    Ok(())
 }
@@ -0,0 +1,128 @@
+//! Database query functions for courses, used by the web API.
+
+use crate::data::models::Course;
+use crate::error::Result;
+use sqlx::PgPool;
+
+/// Search courses by term with optional filters.
+///
+/// Returns `(courses, total_count)` for pagination. Uses FTS tsvector for word
+/// search and falls back to trigram ILIKE for substring matching.
+#[allow(clippy::too_many_arguments)]
+pub async fn search_courses(
+    db_pool: &PgPool,
+    term_code: &str,
+    subject: Option<&str>,
+    title_query: Option<&str>,
+    course_number_low: Option<i32>,
+    course_number_high: Option<i32>,
+    open_only: bool,
+    instructional_method: Option<&str>,
+    campus: Option<&str>,
+    limit: i32,
+    offset: i32,
+) -> Result<(Vec<Course>, i64)> {
+    // Build WHERE clauses dynamically via parameter binding + COALESCE trick:
+    // each optional filter uses ($N IS NULL OR column = $N) so NULL means "no filter".
+    let courses = sqlx::query_as::<_, Course>(
+        r#"
+        SELECT *
+        FROM courses
+        WHERE term_code = $1
+          AND ($2::text IS NULL OR subject = $2)
+          AND ($3::text IS NULL OR title_search @@ plainto_tsquery('simple', $3) OR title ILIKE '%' || $3 || '%')
+          AND ($4::int IS NULL OR course_number::int >= $4)
+          AND ($5::int IS NULL OR course_number::int <= $5)
+          AND ($6::bool = false OR max_enrollment > enrollment)
+          AND ($7::text IS NULL OR instructional_method = $7)
+          AND ($8::text IS NULL OR campus = $8)
+        ORDER BY subject, course_number, sequence_number
+        LIMIT $9 OFFSET $10
+        "#,
+    )
+    .bind(term_code)
+    .bind(subject)
+    .bind(title_query)
+    .bind(course_number_low)
+    .bind(course_number_high)
+    .bind(open_only)
+    .bind(instructional_method)
+    .bind(campus)
+    .bind(limit)
+    .bind(offset)
+    .fetch_all(db_pool)
+    .await?;
+
+    let total: (i64,) = sqlx::query_as(
+        r#"
+        SELECT COUNT(*)
+        FROM courses
+        WHERE term_code = $1
+          AND ($2::text IS NULL OR subject = $2)
+          AND ($3::text IS NULL OR title_search @@ plainto_tsquery('simple', $3) OR title ILIKE '%' || $3 || '%')
+          AND ($4::int IS NULL OR course_number::int >= $4)
+          AND ($5::int IS NULL OR course_number::int <= $5)
+          AND ($6::bool = false OR max_enrollment > enrollment)
+          AND ($7::text IS NULL OR instructional_method = $7)
+          AND ($8::text IS NULL OR campus = $8)
+        "#,
+    )
+    .bind(term_code)
+    .bind(subject)
+    .bind(title_query)
+    .bind(course_number_low)
+    .bind(course_number_high)
+    .bind(open_only)
+    .bind(instructional_method)
+    .bind(campus)
+    .fetch_one(db_pool)
+    .await?;
+
+    Ok((courses, total.0))
+}
+
+/// Get a single course by CRN and term.
+pub async fn get_course_by_crn(
+    db_pool: &PgPool,
+    crn: &str,
+    term_code: &str,
+) -> Result<Option<Course>> {
+    let course =
+        sqlx::query_as::<_, Course>("SELECT * FROM courses WHERE crn = $1 AND term_code = $2")
+            .bind(crn)
+            .bind(term_code)
+            .fetch_optional(db_pool)
+            .await?;
+    Ok(course)
+}
+
+/// Get instructors for a course by course ID.
+///
+/// Returns `(banner_id, display_name, email, is_primary)` tuples.
+pub async fn get_course_instructors(
+    db_pool: &PgPool,
+    course_id: i32,
+) -> Result<Vec<(String, String, Option<String>, bool)>> {
+    let rows: Vec<(String, String, Option<String>, bool)> = sqlx::query_as(
+        r#"
+        SELECT i.banner_id, i.display_name, i.email, ci.is_primary
+        FROM course_instructors ci
+        JOIN instructors i ON i.banner_id = ci.instructor_id
+        WHERE ci.course_id = $1
+        ORDER BY ci.is_primary DESC, i.display_name
+        "#,
+    )
+    .bind(course_id)
+    .fetch_all(db_pool)
+    .await?;
+    Ok(rows)
+}
+
+/// Get all distinct term codes that have courses in the DB.
+pub async fn get_available_terms(db_pool: &PgPool) -> Result<Vec<String>> {
+    let rows: Vec<(String,)> =
+        sqlx::query_as("SELECT DISTINCT term_code FROM courses ORDER BY term_code DESC")
+            .fetch_all(db_pool)
+            .await?;
+    Ok(rows.into_iter().map(|(tc,)| tc).collect())
+}
@@ -1,5 +1,7 @@
 //! Database models and schema.

 pub mod batch;
+pub mod courses;
 pub mod models;
+pub mod reference;
 pub mod scrape_jobs;
@@ -1,8 +1,31 @@
 //! `sqlx` models for the database schema.

 use chrono::{DateTime, Utc};
+use serde::{Deserialize, Serialize};
 use serde_json::Value;

+/// Represents a meeting time stored as JSONB in the courses table.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct DbMeetingTime {
+    pub begin_time: Option<String>,
+    pub end_time: Option<String>,
+    pub start_date: String,
+    pub end_date: String,
+    pub monday: bool,
+    pub tuesday: bool,
+    pub wednesday: bool,
+    pub thursday: bool,
+    pub friday: bool,
+    pub saturday: bool,
+    pub sunday: bool,
+    pub building: Option<String>,
+    pub building_description: Option<String>,
+    pub room: Option<String>,
+    pub campus: Option<String>,
+    pub meeting_type: String,
+    pub meeting_schedule_type: String,
+}
+
 #[allow(dead_code)]
 #[derive(sqlx::FromRow, Debug, Clone)]
 pub struct Course {
@@ -17,6 +40,46 @@ pub struct Course {
    pub wait_count: i32,
    pub wait_capacity: i32,
    pub last_scraped_at: DateTime<Utc>,
+    // New scalar fields
+    pub sequence_number: Option<String>,
+    pub part_of_term: Option<String>,
+    pub instructional_method: Option<String>,
+    pub campus: Option<String>,
+    pub credit_hours: Option<i32>,
+    pub credit_hour_low: Option<i32>,
+    pub credit_hour_high: Option<i32>,
+    pub cross_list: Option<String>,
+    pub cross_list_capacity: Option<i32>,
+    pub cross_list_count: Option<i32>,
+    pub link_identifier: Option<String>,
+    pub is_section_linked: Option<bool>,
+    // JSONB fields
+    pub meeting_times: Value,
+    pub attributes: Value,
+}
+
+#[allow(dead_code)]
+#[derive(sqlx::FromRow, Debug, Clone)]
+pub struct Instructor {
+    pub banner_id: String,
+    pub display_name: String,
+    pub email: Option<String>,
+}
+
+#[allow(dead_code)]
+#[derive(sqlx::FromRow, Debug, Clone)]
+pub struct CourseInstructor {
+    pub course_id: i32,
+    pub instructor_id: String,
+    pub is_primary: bool,
+}
+
+#[allow(dead_code)]
+#[derive(sqlx::FromRow, Debug, Clone)]
+pub struct ReferenceData {
+    pub category: String,
+    pub code: String,
+    pub description: String,
 }

 #[allow(dead_code)]
@@ -0,0 +1,53 @@
+//! Database operations for the `reference_data` table (code→description lookups).
+
+use crate::data::models::ReferenceData;
+use crate::error::Result;
+use sqlx::PgPool;
+
+/// Batch upsert reference data entries.
+pub async fn batch_upsert(entries: &[ReferenceData], db_pool: &PgPool) -> Result<()> {
+    if entries.is_empty() {
+        return Ok(());
+    }
+
+    let categories: Vec<&str> = entries.iter().map(|e| e.category.as_str()).collect();
+    let codes: Vec<&str> = entries.iter().map(|e| e.code.as_str()).collect();
+    let descriptions: Vec<&str> = entries.iter().map(|e| e.description.as_str()).collect();
+
+    sqlx::query(
+        r#"
+        INSERT INTO reference_data (category, code, description)
+        SELECT * FROM UNNEST($1::text[], $2::text[], $3::text[])
+        ON CONFLICT (category, code)
+        DO UPDATE SET description = EXCLUDED.description
+        "#,
+    )
+    .bind(&categories)
+    .bind(&codes)
+    .bind(&descriptions)
+    .execute(db_pool)
+    .await?;
+
+    Ok(())
+}
+
+/// Get all reference data entries for a category.
+pub async fn get_by_category(category: &str, db_pool: &PgPool) -> Result<Vec<ReferenceData>> {
+    let rows = sqlx::query_as::<_, ReferenceData>(
+        "SELECT category, code, description FROM reference_data WHERE category = $1 ORDER BY description",
+    )
+    .bind(category)
+    .fetch_all(db_pool)
+    .await?;
+    Ok(rows)
+}
+
+/// Get all reference data entries (for cache initialization).
+pub async fn get_all(db_pool: &PgPool) -> Result<Vec<ReferenceData>> {
+    let rows = sqlx::query_as::<_, ReferenceData>(
+        "SELECT category, code, description FROM reference_data ORDER BY category, description",
+    )
+    .fetch_all(db_pool)
+    .await?;
+    Ok(rows)
+}