feat: sync RMP professor ratings and display in course search interface

2026-01-30 22:23:32 -06:00 · 2026-01-29 00:26:40 -06:00
parent 5fab8c216a
commit d108a41f91
17 changed files with 1173 additions and 248 deletions
@@ -98,23 +98,26 @@ pub async fn get_course_by_crn(

 /// Get instructors for a course by course ID.
 ///
-/// Returns `(banner_id, display_name, email, is_primary)` tuples.
+/// Returns `(banner_id, display_name, email, is_primary, rmp_avg_rating, rmp_num_ratings)` tuples.
 pub async fn get_course_instructors(
    db_pool: &PgPool,
    course_id: i32,
-) -> Result<Vec<(String, String, Option<String>, bool)>> {
-    let rows: Vec<(String, String, Option<String>, bool)> = sqlx::query_as(
-        r#"
-        SELECT i.banner_id, i.display_name, i.email, ci.is_primary
+) -> Result<Vec<(String, String, Option<String>, bool, Option<f32>, Option<i32>)>> {
+    let rows: Vec<(String, String, Option<String>, bool, Option<f32>, Option<i32>)> =
+        sqlx::query_as(
+            r#"
+        SELECT i.banner_id, i.display_name, i.email, ci.is_primary,
+               rp.avg_rating, rp.num_ratings
        FROM course_instructors ci
        JOIN instructors i ON i.banner_id = ci.instructor_id
+        LEFT JOIN rmp_professors rp ON rp.legacy_id = i.rmp_legacy_id
        WHERE ci.course_id = $1
        ORDER BY ci.is_primary DESC, i.display_name
        "#,
-    )
-    .bind(course_id)
-    .fetch_all(db_pool)
-    .await?;
+        )
+        .bind(course_id)
+        .fetch_all(db_pool)
+        .await?;
    Ok(rows)
 }

@@ -4,4 +4,5 @@ pub mod batch;
 pub mod courses;
 pub mod models;
 pub mod reference;
+pub mod rmp;
 pub mod scrape_jobs;
@@ -0,0 +1,311 @@
+//! Database operations for RateMyProfessors data.
+
+use crate::error::Result;
+use crate::rmp::RmpProfessor;
+use sqlx::PgPool;
+use std::collections::{HashMap, HashSet};
+use tracing::{debug, info, warn};
+
+/// Bulk upsert RMP professors using the UNNEST pattern.
+///
+/// Deduplicates by `legacy_id` before inserting — the RMP API can return
+/// the same professor on multiple pages.
+pub async fn batch_upsert_rmp_professors(
+    professors: &[RmpProfessor],
+    db_pool: &PgPool,
+) -> Result<()> {
+    if professors.is_empty() {
+        return Ok(());
+    }
+
+    // Deduplicate: keep last occurrence per legacy_id (latest page wins)
+    let mut seen = HashSet::new();
+    let deduped: Vec<&RmpProfessor> = professors
+        .iter()
+        .rev()
+        .filter(|p| seen.insert(p.legacy_id))
+        .collect();
+
+    let legacy_ids: Vec<i32> = deduped.iter().map(|p| p.legacy_id).collect();
+    let graphql_ids: Vec<&str> = deduped.iter().map(|p| p.graphql_id.as_str()).collect();
+    let first_names: Vec<String> = deduped.iter().map(|p| p.first_name.trim().to_string()).collect();
+    let first_name_refs: Vec<&str> = first_names.iter().map(|s| s.as_str()).collect();
+    let last_names: Vec<String> = deduped.iter().map(|p| p.last_name.trim().to_string()).collect();
+    let last_name_refs: Vec<&str> = last_names.iter().map(|s| s.as_str()).collect();
+    let departments: Vec<Option<&str>> = deduped
+        .iter()
+        .map(|p| p.department.as_deref())
+        .collect();
+    let avg_ratings: Vec<Option<f32>> = deduped.iter().map(|p| p.avg_rating).collect();
+    let avg_difficulties: Vec<Option<f32>> = deduped.iter().map(|p| p.avg_difficulty).collect();
+    let num_ratings: Vec<i32> = deduped.iter().map(|p| p.num_ratings).collect();
+    let would_take_again_pcts: Vec<Option<f32>> = deduped
+        .iter()
+        .map(|p| p.would_take_again_pct)
+        .collect();
+
+    sqlx::query(
+        r#"
+        INSERT INTO rmp_professors (
+            legacy_id, graphql_id, first_name, last_name, department,
+            avg_rating, avg_difficulty, num_ratings, would_take_again_pct,
+            last_synced_at
+        )
+        SELECT
+            v.legacy_id, v.graphql_id, v.first_name, v.last_name, v.department,
+            v.avg_rating, v.avg_difficulty, v.num_ratings, v.would_take_again_pct,
+            NOW()
+        FROM UNNEST(
+            $1::int4[], $2::text[], $3::text[], $4::text[], $5::text[],
+            $6::real[], $7::real[], $8::int4[], $9::real[]
+        ) AS v(
+            legacy_id, graphql_id, first_name, last_name, department,
+            avg_rating, avg_difficulty, num_ratings, would_take_again_pct
+        )
+        ON CONFLICT (legacy_id)
+        DO UPDATE SET
+            graphql_id = EXCLUDED.graphql_id,
+            first_name = EXCLUDED.first_name,
+            last_name = EXCLUDED.last_name,
+            department = EXCLUDED.department,
+            avg_rating = EXCLUDED.avg_rating,
+            avg_difficulty = EXCLUDED.avg_difficulty,
+            num_ratings = EXCLUDED.num_ratings,
+            would_take_again_pct = EXCLUDED.would_take_again_pct,
+            last_synced_at = EXCLUDED.last_synced_at
+        "#,
+    )
+    .bind(&legacy_ids)
+    .bind(&graphql_ids)
+    .bind(&first_name_refs)
+    .bind(&last_name_refs)
+    .bind(&departments)
+    .bind(&avg_ratings)
+    .bind(&avg_difficulties)
+    .bind(&num_ratings)
+    .bind(&would_take_again_pcts)
+    .execute(db_pool)
+    .await
+    .map_err(|e| anyhow::anyhow!("Failed to batch upsert RMP professors: {}", e))?;
+
+    Ok(())
+}
+
+/// Normalize a name for matching: lowercase, trim, strip trailing periods.
+fn normalize(s: &str) -> String {
+    s.trim().to_lowercase().trim_end_matches('.').to_string()
+}
+
+/// Parse Banner's "Last, First Middle" display name into (last, first) tokens.
+///
+/// Returns `None` if the format is unparseable (no comma, empty parts).
+fn parse_display_name(display_name: &str) -> Option<(String, String)> {
+    let (last_part, first_part) = display_name.split_once(',')?;
+    let last = normalize(last_part);
+    // Take only the first token of the first-name portion to drop middle names/initials.
+    let first = normalize(first_part.split_whitespace().next()?);
+    if last.is_empty() || first.is_empty() {
+        return None;
+    }
+    Some((last, first))
+}
+
+/// Auto-match instructors to RMP professors by normalized name.
+///
+/// Loads all pending instructors and all RMP professors, then matches in Rust
+/// using normalized name comparison. Only assigns a match when exactly one RMP
+/// professor matches a given instructor.
+pub async fn auto_match_instructors(db_pool: &PgPool) -> Result<u64> {
+    // Load pending instructors
+    let instructors: Vec<(String, String)> = sqlx::query_as(
+        "SELECT banner_id, display_name FROM instructors WHERE rmp_match_status = 'pending'",
+    )
+    .fetch_all(db_pool)
+    .await?;
+
+    if instructors.is_empty() {
+        info!(matched = 0, "No pending instructors to match");
+        return Ok(0);
+    }
+
+    // Load all RMP professors
+    let professors: Vec<(i32, String, String)> = sqlx::query_as(
+        "SELECT legacy_id, first_name, last_name FROM rmp_professors",
+    )
+    .fetch_all(db_pool)
+    .await?;
+
+    // Build a lookup: (normalized_last, normalized_first) -> list of legacy_ids
+    let mut rmp_index: HashMap<(String, String), Vec<i32>> = HashMap::new();
+    for (legacy_id, first, last) in &professors {
+        let key = (normalize(last), normalize(first));
+        rmp_index.entry(key).or_default().push(*legacy_id);
+    }
+
+    // Match each instructor
+    let mut matches: Vec<(i32, String)> = Vec::new(); // (legacy_id, banner_id)
+    let mut no_comma = 0u64;
+    let mut no_match = 0u64;
+    let mut ambiguous = 0u64;
+
+    for (banner_id, display_name) in &instructors {
+        let Some((last, first)) = parse_display_name(display_name) else {
+            no_comma += 1;
+            continue;
+        };
+
+        let key = (last, first);
+        match rmp_index.get(&key) {
+            Some(ids) if ids.len() == 1 => {
+                matches.push((ids[0], banner_id.clone()));
+            }
+            Some(ids) => {
+                ambiguous += 1;
+                debug!(
+                    banner_id,
+                    display_name,
+                    candidates = ids.len(),
+                    "Ambiguous RMP match, skipping"
+                );
+            }
+            None => {
+                no_match += 1;
+            }
+        }
+    }
+
+    if no_comma > 0 || ambiguous > 0 {
+        warn!(
+            total_pending = instructors.len(),
+            no_comma,
+            no_match,
+            ambiguous,
+            matched = matches.len(),
+            "RMP matching diagnostics"
+        );
+    }
+
+    // Batch update matches
+    if matches.is_empty() {
+        info!(matched = 0, "Auto-matched instructors to RMP professors");
+        return Ok(0);
+    }
+
+    let legacy_ids: Vec<i32> = matches.iter().map(|(id, _)| *id).collect();
+    let banner_ids: Vec<&str> = matches.iter().map(|(_, bid)| bid.as_str()).collect();
+
+    let result = sqlx::query(
+        r#"
+        UPDATE instructors i
+        SET
+            rmp_legacy_id = m.legacy_id,
+            rmp_match_status = 'auto'
+        FROM UNNEST($1::int4[], $2::text[]) AS m(legacy_id, banner_id)
+        WHERE i.banner_id = m.banner_id
+        "#,
+    )
+    .bind(&legacy_ids)
+    .bind(&banner_ids)
+    .execute(db_pool)
+    .await
+    .map_err(|e| anyhow::anyhow!("Failed to update instructor RMP matches: {}", e))?;
+
+    let matched = result.rows_affected();
+    info!(matched, "Auto-matched instructors to RMP professors");
+    Ok(matched)
+}
+
+/// Retrieve RMP rating data for an instructor by banner_id.
+///
+/// Returns `(avg_rating, num_ratings)` if the instructor has an RMP match.
+#[allow(dead_code)]
+pub async fn get_instructor_rmp_data(
+    db_pool: &PgPool,
+    banner_id: &str,
+) -> Result<Option<(f32, i32)>> {
+    let row: Option<(f32, i32)> = sqlx::query_as(
+        r#"
+        SELECT rp.avg_rating, rp.num_ratings
+        FROM instructors i
+        JOIN rmp_professors rp ON rp.legacy_id = i.rmp_legacy_id
+        WHERE i.banner_id = $1
+          AND rp.avg_rating IS NOT NULL
+        "#,
+    )
+    .bind(banner_id)
+    .fetch_optional(db_pool)
+    .await?;
+    Ok(row)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn parse_standard_name() {
+        assert_eq!(
+            parse_display_name("Smith, John"),
+            Some(("smith".into(), "john".into()))
+        );
+    }
+
+    #[test]
+    fn parse_name_with_middle() {
+        assert_eq!(
+            parse_display_name("Smith, John David"),
+            Some(("smith".into(), "john".into()))
+        );
+    }
+
+    #[test]
+    fn parse_name_with_middle_initial() {
+        assert_eq!(
+            parse_display_name("Garcia, Maria L."),
+            Some(("garcia".into(), "maria".into()))
+        );
+    }
+
+    #[test]
+    fn parse_name_with_suffix_in_last() {
+        // Banner may encode "Jr." as part of the last name.
+        // normalize() strips trailing periods so "Jr." becomes "jr".
+        assert_eq!(
+            parse_display_name("Smith Jr., James"),
+            Some(("smith jr".into(), "james".into()))
+        );
+    }
+
+    #[test]
+    fn parse_no_comma_returns_none() {
+        assert_eq!(parse_display_name("SingleName"), None);
+    }
+
+    #[test]
+    fn parse_empty_first_returns_none() {
+        assert_eq!(parse_display_name("Smith,"), None);
+    }
+
+    #[test]
+    fn parse_empty_last_returns_none() {
+        assert_eq!(parse_display_name(", John"), None);
+    }
+
+    #[test]
+    fn parse_extra_whitespace() {
+        assert_eq!(
+            parse_display_name("  Doe ,  Jane   Marie  "),
+            Some(("doe".into(), "jane".into()))
+        );
+    }
+
+    #[test]
+    fn normalize_trims_and_lowercases() {
+        assert_eq!(normalize("  FOO  "), "foo");
+    }
+
+    #[test]
+    fn normalize_strips_trailing_period() {
+        assert_eq!(normalize("Jr."), "jr");
+    }
+}
@@ -7,6 +7,7 @@ pub mod data;
 pub mod error;
 pub mod formatter;
 pub mod logging;
+pub mod rmp;
 pub mod scraper;
 pub mod services;
 pub mod signals;
@@ -14,6 +14,7 @@ mod data;
 mod error;
 mod formatter;
 mod logging;
+mod rmp;
 mod scraper;
 mod services;
 mod signals;
@@ -0,0 +1,161 @@
+//! RateMyProfessors GraphQL client for bulk professor data sync.
+
+use anyhow::Result;
+use serde::{Deserialize, Serialize};
+use tracing::{debug, info};
+
+/// UTSA's school ID on RateMyProfessors (base64 of "School-1516").
+const UTSA_SCHOOL_ID: &str = "U2Nob29sLTE1MTY=";
+
+/// Basic auth header value (base64 of "test:test").
+const AUTH_HEADER: &str = "Basic dGVzdDp0ZXN0";
+
+/// GraphQL endpoint.
+const GRAPHQL_URL: &str = "https://www.ratemyprofessors.com/graphql";
+
+/// Page size for paginated fetches.
+const PAGE_SIZE: u32 = 100;
+
+/// A professor record from RateMyProfessors.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct RmpProfessor {
+    pub legacy_id: i32,
+    pub graphql_id: String,
+    pub first_name: String,
+    pub last_name: String,
+    pub department: Option<String>,
+    pub avg_rating: Option<f32>,
+    pub avg_difficulty: Option<f32>,
+    pub num_ratings: i32,
+    pub would_take_again_pct: Option<f32>,
+}
+
+/// Client for fetching professor data from RateMyProfessors.
+pub struct RmpClient {
+    http: reqwest::Client,
+}
+
+impl RmpClient {
+    pub fn new() -> Self {
+        Self {
+            http: reqwest::Client::new(),
+        }
+    }
+
+    /// Fetch all professors for UTSA via paginated GraphQL queries.
+    pub async fn fetch_all_professors(&self) -> Result<Vec<RmpProfessor>> {
+        let mut all = Vec::new();
+        let mut cursor: Option<String> = None;
+
+        loop {
+            let after_clause = match &cursor {
+                Some(c) => format!(r#", after: "{}""#, c),
+                None => String::new(),
+            };
+
+            let query = format!(
+                r#"query {{
+  newSearch {{
+    teachers(query: {{ text: "", schoolID: "{school_id}" }}, first: {page_size}{after}) {{
+      edges {{
+        cursor
+        node {{
+          id
+          legacyId
+          firstName
+          lastName
+          department
+          avgRating
+          avgDifficulty
+          numRatings
+          wouldTakeAgainPercent
+        }}
+      }}
+      pageInfo {{
+        hasNextPage
+        endCursor
+      }}
+    }}
+  }}
+}}"#,
+                school_id = UTSA_SCHOOL_ID,
+                page_size = PAGE_SIZE,
+                after = after_clause,
+            );
+
+            let body = serde_json::json!({ "query": query });
+
+            let resp = self
+                .http
+                .post(GRAPHQL_URL)
+                .header("Authorization", AUTH_HEADER)
+                .json(&body)
+                .send()
+                .await?;
+
+            let status = resp.status();
+            if !status.is_success() {
+                let text = resp.text().await.unwrap_or_default();
+                anyhow::bail!("RMP GraphQL request failed ({status}): {text}");
+            }
+
+            let json: serde_json::Value = resp.json().await?;
+
+            let teachers = &json["data"]["newSearch"]["teachers"];
+            let edges = teachers["edges"]
+                .as_array()
+                .ok_or_else(|| anyhow::anyhow!("Missing edges in RMP response"))?;
+
+            for edge in edges {
+                let node = &edge["node"];
+                let wta = node["wouldTakeAgainPercent"]
+                    .as_f64()
+                    .map(|v| v as f32)
+                    .filter(|&v| v >= 0.0);
+
+                all.push(RmpProfessor {
+                    legacy_id: node["legacyId"]
+                        .as_i64()
+                        .ok_or_else(|| anyhow::anyhow!("Missing legacyId"))?
+                        as i32,
+                    graphql_id: node["id"]
+                        .as_str()
+                        .ok_or_else(|| anyhow::anyhow!("Missing id"))?
+                        .to_string(),
+                    first_name: node["firstName"]
+                        .as_str()
+                        .unwrap_or_default()
+                        .to_string(),
+                    last_name: node["lastName"]
+                        .as_str()
+                        .unwrap_or_default()
+                        .to_string(),
+                    department: node["department"].as_str().map(|s| s.to_string()),
+                    avg_rating: node["avgRating"].as_f64().map(|v| v as f32),
+                    avg_difficulty: node["avgDifficulty"].as_f64().map(|v| v as f32),
+                    num_ratings: node["numRatings"].as_i64().unwrap_or(0) as i32,
+                    would_take_again_pct: wta,
+                });
+            }
+
+            let page_info = &teachers["pageInfo"];
+            let has_next = page_info["hasNextPage"].as_bool().unwrap_or(false);
+
+            if !has_next {
+                break;
+            }
+
+            cursor = page_info["endCursor"]
+                .as_str()
+                .map(|s| s.to_string());
+
+            debug!(
+                fetched = all.len(),
+                "RMP pagination: fetching next page"
+            );
+        }
+
+        info!(total = all.len(), "Fetched all RMP professors");
+        Ok(all)
+    }
+}
@@ -2,6 +2,7 @@ use crate::banner::{BannerApi, Term};
 use crate::data::models::{ReferenceData, ScrapePriority, TargetType};
 use crate::data::scrape_jobs;
 use crate::error::Result;
+use crate::rmp::RmpClient;
 use crate::scraper::jobs::subject::SubjectJob;
 use crate::state::ReferenceCache;
 use serde_json::json;
@@ -16,6 +17,9 @@ use tracing::{debug, error, info, warn};
 /// How often reference data is re-scraped (6 hours).
 const REFERENCE_DATA_INTERVAL: Duration = Duration::from_secs(6 * 60 * 60);

+/// How often RMP data is synced (24 hours).
+const RMP_SYNC_INTERVAL: Duration = Duration::from_secs(24 * 60 * 60);
+
 /// Periodically analyzes data and enqueues prioritized scrape jobs.
 pub struct Scheduler {
    db_pool: PgPool,
@@ -53,6 +57,8 @@ impl Scheduler {
        let mut current_work: Option<(tokio::task::JoinHandle<()>, CancellationToken)> = None;
        // Scrape reference data immediately on first cycle
        let mut last_ref_scrape = Instant::now() - REFERENCE_DATA_INTERVAL;
+        // Sync RMP data immediately on first cycle
+        let mut last_rmp_sync = Instant::now() - RMP_SYNC_INTERVAL;

        loop {
            tokio::select! {
@@ -60,6 +66,7 @@ impl Scheduler {
                    let cancel_token = CancellationToken::new();

                    let should_scrape_ref = last_ref_scrape.elapsed() >= REFERENCE_DATA_INTERVAL;
+                    let should_sync_rmp = last_rmp_sync.elapsed() >= RMP_SYNC_INTERVAL;

                    // Spawn work in separate task to allow graceful cancellation during shutdown.
                    let work_handle = tokio::spawn({
@@ -68,28 +75,47 @@ impl Scheduler {
                        let cancel_token = cancel_token.clone();
                        let reference_cache = self.reference_cache.clone();

-                        async move {
-                            tokio::select! {
-                                _ = async {
-                                    if should_scrape_ref
-                                        && let Err(e) = Self::scrape_reference_data(&db_pool, &banner_api, &reference_cache).await
-                                    {
-                                        error!(error = ?e, "Failed to scrape reference data");
+                                async move {
+                                    tokio::select! {
+                                        _ = async {
+                                            // RMP sync is independent of Banner API — run it
+                                            // concurrently with reference data scraping so it
+                                            // doesn't wait behind rate-limited Banner calls.
+                                            let rmp_fut = async {
+                                                if should_sync_rmp
+                                                    && let Err(e) = Self::sync_rmp_data(&db_pool).await
+                                                {
+                                                    error!(error = ?e, "Failed to sync RMP data");
+                                                }
+                                            };
+
+                                            let ref_fut = async {
+                                                if should_scrape_ref
+                                                    && let Err(e) = Self::scrape_reference_data(&db_pool, &banner_api, &reference_cache).await
+                                                {
+                                                    error!(error = ?e, "Failed to scrape reference data");
+                                                }
+                                            };
+
+                                            tokio::join!(rmp_fut, ref_fut);
+
+                                            if let Err(e) = Self::schedule_jobs_impl(&db_pool, &banner_api).await {
+                                                error!(error = ?e, "Failed to schedule jobs");
+                                            }
+                                        } => {}
+                                        _ = cancel_token.cancelled() => {
+                                            debug!("Scheduling work cancelled gracefully");
+                                        }
                                    }
-                                    if let Err(e) = Self::schedule_jobs_impl(&db_pool, &banner_api).await {
-                                        error!(error = ?e, "Failed to schedule jobs");
-                                    }
-                                } => {}
-                                _ = cancel_token.cancelled() => {
-                                    debug!("Scheduling work cancelled gracefully");
                                }
-                            }
-                        }
                    });

                    if should_scrape_ref {
                        last_ref_scrape = Instant::now();
                    }
+                    if should_sync_rmp {
+                        last_rmp_sync = Instant::now();
+                    }

                    current_work = Some((work_handle, cancel_token));
                    next_run = time::Instant::now() + work_interval;
@@ -194,6 +220,24 @@ impl Scheduler {
        Ok(())
    }

+    /// Fetch all RMP professors, upsert to DB, and auto-match against Banner instructors.
+    #[tracing::instrument(skip_all)]
+    async fn sync_rmp_data(db_pool: &PgPool) -> Result<()> {
+        info!("Starting RMP data sync");
+
+        let client = RmpClient::new();
+        let professors = client.fetch_all_professors().await?;
+        let total = professors.len();
+
+        crate::data::rmp::batch_upsert_rmp_professors(&professors, db_pool).await?;
+        info!(total, "RMP professors upserted");
+
+        let matched = crate::data::rmp::auto_match_instructors(db_pool).await?;
+        info!(total, matched, "RMP sync complete");
+
+        Ok(())
+    }
+
    /// Scrape all reference data categories from Banner and upsert to DB, then refresh cache.
    #[tracing::instrument(skip_all)]
    async fn scrape_reference_data(
@@ -357,6 +357,8 @@ pub struct InstructorResponse {
    display_name: String,
    email: Option<String>,
    is_primary: bool,
+    rmp_rating: Option<f32>,
+    rmp_num_ratings: Option<i32>,
 }

 #[derive(Serialize, TS)]
@@ -387,11 +389,15 @@ async fn build_course_response(
        .unwrap_or_default()
        .into_iter()
        .map(
-            |(banner_id, display_name, email, is_primary)| InstructorResponse {
-                banner_id,
-                display_name,
-                email,
-                is_primary,
+            |(banner_id, display_name, email, is_primary, rmp_rating, rmp_num_ratings)| {
+                InstructorResponse {
+                    banner_id,
+                    display_name,
+                    email,
+                    is_primary,
+                    rmp_rating,
+                    rmp_num_ratings,
+                }
            },
        )
        .collect();