From 1954166db65dbf934c13a148e2897cccec5041ee Mon Sep 17 00:00:00 2001 From: Xevion Date: Fri, 30 Jan 2026 19:44:53 -0600 Subject: [PATCH] feat: add name parsing and normalization for instructor-RMP matching --- Cargo.lock | 67 +- Cargo.toml | 2 + ...0131300000_add_instructor_name_columns.sql | 5 + src/app.rs | 7 +- src/data/batch.rs | 19 +- src/data/mod.rs | 1 + src/data/models.rs | 2 + src/data/names.rs | 728 ++++++++++++++++++ src/data/rmp.rs | 147 ++-- src/data/rmp_matching.rs | 102 ++- src/web/admin_rmp.rs | 60 +- tests/admin_rmp.rs | 102 +++ 12 files changed, 1073 insertions(+), 169 deletions(-) create mode 100644 migrations/20260131300000_add_instructor_name_columns.sql create mode 100644 src/data/names.rs create mode 100644 tests/admin_rmp.rs diff --git a/Cargo.lock b/Cargo.lock index 43e2062..c3a4471 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -292,6 +292,7 @@ dependencies = [ "futures", "governor", "html-escape", + "htmlize", "http 1.3.1", "mime_guess", "num-format", @@ -315,6 +316,7 @@ dependencies = [ "tracing", "tracing-subscriber", "ts-rs", + "unicode-normalization", "url", "urlencoding", "yansi", @@ -492,7 +494,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6139a8597ed92cf816dfb33f5dd6cf0bb93a6adc938f11039f371bc5bcd26c3" dependencies = [ "chrono", - "phf", + "phf 0.12.1", ] [[package]] @@ -1348,6 +1350,19 @@ dependencies = [ "utf8-width", ] +[[package]] +name = "htmlize" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d347c0de239be20ba0982e4822de3124404281e119ae3e11f5d7425a414e1935" +dependencies = [ + "memchr", + "pastey", + "phf 0.11.3", + "phf_codegen", + "serde_json", +] + [[package]] name = "http" version = "0.2.12" @@ -2100,6 +2115,12 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "pastey" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35fb2e5f958ec131621fdd531e9fc186ed768cbe395337403ae56c17a74c68ec" + [[package]] name = "pear" version = "0.2.9" @@ -2138,13 +2159,51 @@ version = "2.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" +[[package]] +name = "phf" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" +dependencies = [ + "phf_shared 0.11.3", +] + [[package]] name = "phf" version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "913273894cec178f401a31ec4b656318d95473527be05c0752cc41cdc32be8b7" dependencies = [ - "phf_shared", + "phf_shared 0.12.1", +] + +[[package]] +name = "phf_codegen" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a" +dependencies = [ + "phf_generator", + "phf_shared 0.11.3", +] + +[[package]] +name = "phf_generator" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" +dependencies = [ + "phf_shared 0.11.3", + "rand 0.8.5", +] + +[[package]] +name = "phf_shared" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" +dependencies = [ + "siphasher", ] [[package]] @@ -3982,9 +4041,9 @@ checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" [[package]] name = "unicode-normalization" -version = "0.1.24" +version = "0.1.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956" +checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8" dependencies = [ "tinyvec", ] diff --git a/Cargo.toml b/Cargo.toml index a261c67..211f57b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -60,6 +60,8 @@ html-escape = "0.2.13" axum-extra = { version = "0.12.5", features = ["query"] } urlencoding = "2.1.3" chrono-tz = "0.10.4" +htmlize = { version = "1.0.6", features = ["unescape"] } +unicode-normalization = "0.1.25" [dev-dependencies] diff --git a/migrations/20260131300000_add_instructor_name_columns.sql b/migrations/20260131300000_add_instructor_name_columns.sql new file mode 100644 index 0000000..4380389 --- /dev/null +++ b/migrations/20260131300000_add_instructor_name_columns.sql @@ -0,0 +1,5 @@ +-- Add structured first/last name columns to instructors. +-- Populated by Rust-side backfill (parse_banner_name) since we need +-- HTML entity decoding and suffix extraction that SQL can't handle well. +ALTER TABLE instructors ADD COLUMN first_name VARCHAR; +ALTER TABLE instructors ADD COLUMN last_name VARCHAR; diff --git a/src/app.rs b/src/app.rs index 63723ff..f83547f 100644 --- a/src/app.rs +++ b/src/app.rs @@ -14,7 +14,7 @@ use sqlx::postgres::PgPoolOptions; use std::process::ExitCode; use std::sync::Arc; use std::time::Duration; -use tracing::{error, info}; +use tracing::{error, info, warn}; /// Main application struct containing all necessary components pub struct App { @@ -70,6 +70,11 @@ impl App { .context("Failed to run database migrations")?; info!("Database migrations completed successfully"); + // Backfill structured name columns for existing instructors + if let Err(e) = crate::data::names::backfill_instructor_names(&db_pool).await { + warn!(error = ?e, "Failed to backfill instructor names (non-fatal)"); + } + // Create BannerApi and AppState let banner_api = BannerApi::new_with_config( config.banner_base_url.clone(), diff --git a/src/data/batch.rs b/src/data/batch.rs index c3c71d8..d433471 100644 --- a/src/data/batch.rs +++ b/src/data/batch.rs @@ -2,6 +2,7 @@ use crate::banner::Course; use crate::data::models::{DbMeetingTime, UpsertCounts}; +use crate::data::names::parse_banner_name; use crate::error::Result; use sqlx::PgConnection; use sqlx::PgPool; @@ -628,6 +629,8 @@ async fn upsert_instructors( ) -> Result> { let mut seen = HashSet::new(); let mut display_names: Vec<&str> = Vec::new(); + let mut first_names: Vec> = Vec::new(); + let mut last_names: Vec> = Vec::new(); let mut emails_lower: Vec = Vec::new(); let mut skipped_no_email = 0u32; @@ -636,7 +639,10 @@ async fn upsert_instructors( if let Some(email) = &faculty.email_address { let email_lower = email.to_lowercase(); if seen.insert(email_lower.clone()) { + let parts = parse_banner_name(&faculty.display_name); display_names.push(faculty.display_name.as_str()); + first_names.push(parts.as_ref().map(|p| p.first.clone())); + last_names.push(parts.as_ref().map(|p| p.last.clone())); emails_lower.push(email_lower); } } else { @@ -657,18 +663,25 @@ async fn upsert_instructors( } let email_refs: Vec<&str> = emails_lower.iter().map(|s| s.as_str()).collect(); + let first_name_refs: Vec> = first_names.iter().map(|s| s.as_deref()).collect(); + let last_name_refs: Vec> = last_names.iter().map(|s| s.as_deref()).collect(); let rows: Vec<(i32, String)> = sqlx::query_as( r#" - INSERT INTO instructors (display_name, email) - SELECT * FROM UNNEST($1::text[], $2::text[]) + INSERT INTO instructors (display_name, email, first_name, last_name) + SELECT * FROM UNNEST($1::text[], $2::text[], $3::text[], $4::text[]) ON CONFLICT (email) - DO UPDATE SET display_name = EXCLUDED.display_name + DO UPDATE SET + display_name = EXCLUDED.display_name, + first_name = EXCLUDED.first_name, + last_name = EXCLUDED.last_name RETURNING id, email "#, ) .bind(&display_names) .bind(&email_refs) + .bind(&first_name_refs) + .bind(&last_name_refs) .fetch_all(&mut *conn) .await .map_err(|e| anyhow::anyhow!("Failed to batch upsert instructors: {}", e))?; diff --git a/src/data/mod.rs b/src/data/mod.rs index 8b984db..8e101c9 100644 --- a/src/data/mod.rs +++ b/src/data/mod.rs @@ -3,6 +3,7 @@ pub mod batch; pub mod courses; pub mod models; +pub mod names; pub mod reference; pub mod rmp; pub mod rmp_matching; diff --git a/src/data/models.rs b/src/data/models.rs index 37abd64..4184e78 100644 --- a/src/data/models.rs +++ b/src/data/models.rs @@ -103,6 +103,8 @@ pub struct Instructor { pub display_name: String, pub email: String, pub rmp_match_status: String, + pub first_name: Option, + pub last_name: Option, } #[allow(dead_code)] diff --git a/src/data/names.rs b/src/data/names.rs new file mode 100644 index 0000000..695dbdb --- /dev/null +++ b/src/data/names.rs @@ -0,0 +1,728 @@ +//! Name parsing, normalization, and matching utilities. +//! +//! Handles the mismatch between Banner's single `display_name` ("Last, First Middle") +//! and RMP's separate `first_name`/`last_name` fields, plus data quality issues +//! from both sources (HTML entities, accents, nicknames, suffixes, junk). + +use sqlx::PgPool; +use tracing::{info, warn}; +use unicode_normalization::UnicodeNormalization; + +/// Known name suffixes to extract from the last-name portion. +const SUFFIXES: &[&str] = &["iv", "iii", "ii", "jr", "sr"]; + +/// Parsed, cleaned name components. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct NameParts { + /// Cleaned display-quality first name(s): "H. Paul", "María" + pub first: String, + /// Cleaned display-quality last name: "O'Brien", "LeBlanc" + pub last: String, + /// Middle name/initial if detected: "Manuel", "L." + pub middle: Option, + /// Suffix if detected: "III", "Jr" + pub suffix: Option, + /// Nicknames extracted from parentheses: ["Ken"], ["Qian"] + pub nicknames: Vec, +} + +/// Decode common HTML entities found in Banner data. +/// +/// Handles both named entities (`&`, `ü`) and numeric references +/// (`'`, `'`). +fn decode_html_entities(s: &str) -> String { + if !s.contains('&') { + return s.to_string(); + } + htmlize::unescape(s).to_string() +} + +/// Extract parenthesized nicknames from a name string. +/// +/// `"William (Ken)"` → `("William", vec!["Ken"])` +/// `"Guenevere (Qian)"` → `("Guenevere", vec!["Qian"])` +/// `"John (jack) C."` → `("John C.", vec!["jack"])` +fn extract_nicknames(s: &str) -> (String, Vec) { + let mut nicknames = Vec::new(); + let mut cleaned = String::with_capacity(s.len()); + let mut chars = s.chars().peekable(); + + while let Some(ch) = chars.next() { + if ch == '(' { + let mut nick = String::new(); + for inner in chars.by_ref() { + if inner == ')' { + break; + } + nick.push(inner); + } + let nick = nick.trim().to_string(); + if !nick.is_empty() { + nicknames.push(nick); + } + } else if ch == '"' || ch == '\u{201C}' || ch == '\u{201D}' { + // Extract quoted nicknames: Thomas "Butch" → nickname "Butch" + let mut nick = String::new(); + for inner in chars.by_ref() { + if inner == '"' || inner == '\u{201C}' || inner == '\u{201D}' { + break; + } + nick.push(inner); + } + let nick = nick.trim().to_string(); + if !nick.is_empty() { + nicknames.push(nick); + } + } else { + cleaned.push(ch); + } + } + + // Collapse multiple spaces left by extraction + let cleaned = collapse_whitespace(&cleaned); + (cleaned, nicknames) +} + +/// Extract a suffix (Jr, Sr, II, III, IV) from the last-name portion. +/// +/// `"LeBlanc III"` → `("LeBlanc", Some("III"))` +/// `"Smith Jr."` → `("Smith", Some("Jr."))` +fn extract_suffix(last: &str) -> (String, Option) { + // Try to match the last token as a suffix + let tokens: Vec<&str> = last.split_whitespace().collect(); + if tokens.len() < 2 { + return (last.to_string(), None); + } + + let candidate = tokens.last().unwrap(); + let candidate_normalized = candidate.to_lowercase().trim_end_matches('.').to_string(); + + if SUFFIXES.contains(&candidate_normalized.as_str()) { + let name_part = tokens[..tokens.len() - 1].join(" "); + return (name_part, Some(candidate.to_string())); + } + + (last.to_string(), None) +} + +/// Strip junk commonly found in RMP name fields. +/// +/// - Trailing commas: `"Cronenberger,"` → `"Cronenberger"` +/// - Email addresses: `"Neel.Baumgardner@utsa.edu"` → `""` (returns empty) +fn strip_junk(s: &str) -> String { + let s = s.trim(); + + // If the string looks like an email, return empty + if s.contains('@') && s.contains('.') && !s.contains(' ') { + return String::new(); + } + + // Strip trailing commas + s.trim_end_matches(',').trim().to_string() +} + +/// Collapse runs of whitespace into single spaces and trim. +fn collapse_whitespace(s: &str) -> String { + s.split_whitespace().collect::>().join(" ") +} + +/// Parse a Banner `display_name` ("Last, First Middle") into structured parts. +/// +/// Handles HTML entities, suffixes, and multi-token names. +/// +/// # Examples +/// +/// ``` +/// use banner::data::names::parse_banner_name; +/// +/// let parts = parse_banner_name("O'Brien, Erin").unwrap(); +/// assert_eq!(parts.first, "Erin"); +/// assert_eq!(parts.last, "O'Brien"); +/// ``` +pub fn parse_banner_name(display_name: &str) -> Option { + // 1. Decode HTML entities + let decoded = decode_html_entities(display_name); + + // 2. Split on first comma + let (last_part, first_part) = decoded.split_once(',')?; + let last_part = last_part.trim(); + let first_part = first_part.trim(); + + if last_part.is_empty() || first_part.is_empty() { + return None; + } + + // 3. Extract suffix from last name + let (last_clean, suffix) = extract_suffix(last_part); + + // 4. Parse first-name portion: first token(s) + optional middle + // Banner format is "First Middle", so we keep all tokens as first_name + // to support "H. Paul" style names + let first_clean = collapse_whitespace(first_part); + + Some(NameParts { + first: first_clean, + last: last_clean, + middle: None, // Banner doesn't clearly delineate middle vs first + suffix, + nicknames: Vec::new(), // Banner doesn't include nicknames + }) +} + +/// Parse RMP professor name fields into structured parts. +/// +/// Handles junk data, nicknames in parentheses/quotes, and suffixes. +/// +/// # Examples +/// +/// ``` +/// use banner::data::names::parse_rmp_name; +/// +/// let parts = parse_rmp_name("William (Ken)", "Burchenal").unwrap(); +/// assert_eq!(parts.first, "William"); +/// assert_eq!(parts.nicknames, vec!["Ken"]); +/// ``` +pub fn parse_rmp_name(first_name: &str, last_name: &str) -> Option { + let first_cleaned = strip_junk(first_name); + let last_cleaned = strip_junk(last_name); + + if first_cleaned.is_empty() || last_cleaned.is_empty() { + return None; + } + + // Extract nicknames from parens/quotes in first name + let (first_no_nicks, nicknames) = extract_nicknames(&first_cleaned); + let first_final = collapse_whitespace(&first_no_nicks); + + // Extract suffix from last name + let (last_final, suffix) = extract_suffix(&last_cleaned); + + if first_final.is_empty() || last_final.is_empty() { + return None; + } + + Some(NameParts { + first: first_final, + last: last_final, + middle: None, + suffix, + nicknames, + }) +} + +/// Normalize a name string for matching comparison. +/// +/// Pipeline: lowercase → NFD decompose → strip combining marks → +/// strip punctuation/hyphens → collapse whitespace → trim. +/// +/// # Examples +/// +/// ``` +/// use banner::data::names::normalize_for_matching; +/// +/// assert_eq!(normalize_for_matching("García"), "garcia"); +/// assert_eq!(normalize_for_matching("O'Brien"), "obrien"); +/// assert_eq!(normalize_for_matching("Aguirre-Mesa"), "aguirremesa"); +/// ``` +/// Normalize a name string for matching index keys. +/// +/// Pipeline: lowercase → NFD decompose → strip combining marks → +/// strip ALL punctuation, hyphens, and whitespace. +/// +/// This produces a compact, space-free string so that "Aguirre Mesa" (Banner) +/// and "Aguirre-Mesa" (RMP) both become "aguirremesa". +/// +/// # Examples +/// +/// ``` +/// use banner::data::names::normalize_for_matching; +/// +/// assert_eq!(normalize_for_matching("García"), "garcia"); +/// assert_eq!(normalize_for_matching("O'Brien"), "obrien"); +/// assert_eq!(normalize_for_matching("Aguirre-Mesa"), "aguirremesa"); +/// assert_eq!(normalize_for_matching("Aguirre Mesa"), "aguirremesa"); +/// ``` +pub fn normalize_for_matching(s: &str) -> String { + s.to_lowercase() + .nfd() + .filter(|c| { + // Keep only non-combining alphabetic characters — strip everything else + c.is_alphabetic() && !unicode_normalization::char::is_combining_mark(*c) + }) + .collect() +} + +/// Generate all matching index keys for a parsed name. +/// +/// For a name like "H. Paul" / "LeBlanc" with no nicknames, generates: +/// - `("leblanc", "h paul")` — full normalized first +/// - `("leblanc", "paul")` — individual token (if multi-token) +/// - `("leblanc", "h")` — individual token (if multi-token) +/// +/// For a name like "William" / "Burchenal" with nickname "Ken": +/// - `("burchenal", "william")` — primary +/// - `("burchenal", "ken")` — nickname variant +pub fn matching_keys(parts: &NameParts) -> Vec<(String, String)> { + let norm_last = normalize_for_matching(&parts.last); + if norm_last.is_empty() { + return Vec::new(); + } + + let mut keys = Vec::new(); + let mut seen = std::collections::HashSet::new(); + + // Primary key: full first name (all spaces stripped) + let norm_first_full = normalize_for_matching(&parts.first); + if !norm_first_full.is_empty() && seen.insert(norm_first_full.clone()) { + keys.push((norm_last.clone(), norm_first_full)); + } + + // Individual tokens from the display-form first name + // (split before full normalization so we can generate per-token keys) + let first_tokens: Vec<&str> = parts.first.split_whitespace().collect(); + if first_tokens.len() > 1 { + for token in &first_tokens { + let norm_token = normalize_for_matching(token); + if !norm_token.is_empty() && seen.insert(norm_token.clone()) { + keys.push((norm_last.clone(), norm_token)); + } + } + } + + // Nickname variants + for nick in &parts.nicknames { + let norm_nick = normalize_for_matching(nick); + if !norm_nick.is_empty() && seen.insert(norm_nick.clone()) { + keys.push((norm_last.clone(), norm_nick)); + } + } + + keys +} + +/// Backfill `first_name`/`last_name` columns for all instructors that have +/// a `display_name` but NULL structured name fields. +/// +/// Parses each `display_name` using [`parse_banner_name`] and updates the row. +/// Logs warnings for any names that fail to parse. +pub async fn backfill_instructor_names(db_pool: &PgPool) -> crate::error::Result<()> { + let rows: Vec<(i32, String)> = sqlx::query_as( + "SELECT id, display_name FROM instructors WHERE first_name IS NULL OR last_name IS NULL", + ) + .fetch_all(db_pool) + .await?; + + if rows.is_empty() { + return Ok(()); + } + + let total = rows.len(); + let mut ids: Vec = Vec::with_capacity(total); + let mut firsts: Vec = Vec::with_capacity(total); + let mut lasts: Vec = Vec::with_capacity(total); + let mut unparseable = 0usize; + + for (id, display_name) in &rows { + match parse_banner_name(display_name) { + Some(parts) => { + ids.push(*id); + firsts.push(parts.first); + lasts.push(parts.last); + } + None => { + warn!( + id, + display_name, "Failed to parse instructor display_name during backfill" + ); + unparseable += 1; + } + } + } + + if !ids.is_empty() { + let first_refs: Vec<&str> = firsts.iter().map(|s| s.as_str()).collect(); + let last_refs: Vec<&str> = lasts.iter().map(|s| s.as_str()).collect(); + + sqlx::query( + r#" + UPDATE instructors i + SET first_name = v.first_name, last_name = v.last_name + FROM UNNEST($1::int4[], $2::text[], $3::text[]) + AS v(id, first_name, last_name) + WHERE i.id = v.id + "#, + ) + .bind(&ids) + .bind(&first_refs) + .bind(&last_refs) + .execute(db_pool) + .await?; + } + + info!( + total, + updated = ids.len(), + unparseable, + "Instructor name backfill complete" + ); + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + // ----------------------------------------------------------------------- + // HTML entity decoding + // ----------------------------------------------------------------------- + + #[test] + fn decode_apostrophe_entity() { + assert_eq!(decode_html_entities("O'Brien"), "O'Brien"); + } + + #[test] + fn decode_umlaut_entity() { + assert_eq!(decode_html_entities("Bülent"), "Bülent"); + } + + #[test] + fn decode_no_entities() { + assert_eq!(decode_html_entities("Smith"), "Smith"); + } + + // ----------------------------------------------------------------------- + // Nickname extraction + // ----------------------------------------------------------------------- + + #[test] + fn extract_paren_nickname() { + let (cleaned, nicks) = extract_nicknames("William (Ken)"); + assert_eq!(cleaned, "William"); + assert_eq!(nicks, vec!["Ken"]); + } + + #[test] + fn extract_quoted_nickname() { + let (cleaned, nicks) = extract_nicknames("Thomas \"Butch\""); + assert_eq!(cleaned, "Thomas"); + assert_eq!(nicks, vec!["Butch"]); + } + + #[test] + fn extract_paren_with_extra_text() { + let (cleaned, nicks) = extract_nicknames("John (jack) C."); + assert_eq!(cleaned, "John C."); + assert_eq!(nicks, vec!["jack"]); + } + + #[test] + fn extract_no_nicknames() { + let (cleaned, nicks) = extract_nicknames("Maria Elena"); + assert_eq!(cleaned, "Maria Elena"); + assert!(nicks.is_empty()); + } + + // ----------------------------------------------------------------------- + // Suffix extraction + // ----------------------------------------------------------------------- + + #[test] + fn extract_suffix_iii() { + let (name, suffix) = extract_suffix("LeBlanc III"); + assert_eq!(name, "LeBlanc"); + assert_eq!(suffix, Some("III".to_string())); + } + + #[test] + fn extract_suffix_jr_period() { + let (name, suffix) = extract_suffix("Smith Jr."); + assert_eq!(name, "Smith"); + assert_eq!(suffix, Some("Jr.".to_string())); + } + + #[test] + fn extract_no_suffix() { + let (name, suffix) = extract_suffix("García"); + assert_eq!(name, "García"); + assert_eq!(suffix, None); + } + + // ----------------------------------------------------------------------- + // Junk stripping + // ----------------------------------------------------------------------- + + #[test] + fn strip_trailing_comma() { + assert_eq!(strip_junk("Cronenberger,"), "Cronenberger"); + } + + #[test] + fn strip_email_address() { + assert_eq!(strip_junk("Neel.Baumgardner@utsa.edu"), ""); + } + + #[test] + fn strip_clean_name() { + assert_eq!(strip_junk(" Maria "), "Maria"); + } + + // ----------------------------------------------------------------------- + // normalize_for_matching + // ----------------------------------------------------------------------- + + #[test] + fn normalize_strips_accents() { + assert_eq!(normalize_for_matching("García"), "garcia"); + } + + #[test] + fn normalize_strips_apostrophe() { + assert_eq!(normalize_for_matching("O'Brien"), "obrien"); + } + + #[test] + fn normalize_strips_hyphen() { + assert_eq!(normalize_for_matching("Aguirre-Mesa"), "aguirremesa"); + } + + #[test] + fn normalize_tilde_n() { + assert_eq!(normalize_for_matching("Muñoz"), "munoz"); + } + + #[test] + fn normalize_umlaut() { + assert_eq!(normalize_for_matching("Müller"), "muller"); + } + + #[test] + fn normalize_period() { + assert_eq!(normalize_for_matching("H. Paul"), "hpaul"); + } + + #[test] + fn normalize_strips_spaces() { + assert_eq!(normalize_for_matching("Mary Lou"), "marylou"); + } + + // ----------------------------------------------------------------------- + // parse_banner_name + // ----------------------------------------------------------------------- + + #[test] + fn banner_standard_name() { + let p = parse_banner_name("Smith, John").unwrap(); + assert_eq!(p.first, "John"); + assert_eq!(p.last, "Smith"); + assert_eq!(p.suffix, None); + } + + #[test] + fn banner_html_entity_apostrophe() { + let p = parse_banner_name("O'Brien, Erin").unwrap(); + assert_eq!(p.first, "Erin"); + assert_eq!(p.last, "O'Brien"); + } + + #[test] + fn banner_html_entity_umlaut() { + let p = parse_banner_name("Temel, Bülent").unwrap(); + assert_eq!(p.first, "Bülent"); + assert_eq!(p.last, "Temel"); + } + + #[test] + fn banner_suffix_iii() { + let p = parse_banner_name("LeBlanc III, H. Paul").unwrap(); + assert_eq!(p.first, "H. Paul"); + assert_eq!(p.last, "LeBlanc"); + assert_eq!(p.suffix, Some("III".to_string())); + } + + #[test] + fn banner_suffix_ii() { + let p = parse_banner_name("Ellis II, Ronald").unwrap(); + assert_eq!(p.first, "Ronald"); + assert_eq!(p.last, "Ellis"); + assert_eq!(p.suffix, Some("II".to_string())); + } + + #[test] + fn banner_multi_word_last() { + let p = parse_banner_name("Aguirre Mesa, Andres").unwrap(); + assert_eq!(p.first, "Andres"); + assert_eq!(p.last, "Aguirre Mesa"); + } + + #[test] + fn banner_hyphenated_last() { + let p = parse_banner_name("Abu-Lail, Nehal").unwrap(); + assert_eq!(p.first, "Nehal"); + assert_eq!(p.last, "Abu-Lail"); + } + + #[test] + fn banner_with_middle_name() { + let p = parse_banner_name("Smith, John David").unwrap(); + assert_eq!(p.first, "John David"); + assert_eq!(p.last, "Smith"); + } + + #[test] + fn banner_no_comma() { + assert!(parse_banner_name("SingleName").is_none()); + } + + #[test] + fn banner_empty_first() { + assert!(parse_banner_name("Smith,").is_none()); + } + + #[test] + fn banner_empty_last() { + assert!(parse_banner_name(", John").is_none()); + } + + // ----------------------------------------------------------------------- + // parse_rmp_name + // ----------------------------------------------------------------------- + + #[test] + fn rmp_standard_name() { + let p = parse_rmp_name("John", "Smith").unwrap(); + assert_eq!(p.first, "John"); + assert_eq!(p.last, "Smith"); + } + + #[test] + fn rmp_with_nickname() { + let p = parse_rmp_name("William (Ken)", "Burchenal").unwrap(); + assert_eq!(p.first, "William"); + assert_eq!(p.nicknames, vec!["Ken"]); + } + + #[test] + fn rmp_trailing_comma_last() { + let p = parse_rmp_name("J.", "Cronenberger,").unwrap(); + assert_eq!(p.last, "Cronenberger"); + } + + #[test] + fn rmp_email_in_first() { + assert!(parse_rmp_name("Neel.Baumgardner@utsa.edu", "Baumgardner").is_none()); + } + + #[test] + fn rmp_suffix_in_last() { + let p = parse_rmp_name("H. Paul", "LeBlanc III").unwrap(); + assert_eq!(p.first, "H. Paul"); + assert_eq!(p.last, "LeBlanc"); + assert_eq!(p.suffix, Some("III".to_string())); + } + + #[test] + fn rmp_quoted_nickname() { + let p = parse_rmp_name("Thomas \"Butch\"", "Matjeka").unwrap(); + assert_eq!(p.first, "Thomas"); + assert_eq!(p.nicknames, vec!["Butch"]); + } + + #[test] + fn rmp_accented_last() { + let p = parse_rmp_name("Liliana", "Saldaña").unwrap(); + assert_eq!(p.last, "Saldaña"); + } + + // ----------------------------------------------------------------------- + // matching_keys + // ----------------------------------------------------------------------- + + #[test] + fn keys_simple_name() { + let parts = NameParts { + first: "John".into(), + last: "Smith".into(), + middle: None, + suffix: None, + nicknames: vec![], + }; + let keys = matching_keys(&parts); + assert_eq!(keys, vec![("smith".into(), "john".into())]); + } + + #[test] + fn keys_multi_token_first() { + let parts = NameParts { + first: "H. Paul".into(), + last: "LeBlanc".into(), + middle: None, + suffix: Some("III".into()), + nicknames: vec![], + }; + let keys = matching_keys(&parts); + assert!(keys.contains(&("leblanc".into(), "hpaul".into()))); + assert!(keys.contains(&("leblanc".into(), "paul".into()))); + assert!(keys.contains(&("leblanc".into(), "h".into()))); + assert_eq!(keys.len(), 3); + } + + #[test] + fn keys_with_nickname() { + let parts = NameParts { + first: "William".into(), + last: "Burchenal".into(), + middle: None, + suffix: None, + nicknames: vec!["Ken".into()], + }; + let keys = matching_keys(&parts); + assert!(keys.contains(&("burchenal".into(), "william".into()))); + assert!(keys.contains(&("burchenal".into(), "ken".into()))); + assert_eq!(keys.len(), 2); + } + + #[test] + fn keys_hyphenated_last() { + let parts = parse_banner_name("Aguirre-Mesa, Andres").unwrap(); + let keys = matching_keys(&parts); + // Hyphen removed: "aguirremesa" + assert!(keys.contains(&("aguirremesa".into(), "andres".into()))); + } + + #[test] + fn keys_accented_name() { + let parts = parse_rmp_name("Liliana", "Saldaña").unwrap(); + let keys = matching_keys(&parts); + assert!(keys.contains(&("saldana".into(), "liliana".into()))); + } + + #[test] + fn keys_cross_source_match() { + // Banner: "Aguirre Mesa, Andres" → last="Aguirre Mesa" + let banner = parse_banner_name("Aguirre Mesa, Andres").unwrap(); + let banner_keys = matching_keys(&banner); + + // RMP: "Andres" / "Aguirre-Mesa" → last="Aguirre-Mesa" + let rmp = parse_rmp_name("Andres", "Aguirre-Mesa").unwrap(); + let rmp_keys = matching_keys(&rmp); + + // Both should normalize to ("aguirremesa", "andres") + assert!(banner_keys.iter().any(|k| rmp_keys.contains(k))); + } + + #[test] + fn keys_accent_cross_match() { + // Banner: "García, José" (if Banner ever has accents) + let banner = parse_banner_name("Garcia, Jose").unwrap(); + let banner_keys = matching_keys(&banner); + + // RMP: "José" / "García" + let rmp = parse_rmp_name("José", "García").unwrap(); + let rmp_keys = matching_keys(&rmp); + + // Both normalize to ("garcia", "jose") + assert!(banner_keys.iter().any(|k| rmp_keys.contains(k))); + } +} diff --git a/src/data/rmp.rs b/src/data/rmp.rs index 7d14c25..20f5ad8 100644 --- a/src/data/rmp.rs +++ b/src/data/rmp.rs @@ -91,25 +91,6 @@ pub async fn batch_upsert_rmp_professors( Ok(()) } -/// Normalize a name for matching: lowercase, trim, strip trailing periods. -pub(crate) fn normalize(s: &str) -> String { - s.trim().to_lowercase().trim_end_matches('.').to_string() -} - -/// Parse Banner's "Last, First Middle" display name into (last, first) tokens. -/// -/// Returns `None` if the format is unparseable (no comma, empty parts). -pub(crate) fn parse_display_name(display_name: &str) -> Option<(String, String)> { - let (last_part, first_part) = display_name.split_once(',')?; - let last = normalize(last_part); - // Take only the first token of the first-name portion to drop middle names/initials. - let first = normalize(first_part.split_whitespace().next()?); - if last.is_empty() || first.is_empty() { - return None; - } - Some((last, first)) -} - /// Retrieve RMP rating data for an instructor by instructor id. /// /// Returns `(avg_rating, num_ratings)` for the best linked RMP profile @@ -136,74 +117,76 @@ pub async fn get_instructor_rmp_data( Ok(row) } -#[cfg(test)] -mod tests { - use super::*; +/// Unmatch an instructor from an RMP profile. +/// +/// Removes the link from `instructor_rmp_links` and updates the instructor's +/// `rmp_match_status` to 'unmatched' if no links remain. +/// +/// If `rmp_legacy_id` is `Some`, removes only that specific link. +/// If `None`, removes all links for the instructor. +pub async fn unmatch_instructor( + db_pool: &PgPool, + instructor_id: i32, + rmp_legacy_id: Option, +) -> Result<()> { + let mut tx = db_pool.begin().await?; - #[test] - fn parse_standard_name() { - assert_eq!( - parse_display_name("Smith, John"), - Some(("smith".into(), "john".into())) - ); + // Delete specific link or all links + if let Some(legacy_id) = rmp_legacy_id { + sqlx::query( + "DELETE FROM instructor_rmp_links WHERE instructor_id = $1 AND rmp_legacy_id = $2", + ) + .bind(instructor_id) + .bind(legacy_id) + .execute(&mut *tx) + .await?; + } else { + sqlx::query("DELETE FROM instructor_rmp_links WHERE instructor_id = $1") + .bind(instructor_id) + .execute(&mut *tx) + .await?; } - #[test] - fn parse_name_with_middle() { - assert_eq!( - parse_display_name("Smith, John David"), - Some(("smith".into(), "john".into())) - ); + // Check if any links remain + let (remaining,): (i64,) = + sqlx::query_as("SELECT COUNT(*) FROM instructor_rmp_links WHERE instructor_id = $1") + .bind(instructor_id) + .fetch_one(&mut *tx) + .await?; + + // Update instructor status if no links remain + if remaining == 0 { + sqlx::query("UPDATE instructors SET rmp_match_status = 'unmatched' WHERE id = $1") + .bind(instructor_id) + .execute(&mut *tx) + .await?; } - #[test] - fn parse_name_with_middle_initial() { - assert_eq!( - parse_display_name("Garcia, Maria L."), - Some(("garcia".into(), "maria".into())) - ); + // Reset accepted candidates back to pending when unmatching + // This allows the candidates to be re-matched later + if let Some(legacy_id) = rmp_legacy_id { + // Reset only the specific candidate + sqlx::query( + "UPDATE rmp_match_candidates + SET status = 'pending', resolved_at = NULL, resolved_by = NULL + WHERE instructor_id = $1 AND rmp_legacy_id = $2 AND status = 'accepted'", + ) + .bind(instructor_id) + .bind(legacy_id) + .execute(&mut *tx) + .await?; + } else { + // Reset all accepted candidates for this instructor + sqlx::query( + "UPDATE rmp_match_candidates + SET status = 'pending', resolved_at = NULL, resolved_by = NULL + WHERE instructor_id = $1 AND status = 'accepted'", + ) + .bind(instructor_id) + .execute(&mut *tx) + .await?; } - #[test] - fn parse_name_with_suffix_in_last() { - // Banner may encode "Jr." as part of the last name. - // normalize() strips trailing periods so "Jr." becomes "jr". - assert_eq!( - parse_display_name("Smith Jr., James"), - Some(("smith jr".into(), "james".into())) - ); - } - - #[test] - fn parse_no_comma_returns_none() { - assert_eq!(parse_display_name("SingleName"), None); - } - - #[test] - fn parse_empty_first_returns_none() { - assert_eq!(parse_display_name("Smith,"), None); - } - - #[test] - fn parse_empty_last_returns_none() { - assert_eq!(parse_display_name(", John"), None); - } - - #[test] - fn parse_extra_whitespace() { - assert_eq!( - parse_display_name(" Doe , Jane Marie "), - Some(("doe".into(), "jane".into())) - ); - } - - #[test] - fn normalize_trims_and_lowercases() { - assert_eq!(normalize(" FOO "), "foo"); - } - - #[test] - fn normalize_strips_trailing_period() { - assert_eq!(normalize("Jr."), "jr"); - } + tx.commit().await?; + Ok(()) } diff --git a/src/data/rmp_matching.rs b/src/data/rmp_matching.rs index a4db74d..f1773a7 100644 --- a/src/data/rmp_matching.rs +++ b/src/data/rmp_matching.rs @@ -1,6 +1,6 @@ //! Confidence scoring and candidate generation for RMP instructor matching. -use crate::data::rmp::{normalize, parse_display_name}; +use crate::data::names::{matching_keys, parse_banner_name, parse_rmp_name}; use crate::error::Result; use serde::{Deserialize, Serialize}; use sqlx::PgPool; @@ -14,6 +14,7 @@ use tracing::{debug, info}; /// Breakdown of individual scoring signals. #[derive(Debug, Clone, Default, Serialize, Deserialize)] pub struct ScoreBreakdown { + pub name: f32, pub department: f32, pub uniqueness: f32, pub volume: f32, @@ -37,12 +38,13 @@ const MIN_CANDIDATE_THRESHOLD: f32 = 0.40; const AUTO_ACCEPT_THRESHOLD: f32 = 0.85; // --------------------------------------------------------------------------- -// Weights +// Weights (must sum to 1.0) // --------------------------------------------------------------------------- -const WEIGHT_DEPARTMENT: f32 = 0.50; -const WEIGHT_UNIQUENESS: f32 = 0.30; -const WEIGHT_VOLUME: f32 = 0.20; +const WEIGHT_NAME: f32 = 0.50; +const WEIGHT_DEPARTMENT: f32 = 0.25; +const WEIGHT_UNIQUENESS: f32 = 0.15; +const WEIGHT_VOLUME: f32 = 0.10; // --------------------------------------------------------------------------- // Pure scoring functions @@ -199,35 +201,39 @@ fn matches_known_abbreviation(subject: &str, department: &str) -> bool { /// Compute match confidence score (0.0–1.0) for an instructor–RMP pair. /// -/// Name matching is handled by the caller via pre-filtering on exact -/// normalized `(last, first)`, so only department, uniqueness, and volume -/// signals are scored here. +/// The name signal is always 1.0 since candidates are only generated for +/// exact normalized name matches. The effective score range is 0.50–1.0. pub fn compute_match_score( instructor_subjects: &[String], rmp_department: Option<&str>, candidate_count: usize, rmp_num_ratings: i32, ) -> MatchScore { - // --- Department (0.50) --- + // --- Name (0.50) — always 1.0, candidates only exist for exact matches --- + let name_score = 1.0; + + // --- Department (0.25) --- let dept_score = department_similarity(instructor_subjects, rmp_department); - // --- Uniqueness (0.30) --- + // --- Uniqueness (0.15) --- let uniqueness_score = match candidate_count { 0 | 1 => 1.0, 2 => 0.5, _ => 0.2, }; - // --- Volume (0.20) --- + // --- Volume (0.10) --- let volume_score = ((rmp_num_ratings as f32).ln_1p() / 5.0_f32.ln_1p()).clamp(0.0, 1.0); - let composite = dept_score * WEIGHT_DEPARTMENT + let composite = name_score * WEIGHT_NAME + + dept_score * WEIGHT_DEPARTMENT + uniqueness_score * WEIGHT_UNIQUENESS + volume_score * WEIGHT_VOLUME; MatchScore { score: composite, breakdown: ScoreBreakdown { + name: name_score, department: dept_score, uniqueness: uniqueness_score, volume: volume_score, @@ -260,8 +266,8 @@ struct RmpProfForMatching { /// Generate match candidates for all unmatched instructors. /// /// For each unmatched instructor: -/// 1. Parse `display_name` into (last, first). -/// 2. Find RMP professors with matching normalized name. +/// 1. Parse `display_name` into [`NameParts`] and generate matching keys. +/// 2. Find RMP professors with matching normalized name keys. /// 3. Score each candidate. /// 4. Store candidates scoring above [`MIN_CANDIDATE_THRESHOLD`]. /// 5. Auto-accept if the top candidate scores ≥ [`AUTO_ACCEPT_THRESHOLD`] @@ -309,7 +315,7 @@ pub async fn generate_candidates(db_pool: &PgPool) -> Result { subject_map.entry(iid).or_default().push(subject); } - // 3. Load all RMP professors + // 3. Load all RMP professors and build multi-key name index let prof_rows: Vec<(i32, String, String, Option, i32)> = sqlx::query_as( "SELECT legacy_id, first_name, last_name, department, num_ratings FROM rmp_professors", ) @@ -317,14 +323,36 @@ pub async fn generate_candidates(db_pool: &PgPool) -> Result { .await?; // Build name index: (normalized_last, normalized_first) -> Vec + // Each professor may appear under multiple keys (nicknames, token variants). let mut name_index: HashMap<(String, String), Vec> = HashMap::new(); - for (legacy_id, first_name, last_name, department, num_ratings) in prof_rows { - let key = (normalize(&last_name), normalize(&first_name)); - name_index.entry(key).or_default().push(RmpProfForMatching { - legacy_id, - department, - num_ratings, - }); + let mut rmp_parse_failures = 0usize; + for (legacy_id, first_name, last_name, department, num_ratings) in &prof_rows { + match parse_rmp_name(first_name, last_name) { + Some(parts) => { + let keys = matching_keys(&parts); + for key in keys { + name_index.entry(key).or_default().push(RmpProfForMatching { + legacy_id: *legacy_id, + department: department.clone(), + num_ratings: *num_ratings, + }); + } + } + None => { + rmp_parse_failures += 1; + debug!( + legacy_id, + first_name, last_name, "Unparseable RMP professor name, skipping" + ); + } + } + } + + if rmp_parse_failures > 0 { + debug!( + count = rmp_parse_failures, + "RMP professors with unparseable names" + ); } // 4. Load existing candidate pairs — only skip resolved (accepted/rejected) pairs. @@ -360,7 +388,7 @@ pub async fn generate_candidates(db_pool: &PgPool) -> Result { let mut skipped_no_candidates = 0usize; for (instructor_id, display_name) in &instructors { - let Some((norm_last, norm_first)) = parse_display_name(display_name) else { + let Some(instructor_parts) = parse_banner_name(display_name) else { skipped_unparseable += 1; debug!( instructor_id, @@ -371,16 +399,31 @@ pub async fn generate_candidates(db_pool: &PgPool) -> Result { let subjects = subject_map.get(instructor_id).unwrap_or(&empty_subjects); - let key = (norm_last.clone(), norm_first.clone()); - let Some(rmp_candidates) = name_index.get(&key) else { + // Generate all matching keys for this instructor and collect candidate + // RMP professors across all key variants (deduplicated by legacy_id). + let instructor_keys = matching_keys(&instructor_parts); + let mut seen_profs: HashSet = HashSet::new(); + let mut matched_profs: Vec<&RmpProfForMatching> = Vec::new(); + + for key in &instructor_keys { + if let Some(profs) = name_index.get(key) { + for prof in profs { + if seen_profs.insert(prof.legacy_id) { + matched_profs.push(prof); + } + } + } + } + + if matched_profs.is_empty() { skipped_no_candidates += 1; continue; - }; + } - let candidate_count = rmp_candidates.len(); + let candidate_count = matched_profs.len(); let mut best: Option<(f32, i32)> = None; - for prof in rmp_candidates { + for prof in &matched_profs { let pair = (*instructor_id, prof.legacy_id); if resolved_pairs.contains(&pair) { continue; @@ -582,8 +625,9 @@ mod tests { 1, // unique candidate 50, // decent ratings ); - // dept 1.0*0.50 + unique 1.0*0.30 + volume ~0.97*0.20 ≈ 0.99 + // name 1.0*0.50 + dept 1.0*0.25 + unique 1.0*0.15 + volume ~0.97*0.10 ≈ 0.997 assert!(ms.score >= 0.85, "Expected score >= 0.85, got {}", ms.score); + assert_eq!(ms.breakdown.name, 1.0); assert_eq!(ms.breakdown.uniqueness, 1.0); assert_eq!(ms.breakdown.department, 1.0); } diff --git a/src/web/admin_rmp.rs b/src/web/admin_rmp.rs index 68db567..40149cf 100644 --- a/src/web/admin_rmp.rs +++ b/src/web/admin_rmp.rs @@ -769,16 +769,10 @@ pub async fn unmatch_instructor( ) -> Result, (StatusCode, Json)> { let rmp_legacy_id = body.and_then(|b| b.rmp_legacy_id); - let mut tx = state - .db_pool - .begin() - .await - .map_err(|e| db_error("failed to begin transaction", e))?; - // Verify instructor exists let exists: Option<(i32,)> = sqlx::query_as("SELECT id FROM instructors WHERE id = $1") .bind(id) - .fetch_optional(&mut *tx) + .fetch_optional(&state.db_pool) .await .map_err(|e| db_error("failed to check instructor", e))?; @@ -789,50 +783,16 @@ pub async fn unmatch_instructor( )); } - // Delete specific link or all links - if let Some(legacy_id) = rmp_legacy_id { - let result = sqlx::query( - "DELETE FROM instructor_rmp_links WHERE instructor_id = $1 AND rmp_legacy_id = $2", - ) - .bind(id) - .bind(legacy_id) - .execute(&mut *tx) + // Use the data layer function to perform the unmatch + crate::data::rmp::unmatch_instructor(&state.db_pool, id, rmp_legacy_id) .await - .map_err(|e| db_error("failed to remove rmp link", e))?; - - if result.rows_affected() == 0 { - return Err(( - StatusCode::NOT_FOUND, - Json(json!({"error": "link not found for this instructor"})), - )); - } - } else { - sqlx::query("DELETE FROM instructor_rmp_links WHERE instructor_id = $1") - .bind(id) - .execute(&mut *tx) - .await - .map_err(|e| db_error("failed to remove rmp links", e))?; - } - - // Check if any links remain; update status accordingly - let (remaining,): (i64,) = - sqlx::query_as("SELECT COUNT(*) FROM instructor_rmp_links WHERE instructor_id = $1") - .bind(id) - .fetch_one(&mut *tx) - .await - .map_err(|e| db_error("failed to count remaining links", e))?; - - if remaining == 0 { - sqlx::query("UPDATE instructors SET rmp_match_status = 'unmatched' WHERE id = $1") - .bind(id) - .execute(&mut *tx) - .await - .map_err(|e| db_error("failed to update instructor status", e))?; - } - - tx.commit() - .await - .map_err(|e| db_error("failed to commit transaction", e))?; + .map_err(|e| { + tracing::error!(error = %e, "failed to unmatch instructor"); + ( + StatusCode::INTERNAL_SERVER_ERROR, + Json(json!({"error": "failed to unmatch instructor"})), + ) + })?; Ok(Json(OkResponse { ok: true })) } diff --git a/tests/admin_rmp.rs b/tests/admin_rmp.rs new file mode 100644 index 0000000..5d7631a --- /dev/null +++ b/tests/admin_rmp.rs @@ -0,0 +1,102 @@ +mod helpers; + +use banner::data::rmp::unmatch_instructor; +use sqlx::PgPool; + +/// Test that unmatching an instructor resets accepted candidates back to pending. +/// +/// When a user unmatches an instructor, accepted candidates should be reset to +/// 'pending' so they can be re-matched later. This prevents the bug where +/// candidates remain 'accepted' but have no corresponding link. +#[sqlx::test] +async fn unmatch_resets_accepted_candidates_to_pending(pool: PgPool) { + // ARRANGE: Create an instructor + let (instructor_id,): (i32,) = sqlx::query_as( + "INSERT INTO instructors (display_name, email) + VALUES ('Test, Instructor', 'test@utsa.edu') + RETURNING id", + ) + .fetch_one(&pool) + .await + .expect("failed to create instructor"); + + // ARRANGE: Create an RMP professor + let (rmp_legacy_id,): (i32,) = sqlx::query_as( + "INSERT INTO rmp_professors (legacy_id, graphql_id, first_name, last_name, num_ratings) + VALUES (9999999, 'test-graphql-id', 'Test', 'Professor', 10) + RETURNING legacy_id", + ) + .fetch_one(&pool) + .await + .expect("failed to create rmp professor"); + + // ARRANGE: Create a match candidate with 'accepted' status + sqlx::query( + "INSERT INTO rmp_match_candidates (instructor_id, rmp_legacy_id, score, status) + VALUES ($1, $2, 0.85, 'accepted')", + ) + .bind(instructor_id) + .bind(rmp_legacy_id) + .execute(&pool) + .await + .expect("failed to create candidate"); + + // ARRANGE: Create a link in instructor_rmp_links + sqlx::query( + "INSERT INTO instructor_rmp_links (instructor_id, rmp_legacy_id, source) + VALUES ($1, $2, 'manual')", + ) + .bind(instructor_id) + .bind(rmp_legacy_id) + .execute(&pool) + .await + .expect("failed to create link"); + + // ARRANGE: Update instructor status to 'confirmed' + sqlx::query("UPDATE instructors SET rmp_match_status = 'confirmed' WHERE id = $1") + .bind(instructor_id) + .execute(&pool) + .await + .expect("failed to update instructor status"); + + // ACT: Unmatch the specific RMP profile + unmatch_instructor(&pool, instructor_id, Some(rmp_legacy_id)) + .await + .expect("unmatch should succeed"); + + // ASSERT: Candidate should be reset to pending + let (candidate_status,): (String,) = sqlx::query_as( + "SELECT status FROM rmp_match_candidates + WHERE instructor_id = $1 AND rmp_legacy_id = $2", + ) + .bind(instructor_id) + .bind(rmp_legacy_id) + .fetch_one(&pool) + .await + .expect("failed to fetch candidate status"); + assert_eq!( + candidate_status, "pending", + "candidate should be reset to pending after unmatch" + ); + + // ASSERT: Link should be deleted + let (link_count,): (i64,) = + sqlx::query_as("SELECT COUNT(*) FROM instructor_rmp_links WHERE instructor_id = $1") + .bind(instructor_id) + .fetch_one(&pool) + .await + .expect("failed to count links"); + assert_eq!(link_count, 0, "link should be deleted"); + + // ASSERT: Instructor status should be unmatched + let (instructor_status,): (String,) = + sqlx::query_as("SELECT rmp_match_status FROM instructors WHERE id = $1") + .bind(instructor_id) + .fetch_one(&pool) + .await + .expect("failed to fetch instructor status"); + assert_eq!( + instructor_status, "unmatched", + "instructor should be unmatched" + ); +}