mirror of
https://github.com/Xevion/banner.git
synced 2026-01-30 22:23:32 -06:00
feat: add name parsing and normalization for instructor-RMP matching
This commit is contained in:
+6
-1
@@ -14,7 +14,7 @@ use sqlx::postgres::PgPoolOptions;
|
||||
use std::process::ExitCode;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use tracing::{error, info};
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
/// Main application struct containing all necessary components
|
||||
pub struct App {
|
||||
@@ -70,6 +70,11 @@ impl App {
|
||||
.context("Failed to run database migrations")?;
|
||||
info!("Database migrations completed successfully");
|
||||
|
||||
// Backfill structured name columns for existing instructors
|
||||
if let Err(e) = crate::data::names::backfill_instructor_names(&db_pool).await {
|
||||
warn!(error = ?e, "Failed to backfill instructor names (non-fatal)");
|
||||
}
|
||||
|
||||
// Create BannerApi and AppState
|
||||
let banner_api = BannerApi::new_with_config(
|
||||
config.banner_base_url.clone(),
|
||||
|
||||
+16
-3
@@ -2,6 +2,7 @@
|
||||
|
||||
use crate::banner::Course;
|
||||
use crate::data::models::{DbMeetingTime, UpsertCounts};
|
||||
use crate::data::names::parse_banner_name;
|
||||
use crate::error::Result;
|
||||
use sqlx::PgConnection;
|
||||
use sqlx::PgPool;
|
||||
@@ -628,6 +629,8 @@ async fn upsert_instructors(
|
||||
) -> Result<HashMap<String, i32>> {
|
||||
let mut seen = HashSet::new();
|
||||
let mut display_names: Vec<&str> = Vec::new();
|
||||
let mut first_names: Vec<Option<String>> = Vec::new();
|
||||
let mut last_names: Vec<Option<String>> = Vec::new();
|
||||
let mut emails_lower: Vec<String> = Vec::new();
|
||||
let mut skipped_no_email = 0u32;
|
||||
|
||||
@@ -636,7 +639,10 @@ async fn upsert_instructors(
|
||||
if let Some(email) = &faculty.email_address {
|
||||
let email_lower = email.to_lowercase();
|
||||
if seen.insert(email_lower.clone()) {
|
||||
let parts = parse_banner_name(&faculty.display_name);
|
||||
display_names.push(faculty.display_name.as_str());
|
||||
first_names.push(parts.as_ref().map(|p| p.first.clone()));
|
||||
last_names.push(parts.as_ref().map(|p| p.last.clone()));
|
||||
emails_lower.push(email_lower);
|
||||
}
|
||||
} else {
|
||||
@@ -657,18 +663,25 @@ async fn upsert_instructors(
|
||||
}
|
||||
|
||||
let email_refs: Vec<&str> = emails_lower.iter().map(|s| s.as_str()).collect();
|
||||
let first_name_refs: Vec<Option<&str>> = first_names.iter().map(|s| s.as_deref()).collect();
|
||||
let last_name_refs: Vec<Option<&str>> = last_names.iter().map(|s| s.as_deref()).collect();
|
||||
|
||||
let rows: Vec<(i32, String)> = sqlx::query_as(
|
||||
r#"
|
||||
INSERT INTO instructors (display_name, email)
|
||||
SELECT * FROM UNNEST($1::text[], $2::text[])
|
||||
INSERT INTO instructors (display_name, email, first_name, last_name)
|
||||
SELECT * FROM UNNEST($1::text[], $2::text[], $3::text[], $4::text[])
|
||||
ON CONFLICT (email)
|
||||
DO UPDATE SET display_name = EXCLUDED.display_name
|
||||
DO UPDATE SET
|
||||
display_name = EXCLUDED.display_name,
|
||||
first_name = EXCLUDED.first_name,
|
||||
last_name = EXCLUDED.last_name
|
||||
RETURNING id, email
|
||||
"#,
|
||||
)
|
||||
.bind(&display_names)
|
||||
.bind(&email_refs)
|
||||
.bind(&first_name_refs)
|
||||
.bind(&last_name_refs)
|
||||
.fetch_all(&mut *conn)
|
||||
.await
|
||||
.map_err(|e| anyhow::anyhow!("Failed to batch upsert instructors: {}", e))?;
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
pub mod batch;
|
||||
pub mod courses;
|
||||
pub mod models;
|
||||
pub mod names;
|
||||
pub mod reference;
|
||||
pub mod rmp;
|
||||
pub mod rmp_matching;
|
||||
|
||||
@@ -103,6 +103,8 @@ pub struct Instructor {
|
||||
pub display_name: String,
|
||||
pub email: String,
|
||||
pub rmp_match_status: String,
|
||||
pub first_name: Option<String>,
|
||||
pub last_name: Option<String>,
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
|
||||
@@ -0,0 +1,728 @@
|
||||
//! Name parsing, normalization, and matching utilities.
|
||||
//!
|
||||
//! Handles the mismatch between Banner's single `display_name` ("Last, First Middle")
|
||||
//! and RMP's separate `first_name`/`last_name` fields, plus data quality issues
|
||||
//! from both sources (HTML entities, accents, nicknames, suffixes, junk).
|
||||
|
||||
use sqlx::PgPool;
|
||||
use tracing::{info, warn};
|
||||
use unicode_normalization::UnicodeNormalization;
|
||||
|
||||
/// Known name suffixes to extract from the last-name portion.
|
||||
const SUFFIXES: &[&str] = &["iv", "iii", "ii", "jr", "sr"];
|
||||
|
||||
/// Parsed, cleaned name components.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct NameParts {
|
||||
/// Cleaned display-quality first name(s): "H. Paul", "María"
|
||||
pub first: String,
|
||||
/// Cleaned display-quality last name: "O'Brien", "LeBlanc"
|
||||
pub last: String,
|
||||
/// Middle name/initial if detected: "Manuel", "L."
|
||||
pub middle: Option<String>,
|
||||
/// Suffix if detected: "III", "Jr"
|
||||
pub suffix: Option<String>,
|
||||
/// Nicknames extracted from parentheses: ["Ken"], ["Qian"]
|
||||
pub nicknames: Vec<String>,
|
||||
}
|
||||
|
||||
/// Decode common HTML entities found in Banner data.
|
||||
///
|
||||
/// Handles both named entities (`&`, `ü`) and numeric references
|
||||
/// (`'`, `'`).
|
||||
fn decode_html_entities(s: &str) -> String {
|
||||
if !s.contains('&') {
|
||||
return s.to_string();
|
||||
}
|
||||
htmlize::unescape(s).to_string()
|
||||
}
|
||||
|
||||
/// Extract parenthesized nicknames from a name string.
|
||||
///
|
||||
/// `"William (Ken)"` → `("William", vec!["Ken"])`
|
||||
/// `"Guenevere (Qian)"` → `("Guenevere", vec!["Qian"])`
|
||||
/// `"John (jack) C."` → `("John C.", vec!["jack"])`
|
||||
fn extract_nicknames(s: &str) -> (String, Vec<String>) {
|
||||
let mut nicknames = Vec::new();
|
||||
let mut cleaned = String::with_capacity(s.len());
|
||||
let mut chars = s.chars().peekable();
|
||||
|
||||
while let Some(ch) = chars.next() {
|
||||
if ch == '(' {
|
||||
let mut nick = String::new();
|
||||
for inner in chars.by_ref() {
|
||||
if inner == ')' {
|
||||
break;
|
||||
}
|
||||
nick.push(inner);
|
||||
}
|
||||
let nick = nick.trim().to_string();
|
||||
if !nick.is_empty() {
|
||||
nicknames.push(nick);
|
||||
}
|
||||
} else if ch == '"' || ch == '\u{201C}' || ch == '\u{201D}' {
|
||||
// Extract quoted nicknames: Thomas "Butch" → nickname "Butch"
|
||||
let mut nick = String::new();
|
||||
for inner in chars.by_ref() {
|
||||
if inner == '"' || inner == '\u{201C}' || inner == '\u{201D}' {
|
||||
break;
|
||||
}
|
||||
nick.push(inner);
|
||||
}
|
||||
let nick = nick.trim().to_string();
|
||||
if !nick.is_empty() {
|
||||
nicknames.push(nick);
|
||||
}
|
||||
} else {
|
||||
cleaned.push(ch);
|
||||
}
|
||||
}
|
||||
|
||||
// Collapse multiple spaces left by extraction
|
||||
let cleaned = collapse_whitespace(&cleaned);
|
||||
(cleaned, nicknames)
|
||||
}
|
||||
|
||||
/// Extract a suffix (Jr, Sr, II, III, IV) from the last-name portion.
|
||||
///
|
||||
/// `"LeBlanc III"` → `("LeBlanc", Some("III"))`
|
||||
/// `"Smith Jr."` → `("Smith", Some("Jr."))`
|
||||
fn extract_suffix(last: &str) -> (String, Option<String>) {
|
||||
// Try to match the last token as a suffix
|
||||
let tokens: Vec<&str> = last.split_whitespace().collect();
|
||||
if tokens.len() < 2 {
|
||||
return (last.to_string(), None);
|
||||
}
|
||||
|
||||
let candidate = tokens.last().unwrap();
|
||||
let candidate_normalized = candidate.to_lowercase().trim_end_matches('.').to_string();
|
||||
|
||||
if SUFFIXES.contains(&candidate_normalized.as_str()) {
|
||||
let name_part = tokens[..tokens.len() - 1].join(" ");
|
||||
return (name_part, Some(candidate.to_string()));
|
||||
}
|
||||
|
||||
(last.to_string(), None)
|
||||
}
|
||||
|
||||
/// Strip junk commonly found in RMP name fields.
|
||||
///
|
||||
/// - Trailing commas: `"Cronenberger,"` → `"Cronenberger"`
|
||||
/// - Email addresses: `"Neel.Baumgardner@utsa.edu"` → `""` (returns empty)
|
||||
fn strip_junk(s: &str) -> String {
|
||||
let s = s.trim();
|
||||
|
||||
// If the string looks like an email, return empty
|
||||
if s.contains('@') && s.contains('.') && !s.contains(' ') {
|
||||
return String::new();
|
||||
}
|
||||
|
||||
// Strip trailing commas
|
||||
s.trim_end_matches(',').trim().to_string()
|
||||
}
|
||||
|
||||
/// Collapse runs of whitespace into single spaces and trim.
|
||||
fn collapse_whitespace(s: &str) -> String {
|
||||
s.split_whitespace().collect::<Vec<_>>().join(" ")
|
||||
}
|
||||
|
||||
/// Parse a Banner `display_name` ("Last, First Middle") into structured parts.
|
||||
///
|
||||
/// Handles HTML entities, suffixes, and multi-token names.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// use banner::data::names::parse_banner_name;
|
||||
///
|
||||
/// let parts = parse_banner_name("O'Brien, Erin").unwrap();
|
||||
/// assert_eq!(parts.first, "Erin");
|
||||
/// assert_eq!(parts.last, "O'Brien");
|
||||
/// ```
|
||||
pub fn parse_banner_name(display_name: &str) -> Option<NameParts> {
|
||||
// 1. Decode HTML entities
|
||||
let decoded = decode_html_entities(display_name);
|
||||
|
||||
// 2. Split on first comma
|
||||
let (last_part, first_part) = decoded.split_once(',')?;
|
||||
let last_part = last_part.trim();
|
||||
let first_part = first_part.trim();
|
||||
|
||||
if last_part.is_empty() || first_part.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
// 3. Extract suffix from last name
|
||||
let (last_clean, suffix) = extract_suffix(last_part);
|
||||
|
||||
// 4. Parse first-name portion: first token(s) + optional middle
|
||||
// Banner format is "First Middle", so we keep all tokens as first_name
|
||||
// to support "H. Paul" style names
|
||||
let first_clean = collapse_whitespace(first_part);
|
||||
|
||||
Some(NameParts {
|
||||
first: first_clean,
|
||||
last: last_clean,
|
||||
middle: None, // Banner doesn't clearly delineate middle vs first
|
||||
suffix,
|
||||
nicknames: Vec::new(), // Banner doesn't include nicknames
|
||||
})
|
||||
}
|
||||
|
||||
/// Parse RMP professor name fields into structured parts.
|
||||
///
|
||||
/// Handles junk data, nicknames in parentheses/quotes, and suffixes.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// use banner::data::names::parse_rmp_name;
|
||||
///
|
||||
/// let parts = parse_rmp_name("William (Ken)", "Burchenal").unwrap();
|
||||
/// assert_eq!(parts.first, "William");
|
||||
/// assert_eq!(parts.nicknames, vec!["Ken"]);
|
||||
/// ```
|
||||
pub fn parse_rmp_name(first_name: &str, last_name: &str) -> Option<NameParts> {
|
||||
let first_cleaned = strip_junk(first_name);
|
||||
let last_cleaned = strip_junk(last_name);
|
||||
|
||||
if first_cleaned.is_empty() || last_cleaned.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Extract nicknames from parens/quotes in first name
|
||||
let (first_no_nicks, nicknames) = extract_nicknames(&first_cleaned);
|
||||
let first_final = collapse_whitespace(&first_no_nicks);
|
||||
|
||||
// Extract suffix from last name
|
||||
let (last_final, suffix) = extract_suffix(&last_cleaned);
|
||||
|
||||
if first_final.is_empty() || last_final.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(NameParts {
|
||||
first: first_final,
|
||||
last: last_final,
|
||||
middle: None,
|
||||
suffix,
|
||||
nicknames,
|
||||
})
|
||||
}
|
||||
|
||||
/// Normalize a name string for matching comparison.
|
||||
///
|
||||
/// Pipeline: lowercase → NFD decompose → strip combining marks →
|
||||
/// strip punctuation/hyphens → collapse whitespace → trim.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// use banner::data::names::normalize_for_matching;
|
||||
///
|
||||
/// assert_eq!(normalize_for_matching("García"), "garcia");
|
||||
/// assert_eq!(normalize_for_matching("O'Brien"), "obrien");
|
||||
/// assert_eq!(normalize_for_matching("Aguirre-Mesa"), "aguirremesa");
|
||||
/// ```
|
||||
/// Normalize a name string for matching index keys.
|
||||
///
|
||||
/// Pipeline: lowercase → NFD decompose → strip combining marks →
|
||||
/// strip ALL punctuation, hyphens, and whitespace.
|
||||
///
|
||||
/// This produces a compact, space-free string so that "Aguirre Mesa" (Banner)
|
||||
/// and "Aguirre-Mesa" (RMP) both become "aguirremesa".
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// use banner::data::names::normalize_for_matching;
|
||||
///
|
||||
/// assert_eq!(normalize_for_matching("García"), "garcia");
|
||||
/// assert_eq!(normalize_for_matching("O'Brien"), "obrien");
|
||||
/// assert_eq!(normalize_for_matching("Aguirre-Mesa"), "aguirremesa");
|
||||
/// assert_eq!(normalize_for_matching("Aguirre Mesa"), "aguirremesa");
|
||||
/// ```
|
||||
pub fn normalize_for_matching(s: &str) -> String {
|
||||
s.to_lowercase()
|
||||
.nfd()
|
||||
.filter(|c| {
|
||||
// Keep only non-combining alphabetic characters — strip everything else
|
||||
c.is_alphabetic() && !unicode_normalization::char::is_combining_mark(*c)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Generate all matching index keys for a parsed name.
|
||||
///
|
||||
/// For a name like "H. Paul" / "LeBlanc" with no nicknames, generates:
|
||||
/// - `("leblanc", "h paul")` — full normalized first
|
||||
/// - `("leblanc", "paul")` — individual token (if multi-token)
|
||||
/// - `("leblanc", "h")` — individual token (if multi-token)
|
||||
///
|
||||
/// For a name like "William" / "Burchenal" with nickname "Ken":
|
||||
/// - `("burchenal", "william")` — primary
|
||||
/// - `("burchenal", "ken")` — nickname variant
|
||||
pub fn matching_keys(parts: &NameParts) -> Vec<(String, String)> {
|
||||
let norm_last = normalize_for_matching(&parts.last);
|
||||
if norm_last.is_empty() {
|
||||
return Vec::new();
|
||||
}
|
||||
|
||||
let mut keys = Vec::new();
|
||||
let mut seen = std::collections::HashSet::new();
|
||||
|
||||
// Primary key: full first name (all spaces stripped)
|
||||
let norm_first_full = normalize_for_matching(&parts.first);
|
||||
if !norm_first_full.is_empty() && seen.insert(norm_first_full.clone()) {
|
||||
keys.push((norm_last.clone(), norm_first_full));
|
||||
}
|
||||
|
||||
// Individual tokens from the display-form first name
|
||||
// (split before full normalization so we can generate per-token keys)
|
||||
let first_tokens: Vec<&str> = parts.first.split_whitespace().collect();
|
||||
if first_tokens.len() > 1 {
|
||||
for token in &first_tokens {
|
||||
let norm_token = normalize_for_matching(token);
|
||||
if !norm_token.is_empty() && seen.insert(norm_token.clone()) {
|
||||
keys.push((norm_last.clone(), norm_token));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Nickname variants
|
||||
for nick in &parts.nicknames {
|
||||
let norm_nick = normalize_for_matching(nick);
|
||||
if !norm_nick.is_empty() && seen.insert(norm_nick.clone()) {
|
||||
keys.push((norm_last.clone(), norm_nick));
|
||||
}
|
||||
}
|
||||
|
||||
keys
|
||||
}
|
||||
|
||||
/// Backfill `first_name`/`last_name` columns for all instructors that have
|
||||
/// a `display_name` but NULL structured name fields.
|
||||
///
|
||||
/// Parses each `display_name` using [`parse_banner_name`] and updates the row.
|
||||
/// Logs warnings for any names that fail to parse.
|
||||
pub async fn backfill_instructor_names(db_pool: &PgPool) -> crate::error::Result<()> {
|
||||
let rows: Vec<(i32, String)> = sqlx::query_as(
|
||||
"SELECT id, display_name FROM instructors WHERE first_name IS NULL OR last_name IS NULL",
|
||||
)
|
||||
.fetch_all(db_pool)
|
||||
.await?;
|
||||
|
||||
if rows.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let total = rows.len();
|
||||
let mut ids: Vec<i32> = Vec::with_capacity(total);
|
||||
let mut firsts: Vec<String> = Vec::with_capacity(total);
|
||||
let mut lasts: Vec<String> = Vec::with_capacity(total);
|
||||
let mut unparseable = 0usize;
|
||||
|
||||
for (id, display_name) in &rows {
|
||||
match parse_banner_name(display_name) {
|
||||
Some(parts) => {
|
||||
ids.push(*id);
|
||||
firsts.push(parts.first);
|
||||
lasts.push(parts.last);
|
||||
}
|
||||
None => {
|
||||
warn!(
|
||||
id,
|
||||
display_name, "Failed to parse instructor display_name during backfill"
|
||||
);
|
||||
unparseable += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !ids.is_empty() {
|
||||
let first_refs: Vec<&str> = firsts.iter().map(|s| s.as_str()).collect();
|
||||
let last_refs: Vec<&str> = lasts.iter().map(|s| s.as_str()).collect();
|
||||
|
||||
sqlx::query(
|
||||
r#"
|
||||
UPDATE instructors i
|
||||
SET first_name = v.first_name, last_name = v.last_name
|
||||
FROM UNNEST($1::int4[], $2::text[], $3::text[])
|
||||
AS v(id, first_name, last_name)
|
||||
WHERE i.id = v.id
|
||||
"#,
|
||||
)
|
||||
.bind(&ids)
|
||||
.bind(&first_refs)
|
||||
.bind(&last_refs)
|
||||
.execute(db_pool)
|
||||
.await?;
|
||||
}
|
||||
|
||||
info!(
|
||||
total,
|
||||
updated = ids.len(),
|
||||
unparseable,
|
||||
"Instructor name backfill complete"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// HTML entity decoding
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
#[test]
|
||||
fn decode_apostrophe_entity() {
|
||||
assert_eq!(decode_html_entities("O'Brien"), "O'Brien");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn decode_umlaut_entity() {
|
||||
assert_eq!(decode_html_entities("Bülent"), "Bülent");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn decode_no_entities() {
|
||||
assert_eq!(decode_html_entities("Smith"), "Smith");
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Nickname extraction
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
#[test]
|
||||
fn extract_paren_nickname() {
|
||||
let (cleaned, nicks) = extract_nicknames("William (Ken)");
|
||||
assert_eq!(cleaned, "William");
|
||||
assert_eq!(nicks, vec!["Ken"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_quoted_nickname() {
|
||||
let (cleaned, nicks) = extract_nicknames("Thomas \"Butch\"");
|
||||
assert_eq!(cleaned, "Thomas");
|
||||
assert_eq!(nicks, vec!["Butch"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_paren_with_extra_text() {
|
||||
let (cleaned, nicks) = extract_nicknames("John (jack) C.");
|
||||
assert_eq!(cleaned, "John C.");
|
||||
assert_eq!(nicks, vec!["jack"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_no_nicknames() {
|
||||
let (cleaned, nicks) = extract_nicknames("Maria Elena");
|
||||
assert_eq!(cleaned, "Maria Elena");
|
||||
assert!(nicks.is_empty());
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Suffix extraction
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
#[test]
|
||||
fn extract_suffix_iii() {
|
||||
let (name, suffix) = extract_suffix("LeBlanc III");
|
||||
assert_eq!(name, "LeBlanc");
|
||||
assert_eq!(suffix, Some("III".to_string()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_suffix_jr_period() {
|
||||
let (name, suffix) = extract_suffix("Smith Jr.");
|
||||
assert_eq!(name, "Smith");
|
||||
assert_eq!(suffix, Some("Jr.".to_string()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extract_no_suffix() {
|
||||
let (name, suffix) = extract_suffix("García");
|
||||
assert_eq!(name, "García");
|
||||
assert_eq!(suffix, None);
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Junk stripping
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
#[test]
|
||||
fn strip_trailing_comma() {
|
||||
assert_eq!(strip_junk("Cronenberger,"), "Cronenberger");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn strip_email_address() {
|
||||
assert_eq!(strip_junk("Neel.Baumgardner@utsa.edu"), "");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn strip_clean_name() {
|
||||
assert_eq!(strip_junk(" Maria "), "Maria");
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// normalize_for_matching
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
#[test]
|
||||
fn normalize_strips_accents() {
|
||||
assert_eq!(normalize_for_matching("García"), "garcia");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_strips_apostrophe() {
|
||||
assert_eq!(normalize_for_matching("O'Brien"), "obrien");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_strips_hyphen() {
|
||||
assert_eq!(normalize_for_matching("Aguirre-Mesa"), "aguirremesa");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_tilde_n() {
|
||||
assert_eq!(normalize_for_matching("Muñoz"), "munoz");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_umlaut() {
|
||||
assert_eq!(normalize_for_matching("Müller"), "muller");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_period() {
|
||||
assert_eq!(normalize_for_matching("H. Paul"), "hpaul");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_strips_spaces() {
|
||||
assert_eq!(normalize_for_matching("Mary Lou"), "marylou");
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// parse_banner_name
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
#[test]
|
||||
fn banner_standard_name() {
|
||||
let p = parse_banner_name("Smith, John").unwrap();
|
||||
assert_eq!(p.first, "John");
|
||||
assert_eq!(p.last, "Smith");
|
||||
assert_eq!(p.suffix, None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn banner_html_entity_apostrophe() {
|
||||
let p = parse_banner_name("O'Brien, Erin").unwrap();
|
||||
assert_eq!(p.first, "Erin");
|
||||
assert_eq!(p.last, "O'Brien");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn banner_html_entity_umlaut() {
|
||||
let p = parse_banner_name("Temel, Bülent").unwrap();
|
||||
assert_eq!(p.first, "Bülent");
|
||||
assert_eq!(p.last, "Temel");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn banner_suffix_iii() {
|
||||
let p = parse_banner_name("LeBlanc III, H. Paul").unwrap();
|
||||
assert_eq!(p.first, "H. Paul");
|
||||
assert_eq!(p.last, "LeBlanc");
|
||||
assert_eq!(p.suffix, Some("III".to_string()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn banner_suffix_ii() {
|
||||
let p = parse_banner_name("Ellis II, Ronald").unwrap();
|
||||
assert_eq!(p.first, "Ronald");
|
||||
assert_eq!(p.last, "Ellis");
|
||||
assert_eq!(p.suffix, Some("II".to_string()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn banner_multi_word_last() {
|
||||
let p = parse_banner_name("Aguirre Mesa, Andres").unwrap();
|
||||
assert_eq!(p.first, "Andres");
|
||||
assert_eq!(p.last, "Aguirre Mesa");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn banner_hyphenated_last() {
|
||||
let p = parse_banner_name("Abu-Lail, Nehal").unwrap();
|
||||
assert_eq!(p.first, "Nehal");
|
||||
assert_eq!(p.last, "Abu-Lail");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn banner_with_middle_name() {
|
||||
let p = parse_banner_name("Smith, John David").unwrap();
|
||||
assert_eq!(p.first, "John David");
|
||||
assert_eq!(p.last, "Smith");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn banner_no_comma() {
|
||||
assert!(parse_banner_name("SingleName").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn banner_empty_first() {
|
||||
assert!(parse_banner_name("Smith,").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn banner_empty_last() {
|
||||
assert!(parse_banner_name(", John").is_none());
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// parse_rmp_name
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
#[test]
|
||||
fn rmp_standard_name() {
|
||||
let p = parse_rmp_name("John", "Smith").unwrap();
|
||||
assert_eq!(p.first, "John");
|
||||
assert_eq!(p.last, "Smith");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rmp_with_nickname() {
|
||||
let p = parse_rmp_name("William (Ken)", "Burchenal").unwrap();
|
||||
assert_eq!(p.first, "William");
|
||||
assert_eq!(p.nicknames, vec!["Ken"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rmp_trailing_comma_last() {
|
||||
let p = parse_rmp_name("J.", "Cronenberger,").unwrap();
|
||||
assert_eq!(p.last, "Cronenberger");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rmp_email_in_first() {
|
||||
assert!(parse_rmp_name("Neel.Baumgardner@utsa.edu", "Baumgardner").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rmp_suffix_in_last() {
|
||||
let p = parse_rmp_name("H. Paul", "LeBlanc III").unwrap();
|
||||
assert_eq!(p.first, "H. Paul");
|
||||
assert_eq!(p.last, "LeBlanc");
|
||||
assert_eq!(p.suffix, Some("III".to_string()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rmp_quoted_nickname() {
|
||||
let p = parse_rmp_name("Thomas \"Butch\"", "Matjeka").unwrap();
|
||||
assert_eq!(p.first, "Thomas");
|
||||
assert_eq!(p.nicknames, vec!["Butch"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rmp_accented_last() {
|
||||
let p = parse_rmp_name("Liliana", "Saldaña").unwrap();
|
||||
assert_eq!(p.last, "Saldaña");
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// matching_keys
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
#[test]
|
||||
fn keys_simple_name() {
|
||||
let parts = NameParts {
|
||||
first: "John".into(),
|
||||
last: "Smith".into(),
|
||||
middle: None,
|
||||
suffix: None,
|
||||
nicknames: vec![],
|
||||
};
|
||||
let keys = matching_keys(&parts);
|
||||
assert_eq!(keys, vec![("smith".into(), "john".into())]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn keys_multi_token_first() {
|
||||
let parts = NameParts {
|
||||
first: "H. Paul".into(),
|
||||
last: "LeBlanc".into(),
|
||||
middle: None,
|
||||
suffix: Some("III".into()),
|
||||
nicknames: vec![],
|
||||
};
|
||||
let keys = matching_keys(&parts);
|
||||
assert!(keys.contains(&("leblanc".into(), "hpaul".into())));
|
||||
assert!(keys.contains(&("leblanc".into(), "paul".into())));
|
||||
assert!(keys.contains(&("leblanc".into(), "h".into())));
|
||||
assert_eq!(keys.len(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn keys_with_nickname() {
|
||||
let parts = NameParts {
|
||||
first: "William".into(),
|
||||
last: "Burchenal".into(),
|
||||
middle: None,
|
||||
suffix: None,
|
||||
nicknames: vec!["Ken".into()],
|
||||
};
|
||||
let keys = matching_keys(&parts);
|
||||
assert!(keys.contains(&("burchenal".into(), "william".into())));
|
||||
assert!(keys.contains(&("burchenal".into(), "ken".into())));
|
||||
assert_eq!(keys.len(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn keys_hyphenated_last() {
|
||||
let parts = parse_banner_name("Aguirre-Mesa, Andres").unwrap();
|
||||
let keys = matching_keys(&parts);
|
||||
// Hyphen removed: "aguirremesa"
|
||||
assert!(keys.contains(&("aguirremesa".into(), "andres".into())));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn keys_accented_name() {
|
||||
let parts = parse_rmp_name("Liliana", "Saldaña").unwrap();
|
||||
let keys = matching_keys(&parts);
|
||||
assert!(keys.contains(&("saldana".into(), "liliana".into())));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn keys_cross_source_match() {
|
||||
// Banner: "Aguirre Mesa, Andres" → last="Aguirre Mesa"
|
||||
let banner = parse_banner_name("Aguirre Mesa, Andres").unwrap();
|
||||
let banner_keys = matching_keys(&banner);
|
||||
|
||||
// RMP: "Andres" / "Aguirre-Mesa" → last="Aguirre-Mesa"
|
||||
let rmp = parse_rmp_name("Andres", "Aguirre-Mesa").unwrap();
|
||||
let rmp_keys = matching_keys(&rmp);
|
||||
|
||||
// Both should normalize to ("aguirremesa", "andres")
|
||||
assert!(banner_keys.iter().any(|k| rmp_keys.contains(k)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn keys_accent_cross_match() {
|
||||
// Banner: "García, José" (if Banner ever has accents)
|
||||
let banner = parse_banner_name("Garcia, Jose").unwrap();
|
||||
let banner_keys = matching_keys(&banner);
|
||||
|
||||
// RMP: "José" / "García"
|
||||
let rmp = parse_rmp_name("José", "García").unwrap();
|
||||
let rmp_keys = matching_keys(&rmp);
|
||||
|
||||
// Both normalize to ("garcia", "jose")
|
||||
assert!(banner_keys.iter().any(|k| rmp_keys.contains(k)));
|
||||
}
|
||||
}
|
||||
+65
-82
@@ -91,25 +91,6 @@ pub async fn batch_upsert_rmp_professors(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Normalize a name for matching: lowercase, trim, strip trailing periods.
|
||||
pub(crate) fn normalize(s: &str) -> String {
|
||||
s.trim().to_lowercase().trim_end_matches('.').to_string()
|
||||
}
|
||||
|
||||
/// Parse Banner's "Last, First Middle" display name into (last, first) tokens.
|
||||
///
|
||||
/// Returns `None` if the format is unparseable (no comma, empty parts).
|
||||
pub(crate) fn parse_display_name(display_name: &str) -> Option<(String, String)> {
|
||||
let (last_part, first_part) = display_name.split_once(',')?;
|
||||
let last = normalize(last_part);
|
||||
// Take only the first token of the first-name portion to drop middle names/initials.
|
||||
let first = normalize(first_part.split_whitespace().next()?);
|
||||
if last.is_empty() || first.is_empty() {
|
||||
return None;
|
||||
}
|
||||
Some((last, first))
|
||||
}
|
||||
|
||||
/// Retrieve RMP rating data for an instructor by instructor id.
|
||||
///
|
||||
/// Returns `(avg_rating, num_ratings)` for the best linked RMP profile
|
||||
@@ -136,74 +117,76 @@ pub async fn get_instructor_rmp_data(
|
||||
Ok(row)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
/// Unmatch an instructor from an RMP profile.
|
||||
///
|
||||
/// Removes the link from `instructor_rmp_links` and updates the instructor's
|
||||
/// `rmp_match_status` to 'unmatched' if no links remain.
|
||||
///
|
||||
/// If `rmp_legacy_id` is `Some`, removes only that specific link.
|
||||
/// If `None`, removes all links for the instructor.
|
||||
pub async fn unmatch_instructor(
|
||||
db_pool: &PgPool,
|
||||
instructor_id: i32,
|
||||
rmp_legacy_id: Option<i32>,
|
||||
) -> Result<()> {
|
||||
let mut tx = db_pool.begin().await?;
|
||||
|
||||
#[test]
|
||||
fn parse_standard_name() {
|
||||
assert_eq!(
|
||||
parse_display_name("Smith, John"),
|
||||
Some(("smith".into(), "john".into()))
|
||||
);
|
||||
// Delete specific link or all links
|
||||
if let Some(legacy_id) = rmp_legacy_id {
|
||||
sqlx::query(
|
||||
"DELETE FROM instructor_rmp_links WHERE instructor_id = $1 AND rmp_legacy_id = $2",
|
||||
)
|
||||
.bind(instructor_id)
|
||||
.bind(legacy_id)
|
||||
.execute(&mut *tx)
|
||||
.await?;
|
||||
} else {
|
||||
sqlx::query("DELETE FROM instructor_rmp_links WHERE instructor_id = $1")
|
||||
.bind(instructor_id)
|
||||
.execute(&mut *tx)
|
||||
.await?;
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_name_with_middle() {
|
||||
assert_eq!(
|
||||
parse_display_name("Smith, John David"),
|
||||
Some(("smith".into(), "john".into()))
|
||||
);
|
||||
// Check if any links remain
|
||||
let (remaining,): (i64,) =
|
||||
sqlx::query_as("SELECT COUNT(*) FROM instructor_rmp_links WHERE instructor_id = $1")
|
||||
.bind(instructor_id)
|
||||
.fetch_one(&mut *tx)
|
||||
.await?;
|
||||
|
||||
// Update instructor status if no links remain
|
||||
if remaining == 0 {
|
||||
sqlx::query("UPDATE instructors SET rmp_match_status = 'unmatched' WHERE id = $1")
|
||||
.bind(instructor_id)
|
||||
.execute(&mut *tx)
|
||||
.await?;
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_name_with_middle_initial() {
|
||||
assert_eq!(
|
||||
parse_display_name("Garcia, Maria L."),
|
||||
Some(("garcia".into(), "maria".into()))
|
||||
);
|
||||
// Reset accepted candidates back to pending when unmatching
|
||||
// This allows the candidates to be re-matched later
|
||||
if let Some(legacy_id) = rmp_legacy_id {
|
||||
// Reset only the specific candidate
|
||||
sqlx::query(
|
||||
"UPDATE rmp_match_candidates
|
||||
SET status = 'pending', resolved_at = NULL, resolved_by = NULL
|
||||
WHERE instructor_id = $1 AND rmp_legacy_id = $2 AND status = 'accepted'",
|
||||
)
|
||||
.bind(instructor_id)
|
||||
.bind(legacy_id)
|
||||
.execute(&mut *tx)
|
||||
.await?;
|
||||
} else {
|
||||
// Reset all accepted candidates for this instructor
|
||||
sqlx::query(
|
||||
"UPDATE rmp_match_candidates
|
||||
SET status = 'pending', resolved_at = NULL, resolved_by = NULL
|
||||
WHERE instructor_id = $1 AND status = 'accepted'",
|
||||
)
|
||||
.bind(instructor_id)
|
||||
.execute(&mut *tx)
|
||||
.await?;
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_name_with_suffix_in_last() {
|
||||
// Banner may encode "Jr." as part of the last name.
|
||||
// normalize() strips trailing periods so "Jr." becomes "jr".
|
||||
assert_eq!(
|
||||
parse_display_name("Smith Jr., James"),
|
||||
Some(("smith jr".into(), "james".into()))
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_no_comma_returns_none() {
|
||||
assert_eq!(parse_display_name("SingleName"), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_empty_first_returns_none() {
|
||||
assert_eq!(parse_display_name("Smith,"), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_empty_last_returns_none() {
|
||||
assert_eq!(parse_display_name(", John"), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_extra_whitespace() {
|
||||
assert_eq!(
|
||||
parse_display_name(" Doe , Jane Marie "),
|
||||
Some(("doe".into(), "jane".into()))
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_trims_and_lowercases() {
|
||||
assert_eq!(normalize(" FOO "), "foo");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_strips_trailing_period() {
|
||||
assert_eq!(normalize("Jr."), "jr");
|
||||
}
|
||||
tx.commit().await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
+73
-29
@@ -1,6 +1,6 @@
|
||||
//! Confidence scoring and candidate generation for RMP instructor matching.
|
||||
|
||||
use crate::data::rmp::{normalize, parse_display_name};
|
||||
use crate::data::names::{matching_keys, parse_banner_name, parse_rmp_name};
|
||||
use crate::error::Result;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use sqlx::PgPool;
|
||||
@@ -14,6 +14,7 @@ use tracing::{debug, info};
|
||||
/// Breakdown of individual scoring signals.
|
||||
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
|
||||
pub struct ScoreBreakdown {
|
||||
pub name: f32,
|
||||
pub department: f32,
|
||||
pub uniqueness: f32,
|
||||
pub volume: f32,
|
||||
@@ -37,12 +38,13 @@ const MIN_CANDIDATE_THRESHOLD: f32 = 0.40;
|
||||
const AUTO_ACCEPT_THRESHOLD: f32 = 0.85;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Weights
|
||||
// Weights (must sum to 1.0)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
const WEIGHT_DEPARTMENT: f32 = 0.50;
|
||||
const WEIGHT_UNIQUENESS: f32 = 0.30;
|
||||
const WEIGHT_VOLUME: f32 = 0.20;
|
||||
const WEIGHT_NAME: f32 = 0.50;
|
||||
const WEIGHT_DEPARTMENT: f32 = 0.25;
|
||||
const WEIGHT_UNIQUENESS: f32 = 0.15;
|
||||
const WEIGHT_VOLUME: f32 = 0.10;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Pure scoring functions
|
||||
@@ -199,35 +201,39 @@ fn matches_known_abbreviation(subject: &str, department: &str) -> bool {
|
||||
|
||||
/// Compute match confidence score (0.0–1.0) for an instructor–RMP pair.
|
||||
///
|
||||
/// Name matching is handled by the caller via pre-filtering on exact
|
||||
/// normalized `(last, first)`, so only department, uniqueness, and volume
|
||||
/// signals are scored here.
|
||||
/// The name signal is always 1.0 since candidates are only generated for
|
||||
/// exact normalized name matches. The effective score range is 0.50–1.0.
|
||||
pub fn compute_match_score(
|
||||
instructor_subjects: &[String],
|
||||
rmp_department: Option<&str>,
|
||||
candidate_count: usize,
|
||||
rmp_num_ratings: i32,
|
||||
) -> MatchScore {
|
||||
// --- Department (0.50) ---
|
||||
// --- Name (0.50) — always 1.0, candidates only exist for exact matches ---
|
||||
let name_score = 1.0;
|
||||
|
||||
// --- Department (0.25) ---
|
||||
let dept_score = department_similarity(instructor_subjects, rmp_department);
|
||||
|
||||
// --- Uniqueness (0.30) ---
|
||||
// --- Uniqueness (0.15) ---
|
||||
let uniqueness_score = match candidate_count {
|
||||
0 | 1 => 1.0,
|
||||
2 => 0.5,
|
||||
_ => 0.2,
|
||||
};
|
||||
|
||||
// --- Volume (0.20) ---
|
||||
// --- Volume (0.10) ---
|
||||
let volume_score = ((rmp_num_ratings as f32).ln_1p() / 5.0_f32.ln_1p()).clamp(0.0, 1.0);
|
||||
|
||||
let composite = dept_score * WEIGHT_DEPARTMENT
|
||||
let composite = name_score * WEIGHT_NAME
|
||||
+ dept_score * WEIGHT_DEPARTMENT
|
||||
+ uniqueness_score * WEIGHT_UNIQUENESS
|
||||
+ volume_score * WEIGHT_VOLUME;
|
||||
|
||||
MatchScore {
|
||||
score: composite,
|
||||
breakdown: ScoreBreakdown {
|
||||
name: name_score,
|
||||
department: dept_score,
|
||||
uniqueness: uniqueness_score,
|
||||
volume: volume_score,
|
||||
@@ -260,8 +266,8 @@ struct RmpProfForMatching {
|
||||
/// Generate match candidates for all unmatched instructors.
|
||||
///
|
||||
/// For each unmatched instructor:
|
||||
/// 1. Parse `display_name` into (last, first).
|
||||
/// 2. Find RMP professors with matching normalized name.
|
||||
/// 1. Parse `display_name` into [`NameParts`] and generate matching keys.
|
||||
/// 2. Find RMP professors with matching normalized name keys.
|
||||
/// 3. Score each candidate.
|
||||
/// 4. Store candidates scoring above [`MIN_CANDIDATE_THRESHOLD`].
|
||||
/// 5. Auto-accept if the top candidate scores ≥ [`AUTO_ACCEPT_THRESHOLD`]
|
||||
@@ -309,7 +315,7 @@ pub async fn generate_candidates(db_pool: &PgPool) -> Result<MatchingStats> {
|
||||
subject_map.entry(iid).or_default().push(subject);
|
||||
}
|
||||
|
||||
// 3. Load all RMP professors
|
||||
// 3. Load all RMP professors and build multi-key name index
|
||||
let prof_rows: Vec<(i32, String, String, Option<String>, i32)> = sqlx::query_as(
|
||||
"SELECT legacy_id, first_name, last_name, department, num_ratings FROM rmp_professors",
|
||||
)
|
||||
@@ -317,14 +323,36 @@ pub async fn generate_candidates(db_pool: &PgPool) -> Result<MatchingStats> {
|
||||
.await?;
|
||||
|
||||
// Build name index: (normalized_last, normalized_first) -> Vec<RmpProfForMatching>
|
||||
// Each professor may appear under multiple keys (nicknames, token variants).
|
||||
let mut name_index: HashMap<(String, String), Vec<RmpProfForMatching>> = HashMap::new();
|
||||
for (legacy_id, first_name, last_name, department, num_ratings) in prof_rows {
|
||||
let key = (normalize(&last_name), normalize(&first_name));
|
||||
name_index.entry(key).or_default().push(RmpProfForMatching {
|
||||
legacy_id,
|
||||
department,
|
||||
num_ratings,
|
||||
});
|
||||
let mut rmp_parse_failures = 0usize;
|
||||
for (legacy_id, first_name, last_name, department, num_ratings) in &prof_rows {
|
||||
match parse_rmp_name(first_name, last_name) {
|
||||
Some(parts) => {
|
||||
let keys = matching_keys(&parts);
|
||||
for key in keys {
|
||||
name_index.entry(key).or_default().push(RmpProfForMatching {
|
||||
legacy_id: *legacy_id,
|
||||
department: department.clone(),
|
||||
num_ratings: *num_ratings,
|
||||
});
|
||||
}
|
||||
}
|
||||
None => {
|
||||
rmp_parse_failures += 1;
|
||||
debug!(
|
||||
legacy_id,
|
||||
first_name, last_name, "Unparseable RMP professor name, skipping"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if rmp_parse_failures > 0 {
|
||||
debug!(
|
||||
count = rmp_parse_failures,
|
||||
"RMP professors with unparseable names"
|
||||
);
|
||||
}
|
||||
|
||||
// 4. Load existing candidate pairs — only skip resolved (accepted/rejected) pairs.
|
||||
@@ -360,7 +388,7 @@ pub async fn generate_candidates(db_pool: &PgPool) -> Result<MatchingStats> {
|
||||
let mut skipped_no_candidates = 0usize;
|
||||
|
||||
for (instructor_id, display_name) in &instructors {
|
||||
let Some((norm_last, norm_first)) = parse_display_name(display_name) else {
|
||||
let Some(instructor_parts) = parse_banner_name(display_name) else {
|
||||
skipped_unparseable += 1;
|
||||
debug!(
|
||||
instructor_id,
|
||||
@@ -371,16 +399,31 @@ pub async fn generate_candidates(db_pool: &PgPool) -> Result<MatchingStats> {
|
||||
|
||||
let subjects = subject_map.get(instructor_id).unwrap_or(&empty_subjects);
|
||||
|
||||
let key = (norm_last.clone(), norm_first.clone());
|
||||
let Some(rmp_candidates) = name_index.get(&key) else {
|
||||
// Generate all matching keys for this instructor and collect candidate
|
||||
// RMP professors across all key variants (deduplicated by legacy_id).
|
||||
let instructor_keys = matching_keys(&instructor_parts);
|
||||
let mut seen_profs: HashSet<i32> = HashSet::new();
|
||||
let mut matched_profs: Vec<&RmpProfForMatching> = Vec::new();
|
||||
|
||||
for key in &instructor_keys {
|
||||
if let Some(profs) = name_index.get(key) {
|
||||
for prof in profs {
|
||||
if seen_profs.insert(prof.legacy_id) {
|
||||
matched_profs.push(prof);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if matched_profs.is_empty() {
|
||||
skipped_no_candidates += 1;
|
||||
continue;
|
||||
};
|
||||
}
|
||||
|
||||
let candidate_count = rmp_candidates.len();
|
||||
let candidate_count = matched_profs.len();
|
||||
let mut best: Option<(f32, i32)> = None;
|
||||
|
||||
for prof in rmp_candidates {
|
||||
for prof in &matched_profs {
|
||||
let pair = (*instructor_id, prof.legacy_id);
|
||||
if resolved_pairs.contains(&pair) {
|
||||
continue;
|
||||
@@ -582,8 +625,9 @@ mod tests {
|
||||
1, // unique candidate
|
||||
50, // decent ratings
|
||||
);
|
||||
// dept 1.0*0.50 + unique 1.0*0.30 + volume ~0.97*0.20 ≈ 0.99
|
||||
// name 1.0*0.50 + dept 1.0*0.25 + unique 1.0*0.15 + volume ~0.97*0.10 ≈ 0.997
|
||||
assert!(ms.score >= 0.85, "Expected score >= 0.85, got {}", ms.score);
|
||||
assert_eq!(ms.breakdown.name, 1.0);
|
||||
assert_eq!(ms.breakdown.uniqueness, 1.0);
|
||||
assert_eq!(ms.breakdown.department, 1.0);
|
||||
}
|
||||
|
||||
+10
-50
@@ -769,16 +769,10 @@ pub async fn unmatch_instructor(
|
||||
) -> Result<Json<OkResponse>, (StatusCode, Json<Value>)> {
|
||||
let rmp_legacy_id = body.and_then(|b| b.rmp_legacy_id);
|
||||
|
||||
let mut tx = state
|
||||
.db_pool
|
||||
.begin()
|
||||
.await
|
||||
.map_err(|e| db_error("failed to begin transaction", e))?;
|
||||
|
||||
// Verify instructor exists
|
||||
let exists: Option<(i32,)> = sqlx::query_as("SELECT id FROM instructors WHERE id = $1")
|
||||
.bind(id)
|
||||
.fetch_optional(&mut *tx)
|
||||
.fetch_optional(&state.db_pool)
|
||||
.await
|
||||
.map_err(|e| db_error("failed to check instructor", e))?;
|
||||
|
||||
@@ -789,50 +783,16 @@ pub async fn unmatch_instructor(
|
||||
));
|
||||
}
|
||||
|
||||
// Delete specific link or all links
|
||||
if let Some(legacy_id) = rmp_legacy_id {
|
||||
let result = sqlx::query(
|
||||
"DELETE FROM instructor_rmp_links WHERE instructor_id = $1 AND rmp_legacy_id = $2",
|
||||
)
|
||||
.bind(id)
|
||||
.bind(legacy_id)
|
||||
.execute(&mut *tx)
|
||||
// Use the data layer function to perform the unmatch
|
||||
crate::data::rmp::unmatch_instructor(&state.db_pool, id, rmp_legacy_id)
|
||||
.await
|
||||
.map_err(|e| db_error("failed to remove rmp link", e))?;
|
||||
|
||||
if result.rows_affected() == 0 {
|
||||
return Err((
|
||||
StatusCode::NOT_FOUND,
|
||||
Json(json!({"error": "link not found for this instructor"})),
|
||||
));
|
||||
}
|
||||
} else {
|
||||
sqlx::query("DELETE FROM instructor_rmp_links WHERE instructor_id = $1")
|
||||
.bind(id)
|
||||
.execute(&mut *tx)
|
||||
.await
|
||||
.map_err(|e| db_error("failed to remove rmp links", e))?;
|
||||
}
|
||||
|
||||
// Check if any links remain; update status accordingly
|
||||
let (remaining,): (i64,) =
|
||||
sqlx::query_as("SELECT COUNT(*) FROM instructor_rmp_links WHERE instructor_id = $1")
|
||||
.bind(id)
|
||||
.fetch_one(&mut *tx)
|
||||
.await
|
||||
.map_err(|e| db_error("failed to count remaining links", e))?;
|
||||
|
||||
if remaining == 0 {
|
||||
sqlx::query("UPDATE instructors SET rmp_match_status = 'unmatched' WHERE id = $1")
|
||||
.bind(id)
|
||||
.execute(&mut *tx)
|
||||
.await
|
||||
.map_err(|e| db_error("failed to update instructor status", e))?;
|
||||
}
|
||||
|
||||
tx.commit()
|
||||
.await
|
||||
.map_err(|e| db_error("failed to commit transaction", e))?;
|
||||
.map_err(|e| {
|
||||
tracing::error!(error = %e, "failed to unmatch instructor");
|
||||
(
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
Json(json!({"error": "failed to unmatch instructor"})),
|
||||
)
|
||||
})?;
|
||||
|
||||
Ok(Json(OkResponse { ok: true }))
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user