From 4e1e3836c20f1423c1df56b17be38ea42961867d Mon Sep 17 00:00:00 2001 From: Xevion Date: Fri, 11 Aug 2023 17:39:34 -0500 Subject: [PATCH] Add html entity decoding, class name pattern parsing --- index.ts | 26 ++++++++++++++++++++++++-- package.json | 1 + yarn.lock | 5 +++++ 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/index.ts b/index.ts index caecb63..8ef3738 100644 --- a/index.ts +++ b/index.ts @@ -2,17 +2,22 @@ import {Cheerio, Element, load} from "cheerio"; import {readFileSync} from "fs"; import {parse} from "date-fns"; import {inspect} from "util"; +import {decode} from "html-entities"; import fetch from "node-fetch"; import {z} from "zod"; const $ = load(readFileSync('list.html')); const classes = $('#scheduleListView').children('.listViewWrapper'); +const name_pattern = /(.+) (\d{4}) Section (\d+)/; + const subject_schema = z.object({ code: z.string(), - description: z.string(), + description: z.string().transform((v) => decode(v, {level: 'all'})), }); +type Subject = z.infer; + const subjects = await z.array(subject_schema).parseAsync( await fetch( @@ -27,6 +32,8 @@ const subjects = await z.array(subject_schema).parseAsync( }) ); +const subject_by_name = new Map(subjects.map((subject) => [subject.description, subject])); + function getOffset(period: string) { switch (period) { @@ -61,9 +68,24 @@ function extractDetails(source: Cheerio) { const [start_date, end_date] = raw_date.split("--").map((date) => parse(date.trim(), "MM/dd/yyyy", new Date())); const identifier = source.find('span.list-view-subj-course-section').text(); + const name_match = name_pattern.exec(identifier); + if (name_match === null) { + throw new Error(`Could not parse identifier: ${identifier}`); + } + + const [subject_name, code, section] = name_match.slice(1); + const subject = subject_by_name.get(subject_name); + if (subject === undefined) { + throw new Error(`Unknown subject: ${subject_name}`); + } return { - identifier, + identifier: { + subject, + code: code, + section: section, + crn: null, + }, date: { start: start_date, end: end_date, diff --git a/package.json b/package.json index 1b4d0c9..94a947b 100644 --- a/package.json +++ b/package.json @@ -17,6 +17,7 @@ "@types/node": "^20.4.9", "cheerio": "^1.0.0-rc.12", "date-fns": "^2.30.0", + "html-entities": "^2.4.0", "ics": "^3.2.0", "node-fetch": "^3.3.2", "ts-node": "^10.9.1", diff --git a/yarn.lock b/yarn.lock index 52dc023..66ba9ac 100644 --- a/yarn.lock +++ b/yarn.lock @@ -192,6 +192,11 @@ formdata-polyfill@^4.0.10: dependencies: fetch-blob "^3.1.2" +html-entities@^2.4.0: + version "2.4.0" + resolved "https://registry.yarnpkg.com/html-entities/-/html-entities-2.4.0.tgz#edd0cee70402584c8c76cc2c0556db09d1f45061" + integrity sha512-igBTJcNNNhvZFRtm8uA6xMY6xYleeDwn3PeBCkDz7tHttv4F2hsDI2aPgNERWzvRcNYHNT3ymRaQzllmXj4YsQ== + htmlparser2@^8.0.1: version "8.0.2" resolved "https://registry.yarnpkg.com/htmlparser2/-/htmlparser2-8.0.2.tgz#f002151705b383e62433b5cf466f5b716edaec21"