mirror of
https://github.com/Xevion/schedule-extract.git
synced 2025-12-06 15:16:23 -06:00
Add html entity decoding, class name pattern parsing
This commit is contained in:
26
index.ts
26
index.ts
@@ -2,17 +2,22 @@ import {Cheerio, Element, load} from "cheerio";
|
|||||||
import {readFileSync} from "fs";
|
import {readFileSync} from "fs";
|
||||||
import {parse} from "date-fns";
|
import {parse} from "date-fns";
|
||||||
import {inspect} from "util";
|
import {inspect} from "util";
|
||||||
|
import {decode} from "html-entities";
|
||||||
import fetch from "node-fetch";
|
import fetch from "node-fetch";
|
||||||
import {z} from "zod";
|
import {z} from "zod";
|
||||||
|
|
||||||
const $ = load(readFileSync('list.html'));
|
const $ = load(readFileSync('list.html'));
|
||||||
const classes = $('#scheduleListView').children('.listViewWrapper');
|
const classes = $('#scheduleListView').children('.listViewWrapper');
|
||||||
|
|
||||||
|
const name_pattern = /(.+) (\d{4}) Section (\d+)/;
|
||||||
|
|
||||||
const subject_schema = z.object({
|
const subject_schema = z.object({
|
||||||
code: z.string(),
|
code: z.string(),
|
||||||
description: z.string(),
|
description: z.string().transform((v) => decode(v, {level: 'all'})),
|
||||||
});
|
});
|
||||||
|
|
||||||
|
type Subject = z.infer<typeof subject_schema>;
|
||||||
|
|
||||||
|
|
||||||
const subjects = await z.array(subject_schema).parseAsync(
|
const subjects = await z.array(subject_schema).parseAsync(
|
||||||
await fetch(
|
await fetch(
|
||||||
@@ -27,6 +32,8 @@ const subjects = await z.array(subject_schema).parseAsync(
|
|||||||
})
|
})
|
||||||
);
|
);
|
||||||
|
|
||||||
|
const subject_by_name = new Map(subjects.map((subject) => [subject.description, subject]));
|
||||||
|
|
||||||
|
|
||||||
function getOffset(period: string) {
|
function getOffset(period: string) {
|
||||||
switch (period) {
|
switch (period) {
|
||||||
@@ -61,9 +68,24 @@ function extractDetails(source: Cheerio<Element>) {
|
|||||||
const [start_date, end_date] = raw_date.split("--").map((date) => parse(date.trim(), "MM/dd/yyyy", new Date()));
|
const [start_date, end_date] = raw_date.split("--").map((date) => parse(date.trim(), "MM/dd/yyyy", new Date()));
|
||||||
|
|
||||||
const identifier = source.find('span.list-view-subj-course-section').text();
|
const identifier = source.find('span.list-view-subj-course-section').text();
|
||||||
|
const name_match = name_pattern.exec(identifier);
|
||||||
|
if (name_match === null) {
|
||||||
|
throw new Error(`Could not parse identifier: ${identifier}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const [subject_name, code, section] = name_match.slice(1);
|
||||||
|
const subject = subject_by_name.get(subject_name);
|
||||||
|
if (subject === undefined) {
|
||||||
|
throw new Error(`Unknown subject: ${subject_name}`);
|
||||||
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
identifier,
|
identifier: {
|
||||||
|
subject,
|
||||||
|
code: code,
|
||||||
|
section: section,
|
||||||
|
crn: null,
|
||||||
|
},
|
||||||
date: {
|
date: {
|
||||||
start: start_date,
|
start: start_date,
|
||||||
end: end_date,
|
end: end_date,
|
||||||
|
|||||||
@@ -17,6 +17,7 @@
|
|||||||
"@types/node": "^20.4.9",
|
"@types/node": "^20.4.9",
|
||||||
"cheerio": "^1.0.0-rc.12",
|
"cheerio": "^1.0.0-rc.12",
|
||||||
"date-fns": "^2.30.0",
|
"date-fns": "^2.30.0",
|
||||||
|
"html-entities": "^2.4.0",
|
||||||
"ics": "^3.2.0",
|
"ics": "^3.2.0",
|
||||||
"node-fetch": "^3.3.2",
|
"node-fetch": "^3.3.2",
|
||||||
"ts-node": "^10.9.1",
|
"ts-node": "^10.9.1",
|
||||||
|
|||||||
@@ -192,6 +192,11 @@ formdata-polyfill@^4.0.10:
|
|||||||
dependencies:
|
dependencies:
|
||||||
fetch-blob "^3.1.2"
|
fetch-blob "^3.1.2"
|
||||||
|
|
||||||
|
html-entities@^2.4.0:
|
||||||
|
version "2.4.0"
|
||||||
|
resolved "https://registry.yarnpkg.com/html-entities/-/html-entities-2.4.0.tgz#edd0cee70402584c8c76cc2c0556db09d1f45061"
|
||||||
|
integrity sha512-igBTJcNNNhvZFRtm8uA6xMY6xYleeDwn3PeBCkDz7tHttv4F2hsDI2aPgNERWzvRcNYHNT3ymRaQzllmXj4YsQ==
|
||||||
|
|
||||||
htmlparser2@^8.0.1:
|
htmlparser2@^8.0.1:
|
||||||
version "8.0.2"
|
version "8.0.2"
|
||||||
resolved "https://registry.yarnpkg.com/htmlparser2/-/htmlparser2-8.0.2.tgz#f002151705b383e62433b5cf466f5b716edaec21"
|
resolved "https://registry.yarnpkg.com/htmlparser2/-/htmlparser2-8.0.2.tgz#f002151705b383e62433b5cf466f5b716edaec21"
|
||||||
|
|||||||
Reference in New Issue
Block a user