mirror of
https://github.com/Xevion/banner.git
synced 2025-12-06 07:14:21 -06:00
I realized that archived classes would have unusually low expiry times without the multiplier in plce Also changed the IsViewOnlyTerm func name, new term "Archive" for this
220 lines
6.9 KiB
Go
220 lines
6.9 KiB
Go
package main
|
|
|
|
import (
|
|
"fmt"
|
|
"math/rand"
|
|
"time"
|
|
|
|
"github.com/rs/zerolog/log"
|
|
"github.com/samber/lo"
|
|
)
|
|
|
|
const (
|
|
MaxPageSize = 500
|
|
)
|
|
|
|
var (
|
|
// PriorityMajors is a list of majors that are considered to be high priority for scraping. This list is used to determine which majors to scrape first/most often.
|
|
PriorityMajors = []string{"CS", "CPE", "MAT", "EE", "IS"}
|
|
// AncillaryMajors is a list of majors that are considered to be low priority for scraping. This list will not contain any majors that are in PriorityMajors.
|
|
AncillaryMajors []string
|
|
// AllMajors is a list of all majors that are available in the Banner system.
|
|
AllMajors []string
|
|
)
|
|
|
|
// Scrape is the general scraping invocation (best called within/as a goroutine) that should be called regularly to initiate scraping of the Banner system.
|
|
func Scrape() error {
|
|
// Populate AllMajors if it is empty
|
|
if len(AncillaryMajors) == 0 {
|
|
term := Default(time.Now()).ToString()
|
|
subjects, err := GetSubjects("", term, 1, 99)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to get subjects: %w", err)
|
|
}
|
|
|
|
// Ensure subjects were found
|
|
if len(subjects) == 0 {
|
|
return fmt.Errorf("no subjects found")
|
|
}
|
|
|
|
// Extract major code name
|
|
for _, subject := range subjects {
|
|
// Add to AncillaryMajors if not in PriorityMajors
|
|
if !lo.Contains(PriorityMajors, subject.Code) {
|
|
AncillaryMajors = append(AncillaryMajors, subject.Code)
|
|
}
|
|
}
|
|
|
|
AllMajors = lo.Flatten([][]string{PriorityMajors, AncillaryMajors})
|
|
}
|
|
|
|
expiredSubjects, err := GetExpiredSubjects()
|
|
if err != nil {
|
|
return fmt.Errorf("failed to get scrapable majors: %w", err)
|
|
}
|
|
|
|
log.Info().Strs("majors", expiredSubjects).Msg("Scraping majors")
|
|
for _, subject := range expiredSubjects {
|
|
err := ScrapeMajor(subject)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to scrape major %s: %w", subject, err)
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// GetExpiredSubjects returns a list of subjects that are expired and should be scraped.
|
|
func GetExpiredSubjects() ([]string, error) {
|
|
term := Default(time.Now()).ToString()
|
|
subjects := make([]string, 0)
|
|
|
|
// Get all subjects
|
|
values, err := kv.MGet(ctx, lo.Map(AllMajors, func(major string, _ int) string {
|
|
return fmt.Sprintf("scraped:%s:%s", major, term)
|
|
})...).Result()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to get all subjects: %w", err)
|
|
}
|
|
|
|
// Extract expired subjects
|
|
for i, value := range values {
|
|
subject := AllMajors[i]
|
|
|
|
// If the value is nil or "0", then the subject is expired
|
|
if value == nil || value == "0" {
|
|
subjects = append(subjects, subject)
|
|
}
|
|
}
|
|
|
|
log.Debug().Strs("majors", subjects).Msg("Expired Subjects")
|
|
|
|
return subjects, nil
|
|
}
|
|
|
|
// ScrapeMajor is the scraping invocation for a specific major.
|
|
// This function does not check whether scraping is required at this time, it is assumed that the caller has already done so.
|
|
func ScrapeMajor(subject string) error {
|
|
offset := 0
|
|
totalClassCount := 0
|
|
|
|
for {
|
|
// Build & execute the query
|
|
query := NewQuery().Offset(offset).MaxResults(MaxPageSize * 2).Subject(subject)
|
|
result, err := Search(query, "subjectDescription", false)
|
|
if err != nil {
|
|
return fmt.Errorf("search failed: %w (%s)", err, query.String())
|
|
}
|
|
|
|
// Isn't it bullshit that they decided not to leave an actual 'reason' field for the failure?
|
|
if !result.Success {
|
|
return fmt.Errorf("result marked unsuccessful when searching for classes (%s)", query.String())
|
|
}
|
|
|
|
classCount := len(result.Data)
|
|
totalClassCount += classCount
|
|
log.Debug().Str("subject", subject).Int("count", classCount).Int("offset", offset).Msg("Placing classes in Redis")
|
|
|
|
// Process each class and store it in Redis
|
|
for _, course := range result.Data {
|
|
// Store class in Redis
|
|
err := IntakeCourse(course)
|
|
if err != nil {
|
|
log.Error().Err(err).Msg("failed to store class in Redis")
|
|
}
|
|
}
|
|
|
|
// Increment and continue if the results are full
|
|
if classCount >= MaxPageSize {
|
|
// This is unlikely to happen, but log it just in case
|
|
if classCount > MaxPageSize {
|
|
log.Warn().Int("page", offset).Int("count", classCount).Msg("Results exceed MaxPageSize")
|
|
}
|
|
|
|
offset += MaxPageSize
|
|
|
|
// TODO: Replace sleep with smarter rate limiting
|
|
log.Debug().Str("subject", subject).Int("nextOffset", offset).Msg("Sleeping before next page")
|
|
time.Sleep(time.Second * 3)
|
|
continue
|
|
} else {
|
|
// Log the number of classes scraped
|
|
log.Info().Str("subject", subject).Int("total", totalClassCount).Msgf("Subject %s Scraped", subject)
|
|
break
|
|
}
|
|
}
|
|
|
|
term := Default(time.Now()).ToString()
|
|
|
|
// Calculate the expiry time for the scrape (1 hour for every 200 classes, random +-15%) with a minimum of 1 hour
|
|
var scrapeExpiry time.Duration
|
|
if totalClassCount == 0 {
|
|
scrapeExpiry = time.Hour * 12
|
|
} else {
|
|
scrapeExpiry = CalculateExpiry(term, totalClassCount, lo.Contains(PriorityMajors, subject))
|
|
}
|
|
|
|
// Mark the major as scraped
|
|
if totalClassCount == 0 {
|
|
totalClassCount = -1
|
|
}
|
|
err := kv.Set(ctx, fmt.Sprintf("scraped:%s:%s", subject, term), totalClassCount, scrapeExpiry).Err()
|
|
if err != nil {
|
|
log.Error().Err(err).Msg("failed to mark major as scraped")
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// CalculateExpiry calculates the expiry time until the next scrape for a major.
|
|
// term is the term for which the relevant course is occurring within.
|
|
// count is the number of courses that were scraped.
|
|
// priority is a boolean indicating whether the major is a priority major.
|
|
func CalculateExpiry(term string, count int, priority bool) time.Duration {
|
|
// An hour for every 100 classes
|
|
baseExpiry := time.Hour * time.Duration(count/100)
|
|
|
|
// Subjects with less than 50 classes have a reversed expiry (less classes, longer interval)
|
|
// 1 class => 12 hours, 49 classes => 1 hour
|
|
if count < 50 {
|
|
hours := Slope(Point{1, 12}, Point{49, 1}, float64(count)).Y
|
|
baseExpiry = time.Duration(hours * float64(time.Hour))
|
|
}
|
|
|
|
// If the subject is a priority, then the expiry is halved without variance
|
|
if priority {
|
|
return baseExpiry / 3
|
|
}
|
|
|
|
// If the term is considered "view only" or "archived", then the expiry is multiplied by 5
|
|
var expiry = baseExpiry
|
|
if IsTermArchived(term) {
|
|
expiry *= 5
|
|
}
|
|
|
|
// Add minor variance to the expiry
|
|
expiryVariance := baseExpiry.Seconds() * (rand.Float64() * 0.15) // Between 0 and 15% of the total
|
|
if rand.Intn(2) == 0 {
|
|
expiry -= time.Duration(expiryVariance) * time.Second
|
|
} else {
|
|
expiry += time.Duration(expiryVariance) * time.Second
|
|
}
|
|
|
|
// Ensure the expiry is at least 1 hour with up to 15 extra minutes
|
|
if expiry < time.Hour {
|
|
baseExpiry = time.Hour + time.Duration(rand.Intn(60*15))*time.Second
|
|
}
|
|
|
|
return baseExpiry
|
|
}
|
|
|
|
// IntakeCourse stores a course in Redis.
|
|
// This function is mostly a stub for now, but will be used to handle change identification, notifications, and SQLite upserts in the future.
|
|
func IntakeCourse(course Course) error {
|
|
err := kv.Set(ctx, fmt.Sprintf("class:%s", course.CourseReferenceNumber), course, 0).Err()
|
|
if err != nil {
|
|
return fmt.Errorf("failed to store class in Redis: %w", err)
|
|
}
|
|
return nil
|
|
}
|