diff --git a/scrape.go b/scrape.go index a2c962b..02a69a6 100644 --- a/scrape.go +++ b/scrape.go @@ -4,9 +4,15 @@ import ( "fmt" "time" + "github.com/redis/go-redis/v9" + "github.com/rs/zerolog/log" "github.com/samber/lo" ) +const ( + MaxPageSize = 500 +) + var ( // PriorityMajors is a list of majors that are considered to be high priority for scraping. This list is used to determine which majors to scrape first/most often. PriorityMajors = []string{"CS", "CPE", "MAT", "EE", "IS"} @@ -18,7 +24,8 @@ var ( func Scrape() error { // Populate AllMajors if it is empty if len(AncillaryMajors) == 0 { - subjects, err := GetSubjects("", Default(time.Now()).ToString(), 1, 99) + term := Default(time.Now()).ToString() + subjects, err := GetSubjects("", term, 1, 99) if err != nil { return fmt.Errorf("failed to get subjects: %w", err) } @@ -38,6 +45,10 @@ func Scrape() error { } for _, subject := range PriorityMajors { + if !CanScrape(subject) { + continue + } + err := ScrapeMajor(subject) if err != nil { return fmt.Errorf("failed to scrape priority major %s: %w", subject, err) @@ -45,6 +56,10 @@ func Scrape() error { } for _, subject := range AncillaryMajors { + if !CanScrape(subject) { + continue + } + err := ScrapeMajor(subject) if err != nil { return fmt.Errorf("failed to scrape ancillary major %s: %w", subject, err) @@ -53,3 +68,80 @@ func Scrape() error { return nil } + +// CanScrape returns true if scraping is suggested for a given major at this time. +func CanScrape(subject string) bool { + term := Default(time.Now()).ToString() + scraped, err := kv.Get(ctx, fmt.Sprintf("scraped:%s:%s", subject, term)).Result() + + if err != nil { + // If the key does not exist, then it was never scraped or the scrape needs to be redone (it expired) + if err == redis.Nil { + return true + } + + log.Error().Err(err).Msg("failed to check if scraping is required") + return false + } + + return scraped == "1" +} + +// ScrapeMajor is the scraping invocation for a specific major. +// This function does not check whether scraping is required at this time, it is assumed that the caller has already done so. +func ScrapeMajor(subject string) error { + offset := 1 + totalClassCount := 0 + + for { + query := NewQuery().Offset(offset).MaxResults(MaxPageSize) + result, err := Search(query, "", false) + if err != nil { + return fmt.Errorf("failed to search for classes on page %d: %w", offset, err) + } + + if !result.Success { + // TODO: Improve error log details + return fmt.Errorf("search for classes on page %d was not successful", offset) + } + + // Process each class and store it in Redis + for _, class := range result.Data { + // Store class in Redis + err := kv.Set(ctx, fmt.Sprintf("class:%s", class.CourseReferenceNumber), class, 0).Err() + if err != nil { + log.Error().Err(err).Msg("failed to store class in Redis") + } + } + + classCount := len(result.Data) + totalClassCount += classCount + + // Increment and continue if the results are full + if classCount >= MaxPageSize { + // This is unlikely to happen, but log it just in case + if classCount > MaxPageSize { + log.Warn().Int("page", offset).Int("count", classCount).Msg("Results exceed MaxPageSize") + } + + offset += MaxPageSize + + // TODO: Replace sleep with smarter rate limiting + time.Sleep(time.Second * 7) + continue + } else { + // Log the number of classes scraped + log.Info().Str("subject", subject).Int("count", totalClassCount).Int("offset", offset).Int("finalOffset", offset+classCount).Msg("Scraped classes") + break + } + } + + // Mark the major as scraped + term := Default(time.Now()).ToString() + err := kv.Set(ctx, fmt.Sprintf("scraped:%s:%s", subject, term), "1", 0).Err() + if err != nil { + log.Error().Err(err).Msg("failed to mark major as scraped") + } + + return nil +}