Files
uptime/internal/scheduler/scheduler.go
2026-03-22 11:30:31 -05:00

199 lines
4.9 KiB
Go

package scheduler
import (
"context"
"log/slog"
"slices"
"sync"
"time"
"arclineit/arcline-uptime/internal/alert"
"arclineit/arcline-uptime/internal/config"
"arclineit/arcline-uptime/internal/monitor"
"arclineit/arcline-uptime/internal/store"
)
// Scheduler runs each monitor on its configured interval and handles alerting.
type Scheduler struct {
checkers []monitor.Checker
alerters []alert.NamedAlerter
monitorCfgs map[string]config.MonitorConfig
store *store.Store
cfg config.GlobalConfig
wg sync.WaitGroup
}
func New(
checkers []monitor.Checker,
alerters []alert.NamedAlerter,
monitorCfgs []config.MonitorConfig,
s *store.Store,
cfg config.GlobalConfig,
) *Scheduler {
cfgMap := make(map[string]config.MonitorConfig, len(monitorCfgs))
for _, m := range monitorCfgs {
cfgMap[m.Name] = m
}
return &Scheduler{
checkers: checkers,
alerters: alerters,
monitorCfgs: cfgMap,
store: s,
cfg: cfg,
}
}
// Start launches one goroutine per checker plus an optional pruning goroutine.
// Each checker fires immediately, then repeats on its configured interval.
func (s *Scheduler) Start(ctx context.Context) {
for _, c := range s.checkers {
s.wg.Add(1)
go func(c monitor.Checker) {
defer s.wg.Done()
s.runChecker(ctx, c)
}(c)
}
if s.cfg.RetentionDays > 0 {
s.wg.Add(1)
go func() {
defer s.wg.Done()
s.runPruner(ctx)
}()
}
}
// Wait blocks until all goroutines have exited.
func (s *Scheduler) Wait() { s.wg.Wait() }
func (s *Scheduler) runChecker(ctx context.Context, c monitor.Checker) {
interval := time.Duration(s.cfg.CheckInterval) * time.Second
if ci := c.Interval(); ci > 0 {
interval = ci
}
ticker := time.NewTicker(interval)
defer ticker.Stop()
for {
s.probe(c)
select {
case <-ticker.C:
case <-ctx.Done():
return
}
}
}
func (s *Scheduler) runPruner(ctx context.Context) {
s.prune()
ticker := time.NewTicker(24 * time.Hour)
defer ticker.Stop()
for {
select {
case <-ticker.C:
s.prune()
case <-ctx.Done():
return
}
}
}
func (s *Scheduler) prune() {
n, err := s.store.Prune(s.cfg.RetentionDays)
if err != nil {
slog.Error("prune failed", "error", err)
return
}
if n > 0 {
slog.Info("pruned old records", "deleted", n, "retention_days", s.cfg.RetentionDays)
}
}
func (s *Scheduler) probe(c monitor.Checker) {
mcfg := s.monitorCfgs[c.Name()]
if inMaintenanceWindow(mcfg.Maintenance) {
slog.Debug("skipping probe: maintenance window", "monitor", c.Name())
return
}
result := c.Check()
if err := s.store.SaveResult(result); err != nil {
slog.Error("save result", "monitor", result.MonitorName, "error", err)
}
s.handleAlerts(result, mcfg)
if result.Up {
slog.Info("check ok", "monitor", result.MonitorName, "ms", result.ResponseTime.Milliseconds())
} else {
slog.Warn("check failed", "monitor", result.MonitorName, "error", result.Error)
}
}
func (s *Scheduler) handleAlerts(r monitor.Result, mcfg config.MonitorConfig) {
cooldown := time.Duration(s.cfg.AlertCooldown) * time.Second
if !r.Up {
lastDown, err := s.store.LastAlertSent(r.MonitorName, "down")
if err != nil {
slog.Error("query last-down alert", "monitor", r.MonitorName, "error", err)
return
}
if lastDown.IsZero() || time.Since(lastDown) >= cooldown {
subject, body := alert.FormatDownMessage(r)
s.sendRouted(mcfg, "down", subject, body)
}
return
}
// Monitor is up — send recovery if the last alert was a down.
lastDown, err := s.store.LastAlertSent(r.MonitorName, "down")
if err != nil {
slog.Error("query last-down alert", "monitor", r.MonitorName, "error", err)
return
}
if lastDown.IsZero() {
return
}
lastUp, err := s.store.LastAlertSent(r.MonitorName, "up")
if err != nil {
slog.Error("query last-up alert", "monitor", r.MonitorName, "error", err)
return
}
if lastUp.Before(lastDown) {
subject, body := alert.FormatUpMessage(r, lastDown)
s.sendRouted(mcfg, "up", subject, body)
slog.Info("recovery alert sent", "monitor", r.MonitorName,
"down_duration", alert.FormatDuration(time.Since(lastDown)))
}
}
// sendRouted sends to alerters matching the monitor's alert_names filter.
// If alert_names is empty, sends to all alerters.
func (s *Scheduler) sendRouted(mcfg config.MonitorConfig, kind, subject, body string) {
for _, na := range s.alerters {
if len(mcfg.AlertNames) > 0 && !slices.Contains(mcfg.AlertNames, na.Name) {
continue
}
if err := na.Alerter.Send(subject, body); err != nil {
slog.Error("send alert", "kind", kind, "monitor", mcfg.Name, "alerter", na.Name, "error", err)
}
}
if err := s.store.RecordAlertSent(mcfg.Name, kind); err != nil {
slog.Error("record alert", "monitor", mcfg.Name, "error", err)
}
}
// inMaintenanceWindow reports whether any window in the list is currently active.
func inMaintenanceWindow(windows []config.MaintenanceWindow) bool {
now := time.Now()
for i := range windows {
if windows[i].Active(now) {
return true
}
}
return false
}