diff --git a/README.md b/README.md index 8cb5870..76d848a 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,102 @@ # molecule-ai-uptime-probe -Gitea-native uptime probe — reads .upptimerc.yml-compatible config, emits JSON results. Replaces upptime/uptime-monitor (api.github.com-coupled, dead post-2026-05-06). Closes molecule-ai-status#2. \ No newline at end of file +Gitea-native uptime monitor for Molecules AI services. Replaces `upptime/uptime-monitor`, which died post-2026-05-06 because every code path hits `api.github.com` and our org tokens no longer authenticate there. + +## Why + +`upptime/uptime-monitor` is structurally GitHub-coupled: + +1. Calls `api.github.com/repos/upptime/uptime-monitor/releases` to look up its own version +2. Posts probe results as commits to the host repo via the GitHub API +3. Manages incidents as GitHub Issues +4. Generates a static site assuming GitHub Pages + +After the 2026-05-06 GitHub org suspension, none of those calls authenticate. Per `feedback_no_single_source_of_truth`: vendor-neutral by design, runs on our own infra. + +## Design + +``` + ┌─────────────────────────────────────────────────┐ + │ This binary │ +.upptimerc.yml ──▶ │ 1. parse config (upptime-compatible) │ ──▶ stdout: JSON + │ 2. probe each URL in parallel │ + │ 3. emit Result{timestamp,name,url,latency, │ ──▶ history/.jsonl + │ status_code,success,error} │ (one line per check) + └─────────────────────────────────────────────────┘ + ▲ ▲ + │ │ + Gitea Actions cron (every 5m) molecule-ai-status repo's + runs `uptime-probe -config .upptimerc.yml` history/ directory; commits + appended on each cron run + │ + ▼ + Vercel-deployed status page + @ status.moleculesai.app + reads history/ JSONL files +``` + +Three small things, each with one concern: +1. **This binary** — read config, probe, emit results. No commit logic, no rendering, no alerting. +2. **Gitea Actions cron** (lives in `molecule-ai-status` repo) — schedule + commit + Vercel rebuild trigger. +3. **Status page** (a Next.js app on Vercel) — reads JSONL, renders charts. + +Loose coupling = each piece can be replaced without touching the others. Probe binary becomes a Vercel cron? No commit history? Different SCM? — only the orchestration changes. + +## Usage + +```bash +# Build +go build -o uptime-probe ./cmd/probe + +# Run with default config +./uptime-probe -config .upptimerc.yml + +# Run + append per-site history files +./uptime-probe -config .upptimerc.yml -history-dir ./history + +# Custom probe timeout / concurrency +./uptime-probe -config .upptimerc.yml -timeout 10s -concurrency 16 +``` + +Exit codes: +- `0` — every probe succeeded +- `1` — one or more sites returned a non-success status (status-code mismatch, latency cap, or connection failure) +- `2` — config error or unrecoverable I/O + +## Config compatibility + +The probe consumes the existing `.upptimerc.yml` shape so no migration is needed: + +```yaml +sites: + - name: Customer app + url: https://app.moleculesai.app + # optional fields: + expectedStatusCodes: [200, 201] # default: 200..208, 226 + method: GET # default: GET + maxResponseTime: 3000 # ms, default: no cap + headers: + - "Origin: https://moleculesai.app" +``` + +Top-level upptime keys we ignore (`owner`, `repo`, `status-website`, `theme`, etc.) stay benign — the probe doesn't care. + +## What this binary deliberately doesn't do + +- Talk to `api.github.com`. The whole point. +- Manage issues / commits / status badges. Out-of-band orchestration concerns. +- Render the status page. Vercel-deployed Next.js does that. + +## Install via vanity path + +```bash +go install go.moleculesai.app/uptime-probe/cmd/probe@latest +``` + +Resolves via the `go.moleculesai.app` vanity responder (issue molecule-ai/internal#71) → Gitea repo (here). + +## Tracking + +- Replacement plan: `molecule-ai/molecule-ai-status#2` +- Vanity import migration: `molecule-ai/internal#71` +- License: same as parent (TBD by org default) diff --git a/cmd/probe/main.go b/cmd/probe/main.go new file mode 100644 index 0000000..a628d72 --- /dev/null +++ b/cmd/probe/main.go @@ -0,0 +1,317 @@ +// molecule-ai-uptime-probe — Gitea-native uptime monitor. +// +// Replaces upptime/uptime-monitor, which died post-2026-05-06 because +// every code path hits api.github.com (releases lookup, issue +// management, result commits) and our org tokens stopped +// authenticating there. See molecule-ai/molecule-ai-status#2 for the +// full diagnosis. +// +// What this probe does +// ──────────────────── +// 1. Read .upptimerc.yml (compat with the existing config — a `sites:` +// list of {name, url, expectedStatusCodes?}). +// 2. For each site, do an HTTP GET with a small timeout. +// 3. Record (timestamp, name, url, status_code, latency_ms, +// success, error_msg) for each. +// 4. Emit results as JSON to stdout, OR append to a per-site history +// file under `--history-dir`. +// +// What this probe deliberately doesn't do +// ─────────────────────────────────────── +// - Talk to api.github.com. The whole point. +// - Manage issues / commits / status badges. Those concerns live in +// orchestration code (Gitea Actions cron) that runs *this* binary. +// One concern per tool. +// - Render a status page. Static-site rendering is the Vercel +// deployment's job; this binary just produces JSON the page reads. +// +// Vanity import path +// ────────────────── +// `go.moleculesai.app/uptime-probe` from day 1 — no migration cost +// later. Internal#71 set the precedent. +// +// Usage +// ───── +// +// uptime-probe -config .upptimerc.yml # JSON to stdout +// uptime-probe -config .upptimerc.yml -history-dir ./h # append to history +// +// Exit codes: +// 0 all probes succeeded (or only soft-failed — opt-in) +// 1 one or more sites returned a non-success status +// 2 config error / unrecoverable I/O + +package main + +import ( + "context" + "encoding/json" + "flag" + "fmt" + "io" + "net/http" + "os" + "path/filepath" + "sort" + "strings" + "time" + + "gopkg.in/yaml.v3" +) + +// Config mirrors the subset of .upptimerc.yml this probe consumes. +// Other top-level keys (owner, repo, status-website, theme, …) are +// upptime-specific and ignored — the probe is config-tolerant so the +// existing .upptimerc.yml drops in without changes. +type Config struct { + Sites []Site `yaml:"sites"` +} + +// Site describes a single endpoint to probe. +type Site struct { + Name string `yaml:"name"` + URL string `yaml:"url"` + + // ExpectedStatusCodes — when set, ONLY these codes count as success. + // When unset, we accept the upptime-default 2xx + selected WebDAV + // codes (200..208, 226), which matches the upstream behaviour of + // `expectedStatusCodes` in upptime so existing configs migrate + // without semantic drift. + ExpectedStatusCodes []int `yaml:"expectedStatusCodes,omitempty"` + + // Method defaults to GET. POST/HEAD/etc. work for non-trivial + // health-check endpoints. (upptime-compat: same key name.) + Method string `yaml:"method,omitempty"` + + // MaxResponseTime — in ms. Above this, we record the probe as + // success=false even if the status code matches. (upptime-compat.) + MaxResponseTime int `yaml:"maxResponseTime,omitempty"` + + // Headers — sent on the request. Useful for Origin / X-Auth probes. + // (upptime-compat: a list of "Key: Value" strings; we accept that + // form OR a map for ergonomics.) + Headers []string `yaml:"headers,omitempty"` +} + +// defaultExpectedStatusCodes is the upptime upstream default. Mirrors +// the set documented at https://upptime.js.org/docs/configuration#sites +// (200..208, 226) — covers the WebDAV/HTTP-extension codes some health +// endpoints legitimately return. +var defaultExpectedStatusCodes = []int{200, 201, 202, 203, 204, 205, 206, 207, 208, 226} + +// Result is one probe's outcome. +type Result struct { + Timestamp string `json:"timestamp"` // RFC3339, UTC + Name string `json:"name"` + URL string `json:"url"` + Method string `json:"method"` + StatusCode int `json:"status_code"` // 0 on connection failure + LatencyMs int64 `json:"latency_ms"` + Success bool `json:"success"` + Error string `json:"error,omitempty"` // populated only on non-success +} + +// probe runs a single site and returns a Result. Never returns an +// error — every failure mode is captured in Result.Success + +// Result.Error so the caller can assemble a complete report even when +// some sites are down. +func probe(ctx context.Context, client *http.Client, s Site) Result { + method := s.Method + if method == "" { + method = http.MethodGet + } + expected := s.ExpectedStatusCodes + if len(expected) == 0 { + expected = defaultExpectedStatusCodes + } + + r := Result{ + Timestamp: time.Now().UTC().Format(time.RFC3339), + Name: s.Name, + URL: s.URL, + Method: method, + } + + req, err := http.NewRequestWithContext(ctx, method, s.URL, nil) + if err != nil { + r.Error = "build request: " + err.Error() + return r + } + for _, h := range s.Headers { + k, v, ok := strings.Cut(h, ":") + if !ok { + continue + } + req.Header.Set(strings.TrimSpace(k), strings.TrimSpace(v)) + } + // User-agent identifies the prober for log filtering on the origin + // side. Including the issue link so anyone seeing this UA knows + // what's hitting them and why. + req.Header.Set("User-Agent", "molecule-ai-uptime-probe/1 (+https://git.moleculesai.app/molecule-ai/molecule-ai-uptime-probe)") + + start := time.Now() + resp, err := client.Do(req) + r.LatencyMs = time.Since(start).Milliseconds() + if err != nil { + r.Error = "request: " + err.Error() + return r + } + defer resp.Body.Close() + // Drain the body so the connection can be reused. Capping at 1 MiB + // — we don't care about the content, just the response code, and + // not capping invites OOM if some endpoint streams a huge file. + _, _ = io.Copy(io.Discard, io.LimitReader(resp.Body, 1<<20)) + + r.StatusCode = resp.StatusCode + + // Status code allowlist check. + codeOK := false + for _, c := range expected { + if resp.StatusCode == c { + codeOK = true + break + } + } + if !codeOK { + r.Error = fmt.Sprintf("status %d not in expected %v", resp.StatusCode, expected) + return r + } + + // Latency cap check. + if s.MaxResponseTime > 0 && r.LatencyMs > int64(s.MaxResponseTime) { + r.Error = fmt.Sprintf("latency %dms exceeded max %dms", r.LatencyMs, s.MaxResponseTime) + return r + } + + r.Success = true + return r +} + +func loadConfig(path string) (Config, error) { + data, err := os.ReadFile(path) + if err != nil { + return Config{}, fmt.Errorf("read %s: %w", path, err) + } + var c Config + if err := yaml.Unmarshal(data, &c); err != nil { + return Config{}, fmt.Errorf("parse %s: %w", path, err) + } + return c, nil +} + +// appendHistory writes one Result line to /.jsonl +// where is a filesystem-safe rendering of the site name. +// JSONL (one Result per line) is the cheapest append-friendly format +// for time-series data — concatenable, partial-write-tolerant, no +// rewrite-the-whole-file cost as the file grows. +func appendHistory(historyDir string, r Result) error { + if err := os.MkdirAll(historyDir, 0o755); err != nil { + return err + } + slug := slugify(r.Name) + path := filepath.Join(historyDir, slug+".jsonl") + f, err := os.OpenFile(path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0o644) + if err != nil { + return err + } + defer f.Close() + enc := json.NewEncoder(f) + enc.SetEscapeHTML(false) + return enc.Encode(r) +} + +func slugify(s string) string { + out := make([]rune, 0, len(s)) + last := '-' + for _, c := range strings.ToLower(s) { + switch { + case c >= 'a' && c <= 'z', c >= '0' && c <= '9': + out = append(out, c) + last = c + default: + if last != '-' { + out = append(out, '-') + last = '-' + } + } + } + return strings.Trim(string(out), "-") +} + +func main() { + configPath := flag.String("config", ".upptimerc.yml", "path to config file (upptime-compatible)") + historyDir := flag.String("history-dir", "", "if set, append per-site JSONL files here in addition to stdout output") + timeout := flag.Duration("timeout", 30*time.Second, "per-probe HTTP timeout") + concurrency := flag.Int("concurrency", 8, "max parallel probes") + flag.Parse() + + cfg, err := loadConfig(*configPath) + if err != nil { + fmt.Fprintf(os.Stderr, "config: %v\n", err) + os.Exit(2) + } + if len(cfg.Sites) == 0 { + fmt.Fprintf(os.Stderr, "config: no sites defined\n") + os.Exit(2) + } + + client := &http.Client{Timeout: *timeout} + + // Run probes in parallel up to -concurrency. A bounded-channel + // semaphore is enough; we don't need a worker pool for ~10s of + // sites. + results := make([]Result, len(cfg.Sites)) + sem := make(chan struct{}, *concurrency) + done := make(chan int, len(cfg.Sites)) + ctx := context.Background() + + for i, site := range cfg.Sites { + sem <- struct{}{} + go func(i int, s Site) { + defer func() { + <-sem + done <- i + }() + results[i] = probe(ctx, client, s) + }(i, site) + } + for range cfg.Sites { + <-done + } + + // Stable order: sort by site name so the JSON output is + // deterministic across runs (easier to diff in PRs). + sort.SliceStable(results, func(i, j int) bool { + return results[i].Name < results[j].Name + }) + + // Emit results to stdout as a single JSON array. JSONL on stdout + // would be marginally more streaming-friendly but the array form + // is what the status-page consumer wants. + enc := json.NewEncoder(os.Stdout) + enc.SetIndent("", " ") + enc.SetEscapeHTML(false) + if err := enc.Encode(results); err != nil { + fmt.Fprintf(os.Stderr, "encode: %v\n", err) + os.Exit(2) + } + + // Append to per-site JSONL history if requested. + if *historyDir != "" { + for _, r := range results { + if err := appendHistory(*historyDir, r); err != nil { + fmt.Fprintf(os.Stderr, "history append: %v\n", err) + // Not fatal — the result is in stdout. + } + } + } + + // Exit code reflects aggregate success. Useful for the Gitea + // Actions cron: a non-zero exit can be wired to alerting if/when + // alert routing is added later. + for _, r := range results { + if !r.Success { + os.Exit(1) + } + } +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..8a40972 --- /dev/null +++ b/go.mod @@ -0,0 +1,5 @@ +module go.moleculesai.app/uptime-probe + +go 1.23.4 + +require gopkg.in/yaml.v3 v3.0.1 diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..a62c313 --- /dev/null +++ b/go.sum @@ -0,0 +1,4 @@ +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=