molecule-ai-uptime-probe/cmd/probe/main.go

// molecule-ai-uptime-probe — Gitea-native uptime monitor.
//
// Replaces upptime/uptime-monitor, which died post-2026-05-06 because
// every code path hits api.github.com (releases lookup, issue
// management, result commits) and our org tokens stopped
// authenticating there. See molecule-ai/molecule-ai-status#2 for the
// full diagnosis.
//
// What this probe does
// ────────────────────
//   1. Read .upptimerc.yml (compat with the existing config — a `sites:`
//      list of {name, url, expectedStatusCodes?}).
//   2. For each site, do an HTTP GET with a small timeout.
//   3. Record (timestamp, name, url, status_code, latency_ms,
//      success, error_msg) for each.
//   4. Emit results as JSON to stdout, OR append to a per-site history
//      file under `--history-dir`.
//
// What this probe deliberately doesn't do
// ───────────────────────────────────────
//   - Talk to api.github.com. The whole point.
//   - Manage issues / commits / status badges. Those concerns live in
//     orchestration code (Gitea Actions cron) that runs *this* binary.
//     One concern per tool.
//   - Render a status page. Static-site rendering is the Vercel
//     deployment's job; this binary just produces JSON the page reads.
//
// Vanity import path
// ──────────────────
// `go.moleculesai.app/uptime-probe` from day 1 — no migration cost
// later. Internal#71 set the precedent.
//
// Usage
// ─────
//
//   uptime-probe -config .upptimerc.yml                    # JSON to stdout
//   uptime-probe -config .upptimerc.yml -history-dir ./h   # append to history
//
// Exit codes:
//   0  all probes succeeded (or only soft-failed — opt-in)
//   1  one or more sites returned a non-success status
//   2  config error / unrecoverable I/O

package main

import (
	"context"
	"encoding/json"
	"flag"
	"fmt"
	"io"
	"net/http"
	"os"
	"path/filepath"
	"sort"
	"strings"
	"time"

	"gopkg.in/yaml.v3"
)

// Config mirrors the subset of .upptimerc.yml this probe consumes.
// Other top-level keys (owner, repo, status-website, theme, …) are
// upptime-specific and ignored — the probe is config-tolerant so the
// existing .upptimerc.yml drops in without changes.
type Config struct {
	Sites []Site `yaml:"sites"`
}

// Site describes a single endpoint to probe.
type Site struct {
	Name string `yaml:"name"`
	URL  string `yaml:"url"`

	// ExpectedStatusCodes — when set, ONLY these codes count as success.
	// When unset, we accept the upptime-default 2xx + selected WebDAV
	// codes (200..208, 226), which matches the upstream behaviour of
	// `expectedStatusCodes` in upptime so existing configs migrate
	// without semantic drift.
	ExpectedStatusCodes []int `yaml:"expectedStatusCodes,omitempty"`

	// Method defaults to GET. POST/HEAD/etc. work for non-trivial
	// health-check endpoints. (upptime-compat: same key name.)
	Method string `yaml:"method,omitempty"`

	// MaxResponseTime — in ms. Above this, we record the probe as
	// success=false even if the status code matches. (upptime-compat.)
	MaxResponseTime int `yaml:"maxResponseTime,omitempty"`

	// Headers — sent on the request. Useful for Origin / X-Auth probes.
	// (upptime-compat: a list of "Key: Value" strings; we accept that
	// form OR a map for ergonomics.)
	Headers []string `yaml:"headers,omitempty"`
}

// defaultExpectedStatusCodes is the upptime upstream default. Mirrors
// the set documented at https://upptime.js.org/docs/configuration#sites
// (200..208, 226) — covers the WebDAV/HTTP-extension codes some health
// endpoints legitimately return.
var defaultExpectedStatusCodes = []int{200, 201, 202, 203, 204, 205, 206, 207, 208, 226}

// Result is one probe's outcome.
type Result struct {
	Timestamp  string `json:"timestamp"`             // RFC3339, UTC
	Name       string `json:"name"`
	URL        string `json:"url"`
	Method     string `json:"method"`
	StatusCode int    `json:"status_code"`           // 0 on connection failure
	LatencyMs  int64  `json:"latency_ms"`
	Success    bool   `json:"success"`
	Error      string `json:"error,omitempty"`       // populated only on non-success
}

// probe runs a single site and returns a Result. Never returns an
// error — every failure mode is captured in Result.Success +
// Result.Error so the caller can assemble a complete report even when
// some sites are down.
func probe(ctx context.Context, client *http.Client, s Site) Result {
	method := s.Method
	if method == "" {
		method = http.MethodGet
	}
	expected := s.ExpectedStatusCodes
	if len(expected) == 0 {
		expected = defaultExpectedStatusCodes
	}

	r := Result{
		Timestamp: time.Now().UTC().Format(time.RFC3339),
		Name:      s.Name,
		URL:       s.URL,
		Method:    method,
	}

	req, err := http.NewRequestWithContext(ctx, method, s.URL, nil)
	if err != nil {
		r.Error = "build request: " + err.Error()
		return r
	}
	for _, h := range s.Headers {
		k, v, ok := strings.Cut(h, ":")
		if !ok {
			continue
		}
		req.Header.Set(strings.TrimSpace(k), strings.TrimSpace(v))
	}
	// User-agent identifies the prober for log filtering on the origin
	// side. Including the issue link so anyone seeing this UA knows
	// what's hitting them and why.
	req.Header.Set("User-Agent", "molecule-ai-uptime-probe/1 (+https://git.moleculesai.app/molecule-ai/molecule-ai-uptime-probe)")

	start := time.Now()
	resp, err := client.Do(req)
	r.LatencyMs = time.Since(start).Milliseconds()
	if err != nil {
		r.Error = "request: " + err.Error()
		return r
	}
	defer resp.Body.Close()
	// Drain the body so the connection can be reused. Capping at 1 MiB
	// — we don't care about the content, just the response code, and
	// not capping invites OOM if some endpoint streams a huge file.
	_, _ = io.Copy(io.Discard, io.LimitReader(resp.Body, 1<<20))

	r.StatusCode = resp.StatusCode

	// Status code allowlist check.
	codeOK := false
	for _, c := range expected {
		if resp.StatusCode == c {
			codeOK = true
			break
		}
	}
	if !codeOK {
		r.Error = fmt.Sprintf("status %d not in expected %v", resp.StatusCode, expected)
		return r
	}

	// Latency cap check.
	if s.MaxResponseTime > 0 && r.LatencyMs > int64(s.MaxResponseTime) {
		r.Error = fmt.Sprintf("latency %dms exceeded max %dms", r.LatencyMs, s.MaxResponseTime)
		return r
	}

	r.Success = true
	return r
}

func loadConfig(path string) (Config, error) {
	data, err := os.ReadFile(path)
	if err != nil {
		return Config{}, fmt.Errorf("read %s: %w", path, err)
	}
	var c Config
	if err := yaml.Unmarshal(data, &c); err != nil {
		return Config{}, fmt.Errorf("parse %s: %w", path, err)
	}
	return c, nil
}

// appendHistory writes one Result line to <history-dir>/<slug>.jsonl
// where <slug> is a filesystem-safe rendering of the site name.
// JSONL (one Result per line) is the cheapest append-friendly format
// for time-series data — concatenable, partial-write-tolerant, no
// rewrite-the-whole-file cost as the file grows.
func appendHistory(historyDir string, r Result) error {
	if err := os.MkdirAll(historyDir, 0o755); err != nil {
		return err
	}
	slug := slugify(r.Name)
	path := filepath.Join(historyDir, slug+".jsonl")
	f, err := os.OpenFile(path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0o644)
	if err != nil {
		return err
	}
	defer f.Close()
	enc := json.NewEncoder(f)
	enc.SetEscapeHTML(false)
	return enc.Encode(r)
}

func slugify(s string) string {
	out := make([]rune, 0, len(s))
	last := '-'
	for _, c := range strings.ToLower(s) {
		switch {
		case c >= 'a' && c <= 'z', c >= '0' && c <= '9':
			out = append(out, c)
			last = c
		default:
			if last != '-' {
				out = append(out, '-')
				last = '-'
			}
		}
	}
	return strings.Trim(string(out), "-")
}

func main() {
	configPath := flag.String("config", ".upptimerc.yml", "path to config file (upptime-compatible)")
	historyDir := flag.String("history-dir", "", "if set, append per-site JSONL files here in addition to stdout output")
	timeout := flag.Duration("timeout", 30*time.Second, "per-probe HTTP timeout")
	concurrency := flag.Int("concurrency", 8, "max parallel probes")
	flag.Parse()

	cfg, err := loadConfig(*configPath)
	if err != nil {
		fmt.Fprintf(os.Stderr, "config: %v\n", err)
		os.Exit(2)
	}
	if len(cfg.Sites) == 0 {
		fmt.Fprintf(os.Stderr, "config: no sites defined\n")
		os.Exit(2)
	}

	client := &http.Client{Timeout: *timeout}

	// Run probes in parallel up to -concurrency. A bounded-channel
	// semaphore is enough; we don't need a worker pool for ~10s of
	// sites.
	results := make([]Result, len(cfg.Sites))
	sem := make(chan struct{}, *concurrency)
	done := make(chan int, len(cfg.Sites))
	ctx := context.Background()

	for i, site := range cfg.Sites {
		sem <- struct{}{}
		go func(i int, s Site) {
			defer func() {
				<-sem
				done <- i
			}()
			results[i] = probe(ctx, client, s)
		}(i, site)
	}
	for range cfg.Sites {
		<-done
	}

	// Stable order: sort by site name so the JSON output is
	// deterministic across runs (easier to diff in PRs).
	sort.SliceStable(results, func(i, j int) bool {
		return results[i].Name < results[j].Name
	})

	// Emit results to stdout as a single JSON array. JSONL on stdout
	// would be marginally more streaming-friendly but the array form
	// is what the status-page consumer wants.
	enc := json.NewEncoder(os.Stdout)
	enc.SetIndent("", "  ")
	enc.SetEscapeHTML(false)
	if err := enc.Encode(results); err != nil {
		fmt.Fprintf(os.Stderr, "encode: %v\n", err)
		os.Exit(2)
	}

	// Append to per-site JSONL history if requested.
	if *historyDir != "" {
		for _, r := range results {
			if err := appendHistory(*historyDir, r); err != nil {
				fmt.Fprintf(os.Stderr, "history append: %v\n", err)
				// Not fatal — the result is in stdout.
			}
		}
	}

	// Exit code reflects aggregate success. Useful for the Gitea
	// Actions cron: a non-zero exit can be wired to alerting if/when
	// alert routing is added later.
	for _, r := range results {
		if !r.Success {
			os.Exit(1)
		}
	}
}