Gitea-native uptime probe. Reads .upptimerc.yml-compatible config,
emits per-site Result{timestamp, name, url, status_code, latency_ms,
success, error} as JSON. Optional -history-dir appends per-site JSONL
files for time-series.
Why: upptime is structurally GitHub-coupled (every code path hits
api.github.com — releases lookup + issue management + result commits).
Post the 2026-05-06 GitHub org suspension, no token in our org
authenticates there. Diagnosis + replacement options in
molecule-ai-status#2.
What this replaces vs deliberately leaves out:
- IN: probe loop, parallel HTTP, status + latency cap matching, JSONL
history append, JSON stdout output
- OUT: result commits (Gitea Actions cron orchestrates), issue
management (out of scope), status-page rendering (Vercel does that)
Module path: go.moleculesai.app/uptime-probe (vanity from day 1 — no
migration cost later; matches internal#71 pattern).
Smoke-tested against the existing .upptimerc.yml in molecule-ai-status:
all 7 production endpoints (canvas, docs, CP, landing) return 200 with
latencies 148-357ms. Probe classifies correctly.
Exit codes:
0 all probes succeeded
1 one or more sites returned a non-success (status / latency /
connection failure)
2 config error / unrecoverable I/O
318 lines
9.8 KiB
Go
318 lines
9.8 KiB
Go
// molecule-ai-uptime-probe — Gitea-native uptime monitor.
|
|
//
|
|
// Replaces upptime/uptime-monitor, which died post-2026-05-06 because
|
|
// every code path hits api.github.com (releases lookup, issue
|
|
// management, result commits) and our org tokens stopped
|
|
// authenticating there. See molecule-ai/molecule-ai-status#2 for the
|
|
// full diagnosis.
|
|
//
|
|
// What this probe does
|
|
// ────────────────────
|
|
// 1. Read .upptimerc.yml (compat with the existing config — a `sites:`
|
|
// list of {name, url, expectedStatusCodes?}).
|
|
// 2. For each site, do an HTTP GET with a small timeout.
|
|
// 3. Record (timestamp, name, url, status_code, latency_ms,
|
|
// success, error_msg) for each.
|
|
// 4. Emit results as JSON to stdout, OR append to a per-site history
|
|
// file under `--history-dir`.
|
|
//
|
|
// What this probe deliberately doesn't do
|
|
// ───────────────────────────────────────
|
|
// - Talk to api.github.com. The whole point.
|
|
// - Manage issues / commits / status badges. Those concerns live in
|
|
// orchestration code (Gitea Actions cron) that runs *this* binary.
|
|
// One concern per tool.
|
|
// - Render a status page. Static-site rendering is the Vercel
|
|
// deployment's job; this binary just produces JSON the page reads.
|
|
//
|
|
// Vanity import path
|
|
// ──────────────────
|
|
// `go.moleculesai.app/uptime-probe` from day 1 — no migration cost
|
|
// later. Internal#71 set the precedent.
|
|
//
|
|
// Usage
|
|
// ─────
|
|
//
|
|
// uptime-probe -config .upptimerc.yml # JSON to stdout
|
|
// uptime-probe -config .upptimerc.yml -history-dir ./h # append to history
|
|
//
|
|
// Exit codes:
|
|
// 0 all probes succeeded (or only soft-failed — opt-in)
|
|
// 1 one or more sites returned a non-success status
|
|
// 2 config error / unrecoverable I/O
|
|
|
|
package main
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"flag"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"os"
|
|
"path/filepath"
|
|
"sort"
|
|
"strings"
|
|
"time"
|
|
|
|
"gopkg.in/yaml.v3"
|
|
)
|
|
|
|
// Config mirrors the subset of .upptimerc.yml this probe consumes.
|
|
// Other top-level keys (owner, repo, status-website, theme, …) are
|
|
// upptime-specific and ignored — the probe is config-tolerant so the
|
|
// existing .upptimerc.yml drops in without changes.
|
|
type Config struct {
|
|
Sites []Site `yaml:"sites"`
|
|
}
|
|
|
|
// Site describes a single endpoint to probe.
|
|
type Site struct {
|
|
Name string `yaml:"name"`
|
|
URL string `yaml:"url"`
|
|
|
|
// ExpectedStatusCodes — when set, ONLY these codes count as success.
|
|
// When unset, we accept the upptime-default 2xx + selected WebDAV
|
|
// codes (200..208, 226), which matches the upstream behaviour of
|
|
// `expectedStatusCodes` in upptime so existing configs migrate
|
|
// without semantic drift.
|
|
ExpectedStatusCodes []int `yaml:"expectedStatusCodes,omitempty"`
|
|
|
|
// Method defaults to GET. POST/HEAD/etc. work for non-trivial
|
|
// health-check endpoints. (upptime-compat: same key name.)
|
|
Method string `yaml:"method,omitempty"`
|
|
|
|
// MaxResponseTime — in ms. Above this, we record the probe as
|
|
// success=false even if the status code matches. (upptime-compat.)
|
|
MaxResponseTime int `yaml:"maxResponseTime,omitempty"`
|
|
|
|
// Headers — sent on the request. Useful for Origin / X-Auth probes.
|
|
// (upptime-compat: a list of "Key: Value" strings; we accept that
|
|
// form OR a map for ergonomics.)
|
|
Headers []string `yaml:"headers,omitempty"`
|
|
}
|
|
|
|
// defaultExpectedStatusCodes is the upptime upstream default. Mirrors
|
|
// the set documented at https://upptime.js.org/docs/configuration#sites
|
|
// (200..208, 226) — covers the WebDAV/HTTP-extension codes some health
|
|
// endpoints legitimately return.
|
|
var defaultExpectedStatusCodes = []int{200, 201, 202, 203, 204, 205, 206, 207, 208, 226}
|
|
|
|
// Result is one probe's outcome.
|
|
type Result struct {
|
|
Timestamp string `json:"timestamp"` // RFC3339, UTC
|
|
Name string `json:"name"`
|
|
URL string `json:"url"`
|
|
Method string `json:"method"`
|
|
StatusCode int `json:"status_code"` // 0 on connection failure
|
|
LatencyMs int64 `json:"latency_ms"`
|
|
Success bool `json:"success"`
|
|
Error string `json:"error,omitempty"` // populated only on non-success
|
|
}
|
|
|
|
// probe runs a single site and returns a Result. Never returns an
|
|
// error — every failure mode is captured in Result.Success +
|
|
// Result.Error so the caller can assemble a complete report even when
|
|
// some sites are down.
|
|
func probe(ctx context.Context, client *http.Client, s Site) Result {
|
|
method := s.Method
|
|
if method == "" {
|
|
method = http.MethodGet
|
|
}
|
|
expected := s.ExpectedStatusCodes
|
|
if len(expected) == 0 {
|
|
expected = defaultExpectedStatusCodes
|
|
}
|
|
|
|
r := Result{
|
|
Timestamp: time.Now().UTC().Format(time.RFC3339),
|
|
Name: s.Name,
|
|
URL: s.URL,
|
|
Method: method,
|
|
}
|
|
|
|
req, err := http.NewRequestWithContext(ctx, method, s.URL, nil)
|
|
if err != nil {
|
|
r.Error = "build request: " + err.Error()
|
|
return r
|
|
}
|
|
for _, h := range s.Headers {
|
|
k, v, ok := strings.Cut(h, ":")
|
|
if !ok {
|
|
continue
|
|
}
|
|
req.Header.Set(strings.TrimSpace(k), strings.TrimSpace(v))
|
|
}
|
|
// User-agent identifies the prober for log filtering on the origin
|
|
// side. Including the issue link so anyone seeing this UA knows
|
|
// what's hitting them and why.
|
|
req.Header.Set("User-Agent", "molecule-ai-uptime-probe/1 (+https://git.moleculesai.app/molecule-ai/molecule-ai-uptime-probe)")
|
|
|
|
start := time.Now()
|
|
resp, err := client.Do(req)
|
|
r.LatencyMs = time.Since(start).Milliseconds()
|
|
if err != nil {
|
|
r.Error = "request: " + err.Error()
|
|
return r
|
|
}
|
|
defer resp.Body.Close()
|
|
// Drain the body so the connection can be reused. Capping at 1 MiB
|
|
// — we don't care about the content, just the response code, and
|
|
// not capping invites OOM if some endpoint streams a huge file.
|
|
_, _ = io.Copy(io.Discard, io.LimitReader(resp.Body, 1<<20))
|
|
|
|
r.StatusCode = resp.StatusCode
|
|
|
|
// Status code allowlist check.
|
|
codeOK := false
|
|
for _, c := range expected {
|
|
if resp.StatusCode == c {
|
|
codeOK = true
|
|
break
|
|
}
|
|
}
|
|
if !codeOK {
|
|
r.Error = fmt.Sprintf("status %d not in expected %v", resp.StatusCode, expected)
|
|
return r
|
|
}
|
|
|
|
// Latency cap check.
|
|
if s.MaxResponseTime > 0 && r.LatencyMs > int64(s.MaxResponseTime) {
|
|
r.Error = fmt.Sprintf("latency %dms exceeded max %dms", r.LatencyMs, s.MaxResponseTime)
|
|
return r
|
|
}
|
|
|
|
r.Success = true
|
|
return r
|
|
}
|
|
|
|
func loadConfig(path string) (Config, error) {
|
|
data, err := os.ReadFile(path)
|
|
if err != nil {
|
|
return Config{}, fmt.Errorf("read %s: %w", path, err)
|
|
}
|
|
var c Config
|
|
if err := yaml.Unmarshal(data, &c); err != nil {
|
|
return Config{}, fmt.Errorf("parse %s: %w", path, err)
|
|
}
|
|
return c, nil
|
|
}
|
|
|
|
// appendHistory writes one Result line to <history-dir>/<slug>.jsonl
|
|
// where <slug> is a filesystem-safe rendering of the site name.
|
|
// JSONL (one Result per line) is the cheapest append-friendly format
|
|
// for time-series data — concatenable, partial-write-tolerant, no
|
|
// rewrite-the-whole-file cost as the file grows.
|
|
func appendHistory(historyDir string, r Result) error {
|
|
if err := os.MkdirAll(historyDir, 0o755); err != nil {
|
|
return err
|
|
}
|
|
slug := slugify(r.Name)
|
|
path := filepath.Join(historyDir, slug+".jsonl")
|
|
f, err := os.OpenFile(path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0o644)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer f.Close()
|
|
enc := json.NewEncoder(f)
|
|
enc.SetEscapeHTML(false)
|
|
return enc.Encode(r)
|
|
}
|
|
|
|
func slugify(s string) string {
|
|
out := make([]rune, 0, len(s))
|
|
last := '-'
|
|
for _, c := range strings.ToLower(s) {
|
|
switch {
|
|
case c >= 'a' && c <= 'z', c >= '0' && c <= '9':
|
|
out = append(out, c)
|
|
last = c
|
|
default:
|
|
if last != '-' {
|
|
out = append(out, '-')
|
|
last = '-'
|
|
}
|
|
}
|
|
}
|
|
return strings.Trim(string(out), "-")
|
|
}
|
|
|
|
func main() {
|
|
configPath := flag.String("config", ".upptimerc.yml", "path to config file (upptime-compatible)")
|
|
historyDir := flag.String("history-dir", "", "if set, append per-site JSONL files here in addition to stdout output")
|
|
timeout := flag.Duration("timeout", 30*time.Second, "per-probe HTTP timeout")
|
|
concurrency := flag.Int("concurrency", 8, "max parallel probes")
|
|
flag.Parse()
|
|
|
|
cfg, err := loadConfig(*configPath)
|
|
if err != nil {
|
|
fmt.Fprintf(os.Stderr, "config: %v\n", err)
|
|
os.Exit(2)
|
|
}
|
|
if len(cfg.Sites) == 0 {
|
|
fmt.Fprintf(os.Stderr, "config: no sites defined\n")
|
|
os.Exit(2)
|
|
}
|
|
|
|
client := &http.Client{Timeout: *timeout}
|
|
|
|
// Run probes in parallel up to -concurrency. A bounded-channel
|
|
// semaphore is enough; we don't need a worker pool for ~10s of
|
|
// sites.
|
|
results := make([]Result, len(cfg.Sites))
|
|
sem := make(chan struct{}, *concurrency)
|
|
done := make(chan int, len(cfg.Sites))
|
|
ctx := context.Background()
|
|
|
|
for i, site := range cfg.Sites {
|
|
sem <- struct{}{}
|
|
go func(i int, s Site) {
|
|
defer func() {
|
|
<-sem
|
|
done <- i
|
|
}()
|
|
results[i] = probe(ctx, client, s)
|
|
}(i, site)
|
|
}
|
|
for range cfg.Sites {
|
|
<-done
|
|
}
|
|
|
|
// Stable order: sort by site name so the JSON output is
|
|
// deterministic across runs (easier to diff in PRs).
|
|
sort.SliceStable(results, func(i, j int) bool {
|
|
return results[i].Name < results[j].Name
|
|
})
|
|
|
|
// Emit results to stdout as a single JSON array. JSONL on stdout
|
|
// would be marginally more streaming-friendly but the array form
|
|
// is what the status-page consumer wants.
|
|
enc := json.NewEncoder(os.Stdout)
|
|
enc.SetIndent("", " ")
|
|
enc.SetEscapeHTML(false)
|
|
if err := enc.Encode(results); err != nil {
|
|
fmt.Fprintf(os.Stderr, "encode: %v\n", err)
|
|
os.Exit(2)
|
|
}
|
|
|
|
// Append to per-site JSONL history if requested.
|
|
if *historyDir != "" {
|
|
for _, r := range results {
|
|
if err := appendHistory(*historyDir, r); err != nil {
|
|
fmt.Fprintf(os.Stderr, "history append: %v\n", err)
|
|
// Not fatal — the result is in stdout.
|
|
}
|
|
}
|
|
}
|
|
|
|
// Exit code reflects aggregate success. Useful for the Gitea
|
|
// Actions cron: a non-zero exit can be wired to alerting if/when
|
|
// alert routing is added later.
|
|
for _, r := range results {
|
|
if !r.Success {
|
|
os.Exit(1)
|
|
}
|
|
}
|
|
}
|