diff --git a/infra/obs/WIRING_CHECKLIST.md b/infra/obs/WIRING_CHECKLIST.md new file mode 100644 index 00000000..5d6e3dff --- /dev/null +++ b/infra/obs/WIRING_CHECKLIST.md @@ -0,0 +1,87 @@ +# Wiring checklist: Railway CP → Loki + tenant SSM:SendCommand (#3214) + +**Status:** DRAFT — owner/infra must execute the steps marked `[OWNER]`. + +--- + +## 1. Railway control-plane → Loki log drain + +Goal: make the Railway-hosted control-plane observable in the existing Grafana/Loki stack (obs.moleculesai.app). + +### 1.1 Grafana Cloud credentials `[OWNER]` +- [ ] Confirm Loki tenant endpoint for ds `P8E80F9AEF21F6940`. + - Placeholder: `https://logs-prod-xxx.grafana.net/loki/api/v1/push` +- [ ] Create or reuse a Grafana Cloud service account token with `logs:write` scope. +- [ ] Store `LOKI_ENDPOINT`, `LOKI_TENANT_ID`, and `LOKI_API_KEY` in the secret manager used by the Alloy deployment (e.g. Railway variables, Infisical, or AWS Secrets Manager). + +### 1.2 Alloy deployment `[OWNER]` +- [ ] Choose a host for Alloy: + - Option A: a small Railway service inside the same project as the control-plane (simplest). + - Option B: a sidecar container on the control-plane service. + - Option C: an existing obs host/operator box that is reachable from Railway. +- [ ] Mount `infra/obs/railway-loki-drain.alloy` as the Alloy configuration. +- [ ] Set the environment variables from 1.1. +- [ ] Deploy Alloy and confirm it is healthy (`/metrics` or process logs). + +### 1.3 Railway log-drain wiring `[OWNER]` +- [ ] In Railway project settings, add a log drain pointing at the Alloy HTTP receiver. + - If Alloy is public: `https://:9080/loki/api/v1/push` (secure with TLS + drain token). + - If Alloy is co-located: use Railway internal networking / service URL. +- [ ] Configure Railway to forward **all** control-plane service logs. +- [ ] Add authentication at the edge if Alloy is exposed (use `${RAILWAY_DRAIN_TOKEN}` placeholder in the Alloy config or terminate TLS at a reverse proxy). + +### 1.4 Verify `[OWNER]` +- [ ] Generate a test log line in the control-plane. +- [ ] Query Loki with `{source="railway", service="molecule-controlplane"}` and confirm the line appears within ~30s. +- [ ] Set up a Loki alert for missing Railway CP logs (heartbeat). + +--- + +## 2. Every tenant org box ships to Loki + +Goal: close the gap where some org boxes (e.g. `molecule-adk-demo`) are absent from the `tenant` label in Loki. + +### 2.1 Tenant provisioning `[OWNER / runtime team]` +- [ ] Confirm that tenant box bootstrap installs and starts the log shipper (Vector/Alloy). +- [ ] Confirm the shipper config points at the same Loki endpoint as the operator host. +- [ ] Confirm the shipper labels logs with `tenant=` and `environment=production`. + +### 2.2 Backfill / reconcile `[OWNER]` +- [ ] Audit current tenant boxes for missing log shipper. +- [ ] Re-provision or patch affected boxes (e.g. `molecule-adk-demo`). +- [ ] Verify in Loki that the `tenant` label now includes the org slug. + +--- + +## 3. Grant fleet/operator identity `ssm:SendCommand` + +Goal: enable incident-response access to box-level docker logs without per-incident token pulls. + +### 3.1 IAM policy `[OWNER / AWS admin]` +- [ ] Create or update the IAM policy for the fleet/operator identity with `infra/obs/tenant-ssm-sendcommand-policy.json`. +- [ ] Replace `*` ARNs with the actual AWS account/region if the org restricts scope. +- [ ] Confirm tenant EC2 instances carry the tag `MoleculeTenant=true` (the policy condition depends on it). + +### 3.2 SSM execution role `[OWNER / AWS admin]` +- [ ] Ensure an SSM execution role exists (`molecule-ssm-execution-role`) that the `SendCommand` can assume. +- [ ] Attach minimal read-only permissions to that role (e.g. `logs:DescribeLogGroups`, `ec2:DescribeInstances`). + +### 3.3 Document access `[OWNER / SRE]` +- [ ] Add a runbook entry showing the AWS CLI command to pull docker logs via SSM: + ```bash + aws ssm send-command \ + --instance-ids "i-xxxxxxxxxxxxxxxxx" \ + --document-name "AWS-RunShellCommand" \ + --parameters 'commands=["docker logs --tail 500 "]' \ + --comment "Incident response: fetch tenant docker logs" + ``` +- [ ] Verify the command works against a production tenant box and returns output. + +--- + +## 4. Rollout / acceptance + +- [ ] Deploy Alloy + Railway drain to a non-prod Railway project first (if available). +- [ ] Confirm Loki ingestion and SSM command in non-prod. +- [ ] Schedule production rollout during low-traffic window. +- [ ] Update on-call runbooks with the new Loki query patterns and SSM access path. diff --git a/infra/obs/railway-loki-drain.alloy b/infra/obs/railway-loki-drain.alloy new file mode 100644 index 00000000..a075433d --- /dev/null +++ b/infra/obs/railway-loki-drain.alloy @@ -0,0 +1,93 @@ +// Railway control-plane → Loki log drain (Grafana Cloud). +// +// Deploy as a long-lived sidecar / service that Railway forwards logs to. +// Owner fills in: +// - ${LOKI_ENDPOINT} (e.g. https://logs-prod-xxx.grafana.net/loki/api/v1/push) +// - ${LOKI_TENANT_ID} (Grafana Cloud stack tenant / org id) +// - ${LOKI_API_KEY} (Grafana Cloud API key / service account token) +// - ${RAILWAY_DRAIN_TOKEN} (Railway log-drain shared secret, optional) +// +// References: +// - molecule-core#3214 +// - Loki ds P8E80F9AEF21F6940 on obs.moleculesai.app + +// 1. Accept logs from Railway via a small HTTP receiver. +// Railway can forward project logs to an HTTPS endpoint; use a reverse proxy +// (nginx/caddy) or Railway native log drain to terminate TLS and forward to +// this local Alloy receiver if Alloy is not exposed directly. +promtail "railway_in" { + forward_to = [loki.write.loki_out.receiver] + + // If Alloy is exposed directly, bind to a private port and secure it with + // ${RAILWAY_DRAIN_TOKEN} in a `stage.match` or at the edge proxy. + grpc { + enabled = false + } + http { + listen_address = "127.0.0.1" + listen_port = 9080 + } + + // Placeholder: add a relabel rule to attach Railway project/service labels. + // relabel_rules = +} + +// 2. (Alternative) If Railway native drain is unavailable, tail the Railway +// CLI / JSON log stream from a mounted file or stdin and parse with loki.source.file. +// loki.source.file "railway_json_logs" { +// targets = [ +// {__path__ = "/var/log/railway/cp.json"}, +// ] +// forward_to = [loki.process.enrich.receiver] +// } + +// 3. Enrich / parse logs. +loki.process "enrich" { + forward_to = [loki.write.loki_out.receiver] + + // Add static labels expected by the obs stack. + stage.static_labels { + values = { + environment = "production", + service = "molecule-controlplane", + source = "railway", + } + } + + // Parse Railway JSON log lines if present. + stage.json { + expressions = { + level = "level", + message = "message", + service = "service", + } + } + + // Drop raw line after JSON extraction to avoid double-billing storage. + stage.output { + source = "message" + } +} + +// 4. Ship to Grafana Cloud Loki. +loki.write "loki_out" { + endpoint { + name = "grafana-cloud-loki" + url = env("LOKI_ENDPOINT") + + basic_auth { + username = env("LOKI_TENANT_ID") + password = env("LOKI_API_KEY") + } + + // Batch tuning: keep small for fast incident visibility. + batch_wait = "1s" + batch_size = "256KB" + timeout = "10s" + } + + external_labels = { + cluster = "molecule-railway-cp", + job = "railway-cp", + } +} diff --git a/infra/obs/tenant-ssm-sendcommand-policy.json b/infra/obs/tenant-ssm-sendcommand-policy.json new file mode 100644 index 00000000..56ff48d0 --- /dev/null +++ b/infra/obs/tenant-ssm-sendcommand-policy.json @@ -0,0 +1,47 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "MoleculeFleetSendCommandToTenantInstances", + "Effect": "Allow", + "Action": [ + "ssm:SendCommand", + "ssm:GetCommandInvocation", + "ssm:ListCommandInvocations" + ], + "Resource": [ + "arn:aws:ec2:*:*:instance/*" + ], + "Condition": { + "StringEquals": { + "aws:ResourceTag/MoleculeTenant": "true" + } + } + }, + { + "Sid": "MoleculeFleetUseAWSRunShellCommandDoc", + "Effect": "Allow", + "Action": [ + "ssm:GetDocument" + ], + "Resource": [ + "arn:aws:ssm:*:*:document/AWS-RunShellCommand" + ] + }, + { + "Sid": "MoleculeFleetPassExecutionRole", + "Effect": "Allow", + "Action": [ + "iam:PassRole" + ], + "Resource": [ + "arn:aws:iam::*:role/molecule-ssm-execution-role" + ], + "Condition": { + "StringEquals": { + "iam:PassedToService": "ssm.amazonaws.com" + } + } + } + ] +}