ci(lint): guard actions/setup-go cache on self-hosted fleet #2539

Merged
core-be merged 2 commits from ci/guard-setup-go-cache into main 2026-06-10 22:00:15 +00:00
7 changed files with 358 additions and 1 deletions
+144
View File
@@ -0,0 +1,144 @@
#!/usr/bin/env python3
"""lint_setup_go_cache — forbid actions/setup-go cache on self-hosted runners.
Forbidden shape
---------------
Any `uses: actions/setup-go@...` step that enables actions/cache —
either `cache: true` explicitly OR the default-true case (a `cache-key`
/ `cache-dependency-path` set with NO `cache: false`). setup-go's
`cache` input DEFAULTS to true, so omitting it is also forbidden once
any cache-* input is present, and a bare setup-go with neither is
treated as default-true and flagged too (belt-and-braces: on our
self-hosted fleet the only safe value is explicit `cache: false`).
Why
---
The molecule self-hosted runners bind-mount a persistent, host-shared
GOCACHE/GOMODCACHE (/var/cache/ci-go-{build,mod}, see
operator-config ops/runners/config.dedicated.yaml). actions/cache
(which setup-go drives when cache:true) untars its restored archive
OVER that bind mount -> "File exists" -> "Failed to restore" ->
partial cache -> downstream linker/typecheck failures on heavy jobs
(test -race link "too many errors", go-arch-lint "without types").
The runner-level GOCACHE is the SSOT for caching; setup-go must not
also cache. Fix: add `cache: false` under the setup-go `with:`.
Empirical: 2026-06-09/10 cross-repo rollout; sweep PRs
fix/setup-go-cache-vs-bind-mount (core#2524, cli#16). This guard
PREVENTS regression after those land.
Detection is line-based (not full YAML) so it can attribute a precise
file:line and survives Gitea's ${{ }} expressions that confuse some
YAML loaders. We locate each setup-go step, then read the contiguous
`with:` block that follows it (same or deeper indent, up to the next
step `- ` at the step indent).
"""
import os
import re
import sys
WORKFLOWS_DIR = os.environ.get("WORKFLOWS_DIR", ".gitea/workflows")
SETUP_GO = re.compile(r'^(\s*)(?:-\s+)?uses:\s*actions/setup-go@', re.I)
# step boundary: a list item `- ` at an indent <= the step's own indent
STEP_ITEM = re.compile(r'^(\s*)-\s+\S')
CACHE_LINE = re.compile(r'^\s*cache:\s*(\S+)')
CACHE_DEP = re.compile(r'^\s*cache-(dependency-path|key):')
WITH_LINE = re.compile(r'^\s*with:\s*$')
def step_indent(line):
m = re.match(r'^(\s*)', line)
return len(m.group(1))
def scan_file(path):
"""Return list of (lineno, reason) violations."""
with open(path) as f:
lines = f.readlines()
viols = []
i = 0
n = len(lines)
while i < n:
m = SETUP_GO.match(lines[i])
if not m:
i += 1
continue
go_line = i + 1
# Indent of the `uses:` key. The step's `with:` block lives at
# the same key indent (siblings under the same `- ` list item).
uses_indent = step_indent(lines[i])
# Collect the block belonging to this step: subsequent lines that
# are more-indented than the step list marker, stopping at the
# next `- ` item whose indent <= the list-marker indent.
# The list marker indent is uses_indent if `- uses:` inline,
# else uses_indent-2 (key under a `- `). Normalize to the marker.
# Simpler: gather until a `- ` item at indent < uses_indent, or
# indent == uses_indent for the `- uses:` inline form.
inline_dash = bool(re.match(r'^\s*-\s+uses:', lines[i]))
marker_indent = uses_indent if inline_dash else uses_indent - 2
cache_val = None
has_cache_dep = False
j = i + 1
while j < n:
ln = lines[j]
if ln.strip() == "" or ln.lstrip().startswith("#"):
j += 1
continue
sm = STEP_ITEM.match(ln)
if sm and step_indent(ln) <= marker_indent:
break # next step
# also stop if we dedented out of this step entirely
if step_indent(ln) <= marker_indent and not WITH_LINE.match(ln):
break
cm = CACHE_LINE.match(ln)
if cm:
cache_val = cm.group(1).strip().strip('"\'').lower()
if CACHE_DEP.match(ln):
has_cache_dep = True
j += 1
# Decide
if cache_val == "true":
viols.append((go_line, "cache: true (must be `cache: false`)"))
elif cache_val is None:
# default-true. Flag — explicit cache:false is required on
# the self-hosted fleet. Strongest with cache-dep present,
# but bare setup-go is also default-true so flag both.
if has_cache_dep:
viols.append((go_line, "cache-dependency-path/key set with no `cache:` (defaults to true)"))
else:
viols.append((go_line, "no `cache:` set (defaults to true; require explicit `cache: false`)"))
# cache_val == "false" -> OK
i = j
return viols
def main():
if not os.path.isdir(WORKFLOWS_DIR):
print(f"OK: no {WORKFLOWS_DIR} directory")
return 0
all_viols = []
for fn in sorted(os.listdir(WORKFLOWS_DIR)):
if not (fn.endswith(".yml") or fn.endswith(".yaml")):
continue
path = os.path.join(WORKFLOWS_DIR, fn)
for lineno, reason in scan_file(path):
all_viols.append(f"{path}:{lineno}: actions/setup-go with caching enabled — {reason}")
if all_viols:
print("FAIL: actions/setup-go must set `cache: false` on the self-hosted fleet:")
for v in all_viols:
print(f" - {v}")
print()
print("Why: runners bind-mount a host-shared GOCACHE/GOMODCACHE")
print(" (/var/cache/ci-go-{build,mod}, operator-config")
print(" ops/runners/config.dedicated.yaml). actions/cache untars OVER")
print(" the bind mount -> 'File exists' -> partial cache -> race-link")
print(" / arch-lint failures. The runner-level GOCACHE is the cache SSOT.")
print(" Fix: add `cache: false` under the setup-go `with:` block.")
return 1
print("OK: every actions/setup-go step sets cache: false.")
return 0
if __name__ == "__main__":
sys.exit(main())
+7
View File
@@ -129,6 +129,13 @@ jobs:
uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
with:
go-version: 'stable'
# cache:false — the self-hosted runner bind-mounts a persistent
# GOCACHE/GOMODCACHE (/var/cache/ci-go-{build,mod}); actions/cache is
# redundant and corrupts it by untarring over the bind mount ("File
# exists" -> "Failed to restore" -> partial cache -> linker/typecheck
# errors on heavy jobs, e.g. test -race link "too many errors" and
# go-arch-lint "without types"). Fleet sweep after the cp ci.yml find.
cache: false
- name: Go build + vet (workspace-server)
working-directory: workspace-server
run: |
+7
View File
@@ -133,6 +133,13 @@ jobs:
uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
with:
go-version: 'stable'
# cache:false — the self-hosted runner bind-mounts a persistent
# GOCACHE/GOMODCACHE (/var/cache/ci-go-{build,mod}); actions/cache is
# redundant and corrupts it by untarring over the bind mount ("File
# exists" -> "Failed to restore" -> partial cache -> linker/typecheck
# errors on heavy jobs, e.g. test -race link "too many errors" and
# go-arch-lint "without types"). Fleet sweep after the cp ci.yml find.
cache: false
- if: ${{ needs.changes.outputs.platform == 'true' }}
run: go mod download
- if: ${{ needs.changes.outputs.platform == 'true' }}
@@ -150,7 +150,9 @@ jobs:
# GOCACHE/GOMODCACHE (/var/cache/ci-go-{build,mod}); actions/cache is
# redundant and corrupts it by untarring over the bind mount ("File
# exists" -> "Failed to restore" -> partial cache -> linker/typecheck
# errors on heavy jobs). Fleet sweep cp#698 missed this workflow.
# errors on heavy jobs, e.g. test -race link "too many errors" and
# go-arch-lint "without types"). Fleet sweep cp#698 missed this
# workflow (found during the cp ci.yml sweep).
cache: false
- if: needs.detect-changes.outputs.handlers == 'true'
+76
View File
@@ -0,0 +1,76 @@
name: lint-setup-go-cache
# Static workflow-shape lint: forbid `actions/setup-go` caching on the
# self-hosted fleet. Every setup-go step must set `cache: false`.
#
# Forbidden shape
# ---------------
# - `cache: true` (explicit), OR
# - `cache-dependency-path` / `cache-key` set with no `cache:` (the
# input DEFAULTS to true), OR
# - a bare setup-go with no `cache:` at all (still default-true).
#
# Why this rule exists (2026-06-09/10 cross-repo rollout)
# ------------------------------------------------------
# The molecule self-hosted runners bind-mount a persistent, host-shared
# GOCACHE/GOMODCACHE (/var/cache/ci-go-{build,mod} — operator-config
# ops/runners/config.dedicated.yaml + fleet template). When setup-go
# turns on actions/cache, it untars its restored archive OVER that bind
# mount -> "File exists" -> "Failed to restore" -> partial cache ->
# downstream linker/typecheck failures on heavy jobs (test -race link
# "too many errors", go-arch-lint "without types"). The runner-level
# GOCACHE is the SSOT for caching; setup-go must not also cache.
# Fix: add `cache: false` under the setup-go `with:` block.
#
# Coordination (sweep PRs)
# ------------------------
# The fleet sweep fix/setup-go-cache-vs-bind-mount (core#2524) removes
# the remaining `cache: true` hits. Until it merges, this lint will
# loudly list those hits — so it lands at `continue-on-error: true`
# (advisory). FOLLOW-UP: after core#2524 merges and main is clean for
# 3 days, flip continue-on-error -> false to make this a hard gate.
# This PR already removes the default-true hits the sweep PR does not
# touch (ci.yml, ci-arm64-advisory.yml, handlers-postgres-integration.yml,
# weekly-platform-go.yml).
on:
pull_request:
paths:
- '.gitea/workflows/**'
- '.gitea/scripts/lint_setup_go_cache.py'
- 'tests/test_lint_setup_go_cache.py'
push:
branches: [main, staging]
paths:
- '.gitea/workflows/**'
- '.gitea/scripts/lint_setup_go_cache.py'
permissions:
contents: read
concurrency:
group: lint-setup-go-cache-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
# bp-exempt: advisory lint enforcing the cache:false convention on the
# self-hosted GOCACHE bind-mount fleet; flips to required after the
# core#2524 sweep merges (see header). Not a merge gate yet.
lint:
name: lint-setup-go-cache
runs-on: ubuntu-latest
timeout-minutes: 5
# Advisory until core#2524 sweep merges + 3 clean days, then flip false.
# internal#881 Phase 3 mask — 14d forced-renewal cadence (flip after 3 clean days)
continue-on-error: true # internal#881
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
with:
python-version: '3.12'
- name: Run lint-setup-go-cache
run: python3 .gitea/scripts/lint_setup_go_cache.py
- name: Run lint-setup-go-cache unit tests
run: |
python -m pip install --quiet pytest
python3 -m pytest tests/test_lint_setup_go_cache.py -q
+7
View File
@@ -47,6 +47,13 @@ jobs:
uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
with:
go-version: stable
# cache:false — the self-hosted runner bind-mounts a persistent
# GOCACHE/GOMODCACHE (/var/cache/ci-go-{build,mod}); actions/cache is
# redundant and corrupts it by untarring over the bind mount ("File
# exists" -> "Failed to restore" -> partial cache -> linker/typecheck
# errors on heavy jobs, e.g. test -race link "too many errors" and
# go-arch-lint "without types"). Fleet sweep after the cp ci.yml find.
cache: false
- name: Go mod download
run: go mod download
+114
View File
@@ -0,0 +1,114 @@
"""Unit tests for lint_setup_go_cache — fixture catch + clean proofs."""
import importlib.util
import os
import textwrap
import pytest
HERE = os.path.dirname(__file__)
SCRIPT = os.path.join(HERE, "..", ".gitea", "scripts", "lint_setup_go_cache.py")
spec = importlib.util.spec_from_file_location("lint_setup_go_cache", SCRIPT)
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)
def _write(tmp_path, body):
p = tmp_path / "wf.yml"
p.write_text(textwrap.dedent(body))
return str(p)
def test_cache_true_explicit_flagged(tmp_path):
p = _write(tmp_path, """\
jobs:
build:
runs-on: docker-host
steps:
- uses: actions/setup-go@v5
with:
go-version: 'stable'
cache: true
cache-dependency-path: go.sum
""")
viols = mod.scan_file(p)
assert len(viols) == 1
assert "cache: true" in viols[0][1]
def test_default_true_with_cachedep_flagged(tmp_path):
p = _write(tmp_path, """\
jobs:
build:
runs-on: docker-host
steps:
- uses: actions/setup-go@v5
with:
go-version: 'stable'
cache-dependency-path: go.sum
""")
viols = mod.scan_file(p)
assert len(viols) == 1
assert "defaults to true" in viols[0][1]
def test_bare_setup_go_default_true_flagged(tmp_path):
p = _write(tmp_path, """\
jobs:
build:
runs-on: docker-host
steps:
- uses: actions/setup-go@v5
with:
go-version: 'stable'
- run: go build ./...
""")
viols = mod.scan_file(p)
assert len(viols) == 1
assert "defaults to true" in viols[0][1]
def test_cache_false_clean(tmp_path):
p = _write(tmp_path, """\
jobs:
build:
runs-on: docker-host
steps:
- uses: actions/setup-go@v5
with:
go-version: 'stable'
cache: false
cache-dependency-path: go.sum
""")
assert mod.scan_file(p) == []
def test_no_setup_go_clean(tmp_path):
p = _write(tmp_path, """\
jobs:
build:
runs-on: docker-host
steps:
- uses: actions/checkout@v4
- run: echo hi
""")
assert mod.scan_file(p) == []
def test_multiple_steps_only_bad_flagged(tmp_path):
p = _write(tmp_path, """\
jobs:
build:
runs-on: docker-host
steps:
- uses: actions/setup-go@v5
with:
go-version: 'stable'
cache: false
- uses: actions/setup-go@v6
with:
go-version: '1.25'
cache: true
""")
viols = mod.scan_file(p)
assert len(viols) == 1
assert "cache: true" in viols[0][1]