fix(e2e): bounded retry around external workspace-create in staging concierge user_tasks (#2743) #2746

Merged
devops-engineer merged 1 commits from fix/2743-staging-concierge-create-retry into main 2026-06-13 10:20:13 +00:00
+39 -6
View File
@@ -221,14 +221,47 @@ tenant_call() { # <method> <path> [curl args…]
}
# Create an external workspace (row only — no EC2). Echoes its id.
#
# Bounded retry around the external-row create only. The external create still
# runs a DB transaction + post-commit token/status work before returning 201,
# so under staging control-plane latency the one-shot curl could exit rc=28
# (CURL_COMMON --max-time 30 -> "curl: (28) Operation timed out") and the helper
# parsed no id, hard-failing user_tasks before any assertion (issue #2743).
# This is provisioning-latency flake, not a user_tasks contract failure -- so we
# retry transient cases (rc=28 / connection error -> http 000, or 2xx-but-no-id)
# with a longer per-call timeout (mirroring the teardown DELETE --max-time 120)
# and short backoff. Semantic 4xx/5xx stay hard-red with the response body.
CREATE_WS_ATTEMPTS=${CREATE_WS_ATTEMPTS:-5}
CREATE_WS_MAX_TIME=${CREATE_WS_MAX_TIME:-90}
create_external_ws() { # <name>
local name="$1" resp
resp=$(tenant_call POST /workspaces -H "Content-Type: application/json" \
-d "{\"name\":\"$name\",\"tier\":1,\"runtime\":\"external\",\"external\":true}")
echo "$resp" | python3 -c "import sys,re
b=sys.stdin.read()
local name="$1" attempt body code id rc
for attempt in $(seq 1 "$CREATE_WS_ATTEMPTS"); do
body=$(mktemp "$TMPDIR_E2E/ws_create.XXXXXX")
# Longer --max-time wins over CURL_COMMON's 30s (later flag); capture rc so
# rc=28 is classified as transient latency rather than a no-id hard fail.
set +e
code=$(tenant_call POST /workspaces --max-time "$CREATE_WS_MAX_TIME" \
-H "Content-Type: application/json" \
-d "{\"name\":\"$name\",\"tier\":1,\"runtime\":\"external\",\"external\":true}" \
-o "$body" -w "%{http_code}" 2>/dev/null)
rc=$?
set -e
id=$(python3 -c "import sys,re
b=open('$body',encoding='utf-8').read()
m=re.search(r'\"id\"\s*:\s*\"([^\"]+)\"', b)
print(m.group(1) if m else '')"
print(m.group(1) if m else '')" 2>/dev/null || echo '')
if [ -n "$id" ]; then echo "$id"; rm -f "$body"; return 0; fi
# Semantic failure (got an HTTP response in 4xx/5xx) -> hard-red immediately.
case "$code" in
4??|5??)
fail "external ws create '$name' failed HTTP $code: $(head -c 500 "$body")" ;;
esac
# Transient: rc=28 (timeout), connection error (code 000), or 2xx-with-no-id.
log " ws create '$name' transient (attempt $attempt/$CREATE_WS_ATTEMPTS: curl rc=$rc http=$code) -- retrying"
rm -f "$body"
sleep $(( attempt * 3 ))
done
fail "external ws create '$name' returned no id after $CREATE_WS_ATTEMPTS attempts (last curl rc=$rc http=$code; staging control-plane latency, rc=28 class)"
}
# MCP JSON-RPC tools/call against /workspaces/:id/mcp. Echoes the result text