hearth/scripts/validate-phase1.sh
Eric Garcia e78000831e Initial commit: Port infrastructure from coherence-mcp
Hearth is the infrastructure home for the letemcook ecosystem.

Ported from coherence-mcp/infra:
- Terraform modules (VPC, EKS, IAM, NLB, S3, storage)
- Kubernetes manifests (Forgejo, ingress, cert-manager, karpenter)
- Deployment scripts (phased rollout)

Status: Not deployed. EKS cluster needs to be provisioned.

Next steps:
1. Bootstrap terraform backend
2. Deploy phase 1 (foundation)
3. Deploy phase 2 (core services including Forgejo)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-24 06:06:13 -05:00

245 lines
7.4 KiB
Bash
Executable file

#!/usr/bin/env bash
#
# Phase 1 Validation: Foundation Infrastructure
#
# Validates that RFC 0039 components are deployed and healthy:
# - CockroachDB cluster (3 nodes, all live)
# - cert-manager certificates (all Ready)
# - S3 buckets (4 buckets created)
#
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'
PASSED=0
FAILED=0
WARNINGS=0
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_pass() {
echo -e "${GREEN}[PASS]${NC} $1"
((PASSED++))
}
log_fail() {
echo -e "${RED}[FAIL]${NC} $1"
((FAILED++))
}
log_warn() {
echo -e "${YELLOW}[WARN]${NC} $1"
((WARNINGS++))
}
check_cockroachdb_cluster() {
log_info "Checking CockroachDB cluster health..."
# Check if namespace exists
if ! kubectl get namespace cockroachdb &> /dev/null; then
log_fail "cockroachdb namespace does not exist"
return
fi
# Check pod count
local pod_count
pod_count=$(kubectl -n cockroachdb get pods -l app=cockroachdb -o json | jq '.items | length')
if [ "$pod_count" -ge 3 ]; then
log_pass "CockroachDB has $pod_count pods running (expected: >= 3)"
else
log_fail "CockroachDB has only $pod_count pods (expected: >= 3)"
fi
# Check pod health
local ready_pods
ready_pods=$(kubectl -n cockroachdb get pods -l app=cockroachdb -o json | jq '[.items[] | select(.status.phase == "Running") | select(.status.containerStatuses[]?.ready == true)] | length')
if [ "$ready_pods" -ge 3 ]; then
log_pass "CockroachDB has $ready_pods ready pods"
else
log_fail "CockroachDB has only $ready_pods ready pods (expected: >= 3)"
fi
# Check node status via cockroach CLI
local node_status
node_status=$(kubectl -n cockroachdb exec cockroachdb-0 -- cockroach node status --certs-dir=/cockroach/cockroach-certs --format=json 2>/dev/null || echo '[]')
local live_nodes
live_nodes=$(echo "$node_status" | jq 'length')
if [ "$live_nodes" -ge 3 ]; then
log_pass "CockroachDB cluster has $live_nodes live nodes"
else
log_fail "CockroachDB cluster has only $live_nodes live nodes (expected: >= 3)"
fi
}
check_certificates() {
log_info "Checking cert-manager certificates..."
# Check if cert-manager namespace exists
if ! kubectl get namespace cert-manager &> /dev/null; then
log_fail "cert-manager namespace does not exist"
return
fi
# Check cert-manager pods
local cm_ready
cm_ready=$(kubectl -n cert-manager get pods -l app.kubernetes.io/instance=cert-manager -o json | jq '[.items[] | select(.status.phase == "Running")] | length')
if [ "$cm_ready" -ge 1 ]; then
log_pass "cert-manager is running ($cm_ready pods)"
else
log_fail "cert-manager is not running"
fi
# Check all certificates across namespaces
local all_certs
all_certs=$(kubectl get certificates -A -o json 2>/dev/null || echo '{"items":[]}')
local total_certs
total_certs=$(echo "$all_certs" | jq '.items | length')
local ready_certs
ready_certs=$(echo "$all_certs" | jq '[.items[] | select(.status.conditions[]? | select(.type == "Ready" and .status == "True"))] | length')
if [ "$total_certs" -eq 0 ]; then
log_warn "No certificates found (may not be required yet)"
elif [ "$ready_certs" -eq "$total_certs" ]; then
log_pass "All certificates are ready ($ready_certs/$total_certs)"
else
log_fail "Some certificates are not ready ($ready_certs/$total_certs ready)"
echo " Non-ready certificates:"
echo "$all_certs" | jq -r '.items[] | select(.status.conditions[]? | select(.type == "Ready" and .status != "True")) | " - " + .metadata.namespace + "/" + .metadata.name'
fi
# Check ClusterIssuers
local issuers
issuers=$(kubectl get clusterissuers -o json 2>/dev/null || echo '{"items":[]}')
local ready_issuers
ready_issuers=$(echo "$issuers" | jq '[.items[] | select(.status.conditions[]? | select(.type == "Ready" and .status == "True"))] | length')
local total_issuers
total_issuers=$(echo "$issuers" | jq '.items | length')
if [ "$total_issuers" -eq 0 ]; then
log_warn "No ClusterIssuers found"
elif [ "$ready_issuers" -eq "$total_issuers" ]; then
log_pass "All ClusterIssuers are ready ($ready_issuers/$total_issuers)"
else
log_fail "Some ClusterIssuers are not ready ($ready_issuers/$total_issuers)"
fi
}
check_s3_buckets() {
log_info "Checking S3 buckets..."
# Expected buckets
local expected_buckets=("email" "logs" "traces" "lfs")
local found=0
# List S3 buckets
local buckets
buckets=$(aws s3 ls 2>/dev/null | awk '{print $3}' || echo "")
for expected in "${expected_buckets[@]}"; do
if echo "$buckets" | grep -q "coherence.*$expected"; then
((found++))
fi
done
if [ "$found" -eq ${#expected_buckets[@]} ]; then
log_pass "All expected S3 buckets found ($found/${#expected_buckets[@]})"
elif [ "$found" -gt 0 ]; then
log_warn "Only $found/${#expected_buckets[@]} expected S3 buckets found"
else
log_fail "No expected S3 buckets found"
fi
}
check_karpenter() {
log_info "Checking Karpenter..."
# Check if karpenter namespace exists
if ! kubectl get namespace karpenter &> /dev/null; then
log_warn "karpenter namespace does not exist (may be using Fargate only)"
return
fi
# Check Karpenter pods
local kp_ready
kp_ready=$(kubectl -n karpenter get pods -l app.kubernetes.io/name=karpenter -o json 2>/dev/null | jq '[.items[] | select(.status.phase == "Running")] | length')
if [ "$kp_ready" -ge 1 ]; then
log_pass "Karpenter is running ($kp_ready pods)"
else
log_fail "Karpenter is not running"
fi
# Check NodePools
local nodepools
nodepools=$(kubectl get nodepools -o json 2>/dev/null || echo '{"items":[]}')
local np_count
np_count=$(echo "$nodepools" | jq '.items | length')
if [ "$np_count" -ge 1 ]; then
log_pass "Karpenter NodePools configured ($np_count)"
else
log_warn "No Karpenter NodePools found"
fi
}
print_summary() {
echo ""
echo "========================================"
echo "Phase 1 Validation Summary"
echo "========================================"
echo -e " ${GREEN}Passed:${NC} $PASSED"
echo -e " ${RED}Failed:${NC} $FAILED"
echo -e " ${YELLOW}Warnings:${NC} $WARNINGS"
echo "========================================"
if [ "$FAILED" -gt 0 ]; then
echo ""
echo -e "${RED}Phase 1 validation FAILED${NC}"
echo "Please fix the issues above before proceeding to Phase 2."
exit 1
elif [ "$WARNINGS" -gt 0 ]; then
echo ""
echo -e "${YELLOW}Phase 1 validation passed with warnings${NC}"
echo "Review warnings above. You may proceed to Phase 2."
exit 0
else
echo ""
echo -e "${GREEN}Phase 1 validation PASSED${NC}"
echo "You may proceed to Phase 2."
exit 0
fi
}
main() {
echo "========================================"
echo "Phase 1 Validation: Foundation Infrastructure"
echo "========================================"
echo ""
check_cockroachdb_cluster
check_certificates
check_s3_buckets
check_karpenter
print_summary
}
main "$@"