hearth/scripts/validate-phase4.sh
Eric Garcia e78000831e Initial commit: Port infrastructure from coherence-mcp
Hearth is the infrastructure home for the letemcook ecosystem.

Ported from coherence-mcp/infra:
- Terraform modules (VPC, EKS, IAM, NLB, S3, storage)
- Kubernetes manifests (Forgejo, ingress, cert-manager, karpenter)
- Deployment scripts (phased rollout)

Status: Not deployed. EKS cluster needs to be provisioned.

Next steps:
1. Bootstrap terraform backend
2. Deploy phase 1 (foundation)
3. Deploy phase 2 (core services including Forgejo)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-24 06:06:13 -05:00

300 lines
9.1 KiB
Bash
Executable file

#!/usr/bin/env bash
#
# Phase 4 Validation: Observability Stack
#
# Validates that RFC 0042 components are deployed and healthy:
# - Prometheus (targets UP)
# - Grafana (SSO working, datasources configured)
# - Loki (logs visible)
# - Alertmanager (cluster formed)
#
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'
PASSED=0
FAILED=0
WARNINGS=0
log_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
log_pass() {
echo -e "${GREEN}[PASS]${NC} $1"
((PASSED++))
}
log_fail() {
echo -e "${RED}[FAIL]${NC} $1"
((FAILED++))
}
log_warn() {
echo -e "${YELLOW}[WARN]${NC} $1"
((WARNINGS++))
}
check_prometheus() {
log_info "Checking Prometheus..."
# Check if namespace exists
if ! kubectl get namespace observability &> /dev/null; then
log_fail "observability namespace does not exist"
return
fi
# Check pod health
local ready_pods
ready_pods=$(kubectl -n observability get pods -l app=prometheus -o json 2>/dev/null | jq '[.items[] | select(.status.phase == "Running") | select(.status.containerStatuses[]?.ready == true)] | length')
if [ "$ready_pods" -ge 1 ]; then
log_pass "Prometheus has $ready_pods ready pods"
else
log_fail "Prometheus has no ready pods"
return
fi
# Check Prometheus targets via API
local prom_pod
prom_pod=$(kubectl -n observability get pods -l app=prometheus -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")
if [ -n "$prom_pod" ]; then
local targets
targets=$(kubectl -n observability exec "$prom_pod" -- wget -q -O - http://localhost:9090/api/v1/targets 2>/dev/null || echo '{}')
local active_targets
active_targets=$(echo "$targets" | jq '.data.activeTargets | length' 2>/dev/null || echo "0")
local up_targets
up_targets=$(echo "$targets" | jq '[.data.activeTargets[] | select(.health == "up")] | length' 2>/dev/null || echo "0")
if [ "$active_targets" -gt 0 ]; then
log_pass "Prometheus has $active_targets active targets ($up_targets UP)"
else
log_warn "No active Prometheus targets found"
fi
# Check for down targets
local down_targets
down_targets=$(echo "$targets" | jq '[.data.activeTargets[] | select(.health == "down")] | length' 2>/dev/null || echo "0")
if [ "$down_targets" -gt 0 ]; then
log_warn "$down_targets Prometheus targets are DOWN"
fi
fi
}
check_grafana() {
log_info "Checking Grafana..."
# Check pod health
local ready_pods
ready_pods=$(kubectl -n observability get pods -l app=grafana -o json 2>/dev/null | jq '[.items[] | select(.status.phase == "Running") | select(.status.containerStatuses[]?.ready == true)] | length')
if [ "$ready_pods" -ge 1 ]; then
log_pass "Grafana has $ready_pods ready pods"
else
log_fail "Grafana has no ready pods"
return
fi
# Check Grafana service
local svc_exists
svc_exists=$(kubectl -n observability get svc grafana -o name 2>/dev/null || echo "")
if [ -n "$svc_exists" ]; then
log_pass "Grafana service exists"
else
log_fail "Grafana service does not exist"
fi
# Check datasources ConfigMap
local datasources_cm
datasources_cm=$(kubectl -n observability get configmap grafana-datasources -o name 2>/dev/null || echo "")
if [ -n "$datasources_cm" ]; then
log_pass "Grafana datasources ConfigMap exists"
else
log_warn "Grafana datasources ConfigMap not found"
fi
# Check dashboards ConfigMaps
local dashboard_cms
dashboard_cms=$(kubectl -n observability get configmap -l grafana_dashboard=1 -o name 2>/dev/null | wc -l | tr -d ' ')
if [ "$dashboard_cms" -gt 0 ]; then
log_pass "Found $dashboard_cms Grafana dashboard ConfigMaps"
else
log_warn "No Grafana dashboard ConfigMaps found"
fi
# Check ingress
local ingress_exists
ingress_exists=$(kubectl get ingress -A -o json 2>/dev/null | jq '[.items[] | select(.spec.rules[]?.host | contains("grafana"))] | length')
if [ "$ingress_exists" -ge 1 ]; then
log_pass "Grafana ingress configured"
else
log_warn "No Grafana ingress found"
fi
}
check_loki() {
log_info "Checking Loki..."
# Check pod health
local ready_pods
ready_pods=$(kubectl -n observability get pods -l app=loki -o json 2>/dev/null | jq '[.items[] | select(.status.phase == "Running") | select(.status.containerStatuses[]?.ready == true)] | length')
if [ "$ready_pods" -ge 1 ]; then
log_pass "Loki has $ready_pods ready pods"
else
log_fail "Loki has no ready pods"
return
fi
# Check Loki service
local svc_exists
svc_exists=$(kubectl -n observability get svc loki -o name 2>/dev/null || echo "")
if [ -n "$svc_exists" ]; then
log_pass "Loki service exists"
else
log_fail "Loki service does not exist"
fi
# Check Loki health
local loki_pod
loki_pod=$(kubectl -n observability get pods -l app=loki -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")
if [ -n "$loki_pod" ]; then
local loki_ready
loki_ready=$(kubectl -n observability exec "$loki_pod" -- wget -q -O - http://localhost:3100/ready 2>/dev/null || echo "")
if [ "$loki_ready" = "ready" ]; then
log_pass "Loki is ready"
else
log_warn "Loki readiness check returned: $loki_ready"
fi
fi
}
check_alertmanager() {
log_info "Checking Alertmanager..."
# Check pod health
local ready_pods
ready_pods=$(kubectl -n observability get pods -l app=alertmanager -o json 2>/dev/null | jq '[.items[] | select(.status.phase == "Running") | select(.status.containerStatuses[]?.ready == true)] | length')
if [ "$ready_pods" -ge 1 ]; then
log_pass "Alertmanager has $ready_pods ready pods"
else
log_fail "Alertmanager has no ready pods"
return
fi
# Check Alertmanager cluster status
local am_pod
am_pod=$(kubectl -n observability get pods -l app=alertmanager -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")
if [ -n "$am_pod" ]; then
local cluster_status
cluster_status=$(kubectl -n observability exec "$am_pod" -- wget -q -O - http://localhost:9093/api/v2/status 2>/dev/null || echo '{}')
local cluster_peers
cluster_peers=$(echo "$cluster_status" | jq '.cluster.peers | length' 2>/dev/null || echo "0")
if [ "$cluster_peers" -ge 1 ]; then
log_pass "Alertmanager cluster formed ($cluster_peers peers)"
else
log_warn "Alertmanager cluster not formed"
fi
fi
# Check for active alerts
if [ -n "$am_pod" ]; then
local active_alerts
active_alerts=$(kubectl -n observability exec "$am_pod" -- wget -q -O - http://localhost:9093/api/v2/alerts 2>/dev/null | jq 'length' 2>/dev/null || echo "0")
if [ "$active_alerts" -gt 0 ]; then
log_warn "$active_alerts active alerts in Alertmanager"
else
log_pass "No active alerts"
fi
fi
}
check_promtail() {
log_info "Checking Promtail (log shipper)..."
# Check DaemonSet
local promtail_ds
promtail_ds=$(kubectl -n observability get daemonset promtail -o json 2>/dev/null || echo '{}')
local desired
desired=$(echo "$promtail_ds" | jq '.status.desiredNumberScheduled // 0')
local ready
ready=$(echo "$promtail_ds" | jq '.status.numberReady // 0')
if [ "$desired" -eq 0 ]; then
log_warn "Promtail DaemonSet not found (may use different log collector)"
elif [ "$ready" -eq "$desired" ]; then
log_pass "Promtail DaemonSet healthy ($ready/$desired ready)"
else
log_warn "Promtail DaemonSet partially ready ($ready/$desired)"
fi
}
print_summary() {
echo ""
echo "========================================"
echo "Phase 4 Validation Summary"
echo "========================================"
echo -e " ${GREEN}Passed:${NC} $PASSED"
echo -e " ${RED}Failed:${NC} $FAILED"
echo -e " ${YELLOW}Warnings:${NC} $WARNINGS"
echo "========================================"
if [ "$FAILED" -gt 0 ]; then
echo ""
echo -e "${RED}Phase 4 validation FAILED${NC}"
echo "Please fix the issues above before proceeding to Phase 5."
exit 1
elif [ "$WARNINGS" -gt 0 ]; then
echo ""
echo -e "${YELLOW}Phase 4 validation passed with warnings${NC}"
echo "Review warnings above. You may proceed to Phase 5."
exit 0
else
echo ""
echo -e "${GREEN}Phase 4 validation PASSED${NC}"
echo "You may proceed to Phase 5 (optional)."
exit 0
fi
}
main() {
echo "========================================"
echo "Phase 4 Validation: Observability Stack"
echo "========================================"
echo ""
check_prometheus
check_grafana
check_loki
check_alertmanager
check_promtail
print_summary
}
main "$@"