Hearth is the infrastructure home for the letemcook ecosystem. Ported from coherence-mcp/infra: - Terraform modules (VPC, EKS, IAM, NLB, S3, storage) - Kubernetes manifests (Forgejo, ingress, cert-manager, karpenter) - Deployment scripts (phased rollout) Status: Not deployed. EKS cluster needs to be provisioned. Next steps: 1. Bootstrap terraform backend 2. Deploy phase 1 (foundation) 3. Deploy phase 2 (core services including Forgejo) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
300 lines
9.1 KiB
Bash
Executable file
300 lines
9.1 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
#
|
|
# Phase 4 Validation: Observability Stack
|
|
#
|
|
# Validates that RFC 0042 components are deployed and healthy:
|
|
# - Prometheus (targets UP)
|
|
# - Grafana (SSO working, datasources configured)
|
|
# - Loki (logs visible)
|
|
# - Alertmanager (cluster formed)
|
|
#
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
|
|
# Colors for output
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
BLUE='\033[0;34m'
|
|
NC='\033[0m'
|
|
|
|
PASSED=0
|
|
FAILED=0
|
|
WARNINGS=0
|
|
|
|
log_info() {
|
|
echo -e "${BLUE}[INFO]${NC} $1"
|
|
}
|
|
|
|
log_pass() {
|
|
echo -e "${GREEN}[PASS]${NC} $1"
|
|
((PASSED++))
|
|
}
|
|
|
|
log_fail() {
|
|
echo -e "${RED}[FAIL]${NC} $1"
|
|
((FAILED++))
|
|
}
|
|
|
|
log_warn() {
|
|
echo -e "${YELLOW}[WARN]${NC} $1"
|
|
((WARNINGS++))
|
|
}
|
|
|
|
check_prometheus() {
|
|
log_info "Checking Prometheus..."
|
|
|
|
# Check if namespace exists
|
|
if ! kubectl get namespace observability &> /dev/null; then
|
|
log_fail "observability namespace does not exist"
|
|
return
|
|
fi
|
|
|
|
# Check pod health
|
|
local ready_pods
|
|
ready_pods=$(kubectl -n observability get pods -l app=prometheus -o json 2>/dev/null | jq '[.items[] | select(.status.phase == "Running") | select(.status.containerStatuses[]?.ready == true)] | length')
|
|
|
|
if [ "$ready_pods" -ge 1 ]; then
|
|
log_pass "Prometheus has $ready_pods ready pods"
|
|
else
|
|
log_fail "Prometheus has no ready pods"
|
|
return
|
|
fi
|
|
|
|
# Check Prometheus targets via API
|
|
local prom_pod
|
|
prom_pod=$(kubectl -n observability get pods -l app=prometheus -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")
|
|
|
|
if [ -n "$prom_pod" ]; then
|
|
local targets
|
|
targets=$(kubectl -n observability exec "$prom_pod" -- wget -q -O - http://localhost:9090/api/v1/targets 2>/dev/null || echo '{}')
|
|
|
|
local active_targets
|
|
active_targets=$(echo "$targets" | jq '.data.activeTargets | length' 2>/dev/null || echo "0")
|
|
|
|
local up_targets
|
|
up_targets=$(echo "$targets" | jq '[.data.activeTargets[] | select(.health == "up")] | length' 2>/dev/null || echo "0")
|
|
|
|
if [ "$active_targets" -gt 0 ]; then
|
|
log_pass "Prometheus has $active_targets active targets ($up_targets UP)"
|
|
else
|
|
log_warn "No active Prometheus targets found"
|
|
fi
|
|
|
|
# Check for down targets
|
|
local down_targets
|
|
down_targets=$(echo "$targets" | jq '[.data.activeTargets[] | select(.health == "down")] | length' 2>/dev/null || echo "0")
|
|
|
|
if [ "$down_targets" -gt 0 ]; then
|
|
log_warn "$down_targets Prometheus targets are DOWN"
|
|
fi
|
|
fi
|
|
}
|
|
|
|
check_grafana() {
|
|
log_info "Checking Grafana..."
|
|
|
|
# Check pod health
|
|
local ready_pods
|
|
ready_pods=$(kubectl -n observability get pods -l app=grafana -o json 2>/dev/null | jq '[.items[] | select(.status.phase == "Running") | select(.status.containerStatuses[]?.ready == true)] | length')
|
|
|
|
if [ "$ready_pods" -ge 1 ]; then
|
|
log_pass "Grafana has $ready_pods ready pods"
|
|
else
|
|
log_fail "Grafana has no ready pods"
|
|
return
|
|
fi
|
|
|
|
# Check Grafana service
|
|
local svc_exists
|
|
svc_exists=$(kubectl -n observability get svc grafana -o name 2>/dev/null || echo "")
|
|
|
|
if [ -n "$svc_exists" ]; then
|
|
log_pass "Grafana service exists"
|
|
else
|
|
log_fail "Grafana service does not exist"
|
|
fi
|
|
|
|
# Check datasources ConfigMap
|
|
local datasources_cm
|
|
datasources_cm=$(kubectl -n observability get configmap grafana-datasources -o name 2>/dev/null || echo "")
|
|
|
|
if [ -n "$datasources_cm" ]; then
|
|
log_pass "Grafana datasources ConfigMap exists"
|
|
else
|
|
log_warn "Grafana datasources ConfigMap not found"
|
|
fi
|
|
|
|
# Check dashboards ConfigMaps
|
|
local dashboard_cms
|
|
dashboard_cms=$(kubectl -n observability get configmap -l grafana_dashboard=1 -o name 2>/dev/null | wc -l | tr -d ' ')
|
|
|
|
if [ "$dashboard_cms" -gt 0 ]; then
|
|
log_pass "Found $dashboard_cms Grafana dashboard ConfigMaps"
|
|
else
|
|
log_warn "No Grafana dashboard ConfigMaps found"
|
|
fi
|
|
|
|
# Check ingress
|
|
local ingress_exists
|
|
ingress_exists=$(kubectl get ingress -A -o json 2>/dev/null | jq '[.items[] | select(.spec.rules[]?.host | contains("grafana"))] | length')
|
|
|
|
if [ "$ingress_exists" -ge 1 ]; then
|
|
log_pass "Grafana ingress configured"
|
|
else
|
|
log_warn "No Grafana ingress found"
|
|
fi
|
|
}
|
|
|
|
check_loki() {
|
|
log_info "Checking Loki..."
|
|
|
|
# Check pod health
|
|
local ready_pods
|
|
ready_pods=$(kubectl -n observability get pods -l app=loki -o json 2>/dev/null | jq '[.items[] | select(.status.phase == "Running") | select(.status.containerStatuses[]?.ready == true)] | length')
|
|
|
|
if [ "$ready_pods" -ge 1 ]; then
|
|
log_pass "Loki has $ready_pods ready pods"
|
|
else
|
|
log_fail "Loki has no ready pods"
|
|
return
|
|
fi
|
|
|
|
# Check Loki service
|
|
local svc_exists
|
|
svc_exists=$(kubectl -n observability get svc loki -o name 2>/dev/null || echo "")
|
|
|
|
if [ -n "$svc_exists" ]; then
|
|
log_pass "Loki service exists"
|
|
else
|
|
log_fail "Loki service does not exist"
|
|
fi
|
|
|
|
# Check Loki health
|
|
local loki_pod
|
|
loki_pod=$(kubectl -n observability get pods -l app=loki -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")
|
|
|
|
if [ -n "$loki_pod" ]; then
|
|
local loki_ready
|
|
loki_ready=$(kubectl -n observability exec "$loki_pod" -- wget -q -O - http://localhost:3100/ready 2>/dev/null || echo "")
|
|
|
|
if [ "$loki_ready" = "ready" ]; then
|
|
log_pass "Loki is ready"
|
|
else
|
|
log_warn "Loki readiness check returned: $loki_ready"
|
|
fi
|
|
fi
|
|
}
|
|
|
|
check_alertmanager() {
|
|
log_info "Checking Alertmanager..."
|
|
|
|
# Check pod health
|
|
local ready_pods
|
|
ready_pods=$(kubectl -n observability get pods -l app=alertmanager -o json 2>/dev/null | jq '[.items[] | select(.status.phase == "Running") | select(.status.containerStatuses[]?.ready == true)] | length')
|
|
|
|
if [ "$ready_pods" -ge 1 ]; then
|
|
log_pass "Alertmanager has $ready_pods ready pods"
|
|
else
|
|
log_fail "Alertmanager has no ready pods"
|
|
return
|
|
fi
|
|
|
|
# Check Alertmanager cluster status
|
|
local am_pod
|
|
am_pod=$(kubectl -n observability get pods -l app=alertmanager -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")
|
|
|
|
if [ -n "$am_pod" ]; then
|
|
local cluster_status
|
|
cluster_status=$(kubectl -n observability exec "$am_pod" -- wget -q -O - http://localhost:9093/api/v2/status 2>/dev/null || echo '{}')
|
|
|
|
local cluster_peers
|
|
cluster_peers=$(echo "$cluster_status" | jq '.cluster.peers | length' 2>/dev/null || echo "0")
|
|
|
|
if [ "$cluster_peers" -ge 1 ]; then
|
|
log_pass "Alertmanager cluster formed ($cluster_peers peers)"
|
|
else
|
|
log_warn "Alertmanager cluster not formed"
|
|
fi
|
|
fi
|
|
|
|
# Check for active alerts
|
|
if [ -n "$am_pod" ]; then
|
|
local active_alerts
|
|
active_alerts=$(kubectl -n observability exec "$am_pod" -- wget -q -O - http://localhost:9093/api/v2/alerts 2>/dev/null | jq 'length' 2>/dev/null || echo "0")
|
|
|
|
if [ "$active_alerts" -gt 0 ]; then
|
|
log_warn "$active_alerts active alerts in Alertmanager"
|
|
else
|
|
log_pass "No active alerts"
|
|
fi
|
|
fi
|
|
}
|
|
|
|
check_promtail() {
|
|
log_info "Checking Promtail (log shipper)..."
|
|
|
|
# Check DaemonSet
|
|
local promtail_ds
|
|
promtail_ds=$(kubectl -n observability get daemonset promtail -o json 2>/dev/null || echo '{}')
|
|
|
|
local desired
|
|
desired=$(echo "$promtail_ds" | jq '.status.desiredNumberScheduled // 0')
|
|
|
|
local ready
|
|
ready=$(echo "$promtail_ds" | jq '.status.numberReady // 0')
|
|
|
|
if [ "$desired" -eq 0 ]; then
|
|
log_warn "Promtail DaemonSet not found (may use different log collector)"
|
|
elif [ "$ready" -eq "$desired" ]; then
|
|
log_pass "Promtail DaemonSet healthy ($ready/$desired ready)"
|
|
else
|
|
log_warn "Promtail DaemonSet partially ready ($ready/$desired)"
|
|
fi
|
|
}
|
|
|
|
print_summary() {
|
|
echo ""
|
|
echo "========================================"
|
|
echo "Phase 4 Validation Summary"
|
|
echo "========================================"
|
|
echo -e " ${GREEN}Passed:${NC} $PASSED"
|
|
echo -e " ${RED}Failed:${NC} $FAILED"
|
|
echo -e " ${YELLOW}Warnings:${NC} $WARNINGS"
|
|
echo "========================================"
|
|
|
|
if [ "$FAILED" -gt 0 ]; then
|
|
echo ""
|
|
echo -e "${RED}Phase 4 validation FAILED${NC}"
|
|
echo "Please fix the issues above before proceeding to Phase 5."
|
|
exit 1
|
|
elif [ "$WARNINGS" -gt 0 ]; then
|
|
echo ""
|
|
echo -e "${YELLOW}Phase 4 validation passed with warnings${NC}"
|
|
echo "Review warnings above. You may proceed to Phase 5."
|
|
exit 0
|
|
else
|
|
echo ""
|
|
echo -e "${GREEN}Phase 4 validation PASSED${NC}"
|
|
echo "You may proceed to Phase 5 (optional)."
|
|
exit 0
|
|
fi
|
|
}
|
|
|
|
main() {
|
|
echo "========================================"
|
|
echo "Phase 4 Validation: Observability Stack"
|
|
echo "========================================"
|
|
echo ""
|
|
|
|
check_prometheus
|
|
check_grafana
|
|
check_loki
|
|
check_alertmanager
|
|
check_promtail
|
|
|
|
print_summary
|
|
}
|
|
|
|
main "$@"
|