#!/usr/bin/env bash # # Phase 4 Validation: Observability Stack # # Validates that RFC 0042 components are deployed and healthy: # - Prometheus (targets UP) # - Grafana (SSO working, datasources configured) # - Loki (logs visible) # - Alertmanager (cluster formed) # set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # Colors for output RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' PASSED=0 FAILED=0 WARNINGS=0 log_info() { echo -e "${BLUE}[INFO]${NC} $1" } log_pass() { echo -e "${GREEN}[PASS]${NC} $1" ((PASSED++)) } log_fail() { echo -e "${RED}[FAIL]${NC} $1" ((FAILED++)) } log_warn() { echo -e "${YELLOW}[WARN]${NC} $1" ((WARNINGS++)) } check_prometheus() { log_info "Checking Prometheus..." # Check if namespace exists if ! kubectl get namespace observability &> /dev/null; then log_fail "observability namespace does not exist" return fi # Check pod health local ready_pods ready_pods=$(kubectl -n observability get pods -l app=prometheus -o json 2>/dev/null | jq '[.items[] | select(.status.phase == "Running") | select(.status.containerStatuses[]?.ready == true)] | length') if [ "$ready_pods" -ge 1 ]; then log_pass "Prometheus has $ready_pods ready pods" else log_fail "Prometheus has no ready pods" return fi # Check Prometheus targets via API local prom_pod prom_pod=$(kubectl -n observability get pods -l app=prometheus -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") if [ -n "$prom_pod" ]; then local targets targets=$(kubectl -n observability exec "$prom_pod" -- wget -q -O - http://localhost:9090/api/v1/targets 2>/dev/null || echo '{}') local active_targets active_targets=$(echo "$targets" | jq '.data.activeTargets | length' 2>/dev/null || echo "0") local up_targets up_targets=$(echo "$targets" | jq '[.data.activeTargets[] | select(.health == "up")] | length' 2>/dev/null || echo "0") if [ "$active_targets" -gt 0 ]; then log_pass "Prometheus has $active_targets active targets ($up_targets UP)" else log_warn "No active Prometheus targets found" fi # Check for down targets local down_targets down_targets=$(echo "$targets" | jq '[.data.activeTargets[] | select(.health == "down")] | length' 2>/dev/null || echo "0") if [ "$down_targets" -gt 0 ]; then log_warn "$down_targets Prometheus targets are DOWN" fi fi } check_grafana() { log_info "Checking Grafana..." # Check pod health local ready_pods ready_pods=$(kubectl -n observability get pods -l app=grafana -o json 2>/dev/null | jq '[.items[] | select(.status.phase == "Running") | select(.status.containerStatuses[]?.ready == true)] | length') if [ "$ready_pods" -ge 1 ]; then log_pass "Grafana has $ready_pods ready pods" else log_fail "Grafana has no ready pods" return fi # Check Grafana service local svc_exists svc_exists=$(kubectl -n observability get svc grafana -o name 2>/dev/null || echo "") if [ -n "$svc_exists" ]; then log_pass "Grafana service exists" else log_fail "Grafana service does not exist" fi # Check datasources ConfigMap local datasources_cm datasources_cm=$(kubectl -n observability get configmap grafana-datasources -o name 2>/dev/null || echo "") if [ -n "$datasources_cm" ]; then log_pass "Grafana datasources ConfigMap exists" else log_warn "Grafana datasources ConfigMap not found" fi # Check dashboards ConfigMaps local dashboard_cms dashboard_cms=$(kubectl -n observability get configmap -l grafana_dashboard=1 -o name 2>/dev/null | wc -l | tr -d ' ') if [ "$dashboard_cms" -gt 0 ]; then log_pass "Found $dashboard_cms Grafana dashboard ConfigMaps" else log_warn "No Grafana dashboard ConfigMaps found" fi # Check ingress local ingress_exists ingress_exists=$(kubectl get ingress -A -o json 2>/dev/null | jq '[.items[] | select(.spec.rules[]?.host | contains("grafana"))] | length') if [ "$ingress_exists" -ge 1 ]; then log_pass "Grafana ingress configured" else log_warn "No Grafana ingress found" fi } check_loki() { log_info "Checking Loki..." # Check pod health local ready_pods ready_pods=$(kubectl -n observability get pods -l app=loki -o json 2>/dev/null | jq '[.items[] | select(.status.phase == "Running") | select(.status.containerStatuses[]?.ready == true)] | length') if [ "$ready_pods" -ge 1 ]; then log_pass "Loki has $ready_pods ready pods" else log_fail "Loki has no ready pods" return fi # Check Loki service local svc_exists svc_exists=$(kubectl -n observability get svc loki -o name 2>/dev/null || echo "") if [ -n "$svc_exists" ]; then log_pass "Loki service exists" else log_fail "Loki service does not exist" fi # Check Loki health local loki_pod loki_pod=$(kubectl -n observability get pods -l app=loki -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") if [ -n "$loki_pod" ]; then local loki_ready loki_ready=$(kubectl -n observability exec "$loki_pod" -- wget -q -O - http://localhost:3100/ready 2>/dev/null || echo "") if [ "$loki_ready" = "ready" ]; then log_pass "Loki is ready" else log_warn "Loki readiness check returned: $loki_ready" fi fi } check_alertmanager() { log_info "Checking Alertmanager..." # Check pod health local ready_pods ready_pods=$(kubectl -n observability get pods -l app=alertmanager -o json 2>/dev/null | jq '[.items[] | select(.status.phase == "Running") | select(.status.containerStatuses[]?.ready == true)] | length') if [ "$ready_pods" -ge 1 ]; then log_pass "Alertmanager has $ready_pods ready pods" else log_fail "Alertmanager has no ready pods" return fi # Check Alertmanager cluster status local am_pod am_pod=$(kubectl -n observability get pods -l app=alertmanager -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") if [ -n "$am_pod" ]; then local cluster_status cluster_status=$(kubectl -n observability exec "$am_pod" -- wget -q -O - http://localhost:9093/api/v2/status 2>/dev/null || echo '{}') local cluster_peers cluster_peers=$(echo "$cluster_status" | jq '.cluster.peers | length' 2>/dev/null || echo "0") if [ "$cluster_peers" -ge 1 ]; then log_pass "Alertmanager cluster formed ($cluster_peers peers)" else log_warn "Alertmanager cluster not formed" fi fi # Check for active alerts if [ -n "$am_pod" ]; then local active_alerts active_alerts=$(kubectl -n observability exec "$am_pod" -- wget -q -O - http://localhost:9093/api/v2/alerts 2>/dev/null | jq 'length' 2>/dev/null || echo "0") if [ "$active_alerts" -gt 0 ]; then log_warn "$active_alerts active alerts in Alertmanager" else log_pass "No active alerts" fi fi } check_promtail() { log_info "Checking Promtail (log shipper)..." # Check DaemonSet local promtail_ds promtail_ds=$(kubectl -n observability get daemonset promtail -o json 2>/dev/null || echo '{}') local desired desired=$(echo "$promtail_ds" | jq '.status.desiredNumberScheduled // 0') local ready ready=$(echo "$promtail_ds" | jq '.status.numberReady // 0') if [ "$desired" -eq 0 ]; then log_warn "Promtail DaemonSet not found (may use different log collector)" elif [ "$ready" -eq "$desired" ]; then log_pass "Promtail DaemonSet healthy ($ready/$desired ready)" else log_warn "Promtail DaemonSet partially ready ($ready/$desired)" fi } print_summary() { echo "" echo "========================================" echo "Phase 4 Validation Summary" echo "========================================" echo -e " ${GREEN}Passed:${NC} $PASSED" echo -e " ${RED}Failed:${NC} $FAILED" echo -e " ${YELLOW}Warnings:${NC} $WARNINGS" echo "========================================" if [ "$FAILED" -gt 0 ]; then echo "" echo -e "${RED}Phase 4 validation FAILED${NC}" echo "Please fix the issues above before proceeding to Phase 5." exit 1 elif [ "$WARNINGS" -gt 0 ]; then echo "" echo -e "${YELLOW}Phase 4 validation passed with warnings${NC}" echo "Review warnings above. You may proceed to Phase 5." exit 0 else echo "" echo -e "${GREEN}Phase 4 validation PASSED${NC}" echo "You may proceed to Phase 5 (optional)." exit 0 fi } main() { echo "========================================" echo "Phase 4 Validation: Observability Stack" echo "========================================" echo "" check_prometheus check_grafana check_loki check_alertmanager check_promtail print_summary } main "$@"