commit e78000831e68bd9f12d0c8b196dcba1f73441d67 Author: Eric Garcia Date: Sat Jan 24 06:06:13 2026 -0500 Initial commit: Port infrastructure from coherence-mcp Hearth is the infrastructure home for the letemcook ecosystem. Ported from coherence-mcp/infra: - Terraform modules (VPC, EKS, IAM, NLB, S3, storage) - Kubernetes manifests (Forgejo, ingress, cert-manager, karpenter) - Deployment scripts (phased rollout) Status: Not deployed. EKS cluster needs to be provisioned. Next steps: 1. Bootstrap terraform backend 2. Deploy phase 1 (foundation) 3. Deploy phase 2 (core services including Forgejo) Co-Authored-By: Claude Opus 4.5 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..afb07e1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,25 @@ +# Terraform +*.tfstate +*.tfstate.* +.terraform/ +.terraform.lock.hcl +*.tfvars +!*.tfvars.example + +# Secrets +*.pem +*.key +credentials.yaml +secrets.yaml + +# IDE +.idea/ +.vscode/ +*.swp + +# OS +.DS_Store +Thumbs.db + +# Logs +*.log diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..dc94dba --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,78 @@ +# Hearth - Infrastructure Home + +The warm center where infrastructure becomes real. + +## What This Is + +Hearth is the infrastructure repository for the letemcook ecosystem. It contains: + +- **Terraform modules** for AWS EKS, VPC, IAM, storage +- **Kubernetes manifests** for core services (Forgejo, cert-manager, ingress) +- **Deployment scripts** for phased rollout + +## Quick Start + +```bash +# 1. Configure AWS +aws sso login --profile muffinlabs + +# 2. Bootstrap Terraform backend +cd terraform/environments/production +terraform init +terraform apply -target=module.bootstrap + +# 3. Deploy foundation (EKS, VPC, storage) +./scripts/deploy-phase1-foundation.sh + +# 4. Deploy core services (Forgejo) +./scripts/deploy-phase2-core-services.sh +``` + +## Structure + +``` +hearth/ +├── terraform/ +│ ├── modules/ # Reusable infrastructure modules +│ │ ├── vpc/ # VPC with multi-AZ subnets +│ │ ├── eks/ # EKS cluster +│ │ ├── iam/ # IAM roles and IRSA +│ │ ├── nlb/ # Network Load Balancer +│ │ └── storage/ # EFS, S3 +│ ├── main.tf # Root module +│ ├── variables.tf # Input variables +│ └── outputs.tf # Output values +├── kubernetes/ +│ ├── forgejo/ # Git hosting +│ ├── ingress/ # ALB ingress +│ ├── cert-manager/ # TLS certificates +│ ├── karpenter/ # Auto-scaling +│ └── storage/ # Storage classes +├── scripts/ +│ ├── deploy-phase*.sh # Phased deployment +│ └── validate-*.sh # Validation scripts +└── docs/ + └── architecture.md # Infrastructure overview +``` + +## Principles + +From Blue's ADRs: + +- **Single Source (0005)**: Infrastructure as code, one truth +- **Evidence (0004)**: Terraform plan before apply +- **No Dead Code (0010)**: Delete unused resources +- **Never Give Up (0000)**: Deploy, fail, learn, redeploy + +## AWS Profile + +Use `muffinlabs` profile for all AWS operations: + +```bash +export AWS_PROFILE=muffinlabs +``` + +## Related Repos + +- **blue** - Philosophy and CLI tooling +- **coherence-mcp** - MCP server (source of these manifests) diff --git a/README.md b/README.md new file mode 100644 index 0000000..a727031 --- /dev/null +++ b/README.md @@ -0,0 +1,38 @@ +# Hearth + +Infrastructure for the letemcook ecosystem. + +## Overview + +Hearth deploys and manages: + +- **EKS Cluster** - Kubernetes on AWS with Karpenter auto-scaling +- **Forgejo** - Self-hosted Git (git.beyondtheuniverse.superviber.com) +- **Core Services** - Ingress, TLS, storage + +## Status + +| Component | Status | +|-----------|--------| +| Terraform modules | Ported from coherence-mcp | +| EKS cluster | Not deployed | +| Forgejo | Not deployed | + +## Getting Started + +See [CLAUDE.md](CLAUDE.md) for setup instructions. + +## Cost Estimate + +| Component | Monthly | +|-----------|---------| +| EKS Control Plane | $73 | +| Spot nodes (variable) | $0-50 | +| NLB | $16 | +| EFS | $5 | +| S3 | $5 | +| **Total** | **~$100-150** | + +## License + +Private. diff --git a/docs/architecture.md b/docs/architecture.md new file mode 100644 index 0000000..94aba6e --- /dev/null +++ b/docs/architecture.md @@ -0,0 +1,269 @@ +# Foundation Infrastructure + +RFC 0039: ADR-Compliant Foundation Infrastructure + +## Overview + +This directory contains Terraform modules and Kubernetes manifests for deploying +the Alignment foundation infrastructure on AWS EKS. + +## Architecture + +``` + Internet + | + +---------+----------+ + | Shared NLB | + | (~$16/mo) | + +--------------------+ + | :53 DNS (PowerDNS)| + | :25 SMTP | + | :587 Submission | + | :993 IMAPS | + | :443 HTTPS | + +--------+-----------+ + | + +--------------------+--------------------+ + | | | + +-----+------+ +-----+------+ +------+-----+ + | AZ-a | | AZ-b | | AZ-c | + +------------+ +------------+ +------------+ + | | | | | | + | Karpenter | | Karpenter | | Karpenter | + | Spot Nodes | | Spot Nodes | | Spot Nodes | + | | | | | | + +------------+ +------------+ +------------+ + | | | | | | + | CockroachDB| | CockroachDB| | CockroachDB| + | (m6i.large)| | (m6i.large)| | (m6i.large)| + | | | | | | + +------------+ +------------+ +------------+ +``` + +## Cost Breakdown + +| Component | Monthly Cost | +|-----------|--------------| +| EKS Control Plane | $73 | +| CockroachDB (3x m6i.large, 3yr) | $105 | +| NLB | $16 | +| EFS | $5 | +| S3 | $5 | +| Spot nodes (variable) | $0-50 | +| **Total** | **$204-254** | + +## ADR Compliance + +- **ADR 0003**: Self-hosted CockroachDB with FIPS 140-2 +- **ADR 0004**: "Set It and Forget It" auto-scaling with Karpenter +- **ADR 0005**: Full-stack self-hosting (no SaaS dependencies) + +## Prerequisites + +1. AWS CLI configured with appropriate credentials +2. Terraform >= 1.6.0 +3. kubectl +4. Helm 3.x + +## Quick Start + +### 1. Bootstrap Terraform Backend + +First, create the S3 bucket and DynamoDB table for Terraform state: + +```bash +cd terraform/environments/production +# Uncomment the backend.tf bootstrap code and run: +# terraform init && terraform apply +``` + +### 2. Deploy Foundation Infrastructure + +```bash +cd terraform/environments/production +terraform init +terraform plan +terraform apply +``` + +### 3. Configure kubectl + +```bash +aws eks update-kubeconfig --region us-east-1 --name alignment-production +``` + +### 4. Deploy Karpenter + +```bash +# Set environment variables +export CLUSTER_NAME=$(terraform output -raw cluster_name) +export CLUSTER_ENDPOINT=$(terraform output -raw cluster_endpoint) +export KARPENTER_ROLE_ARN=$(terraform output -raw karpenter_role_arn) +export INTERRUPTION_QUEUE_NAME=$(terraform output -raw karpenter_interruption_queue_name) + +# Install Karpenter +helm upgrade --install karpenter oci://public.ecr.aws/karpenter/karpenter \ + --namespace karpenter --create-namespace \ + -f kubernetes/karpenter/helm-values.yaml \ + --set settings.clusterName=$CLUSTER_NAME \ + --set settings.clusterEndpoint=$CLUSTER_ENDPOINT \ + --set settings.interruptionQueue=$INTERRUPTION_QUEUE_NAME \ + --set serviceAccount.annotations."eks\.amazonaws\.com/role-arn"=$KARPENTER_ROLE_ARN + +# Apply NodePool and EC2NodeClass +kubectl apply -f kubernetes/karpenter/nodepool.yaml +kubectl apply -f kubernetes/karpenter/ec2nodeclass.yaml +``` + +### 5. Deploy Storage Classes + +```bash +export EFS_ID=$(terraform output -raw efs_id) +envsubst < kubernetes/storage/classes.yaml | kubectl apply -f - +``` + +## Directory Structure + +``` +infra/ +├── terraform/ +│ ├── main.tf # Root module +│ ├── variables.tf # Input variables +│ ├── outputs.tf # Output values +│ ├── versions.tf # Provider versions +│ ├── modules/ +│ │ ├── vpc/ # VPC with multi-AZ subnets +│ │ ├── eks/ # EKS cluster with Fargate +│ │ ├── iam/ # IAM roles and IRSA +│ │ ├── storage/ # EFS and S3 +│ │ ├── nlb/ # Shared NLB +│ │ └── cockroachdb/ # CockroachDB (future) +│ └── environments/ +│ └── production/ # Production config +├── kubernetes/ +│ ├── karpenter/ # Karpenter manifests +│ ├── cockroachdb/ # CockroachDB StatefulSet +│ ├── storage/ # Storage classes +│ ├── ingress/ # Ingress configuration +│ └── cert-manager/ # TLS certificates +└── README.md +``` + +## Modules + +### VPC Module + +Creates a VPC with: +- 3 availability zones +- Public subnets (for NLB, NAT Gateways) +- Private subnets (for EKS nodes, workloads) +- Database subnets (isolated, for CockroachDB) +- NAT Gateway per AZ for HA +- VPC endpoints for S3, ECR, STS, EC2 + +### EKS Module + +Creates an EKS cluster with: +- Kubernetes 1.29 +- Fargate profiles for Karpenter and kube-system +- OIDC provider for IRSA +- KMS encryption for secrets +- Cluster logging enabled + +### IAM Module + +Creates IAM roles for: +- Karpenter controller +- EBS CSI driver +- EFS CSI driver +- AWS Load Balancer Controller +- cert-manager +- External DNS + +### Storage Module + +Creates storage resources: +- EFS filesystem with encryption +- S3 bucket for backups (versioned, encrypted) +- S3 bucket for blob storage +- KMS key for encryption + +### NLB Module + +Creates a shared NLB with: +- HTTPS (443) for web traffic +- DNS (53 UDP/TCP) for PowerDNS +- SMTP (25), Submission (587), IMAPS (993) for email +- Cross-zone load balancing +- Target groups for each service + +## Operations + +### Scaling + +Karpenter automatically scales nodes based on pending pods. No manual intervention required. + +To adjust limits: +```bash +kubectl edit nodepool default +``` + +### Monitoring + +Check Karpenter status: +```bash +kubectl logs -n karpenter -l app.kubernetes.io/name=karpenter -f +``` + +Check node status: +```bash +kubectl get nodes -L karpenter.sh/capacity-type,node.kubernetes.io/instance-type +``` + +### Troubleshooting + +View Karpenter events: +```bash +kubectl get events -n karpenter --sort-by=.lastTimestamp +``` + +Check pending pods: +```bash +kubectl get pods --all-namespaces --field-selector=status.phase=Pending +``` + +## Security + +- All storage encrypted at rest (KMS) +- TLS required for all connections +- IMDSv2 required for all nodes +- VPC Flow Logs enabled +- Cluster audit logging enabled +- FIPS 140-2 mode for CockroachDB + +## Disaster Recovery + +### Backups + +CockroachDB backups are stored in S3 with: +- Daily full backups +- 30-day retention in Standard +- 90-day transition to Glacier +- 365-day noncurrent version retention + +### Recovery + +To restore from backup: +```bash +# Restore CockroachDB from S3 backup +cockroach restore ... FROM 's3://alignment-production-backups/...' +``` + +## References + +- [RFC 0039: Foundation Infrastructure](../../../.repos/alignment-mcp/docs/rfcs/0039-foundation-infrastructure.md) +- [ADR 0003: CockroachDB Self-Hosted FIPS](../../../.repos/alignment-mcp/docs/adrs/0003-cockroachdb-self-hosted-fips.md) +- [ADR 0004: Set It and Forget It](../../../.repos/alignment-mcp/docs/adrs/0004-set-it-and-forget-it-architecture.md) +- [ADR 0005: Full-Stack Self-Hosting](../../../.repos/alignment-mcp/docs/adrs/0005-full-stack-self-hosting.md) +- [Karpenter Documentation](https://karpenter.sh/) +- [EKS Best Practices](https://aws.github.io/aws-eks-best-practices/) diff --git a/kubernetes/cert-manager/cluster-issuers.yaml b/kubernetes/cert-manager/cluster-issuers.yaml new file mode 100644 index 0000000..52e93d9 --- /dev/null +++ b/kubernetes/cert-manager/cluster-issuers.yaml @@ -0,0 +1,107 @@ +# cert-manager ClusterIssuers +# RFC 0039: ADR-Compliant Foundation Infrastructure +# +# Provides: +# - Let's Encrypt production issuer +# - Let's Encrypt staging issuer (for testing) +# - Self-signed issuer (for internal services) +--- +# Self-signed ClusterIssuer for internal certificates +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: selfsigned +spec: + selfSigned: {} +--- +# Internal CA for cluster services +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: alignment-ca + namespace: cert-manager +spec: + isCA: true + commonName: alignment-internal-ca + secretName: alignment-ca-secret + duration: 87600h # 10 years + renewBefore: 8760h # 1 year + privateKey: + algorithm: ECDSA + size: 256 + issuerRef: + name: selfsigned + kind: ClusterIssuer + group: cert-manager.io +--- +# Internal CA ClusterIssuer +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: alignment-ca +spec: + ca: + secretName: alignment-ca-secret +--- +# Let's Encrypt Staging (for testing) +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: letsencrypt-staging +spec: + acme: + # Staging server URL + server: https://acme-staging-v02.api.letsencrypt.org/directory + email: ${ACME_EMAIL} + privateKeySecretRef: + name: letsencrypt-staging-account-key + solvers: + # HTTP-01 challenge solver using ingress + - http01: + ingress: + ingressClassName: nginx + # DNS-01 challenge solver using Route53 + - dns01: + route53: + region: us-east-1 + # Use IRSA for authentication + # Requires IAM role with Route53 permissions + selector: + dnsZones: + - "${DOMAIN}" +--- +# Let's Encrypt Production +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: letsencrypt-production +spec: + acme: + # Production server URL + server: https://acme-v02.api.letsencrypt.org/directory + email: ${ACME_EMAIL} + privateKeySecretRef: + name: letsencrypt-production-account-key + solvers: + # HTTP-01 challenge solver using ingress + - http01: + ingress: + ingressClassName: nginx + # DNS-01 challenge solver using Route53 + - dns01: + route53: + region: us-east-1 + selector: + dnsZones: + - "${DOMAIN}" +--- +# ConfigMap for cert-manager configuration +apiVersion: v1 +kind: ConfigMap +metadata: + name: cert-manager-config + namespace: cert-manager +data: + # Replace these values during deployment + ACME_EMAIL: "admin@example.com" + DOMAIN: "example.com" diff --git a/kubernetes/cert-manager/helm-values.yaml b/kubernetes/cert-manager/helm-values.yaml new file mode 100644 index 0000000..e85f3e4 --- /dev/null +++ b/kubernetes/cert-manager/helm-values.yaml @@ -0,0 +1,102 @@ +# cert-manager Helm Values +# RFC 0039: ADR-Compliant Foundation Infrastructure +# +# Install with: +# helm repo add jetstack https://charts.jetstack.io +# helm install cert-manager jetstack/cert-manager \ +# --namespace cert-manager \ +# --create-namespace \ +# --values helm-values.yaml +--- +# Install CRDs +installCRDs: true + +# Replica count for HA +replicaCount: 2 + +# Resource requests and limits +resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 256Mi + +# Webhook configuration +webhook: + replicaCount: 2 + resources: + requests: + cpu: 25m + memory: 32Mi + limits: + cpu: 100m + memory: 128Mi + +# CA Injector configuration +cainjector: + replicaCount: 2 + resources: + requests: + cpu: 25m + memory: 64Mi + limits: + cpu: 100m + memory: 256Mi + +# Pod disruption budgets +podDisruptionBudget: + enabled: true + minAvailable: 1 + +# Prometheus metrics +prometheus: + enabled: true + servicemonitor: + enabled: true + namespace: monitoring + labels: + release: prometheus + +# Pod anti-affinity for HA +affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: cert-manager + topologyKey: kubernetes.io/hostname + +# Topology spread constraints +topologySpreadConstraints: + - maxSkew: 1 + topologyKey: topology.kubernetes.io/zone + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app.kubernetes.io/name: cert-manager + +# Security context +securityContext: + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + +containerSecurityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + +# DNS configuration for Route53 +dns01RecursiveNameservers: "8.8.8.8:53,1.1.1.1:53" +dns01RecursiveNameserversOnly: true + +# Global options +global: + leaderElection: + namespace: cert-manager diff --git a/kubernetes/cert-manager/kustomization.yaml b/kubernetes/cert-manager/kustomization.yaml new file mode 100644 index 0000000..a57a1cc --- /dev/null +++ b/kubernetes/cert-manager/kustomization.yaml @@ -0,0 +1,21 @@ +# cert-manager Kustomization +# RFC 0039: ADR-Compliant Foundation Infrastructure +# +# Deploy with: +# 1. helm install cert-manager jetstack/cert-manager -f helm-values.yaml +# 2. kubectl apply -k infra/kubernetes/cert-manager/ +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: cert-manager + +resources: + - namespace.yaml + - cluster-issuers.yaml + - route53-role.yaml + +commonLabels: + app.kubernetes.io/managed-by: kustomize + app.kubernetes.io/part-of: alignment-foundation + rfc: "0039" diff --git a/kubernetes/cert-manager/namespace.yaml b/kubernetes/cert-manager/namespace.yaml new file mode 100644 index 0000000..9801c7e --- /dev/null +++ b/kubernetes/cert-manager/namespace.yaml @@ -0,0 +1,10 @@ +# cert-manager Namespace +# RFC 0039: ADR-Compliant Foundation Infrastructure +--- +apiVersion: v1 +kind: Namespace +metadata: + name: cert-manager + labels: + app.kubernetes.io/name: cert-manager + app.kubernetes.io/component: certificate-management diff --git a/kubernetes/cert-manager/route53-role.yaml b/kubernetes/cert-manager/route53-role.yaml new file mode 100644 index 0000000..946fd57 --- /dev/null +++ b/kubernetes/cert-manager/route53-role.yaml @@ -0,0 +1,40 @@ +# cert-manager Route53 IAM Role for IRSA +# RFC 0039: ADR-Compliant Foundation Infrastructure +# +# This ServiceAccount uses IRSA to allow cert-manager to manage +# Route53 DNS records for DNS-01 ACME challenges. +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: cert-manager-route53 + namespace: cert-manager + annotations: + # Replace with actual IAM role ARN from Terraform + eks.amazonaws.com/role-arn: "${CERT_MANAGER_ROUTE53_ROLE_ARN}" + labels: + app.kubernetes.io/name: cert-manager + app.kubernetes.io/component: route53-solver +--- +# ClusterRole for cert-manager to use the ServiceAccount +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: cert-manager-route53 +rules: + - apiGroups: [""] + resources: ["serviceaccounts/token"] + verbs: ["create"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: cert-manager-route53 +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: cert-manager-route53 +subjects: + - kind: ServiceAccount + name: cert-manager-route53 + namespace: cert-manager diff --git a/kubernetes/forgejo/cockroachdb-secret.yaml b/kubernetes/forgejo/cockroachdb-secret.yaml new file mode 100644 index 0000000..18703fa --- /dev/null +++ b/kubernetes/forgejo/cockroachdb-secret.yaml @@ -0,0 +1,17 @@ +# Forgejo CockroachDB Credentials +# RFC 0049: Service Database Unification +# ADR 0003: FIPS 140-2 compliant storage +apiVersion: v1 +kind: Secret +metadata: + name: forgejo-cockroachdb + namespace: forgejo + labels: + app.kubernetes.io/name: forgejo + app.kubernetes.io/part-of: core-services + alignment.dev/rfc: "0049" +type: Opaque +stringData: + username: forgejo_user + password: "forgejo-crdb-secure-2026" + database: forgejo_db diff --git a/kubernetes/forgejo/deployment.yaml b/kubernetes/forgejo/deployment.yaml new file mode 100644 index 0000000..c2b4f0d --- /dev/null +++ b/kubernetes/forgejo/deployment.yaml @@ -0,0 +1,263 @@ +# Forgejo Deployment +# RFC 0040: Self-Hosted Core Services +# RFC 0049: Service Database Unification - CockroachDB backend +# ADR 0004: Set It and Forget It - auto-scaling via HPA +# ADR 0005: Full-stack self-hosting +# +# Spike 2026-01-19: Forgejo v9 works with CockroachDB after setting +# sql.defaults.serial_normalization = 'sql_sequence' to fix XORM SERIAL handling. +# Requires extended startup probes for initial 230-table migration. +# +# Integrated with Keycloak SSO +apiVersion: apps/v1 +kind: Deployment +metadata: + name: forgejo + namespace: forgejo + labels: + app.kubernetes.io/name: forgejo + app.kubernetes.io/part-of: core-services +spec: + replicas: 1 # Single replica due to ReadWriteOnce storage; HA requires EFS or Redis queue + selector: + matchLabels: + app: forgejo + strategy: + type: Recreate # Required for single replica with RWO storage (LevelDB queue lock) + template: + metadata: + labels: + app: forgejo + app.kubernetes.io/name: forgejo + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "3000" + prometheus.io/path: "/metrics" + spec: + serviceAccountName: forgejo + terminationGracePeriodSeconds: 60 + + # Spread pods across AZs for HA + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + app: forgejo + topologyKey: topology.kubernetes.io/zone + + # Forgejo needs elevated permissions for initial setup (chown, su-exec) + # After initial setup, consider using a custom image with pre-set permissions + securityContext: + fsGroup: 1000 + runAsUser: 0 + + initContainers: + # Wait for CockroachDB to be ready + - name: wait-for-db + image: postgres:16-alpine + command: + - /bin/sh + - -c + - | + until pg_isready -h cockroachdb-public.cockroachdb.svc.cluster.local -p 26257; do + echo "Waiting for CockroachDB..." + sleep 5 + done + echo "CockroachDB is ready!" + # Fix ssh directory permissions for git user + - name: fix-permissions + image: alpine:3.19 + command: + - /bin/sh + - -c + - | + # Ensure ssh directory exists with correct permissions + mkdir -p /data/git/.ssh + chown -R 1000:1000 /data/git + chmod 770 /data/git/.ssh + echo "Permissions fixed" + volumeMounts: + - name: data + mountPath: /data + + containers: + - name: forgejo + image: codeberg.org/forgejo/forgejo:9 + imagePullPolicy: IfNotPresent + + env: + # Database configuration (CockroachDB) + - name: FORGEJO__database__DB_TYPE + value: "postgres" + - name: FORGEJO__database__HOST + value: "cockroachdb-public.cockroachdb.svc.cluster.local:26257" + - name: FORGEJO__database__NAME + valueFrom: + secretKeyRef: + name: forgejo-cockroachdb + key: database + - name: FORGEJO__database__USER + valueFrom: + secretKeyRef: + name: forgejo-cockroachdb + key: username + - name: FORGEJO__database__PASSWD + valueFrom: + secretKeyRef: + name: forgejo-cockroachdb + key: password + - name: FORGEJO__database__SSL_MODE + value: "require" + - name: FORGEJO__database__CHARSET + value: "utf8" + + # Server configuration + - name: FORGEJO__server__DOMAIN + value: "git.beyondtheuniverse.superviber.com" + - name: FORGEJO__server__ROOT_URL + value: "https://git.beyondtheuniverse.superviber.com" + - name: FORGEJO__server__SSH_DOMAIN + value: "git.beyondtheuniverse.superviber.com" + - name: FORGEJO__server__SSH_PORT + value: "22" + - name: FORGEJO__server__LFS_START_SERVER + value: "true" + - name: FORGEJO__server__OFFLINE_MODE + value: "false" + + # Security settings + - name: FORGEJO__security__INSTALL_LOCK + value: "true" + - name: FORGEJO__security__SECRET_KEY + valueFrom: + secretKeyRef: + name: forgejo-secrets + key: secret-key + - name: FORGEJO__security__INTERNAL_TOKEN + valueFrom: + secretKeyRef: + name: forgejo-secrets + key: internal-token + + # OAuth2 configuration (Keycloak SSO) + - name: FORGEJO__oauth2__ENABLED + value: "true" + - name: FORGEJO__oauth2__JWT_SECRET + valueFrom: + secretKeyRef: + name: forgejo-oauth + key: jwt-secret + + # Session configuration (for HA) + - name: FORGEJO__session__PROVIDER + value: "db" + - name: FORGEJO__session__PROVIDER_CONFIG + value: "" + + # Cache configuration + - name: FORGEJO__cache__ENABLED + value: "true" + - name: FORGEJO__cache__ADAPTER + value: "memory" + + # Queue configuration + - name: FORGEJO__queue__TYPE + value: "level" + - name: FORGEJO__queue__DATADIR + value: "/data/queues" + + # Repository settings + - name: FORGEJO__repository__ROOT + value: "/data/git/repositories" + - name: FORGEJO__repository__DEFAULT_BRANCH + value: "main" + + # LFS settings + - name: FORGEJO__lfs__PATH + value: "/data/lfs" + + # Logging + - name: FORGEJO__log__MODE + value: "console" + - name: FORGEJO__log__LEVEL + value: "Info" + + # Metrics + - name: FORGEJO__metrics__ENABLED + value: "true" + - name: FORGEJO__metrics__TOKEN + valueFrom: + secretKeyRef: + name: forgejo-secrets + key: metrics-token + + # Service settings + - name: FORGEJO__service__DISABLE_REGISTRATION + value: "true" + - name: FORGEJO__service__REQUIRE_SIGNIN_VIEW + value: "false" + + ports: + - name: http + containerPort: 3000 + protocol: TCP + - name: ssh + containerPort: 22 + protocol: TCP + + resources: + requests: + cpu: 200m + memory: 256Mi + limits: + cpu: 1000m + memory: 1Gi + + # Forgejo needs CHOWN, SETGID, SETUID capabilities for initial setup + securityContext: + capabilities: + add: + - CHOWN + - SETGID + - SETUID + - FOWNER + + volumeMounts: + - name: data + mountPath: /data + + # Health probes - extended for CockroachDB migration time + startupProbe: + httpGet: + path: /api/healthz + port: 3000 + initialDelaySeconds: 10 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 60 # Allow up to 10 minutes for initial migration + + readinessProbe: + httpGet: + path: /api/healthz + port: 3000 + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + + livenessProbe: + httpGet: + path: /api/healthz + port: 3000 + initialDelaySeconds: 10 + periodSeconds: 30 + timeoutSeconds: 10 + failureThreshold: 3 + + volumes: + - name: data + persistentVolumeClaim: + claimName: forgejo-data diff --git a/kubernetes/forgejo/hpa.yaml b/kubernetes/forgejo/hpa.yaml new file mode 100644 index 0000000..311e2af --- /dev/null +++ b/kubernetes/forgejo/hpa.yaml @@ -0,0 +1,47 @@ +# Forgejo Horizontal Pod Autoscaler +# RFC 0040: Self-Hosted Core Services +# ADR 0004: Set It and Forget It - auto-scaling required +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: forgejo + namespace: forgejo + labels: + app.kubernetes.io/name: forgejo + app.kubernetes.io/part-of: core-services +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: forgejo + minReplicas: 2 + maxReplicas: 10 + metrics: + # Scale on CPU utilization + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 70 + # Scale on memory utilization + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 + behavior: + scaleUp: + stabilizationWindowSeconds: 60 + policies: + - type: Pods + value: 2 + periodSeconds: 60 + scaleDown: + # Conservative scale-down + stabilizationWindowSeconds: 300 + policies: + - type: Pods + value: 1 + periodSeconds: 120 diff --git a/kubernetes/forgejo/kustomization.yaml b/kubernetes/forgejo/kustomization.yaml new file mode 100644 index 0000000..2475f97 --- /dev/null +++ b/kubernetes/forgejo/kustomization.yaml @@ -0,0 +1,21 @@ +# Forgejo Kustomization +# RFC 0040: Self-Hosted Core Services +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: forgejo + +resources: + - namespace.yaml + - serviceaccount.yaml + - secrets.yaml + - oauth-provider-configmap.yaml + - pvc.yaml + - deployment.yaml + - service.yaml + - hpa.yaml + - pdb.yaml + +commonLabels: + app.kubernetes.io/managed-by: alignment + rfc: "0040" diff --git a/kubernetes/forgejo/namespace.yaml b/kubernetes/forgejo/namespace.yaml new file mode 100644 index 0000000..247b6e0 --- /dev/null +++ b/kubernetes/forgejo/namespace.yaml @@ -0,0 +1,9 @@ +# Forgejo namespace +# RFC 0040: Self-Hosted Core Services +apiVersion: v1 +kind: Namespace +metadata: + name: forgejo + labels: + app.kubernetes.io/name: forgejo + app.kubernetes.io/part-of: core-services diff --git a/kubernetes/forgejo/oauth-provider-configmap.yaml b/kubernetes/forgejo/oauth-provider-configmap.yaml new file mode 100644 index 0000000..b9d4265 --- /dev/null +++ b/kubernetes/forgejo/oauth-provider-configmap.yaml @@ -0,0 +1,77 @@ +# Forgejo OAuth2 Provider Configuration +# RFC 0040: Self-Hosted Core Services +# +# This ConfigMap contains the Keycloak OAuth2 provider configuration +# Applied via Forgejo's app.ini or admin UI +apiVersion: v1 +kind: ConfigMap +metadata: + name: forgejo-oauth-provider + namespace: forgejo + labels: + app.kubernetes.io/name: forgejo + app.kubernetes.io/part-of: core-services +data: + # Instructions for configuring Keycloak SSO in Forgejo + # This can be done via: + # 1. Admin UI: Site Administration -> Authentication Sources -> Add New Source + # 2. API: POST /api/v1/admin/auths + # 3. Database seed script + oauth-provider.md: | + # Keycloak SSO Configuration for Forgejo + + ## Via Admin UI + + 1. Navigate to Site Administration -> Authentication Sources + 2. Click "Add Authentication Source" + 3. Select "OAuth2" as the type + 4. Fill in the following: + - Authentication Name: keycloak + - OAuth2 Provider: OpenID Connect + - Client ID: forgejo + - Client Secret: (from Keycloak) + - OpenID Connect Auto Discovery URL: https://auth.beyondtheuniverse.superviber.com/realms/alignment/.well-known/openid-configuration + - Additional Scopes: groups + - Required Claim Name: (leave empty or set to "groups") + - Required Claim Value: (leave empty) + - Group Claim Name: groups + - Admin Group: /admins + - Restricted Group: (leave empty) + + ## Via API + + ```bash + curl -X POST "https://git.beyondtheuniverse.superviber.com/api/v1/admin/auths" \ + -H "Authorization: token ${ADMIN_TOKEN}" \ + -H "Content-Type: application/json" \ + -d '{ + "type": 6, + "name": "keycloak", + "is_active": true, + "is_sync_enabled": true, + "cfg": { + "Provider": "openidConnect", + "ClientID": "forgejo", + "ClientSecret": "${KEYCLOAK_CLIENT_SECRET}", + "OpenIDConnectAutoDiscoveryURL": "https://auth.beyondtheuniverse.superviber.com/realms/alignment/.well-known/openid-configuration", + "Scopes": "openid profile email groups", + "GroupClaimName": "groups", + "AdminGroup": "/admins" + } + }' + ``` + + # OAuth2 provider configuration (for reference/automation) + oauth-config.json: | + { + "name": "keycloak", + "provider": "openidConnect", + "clientId": "forgejo", + "openIdConnectAutoDiscoveryUrl": "https://auth.beyondtheuniverse.superviber.com/realms/alignment/.well-known/openid-configuration", + "scopes": ["openid", "profile", "email", "groups"], + "groupClaimName": "groups", + "adminGroup": "/admins", + "restrictedGroup": "", + "skipLocalTwoFA": false, + "iconUrl": "https://auth.beyondtheuniverse.superviber.com/resources/logo.png" + } diff --git a/kubernetes/forgejo/pdb.yaml b/kubernetes/forgejo/pdb.yaml new file mode 100644 index 0000000..03a52f4 --- /dev/null +++ b/kubernetes/forgejo/pdb.yaml @@ -0,0 +1,15 @@ +# Forgejo PodDisruptionBudget +# RFC 0040: Self-Hosted Core Services +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: forgejo + namespace: forgejo + labels: + app.kubernetes.io/name: forgejo + app.kubernetes.io/part-of: core-services +spec: + minAvailable: 1 + selector: + matchLabels: + app: forgejo diff --git a/kubernetes/forgejo/pvc.yaml b/kubernetes/forgejo/pvc.yaml new file mode 100644 index 0000000..a343a7f --- /dev/null +++ b/kubernetes/forgejo/pvc.yaml @@ -0,0 +1,37 @@ +# Forgejo Persistent Volume Claim +# RFC 0040: Self-Hosted Core Services +# +# NOTE: For HA setup with multiple replicas, consider: +# 1. Using EFS (AWS) or similar shared filesystem for /data/git +# 2. Using S3 for LFS storage +# 3. This PVC works for single-replica or ReadWriteMany storage classes +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: forgejo-data + namespace: forgejo + labels: + app.kubernetes.io/name: forgejo + app.kubernetes.io/part-of: core-services +spec: + accessModes: + - ReadWriteMany + storageClassName: efs-sc + resources: + requests: + storage: 100Gi +--- +# Alternative: Use GP3 with single replica +# Uncomment this and comment above for single-replica setup +# apiVersion: v1 +# kind: PersistentVolumeClaim +# metadata: +# name: forgejo-data +# namespace: forgejo +# spec: +# accessModes: +# - ReadWriteOnce +# storageClassName: gp3-encrypted +# resources: +# requests: +# storage: 100Gi diff --git a/kubernetes/forgejo/secrets.yaml.template b/kubernetes/forgejo/secrets.yaml.template new file mode 100644 index 0000000..cae58a8 --- /dev/null +++ b/kubernetes/forgejo/secrets.yaml.template @@ -0,0 +1,57 @@ +# Forgejo Secrets Template +# RFC 0040: Self-Hosted Core Services +# +# NOTE: This is a template. In production, secrets should be created via: +# 1. External Secrets Operator +# 2. Sealed Secrets +# 3. Manual kubectl create secret +# +# DO NOT commit actual secret values to git! +--- +# Database credentials for CockroachDB connection +apiVersion: v1 +kind: Secret +metadata: + name: forgejo-db + namespace: forgejo + labels: + app.kubernetes.io/name: forgejo + app.kubernetes.io/part-of: core-services +type: Opaque +stringData: + username: "forgejo" + password: "REPLACE_WITH_ACTUAL_PASSWORD" +--- +# Application secrets +apiVersion: v1 +kind: Secret +metadata: + name: forgejo-secrets + namespace: forgejo + labels: + app.kubernetes.io/name: forgejo + app.kubernetes.io/part-of: core-services +type: Opaque +stringData: + # Generate with: openssl rand -hex 32 + secret-key: "REPLACE_WITH_RANDOM_64_CHAR_HEX" + # Generate with: forgejo generate secret INTERNAL_TOKEN + internal-token: "REPLACE_WITH_INTERNAL_TOKEN" + # Token for metrics endpoint access + metrics-token: "REPLACE_WITH_METRICS_TOKEN" +--- +# OAuth2 secrets for Keycloak SSO +apiVersion: v1 +kind: Secret +metadata: + name: forgejo-oauth + namespace: forgejo + labels: + app.kubernetes.io/name: forgejo + app.kubernetes.io/part-of: core-services +type: Opaque +stringData: + # Generate with: openssl rand -hex 32 + jwt-secret: "REPLACE_WITH_RANDOM_64_CHAR_HEX" + # Keycloak client secret (from Keycloak admin console) + keycloak-client-secret: "REPLACE_WITH_KEYCLOAK_CLIENT_SECRET" diff --git a/kubernetes/forgejo/service.yaml b/kubernetes/forgejo/service.yaml new file mode 100644 index 0000000..1b5a033 --- /dev/null +++ b/kubernetes/forgejo/service.yaml @@ -0,0 +1,42 @@ +# Forgejo Services +# RFC 0040: Self-Hosted Core Services +apiVersion: v1 +kind: Service +metadata: + name: forgejo + namespace: forgejo + labels: + app.kubernetes.io/name: forgejo + app.kubernetes.io/part-of: core-services +spec: + type: ClusterIP + selector: + app: forgejo + ports: + - name: http + port: 3000 + targetPort: 3000 + protocol: TCP +--- +# SSH service (exposed via LoadBalancer or NodePort) +apiVersion: v1 +kind: Service +metadata: + name: forgejo-ssh + namespace: forgejo + labels: + app.kubernetes.io/name: forgejo + app.kubernetes.io/part-of: core-services + annotations: + # For AWS NLB + service.beta.kubernetes.io/aws-load-balancer-type: "nlb" + service.beta.kubernetes.io/aws-load-balancer-scheme: "internet-facing" +spec: + type: LoadBalancer + selector: + app: forgejo + ports: + - name: ssh + port: 22 + targetPort: 22 + protocol: TCP diff --git a/kubernetes/forgejo/serviceaccount.yaml b/kubernetes/forgejo/serviceaccount.yaml new file mode 100644 index 0000000..89d6bf0 --- /dev/null +++ b/kubernetes/forgejo/serviceaccount.yaml @@ -0,0 +1,10 @@ +# Forgejo ServiceAccount +# RFC 0040: Self-Hosted Core Services +apiVersion: v1 +kind: ServiceAccount +metadata: + name: forgejo + namespace: forgejo + labels: + app.kubernetes.io/name: forgejo + app.kubernetes.io/part-of: core-services diff --git a/kubernetes/ingress/core-services.yaml b/kubernetes/ingress/core-services.yaml new file mode 100644 index 0000000..3db70e3 --- /dev/null +++ b/kubernetes/ingress/core-services.yaml @@ -0,0 +1,105 @@ +# Core Services Ingress +# RFC 0040: Self-Hosted Core Services +# +# Routes traffic to Vault, Keycloak, and Forgejo via AWS ALB +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: core-services + namespace: ingress + labels: + app.kubernetes.io/name: core-services-ingress + app.kubernetes.io/part-of: core-services + annotations: + # AWS ALB Ingress Controller + kubernetes.io/ingress.class: alb + alb.ingress.kubernetes.io/scheme: internet-facing + alb.ingress.kubernetes.io/target-type: ip + alb.ingress.kubernetes.io/listen-ports: '[{"HTTPS":443}]' + alb.ingress.kubernetes.io/ssl-policy: ELBSecurityPolicy-TLS13-1-2-2021-06 + alb.ingress.kubernetes.io/certificate-arn: ${ACM_CERT_ARN} + alb.ingress.kubernetes.io/ssl-redirect: "443" + # Health check settings + alb.ingress.kubernetes.io/healthcheck-path: /health + alb.ingress.kubernetes.io/healthcheck-interval-seconds: "15" + alb.ingress.kubernetes.io/healthcheck-timeout-seconds: "5" + alb.ingress.kubernetes.io/healthy-threshold-count: "2" + alb.ingress.kubernetes.io/unhealthy-threshold-count: "3" + # WAF integration (optional) + # alb.ingress.kubernetes.io/wafv2-acl-arn: ${WAF_ACL_ARN} + # Access logs + alb.ingress.kubernetes.io/load-balancer-attributes: >- + access_logs.s3.enabled=true, + access_logs.s3.bucket=alignment-alb-logs, + access_logs.s3.prefix=core-services +spec: + ingressClassName: alb + rules: + # Vault + - host: vault.beyondtheuniverse.superviber.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: vault + port: + number: 8200 + # Keycloak + - host: auth.beyondtheuniverse.superviber.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: keycloak + port: + number: 8080 + # Forgejo + - host: git.beyondtheuniverse.superviber.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: forgejo + port: + number: 3000 +--- +# Cross-namespace service references +# These ExternalName services allow the ingress namespace to route to other namespaces +apiVersion: v1 +kind: Service +metadata: + name: vault + namespace: ingress +spec: + type: ExternalName + externalName: vault.vault.svc.cluster.local + ports: + - port: 8200 +--- +apiVersion: v1 +kind: Service +metadata: + name: keycloak + namespace: ingress +spec: + type: ExternalName + externalName: keycloak.keycloak.svc.cluster.local + ports: + - port: 8080 +--- +apiVersion: v1 +kind: Service +metadata: + name: forgejo + namespace: ingress +spec: + type: ExternalName + externalName: forgejo.forgejo.svc.cluster.local + ports: + - port: 3000 diff --git a/kubernetes/ingress/kustomization.yaml b/kubernetes/ingress/kustomization.yaml new file mode 100644 index 0000000..72c9616 --- /dev/null +++ b/kubernetes/ingress/kustomization.yaml @@ -0,0 +1,14 @@ +# Ingress Kustomization +# RFC 0040: Self-Hosted Core Services +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: ingress + +resources: + - namespace.yaml + - core-services.yaml + +commonLabels: + app.kubernetes.io/managed-by: alignment + rfc: "0040" diff --git a/kubernetes/ingress/namespace.yaml b/kubernetes/ingress/namespace.yaml new file mode 100644 index 0000000..348f3ae --- /dev/null +++ b/kubernetes/ingress/namespace.yaml @@ -0,0 +1,9 @@ +# Ingress namespace +# RFC 0040: Self-Hosted Core Services +apiVersion: v1 +kind: Namespace +metadata: + name: ingress + labels: + app.kubernetes.io/name: ingress + app.kubernetes.io/part-of: core-services diff --git a/kubernetes/karpenter/ec2nodeclass.yaml b/kubernetes/karpenter/ec2nodeclass.yaml new file mode 100644 index 0000000..be24cd2 --- /dev/null +++ b/kubernetes/karpenter/ec2nodeclass.yaml @@ -0,0 +1,81 @@ +# Karpenter EC2NodeClass Configuration +# RFC 0039: ADR-Compliant Foundation Infrastructure +# +# Defines how Karpenter provisions EC2 instances +--- +apiVersion: karpenter.k8s.aws/v1beta1 +kind: EC2NodeClass +metadata: + name: default +spec: + # Amazon Linux 2 AMI family + amiFamily: AL2 + + # Subnet selection - private subnets only + subnetSelectorTerms: + - tags: + karpenter.sh/discovery: "true" + + # Security group selection + securityGroupSelectorTerms: + - tags: + karpenter.sh/discovery: "true" + + # IAM role for nodes + role: "alignment-production-node" + + # Instance store policy for NVMe instances + instanceStorePolicy: RAID0 + + # Block device mappings + blockDeviceMappings: + - deviceName: /dev/xvda + ebs: + volumeSize: 100Gi + volumeType: gp3 + iops: 3000 + throughput: 125 + encrypted: true + deleteOnTermination: true + + # User data for node initialization + userData: | + #!/bin/bash + set -e + + # Enable FIPS mode (ADR 0003) + # Note: Full FIPS requires FIPS-validated AMI + # This is a placeholder for production FIPS configuration + + # Configure kubelet for optimal performance + cat >> /etc/kubernetes/kubelet/config.json.patch <] +# +# Prerequisites: +# - AWS CLI configured with appropriate credentials +# - Terraform >= 1.6.0 +# - kubectl +# - Helm 3.x +# +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +# Flags +DRY_RUN=false +SKIP_VALIDATION=false +START_PHASE=1 + +# Parse arguments +while [[ $# -gt 0 ]]; do + case $1 in + --dry-run) + DRY_RUN=true + shift + ;; + --skip-validation) + SKIP_VALIDATION=true + shift + ;; + --start-phase) + START_PHASE="$2" + shift 2 + ;; + -h|--help) + echo "Usage: $0 [--dry-run] [--skip-validation] [--start-phase <1-5>]" + echo "" + echo "Options:" + echo " --dry-run Show what would be done without making changes" + echo " --skip-validation Skip validation between phases (not recommended)" + echo " --start-phase Start from phase N (for resuming failed deployments)" + exit 0 + ;; + *) + echo -e "${RED}Unknown option: $1${NC}" + exit 1 + ;; + esac +done + +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +run_phase() { + local phase="$1" + local script="$2" + local validation="$3" + + echo "" + echo "========================================" + echo "Starting Phase $phase" + echo "========================================" + echo "" + + # Run deployment + local args=() + if [ "$DRY_RUN" = true ]; then + args+=("--dry-run") + fi + + if [ -x "$script" ]; then + "$script" "${args[@]}" + else + log_error "Deployment script not found or not executable: $script" + exit 1 + fi + + # Run validation + if [ "$SKIP_VALIDATION" = false ] && [ -x "$validation" ]; then + echo "" + log_info "Running validation for Phase $phase..." + if ! "$validation"; then + log_error "Phase $phase validation failed" + echo "" + echo "Options:" + echo " 1. Fix the issues and re-run: $0 --start-phase $phase" + echo " 2. Skip validation: $0 --skip-validation --start-phase $phase" + echo " 3. Rollback: ./rollback.sh --phase $phase" + exit 1 + fi + fi + + # Tag successful deployment + if [ "$DRY_RUN" = false ]; then + log_info "Tagging Phase $phase deployment..." + local tag_name="v0.${phase}.0-phase${phase}" + local tag_message="Phase ${phase}: $(get_phase_name "$phase")" + git tag -a "$tag_name" -m "$tag_message" 2>/dev/null || log_warn "Tag $tag_name already exists" + fi + + log_success "Phase $phase complete" +} + +get_phase_name() { + case "$1" in + 1) echo "Foundation Infrastructure" ;; + 2) echo "Core Services" ;; + 3) echo "DNS and Email" ;; + 4) echo "Observability" ;; + 5) echo "E2EE Webmail" ;; + *) echo "Unknown" ;; + esac +} + +main() { + echo "========================================" + echo "Full Infrastructure Deployment" + echo "RFC 0044: Infrastructure Rollout Guide" + echo "========================================" + echo "" + echo "This script will deploy all infrastructure phases in order:" + echo " Phase 1: Foundation Infrastructure (RFC 0039)" + echo " Phase 2: Core Services (RFC 0040)" + echo " Phase 3: DNS and Email (RFC 0041)" + echo " Phase 4: Observability (RFC 0042)" + echo " Phase 5: E2EE Webmail (RFC 0043) [optional]" + echo "" + + if [ "$DRY_RUN" = true ]; then + log_warn "Running in DRY-RUN mode - no changes will be made" + echo "" + fi + + if [ "$START_PHASE" -gt 1 ]; then + log_info "Starting from Phase $START_PHASE" + echo "" + fi + + # Track start time + local start_time + start_time=$(date +%s) + + # Phase 1: Foundation + if [ "$START_PHASE" -le 1 ]; then + run_phase 1 "$SCRIPT_DIR/deploy-phase1-foundation.sh" "$SCRIPT_DIR/validate-phase1.sh" + fi + + # Phase 2: Core Services + if [ "$START_PHASE" -le 2 ]; then + run_phase 2 "$SCRIPT_DIR/deploy-phase2-core-services.sh" "$SCRIPT_DIR/validate-phase2.sh" + fi + + # Phase 3: DNS and Email + if [ "$START_PHASE" -le 3 ]; then + run_phase 3 "$SCRIPT_DIR/deploy-phase3-dns-email.sh" "$SCRIPT_DIR/validate-phase3.sh" + fi + + # Phase 4: Observability + if [ "$START_PHASE" -le 4 ]; then + run_phase 4 "$SCRIPT_DIR/deploy-phase4-observability.sh" "$SCRIPT_DIR/validate-phase4.sh" + fi + + # Phase 5: E2EE Webmail (optional) + if [ "$START_PHASE" -le 5 ]; then + echo "" + log_info "Phase 5 (E2EE Webmail) is optional." + read -p "Deploy Phase 5? (yes/no): " deploy_phase5 + if [ "$deploy_phase5" = "yes" ]; then + run_phase 5 "$SCRIPT_DIR/deploy-phase5-e2ee-webmail.sh" "$SCRIPT_DIR/validate-phase5.sh" + else + log_info "Skipping Phase 5" + fi + fi + + # Calculate duration + local end_time + end_time=$(date +%s) + local duration=$((end_time - start_time)) + local minutes=$((duration / 60)) + local seconds=$((duration % 60)) + + # Final release tag + if [ "$DRY_RUN" = false ]; then + log_info "Creating final release tag..." + git tag -a "v1.0.0" -m "Initial Release: Full Infrastructure Stack" 2>/dev/null || log_warn "Tag v1.0.0 already exists" + fi + + echo "" + echo "========================================" + log_success "Full Infrastructure Deployment Complete!" + echo "========================================" + echo "" + echo "Deployment Summary:" + echo " Duration: ${minutes}m ${seconds}s" + echo " Phases deployed: $(($START_PHASE > 1 ? 6 - $START_PHASE : 5))" + echo "" + echo "Post-Deployment Checklist:" + echo " [ ] All Prometheus targets UP" + echo " [ ] No firing alerts in Alertmanager" + echo " [ ] Grafana dashboards showing data" + echo " [ ] CockroachDB cluster healthy (3 nodes)" + echo " [ ] Vault unsealed and HA" + echo " [ ] Keycloak SSO working for all services" + echo " [ ] DNS resolving correctly" + echo " [ ] Email sending/receiving works" + echo " [ ] Backups running" + echo "" + echo "Documentation:" + echo " - Runbooks: $SCRIPT_DIR/../runbooks/" + echo " - RFC 0044: Infrastructure Rollout Guide" +} + +main "$@" diff --git a/scripts/deploy-phase1-foundation.sh b/scripts/deploy-phase1-foundation.sh new file mode 100755 index 0000000..597b02b --- /dev/null +++ b/scripts/deploy-phase1-foundation.sh @@ -0,0 +1,298 @@ +#!/usr/bin/env bash +# +# Phase 1: Foundation Infrastructure Deployment (RFC 0039) +# +# This script deploys the foundation infrastructure including: +# - AWS resources via Terraform (VPC, EKS, S3, IAM) +# - cert-manager for TLS certificate management +# - CockroachDB cluster for distributed database +# +# Usage: +# ./deploy-phase1-foundation.sh [--dry-run] [--skip-terraform] [--skip-cert-manager] [--skip-cockroachdb] +# +# Prerequisites: +# - AWS CLI configured with appropriate credentials +# - Terraform >= 1.6.0 +# - kubectl configured for EKS cluster +# - Helm 3.x installed +# +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +INFRA_DIR="$(dirname "$SCRIPT_DIR")" +TERRAFORM_DIR="$INFRA_DIR/terraform" +K8S_DIR="$INFRA_DIR/kubernetes" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Flags +DRY_RUN=false +SKIP_TERRAFORM=false +SKIP_CERT_MANAGER=false +SKIP_COCKROACHDB=false + +# Parse arguments +while [[ $# -gt 0 ]]; do + case $1 in + --dry-run) + DRY_RUN=true + shift + ;; + --skip-terraform) + SKIP_TERRAFORM=true + shift + ;; + --skip-cert-manager) + SKIP_CERT_MANAGER=true + shift + ;; + --skip-cockroachdb) + SKIP_COCKROACHDB=true + shift + ;; + -h|--help) + echo "Usage: $0 [--dry-run] [--skip-terraform] [--skip-cert-manager] [--skip-cockroachdb]" + exit 0 + ;; + *) + echo -e "${RED}Unknown option: $1${NC}" + exit 1 + ;; + esac +done + +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +run_cmd() { + if [ "$DRY_RUN" = true ]; then + echo -e "${YELLOW}[DRY-RUN]${NC} Would run: $*" + else + "$@" + fi +} + +check_prerequisites() { + log_info "Checking prerequisites..." + + local missing=() + + if ! command -v aws &> /dev/null; then + missing+=("aws") + fi + + if ! command -v terraform &> /dev/null; then + missing+=("terraform") + fi + + if ! command -v kubectl &> /dev/null; then + missing+=("kubectl") + fi + + if ! command -v helm &> /dev/null; then + missing+=("helm") + fi + + if [ ${#missing[@]} -ne 0 ]; then + log_error "Missing required tools: ${missing[*]}" + exit 1 + fi + + # Check AWS credentials + if ! aws sts get-caller-identity &> /dev/null; then + log_error "AWS credentials not configured or expired" + exit 1 + fi + + log_success "All prerequisites met" +} + +deploy_terraform() { + if [ "$SKIP_TERRAFORM" = true ]; then + log_warn "Skipping Terraform deployment" + return + fi + + log_info "Deploying AWS resources via Terraform..." + + cd "$TERRAFORM_DIR" + + run_cmd terraform init -upgrade + + if [ "$DRY_RUN" = true ]; then + run_cmd terraform plan + else + terraform plan -out=plan.tfplan + terraform apply plan.tfplan + rm -f plan.tfplan + fi + + log_success "Terraform deployment complete" + + # Export outputs for downstream use + if [ "$DRY_RUN" = false ]; then + export CLUSTER_NAME=$(terraform output -raw cluster_name 2>/dev/null || echo "coherence-production") + export CLUSTER_ENDPOINT=$(terraform output -raw cluster_endpoint 2>/dev/null || echo "") + export AWS_REGION=$(terraform output -raw aws_region 2>/dev/null || echo "us-east-1") + fi +} + +configure_kubectl() { + log_info "Configuring kubectl for EKS cluster..." + + local cluster_name="${CLUSTER_NAME:-coherence-production}" + local region="${AWS_REGION:-us-east-1}" + + run_cmd aws eks update-kubeconfig --region "$region" --name "$cluster_name" + + # Verify connectivity + if [ "$DRY_RUN" = false ]; then + if kubectl cluster-info &> /dev/null; then + log_success "kubectl configured and connected to cluster" + else + log_error "Failed to connect to EKS cluster" + exit 1 + fi + fi +} + +deploy_cert_manager() { + if [ "$SKIP_CERT_MANAGER" = true ]; then + log_warn "Skipping cert-manager deployment" + return + fi + + log_info "Deploying cert-manager..." + + # Install cert-manager CRDs + log_info "Installing cert-manager CRDs..." + run_cmd kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.14.0/cert-manager.crds.yaml + + # Add Jetstack Helm repository + run_cmd helm repo add jetstack https://charts.jetstack.io 2>/dev/null || true + run_cmd helm repo update + + # Install cert-manager + log_info "Installing cert-manager via Helm..." + run_cmd helm upgrade --install cert-manager jetstack/cert-manager \ + --namespace cert-manager --create-namespace \ + -f "$K8S_DIR/cert-manager/helm-values.yaml" \ + --wait --timeout 300s + + # Wait for cert-manager to be ready + if [ "$DRY_RUN" = false ]; then + log_info "Waiting for cert-manager pods to be ready..." + kubectl -n cert-manager wait --for=condition=ready pod -l app.kubernetes.io/instance=cert-manager --timeout=300s + fi + + # Apply ClusterIssuers + log_info "Applying ClusterIssuers..." + run_cmd kubectl apply -k "$K8S_DIR/cert-manager/" + + log_success "cert-manager deployment complete" +} + +deploy_cockroachdb() { + if [ "$SKIP_COCKROACHDB" = true ]; then + log_warn "Skipping CockroachDB deployment" + return + fi + + log_info "Deploying CockroachDB cluster..." + + # Deploy CockroachDB StatefulSet + run_cmd kubectl apply -k "$K8S_DIR/cockroachdb/" + + # Wait for CockroachDB pods + if [ "$DRY_RUN" = false ]; then + log_info "Waiting for CockroachDB pods to be ready (this may take several minutes)..." + kubectl -n cockroachdb wait --for=condition=ready pod -l app=cockroachdb --timeout=600s + fi + + # Initialize cluster (only needed on first deployment) + log_info "Initializing CockroachDB cluster..." + run_cmd kubectl apply -f "$K8S_DIR/cockroachdb/cluster-init.yaml" + + # Wait for init job to complete + if [ "$DRY_RUN" = false ]; then + log_info "Waiting for cluster initialization..." + kubectl -n cockroachdb wait --for=condition=complete job/cockroachdb-init --timeout=120s || true + fi + + # Initialize schemas + log_info "Initializing database schemas..." + run_cmd kubectl apply -f "$K8S_DIR/cockroachdb/schema-init-job.yaml" + + # Wait for schema init + if [ "$DRY_RUN" = false ]; then + log_info "Waiting for schema initialization..." + kubectl -n cockroachdb wait --for=condition=complete job/schema-init --timeout=300s || true + fi + + log_success "CockroachDB deployment complete" +} + +validate_phase1() { + log_info "Running Phase 1 validation..." + + local validation_script="$SCRIPT_DIR/validate-phase1.sh" + if [ -x "$validation_script" ]; then + if [ "$DRY_RUN" = true ]; then + log_info "Would run validation script: $validation_script" + else + "$validation_script" + fi + else + log_warn "Validation script not found or not executable: $validation_script" + fi +} + +main() { + echo "========================================" + echo "Phase 1: Foundation Infrastructure" + echo "RFC 0039 Deployment" + echo "========================================" + echo "" + + if [ "$DRY_RUN" = true ]; then + log_warn "Running in DRY-RUN mode - no changes will be made" + echo "" + fi + + check_prerequisites + deploy_terraform + configure_kubectl + deploy_cert_manager + deploy_cockroachdb + validate_phase1 + + echo "" + echo "========================================" + log_success "Phase 1 deployment complete!" + echo "========================================" + echo "" + echo "Next steps:" + echo " 1. Run validate-phase1.sh to verify deployment" + echo " 2. Tag this deployment: git tag -a v0.1.0-phase1 -m 'Phase 1: Foundation Infrastructure'" + echo " 3. Proceed to Phase 2: ./deploy-phase2-core-services.sh" +} + +main "$@" diff --git a/scripts/deploy-phase2-core-services.sh b/scripts/deploy-phase2-core-services.sh new file mode 100755 index 0000000..792ea06 --- /dev/null +++ b/scripts/deploy-phase2-core-services.sh @@ -0,0 +1,259 @@ +#!/usr/bin/env bash +# +# Phase 2: Core Services Deployment (RFC 0040) +# +# This script deploys core services including: +# - HashiCorp Vault for secrets management +# - Keycloak for identity and SSO +# - Forgejo for Git hosting +# +# Usage: +# ./deploy-phase2-core-services.sh [--dry-run] [--skip-vault] [--skip-keycloak] [--skip-forgejo] +# +# Prerequisites: +# - Phase 1 (Foundation) must be deployed and validated +# - kubectl configured for EKS cluster +# - Helm 3.x installed +# +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +INFRA_DIR="$(dirname "$SCRIPT_DIR")" +K8S_DIR="$INFRA_DIR/kubernetes" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +# Flags +DRY_RUN=false +SKIP_VAULT=false +SKIP_KEYCLOAK=false +SKIP_FORGEJO=false + +# Parse arguments +while [[ $# -gt 0 ]]; do + case $1 in + --dry-run) + DRY_RUN=true + shift + ;; + --skip-vault) + SKIP_VAULT=true + shift + ;; + --skip-keycloak) + SKIP_KEYCLOAK=true + shift + ;; + --skip-forgejo) + SKIP_FORGEJO=true + shift + ;; + -h|--help) + echo "Usage: $0 [--dry-run] [--skip-vault] [--skip-keycloak] [--skip-forgejo]" + exit 0 + ;; + *) + echo -e "${RED}Unknown option: $1${NC}" + exit 1 + ;; + esac +done + +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +run_cmd() { + if [ "$DRY_RUN" = true ]; then + echo -e "${YELLOW}[DRY-RUN]${NC} Would run: $*" + else + "$@" + fi +} + +check_prerequisites() { + log_info "Checking prerequisites..." + + # Check kubectl connectivity + if ! kubectl cluster-info &> /dev/null; then + log_error "kubectl not connected to cluster. Run Phase 1 first." + exit 1 + fi + + # Check cert-manager is running + if ! kubectl -n cert-manager get pods -l app.kubernetes.io/instance=cert-manager -o jsonpath='{.items[0].status.phase}' 2>/dev/null | grep -q Running; then + log_error "cert-manager not running. Ensure Phase 1 is complete." + exit 1 + fi + + # Check CockroachDB is running + if ! kubectl -n cockroachdb get pods -l app=cockroachdb -o jsonpath='{.items[0].status.phase}' 2>/dev/null | grep -q Running; then + log_error "CockroachDB not running. Ensure Phase 1 is complete." + exit 1 + fi + + log_success "All prerequisites met" +} + +deploy_vault() { + if [ "$SKIP_VAULT" = true ]; then + log_warn "Skipping Vault deployment" + return + fi + + log_info "Deploying HashiCorp Vault..." + + # Deploy Vault + run_cmd kubectl apply -k "$K8S_DIR/vault/" + + # Wait for Vault pods + if [ "$DRY_RUN" = false ]; then + log_info "Waiting for Vault pods to be ready..." + kubectl -n vault wait --for=condition=ready pod -l app=vault --timeout=300s || true + fi + + # Check if Vault needs initialization + if [ "$DRY_RUN" = false ]; then + local vault_status + vault_status=$(kubectl -n vault exec vault-0 -- vault status -format=json 2>/dev/null || echo '{}') + local initialized + initialized=$(echo "$vault_status" | jq -r '.initialized // false') + + if [ "$initialized" = "false" ]; then + log_warn "Vault needs initialization!" + echo "" + echo "Run the following commands to initialize Vault:" + echo " kubectl -n vault exec -it vault-0 -- vault operator init" + echo "" + echo "IMPORTANT: Save the unseal keys and root token securely!" + echo "" + echo "Then unseal each Vault pod:" + echo " kubectl -n vault exec -it vault-0 -- vault operator unseal " + echo " kubectl -n vault exec -it vault-0 -- vault operator unseal " + echo " kubectl -n vault exec -it vault-0 -- vault operator unseal " + echo "" + read -p "Press Enter after Vault is initialized and unsealed..." + fi + + # Verify Vault is unsealed + vault_status=$(kubectl -n vault exec vault-0 -- vault status -format=json 2>/dev/null || echo '{}') + local sealed + sealed=$(echo "$vault_status" | jq -r '.sealed // true') + if [ "$sealed" = "true" ]; then + log_error "Vault is still sealed. Please unseal before continuing." + exit 1 + fi + fi + + log_success "Vault deployment complete" +} + +deploy_keycloak() { + if [ "$SKIP_KEYCLOAK" = true ]; then + log_warn "Skipping Keycloak deployment" + return + fi + + log_info "Deploying Keycloak..." + + # Deploy Keycloak + run_cmd kubectl apply -k "$K8S_DIR/keycloak/" + + # Wait for Keycloak pods + if [ "$DRY_RUN" = false ]; then + log_info "Waiting for Keycloak pods to be ready..." + kubectl -n keycloak wait --for=condition=ready pod -l app=keycloak --timeout=300s + fi + + # Import realm configuration if available + if [ -f "$K8S_DIR/keycloak/realm-config.yaml" ]; then + log_info "Applying realm configuration..." + run_cmd kubectl apply -f "$K8S_DIR/keycloak/realm-config.yaml" + fi + + log_success "Keycloak deployment complete" +} + +deploy_forgejo() { + if [ "$SKIP_FORGEJO" = true ]; then + log_warn "Skipping Forgejo deployment" + return + fi + + log_info "Deploying Forgejo..." + + # Deploy Forgejo + run_cmd kubectl apply -k "$K8S_DIR/forgejo/" + + # Wait for Forgejo pods + if [ "$DRY_RUN" = false ]; then + log_info "Waiting for Forgejo pods to be ready..." + kubectl -n forgejo wait --for=condition=ready pod -l app=forgejo --timeout=300s + fi + + log_success "Forgejo deployment complete" +} + +validate_phase2() { + log_info "Running Phase 2 validation..." + + local validation_script="$SCRIPT_DIR/validate-phase2.sh" + if [ -x "$validation_script" ]; then + if [ "$DRY_RUN" = true ]; then + log_info "Would run validation script: $validation_script" + else + "$validation_script" + fi + else + log_warn "Validation script not found or not executable: $validation_script" + fi +} + +main() { + echo "========================================" + echo "Phase 2: Core Services" + echo "RFC 0040 Deployment" + echo "========================================" + echo "" + + if [ "$DRY_RUN" = true ]; then + log_warn "Running in DRY-RUN mode - no changes will be made" + echo "" + fi + + check_prerequisites + deploy_vault + deploy_keycloak + deploy_forgejo + validate_phase2 + + echo "" + echo "========================================" + log_success "Phase 2 deployment complete!" + echo "========================================" + echo "" + echo "Next steps:" + echo " 1. Run validate-phase2.sh to verify deployment" + echo " 2. Configure Keycloak realm and clients via Web UI" + echo " 3. Tag this deployment: git tag -a v0.2.0-phase2 -m 'Phase 2: Core Services'" + echo " 4. Proceed to Phase 3: ./deploy-phase3-dns-email.sh" +} + +main "$@" diff --git a/scripts/deploy-phase3-dns-email.sh b/scripts/deploy-phase3-dns-email.sh new file mode 100755 index 0000000..92e9871 --- /dev/null +++ b/scripts/deploy-phase3-dns-email.sh @@ -0,0 +1,282 @@ +#!/usr/bin/env bash +# +# Phase 3: DNS and Email Deployment (RFC 0041) +# +# This script deploys DNS and email services including: +# - PowerDNS for authoritative DNS +# - Stalwart Mail Server for email +# - Vault S/MIME PKI configuration +# +# Usage: +# ./deploy-phase3-dns-email.sh [--dry-run] [--skip-powerdns] [--skip-stalwart] [--skip-vault-pki] +# +# Prerequisites: +# - Phase 2 (Core Services) must be deployed and validated +# - kubectl configured for EKS cluster +# - DNS registrar access for NS record delegation +# +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +INFRA_DIR="$(dirname "$SCRIPT_DIR")" +TERRAFORM_DIR="$INFRA_DIR/terraform" +K8S_DIR="$INFRA_DIR/kubernetes" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +# Flags +DRY_RUN=false +SKIP_POWERDNS=false +SKIP_STALWART=false +SKIP_VAULT_PKI=false + +# Parse arguments +while [[ $# -gt 0 ]]; do + case $1 in + --dry-run) + DRY_RUN=true + shift + ;; + --skip-powerdns) + SKIP_POWERDNS=true + shift + ;; + --skip-stalwart) + SKIP_STALWART=true + shift + ;; + --skip-vault-pki) + SKIP_VAULT_PKI=true + shift + ;; + -h|--help) + echo "Usage: $0 [--dry-run] [--skip-powerdns] [--skip-stalwart] [--skip-vault-pki]" + exit 0 + ;; + *) + echo -e "${RED}Unknown option: $1${NC}" + exit 1 + ;; + esac +done + +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +run_cmd() { + if [ "$DRY_RUN" = true ]; then + echo -e "${YELLOW}[DRY-RUN]${NC} Would run: $*" + else + "$@" + fi +} + +check_prerequisites() { + log_info "Checking prerequisites..." + + # Check kubectl connectivity + if ! kubectl cluster-info &> /dev/null; then + log_error "kubectl not connected to cluster." + exit 1 + fi + + # Check Vault is running and unsealed + if [ "$DRY_RUN" = false ]; then + local vault_status + vault_status=$(kubectl -n vault exec vault-0 -- vault status -format=json 2>/dev/null || echo '{}') + local sealed + sealed=$(echo "$vault_status" | jq -r '.sealed // true') + if [ "$sealed" = "true" ]; then + log_error "Vault is sealed. Ensure Phase 2 is complete." + exit 1 + fi + fi + + # Check Keycloak is running + if ! kubectl -n keycloak get pods -l app=keycloak -o jsonpath='{.items[0].status.phase}' 2>/dev/null | grep -q Running; then + log_error "Keycloak not running. Ensure Phase 2 is complete." + exit 1 + fi + + log_success "All prerequisites met" +} + +deploy_powerdns() { + if [ "$SKIP_POWERDNS" = true ]; then + log_warn "Skipping PowerDNS deployment" + return + fi + + log_info "Deploying PowerDNS..." + + # Deploy PowerDNS + run_cmd kubectl apply -k "$K8S_DIR/powerdns/" + + # Wait for PowerDNS pods + if [ "$DRY_RUN" = false ]; then + log_info "Waiting for PowerDNS pods to be ready..." + kubectl -n dns wait --for=condition=ready pod -l app=powerdns --timeout=300s + fi + + # Initialize DNS records + if [ -f "$K8S_DIR/powerdns/dns-records-job.yaml" ]; then + log_info "Initializing DNS records..." + run_cmd kubectl apply -f "$K8S_DIR/powerdns/dns-records-job.yaml" + if [ "$DRY_RUN" = false ]; then + kubectl -n dns wait --for=condition=complete job/dns-records-init --timeout=120s || true + fi + fi + + # Enable DNSSEC + if [ -f "$K8S_DIR/powerdns/dnssec-setup-job.yaml" ]; then + log_info "Enabling DNSSEC..." + run_cmd kubectl apply -f "$K8S_DIR/powerdns/dnssec-setup-job.yaml" + if [ "$DRY_RUN" = false ]; then + kubectl -n dns wait --for=condition=complete job/dnssec-setup --timeout=120s || true + fi + fi + + log_success "PowerDNS deployment complete" + + # Display NS record information + echo "" + log_info "Important: Update your DNS registrar with the following NS records:" + if [ "$DRY_RUN" = false ]; then + local nlb_dns + nlb_dns=$(kubectl -n dns get svc powerdns-external -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' 2>/dev/null || echo "") + echo " NS record should point to: $nlb_dns" + fi + echo "" +} + +deploy_stalwart() { + if [ "$SKIP_STALWART" = true ]; then + log_warn "Skipping Stalwart deployment" + return + fi + + log_info "Deploying Stalwart Mail Server..." + + # Deploy Stalwart + run_cmd kubectl apply -k "$K8S_DIR/stalwart/" + + # Wait for Stalwart pods + if [ "$DRY_RUN" = false ]; then + log_info "Waiting for Stalwart pods to be ready..." + kubectl -n email wait --for=condition=ready pod -l app=stalwart --timeout=300s + fi + + # Generate DKIM keys + if [ -f "$K8S_DIR/stalwart/dkim-setup-job.yaml" ]; then + log_info "Generating DKIM keys..." + run_cmd kubectl apply -f "$K8S_DIR/stalwart/dkim-setup-job.yaml" + if [ "$DRY_RUN" = false ]; then + kubectl -n email wait --for=condition=complete job/dkim-setup --timeout=120s || true + + # Display DKIM record + echo "" + log_info "DKIM TXT record to add to DNS:" + kubectl -n email logs job/dkim-setup 2>/dev/null | grep -A5 "DKIM TXT record" || echo " Check job logs for DKIM record" + fi + fi + + log_success "Stalwart deployment complete" + + # Display email DNS records + echo "" + log_info "Ensure the following DNS records are configured:" + echo " SPF: v=spf1 mx a ~all" + echo " DMARC: v=DMARC1; p=quarantine; rua=mailto:dmarc@coherence.dev" + echo "" +} + +deploy_vault_smime_pki() { + if [ "$SKIP_VAULT_PKI" = true ]; then + log_warn "Skipping Vault S/MIME PKI deployment" + return + fi + + log_info "Deploying Vault S/MIME PKI..." + + cd "$TERRAFORM_DIR" + + if [ "$DRY_RUN" = true ]; then + run_cmd terraform plan -target=module.vault_smime_pki + else + terraform plan -target=module.vault_smime_pki -out=pki.tfplan + terraform apply pki.tfplan + rm -f pki.tfplan + fi + + log_success "Vault S/MIME PKI deployment complete" +} + +validate_phase3() { + log_info "Running Phase 3 validation..." + + local validation_script="$SCRIPT_DIR/validate-phase3.sh" + if [ -x "$validation_script" ]; then + if [ "$DRY_RUN" = true ]; then + log_info "Would run validation script: $validation_script" + else + "$validation_script" + fi + else + log_warn "Validation script not found or not executable: $validation_script" + fi +} + +main() { + echo "========================================" + echo "Phase 3: DNS and Email" + echo "RFC 0041 Deployment" + echo "========================================" + echo "" + + if [ "$DRY_RUN" = true ]; then + log_warn "Running in DRY-RUN mode - no changes will be made" + echo "" + fi + + check_prerequisites + deploy_powerdns + deploy_stalwart + deploy_vault_smime_pki + validate_phase3 + + echo "" + echo "========================================" + log_success "Phase 3 deployment complete!" + echo "========================================" + echo "" + echo "IMPORTANT MANUAL STEPS:" + echo " 1. Update DNS registrar with NS records pointing to PowerDNS NLB" + echo " 2. Add DKIM TXT record to DNS" + echo " 3. Add SPF and DMARC records" + echo " 4. Wait for DNS propagation (may take up to 48 hours)" + echo "" + echo "Next steps:" + echo " 1. Run validate-phase3.sh to verify deployment" + echo " 2. Tag this deployment: git tag -a v0.3.0-phase3 -m 'Phase 3: DNS and Email'" + echo " 3. Proceed to Phase 4: ./deploy-phase4-observability.sh" +} + +main "$@" diff --git a/scripts/deploy-phase4-observability.sh b/scripts/deploy-phase4-observability.sh new file mode 100755 index 0000000..65b6364 --- /dev/null +++ b/scripts/deploy-phase4-observability.sh @@ -0,0 +1,190 @@ +#!/usr/bin/env bash +# +# Phase 4: Observability Stack Deployment (RFC 0042) +# +# This script deploys the observability stack including: +# - Prometheus for metrics collection +# - Grafana for visualization +# - Loki for log aggregation +# - Alertmanager for alert management +# +# Usage: +# ./deploy-phase4-observability.sh [--dry-run] +# +# Prerequisites: +# - Phase 3 (DNS and Email) should be deployed +# - kubectl configured for EKS cluster +# +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +INFRA_DIR="$(dirname "$SCRIPT_DIR")" +K8S_DIR="$INFRA_DIR/kubernetes" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +# Flags +DRY_RUN=false + +# Parse arguments +while [[ $# -gt 0 ]]; do + case $1 in + --dry-run) + DRY_RUN=true + shift + ;; + -h|--help) + echo "Usage: $0 [--dry-run]" + exit 0 + ;; + *) + echo -e "${RED}Unknown option: $1${NC}" + exit 1 + ;; + esac +done + +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +run_cmd() { + if [ "$DRY_RUN" = true ]; then + echo -e "${YELLOW}[DRY-RUN]${NC} Would run: $*" + else + "$@" + fi +} + +check_prerequisites() { + log_info "Checking prerequisites..." + + # Check kubectl connectivity + if ! kubectl cluster-info &> /dev/null; then + log_error "kubectl not connected to cluster." + exit 1 + fi + + # Check Keycloak is running (for SSO) + if ! kubectl -n keycloak get pods -l app=keycloak -o jsonpath='{.items[0].status.phase}' 2>/dev/null | grep -q Running; then + log_warn "Keycloak not running. SSO integration may not work." + fi + + log_success "All prerequisites met" +} + +deploy_observability_stack() { + log_info "Deploying observability stack..." + + # Deploy entire observability stack + run_cmd kubectl apply -k "$K8S_DIR/observability/" + + # Wait for each component + if [ "$DRY_RUN" = false ]; then + log_info "Waiting for Prometheus pods to be ready..." + kubectl -n observability wait --for=condition=ready pod -l app=prometheus --timeout=300s || true + + log_info "Waiting for Grafana pods to be ready..." + kubectl -n observability wait --for=condition=ready pod -l app=grafana --timeout=300s || true + + log_info "Waiting for Loki pods to be ready..." + kubectl -n observability wait --for=condition=ready pod -l app=loki --timeout=300s || true + + log_info "Waiting for Alertmanager pods to be ready..." + kubectl -n observability wait --for=condition=ready pod -l app=alertmanager --timeout=300s || true + fi + + log_success "Observability stack deployment complete" +} + +configure_grafana() { + log_info "Configuring Grafana..." + + if [ "$DRY_RUN" = false ]; then + # Get Grafana pod + local grafana_pod + grafana_pod=$(kubectl -n observability get pods -l app=grafana -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") + + if [ -n "$grafana_pod" ]; then + log_info "Grafana pod: $grafana_pod" + + # Datasources and dashboards are typically provisioned via ConfigMaps + log_info "Checking datasources..." + kubectl -n observability get configmap grafana-datasources -o yaml 2>/dev/null | head -20 || log_warn "No datasources ConfigMap found" + + log_info "Checking dashboards..." + kubectl -n observability get configmap -l grafana_dashboard=1 2>/dev/null || log_warn "No dashboard ConfigMaps found" + fi + fi + + log_success "Grafana configuration complete" +} + +validate_phase4() { + log_info "Running Phase 4 validation..." + + local validation_script="$SCRIPT_DIR/validate-phase4.sh" + if [ -x "$validation_script" ]; then + if [ "$DRY_RUN" = true ]; then + log_info "Would run validation script: $validation_script" + else + "$validation_script" + fi + else + log_warn "Validation script not found or not executable: $validation_script" + fi +} + +main() { + echo "========================================" + echo "Phase 4: Observability Stack" + echo "RFC 0042 Deployment" + echo "========================================" + echo "" + + if [ "$DRY_RUN" = true ]; then + log_warn "Running in DRY-RUN mode - no changes will be made" + echo "" + fi + + check_prerequisites + deploy_observability_stack + configure_grafana + validate_phase4 + + echo "" + echo "========================================" + log_success "Phase 4 deployment complete!" + echo "========================================" + echo "" + echo "Post-deployment steps:" + echo " 1. Access Grafana via Keycloak SSO" + echo " 2. Verify Prometheus datasource is configured" + echo " 3. Verify Loki datasource is configured" + echo " 4. Check that dashboards are loaded" + echo " 5. Verify Alertmanager cluster is formed" + echo "" + echo "Next steps:" + echo " 1. Run validate-phase4.sh to verify deployment" + echo " 2. Tag this deployment: git tag -a v0.4.0-phase4 -m 'Phase 4: Observability'" + echo " 3. Proceed to Phase 5: ./deploy-phase5-e2ee-webmail.sh (optional)" +} + +main "$@" diff --git a/scripts/deploy-phase5-e2ee-webmail.sh b/scripts/deploy-phase5-e2ee-webmail.sh new file mode 100755 index 0000000..2c32e81 --- /dev/null +++ b/scripts/deploy-phase5-e2ee-webmail.sh @@ -0,0 +1,300 @@ +#!/usr/bin/env bash +# +# Phase 5: E2EE Webmail Deployment (RFC 0043) +# +# This script deploys the E2EE webmail application including: +# - Webmail frontend (React/TypeScript) +# - Key service backend (Rust) +# - Integration with Stalwart and Vault +# +# Usage: +# ./deploy-phase5-e2ee-webmail.sh [--dry-run] [--build-local] [--skip-frontend] [--skip-key-service] +# +# Prerequisites: +# - Phase 4 (Observability) should be deployed +# - Stalwart email server running (Phase 3) +# - Vault S/MIME PKI configured (Phase 3) +# - kubectl configured for EKS cluster +# +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +INFRA_DIR="$(dirname "$SCRIPT_DIR")" +K8S_DIR="$INFRA_DIR/kubernetes" +APPS_DIR="$(dirname "$INFRA_DIR")/apps" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +# Flags +DRY_RUN=false +BUILD_LOCAL=false +SKIP_FRONTEND=false +SKIP_KEY_SERVICE=false + +# Configuration +REGISTRY="${REGISTRY:-ghcr.io/coherence}" +VERSION="${VERSION:-latest}" + +# Parse arguments +while [[ $# -gt 0 ]]; do + case $1 in + --dry-run) + DRY_RUN=true + shift + ;; + --build-local) + BUILD_LOCAL=true + shift + ;; + --skip-frontend) + SKIP_FRONTEND=true + shift + ;; + --skip-key-service) + SKIP_KEY_SERVICE=true + shift + ;; + -h|--help) + echo "Usage: $0 [--dry-run] [--build-local] [--skip-frontend] [--skip-key-service]" + exit 0 + ;; + *) + echo -e "${RED}Unknown option: $1${NC}" + exit 1 + ;; + esac +done + +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +run_cmd() { + if [ "$DRY_RUN" = true ]; then + echo -e "${YELLOW}[DRY-RUN]${NC} Would run: $*" + else + "$@" + fi +} + +check_prerequisites() { + log_info "Checking prerequisites..." + + # Check kubectl connectivity + if ! kubectl cluster-info &> /dev/null; then + log_error "kubectl not connected to cluster." + exit 1 + fi + + # Check Stalwart is running + if ! kubectl -n email get pods -l app=stalwart -o jsonpath='{.items[0].status.phase}' 2>/dev/null | grep -q Running; then + log_error "Stalwart email server not running. Ensure Phase 3 is complete." + exit 1 + fi + + # Check Vault is running + if [ "$DRY_RUN" = false ]; then + local vault_status + vault_status=$(kubectl -n vault exec vault-0 -- vault status -format=json 2>/dev/null || echo '{}') + local sealed + sealed=$(echo "$vault_status" | jq -r '.sealed // true') + if [ "$sealed" = "true" ]; then + log_error "Vault is sealed. Ensure it is unsealed." + exit 1 + fi + fi + + # Check build tools if building locally + if [ "$BUILD_LOCAL" = true ]; then + if ! command -v npm &> /dev/null; then + log_error "npm not found. Required for building frontend." + exit 1 + fi + if ! command -v cargo &> /dev/null; then + log_error "cargo not found. Required for building key service." + exit 1 + fi + if ! command -v docker &> /dev/null; then + log_error "docker not found. Required for building images." + exit 1 + fi + fi + + log_success "All prerequisites met" +} + +build_frontend() { + if [ "$SKIP_FRONTEND" = true ]; then + log_warn "Skipping frontend build" + return + fi + + if [ "$BUILD_LOCAL" = false ]; then + log_info "Using pre-built frontend image: $REGISTRY/webmail-frontend:$VERSION" + return + fi + + log_info "Building webmail frontend..." + + local webmail_dir="$APPS_DIR/webmail" + + if [ ! -d "$webmail_dir" ]; then + log_warn "Webmail directory not found: $webmail_dir" + log_warn "Skipping frontend build" + return + fi + + cd "$webmail_dir" + + # Install dependencies + log_info "Installing npm dependencies..." + run_cmd npm install + + # Build production bundle + log_info "Building production bundle..." + run_cmd npm run build + + # Build Docker image + log_info "Building Docker image..." + run_cmd docker build -t "$REGISTRY/webmail-frontend:$VERSION" . + + # Push to registry + if [ "$DRY_RUN" = false ]; then + log_info "Pushing image to registry..." + run_cmd docker push "$REGISTRY/webmail-frontend:$VERSION" + fi + + log_success "Frontend build complete" +} + +build_key_service() { + if [ "$SKIP_KEY_SERVICE" = true ]; then + log_warn "Skipping key service build" + return + fi + + if [ "$BUILD_LOCAL" = false ]; then + log_info "Using pre-built key service image: $REGISTRY/key-service:$VERSION" + return + fi + + log_info "Building key service..." + + local key_service_dir="$APPS_DIR/key-service" + + if [ ! -d "$key_service_dir" ]; then + log_warn "Key service directory not found: $key_service_dir" + log_warn "Skipping key service build" + return + fi + + cd "$key_service_dir" + + # Build Rust binary + log_info "Building Rust binary (release)..." + run_cmd cargo build --release + + # Build Docker image + log_info "Building Docker image..." + run_cmd docker build -t "$REGISTRY/key-service:$VERSION" . + + # Push to registry + if [ "$DRY_RUN" = false ]; then + log_info "Pushing image to registry..." + run_cmd docker push "$REGISTRY/key-service:$VERSION" + fi + + log_success "Key service build complete" +} + +deploy_webmail() { + log_info "Deploying webmail components..." + + # Check if webmail manifests exist + if [ ! -d "$K8S_DIR/webmail" ]; then + log_warn "Webmail Kubernetes manifests not found: $K8S_DIR/webmail" + log_warn "Creating placeholder manifests..." + mkdir -p "$K8S_DIR/webmail" + fi + + # Deploy webmail + run_cmd kubectl apply -k "$K8S_DIR/webmail/" || run_cmd kubectl apply -f "$K8S_DIR/webmail/" || true + + # Wait for pods + if [ "$DRY_RUN" = false ]; then + log_info "Waiting for webmail pods to be ready..." + kubectl -n webmail wait --for=condition=ready pod -l app=webmail --timeout=300s 2>/dev/null || true + kubectl -n webmail wait --for=condition=ready pod -l app=key-service --timeout=300s 2>/dev/null || true + fi + + log_success "Webmail deployment complete" +} + +validate_phase5() { + log_info "Running Phase 5 validation..." + + local validation_script="$SCRIPT_DIR/validate-phase5.sh" + if [ -x "$validation_script" ]; then + if [ "$DRY_RUN" = true ]; then + log_info "Would run validation script: $validation_script" + else + "$validation_script" + fi + else + log_warn "Validation script not found or not executable: $validation_script" + fi +} + +main() { + echo "========================================" + echo "Phase 5: E2EE Webmail (Optional)" + echo "RFC 0043 Deployment" + echo "========================================" + echo "" + + if [ "$DRY_RUN" = true ]; then + log_warn "Running in DRY-RUN mode - no changes will be made" + echo "" + fi + + check_prerequisites + build_frontend + build_key_service + deploy_webmail + validate_phase5 + + echo "" + echo "========================================" + log_success "Phase 5 deployment complete!" + echo "========================================" + echo "" + echo "Post-deployment steps:" + echo " 1. Access webmail at https://mail.coherence.dev" + echo " 2. Login via Keycloak SSO" + echo " 3. Verify E2EE key generation works" + echo " 4. Send a test encrypted email" + echo "" + echo "Next steps:" + echo " 1. Run validate-phase5.sh to verify deployment" + echo " 2. Tag this deployment: git tag -a v0.5.0-phase5 -m 'Phase 5: E2EE Webmail'" + echo " 3. Tag final release: git tag -a v1.0.0 -m 'Initial Release: Full Infrastructure Stack'" +} + +main "$@" diff --git a/scripts/validate-phase1.sh b/scripts/validate-phase1.sh new file mode 100755 index 0000000..ee7340e --- /dev/null +++ b/scripts/validate-phase1.sh @@ -0,0 +1,245 @@ +#!/usr/bin/env bash +# +# Phase 1 Validation: Foundation Infrastructure +# +# Validates that RFC 0039 components are deployed and healthy: +# - CockroachDB cluster (3 nodes, all live) +# - cert-manager certificates (all Ready) +# - S3 buckets (4 buckets created) +# +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +PASSED=0 +FAILED=0 +WARNINGS=0 + +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_pass() { + echo -e "${GREEN}[PASS]${NC} $1" + ((PASSED++)) +} + +log_fail() { + echo -e "${RED}[FAIL]${NC} $1" + ((FAILED++)) +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $1" + ((WARNINGS++)) +} + +check_cockroachdb_cluster() { + log_info "Checking CockroachDB cluster health..." + + # Check if namespace exists + if ! kubectl get namespace cockroachdb &> /dev/null; then + log_fail "cockroachdb namespace does not exist" + return + fi + + # Check pod count + local pod_count + pod_count=$(kubectl -n cockroachdb get pods -l app=cockroachdb -o json | jq '.items | length') + + if [ "$pod_count" -ge 3 ]; then + log_pass "CockroachDB has $pod_count pods running (expected: >= 3)" + else + log_fail "CockroachDB has only $pod_count pods (expected: >= 3)" + fi + + # Check pod health + local ready_pods + ready_pods=$(kubectl -n cockroachdb get pods -l app=cockroachdb -o json | jq '[.items[] | select(.status.phase == "Running") | select(.status.containerStatuses[]?.ready == true)] | length') + + if [ "$ready_pods" -ge 3 ]; then + log_pass "CockroachDB has $ready_pods ready pods" + else + log_fail "CockroachDB has only $ready_pods ready pods (expected: >= 3)" + fi + + # Check node status via cockroach CLI + local node_status + node_status=$(kubectl -n cockroachdb exec cockroachdb-0 -- cockroach node status --certs-dir=/cockroach/cockroach-certs --format=json 2>/dev/null || echo '[]') + + local live_nodes + live_nodes=$(echo "$node_status" | jq 'length') + + if [ "$live_nodes" -ge 3 ]; then + log_pass "CockroachDB cluster has $live_nodes live nodes" + else + log_fail "CockroachDB cluster has only $live_nodes live nodes (expected: >= 3)" + fi +} + +check_certificates() { + log_info "Checking cert-manager certificates..." + + # Check if cert-manager namespace exists + if ! kubectl get namespace cert-manager &> /dev/null; then + log_fail "cert-manager namespace does not exist" + return + fi + + # Check cert-manager pods + local cm_ready + cm_ready=$(kubectl -n cert-manager get pods -l app.kubernetes.io/instance=cert-manager -o json | jq '[.items[] | select(.status.phase == "Running")] | length') + + if [ "$cm_ready" -ge 1 ]; then + log_pass "cert-manager is running ($cm_ready pods)" + else + log_fail "cert-manager is not running" + fi + + # Check all certificates across namespaces + local all_certs + all_certs=$(kubectl get certificates -A -o json 2>/dev/null || echo '{"items":[]}') + + local total_certs + total_certs=$(echo "$all_certs" | jq '.items | length') + + local ready_certs + ready_certs=$(echo "$all_certs" | jq '[.items[] | select(.status.conditions[]? | select(.type == "Ready" and .status == "True"))] | length') + + if [ "$total_certs" -eq 0 ]; then + log_warn "No certificates found (may not be required yet)" + elif [ "$ready_certs" -eq "$total_certs" ]; then + log_pass "All certificates are ready ($ready_certs/$total_certs)" + else + log_fail "Some certificates are not ready ($ready_certs/$total_certs ready)" + echo " Non-ready certificates:" + echo "$all_certs" | jq -r '.items[] | select(.status.conditions[]? | select(.type == "Ready" and .status != "True")) | " - " + .metadata.namespace + "/" + .metadata.name' + fi + + # Check ClusterIssuers + local issuers + issuers=$(kubectl get clusterissuers -o json 2>/dev/null || echo '{"items":[]}') + + local ready_issuers + ready_issuers=$(echo "$issuers" | jq '[.items[] | select(.status.conditions[]? | select(.type == "Ready" and .status == "True"))] | length') + + local total_issuers + total_issuers=$(echo "$issuers" | jq '.items | length') + + if [ "$total_issuers" -eq 0 ]; then + log_warn "No ClusterIssuers found" + elif [ "$ready_issuers" -eq "$total_issuers" ]; then + log_pass "All ClusterIssuers are ready ($ready_issuers/$total_issuers)" + else + log_fail "Some ClusterIssuers are not ready ($ready_issuers/$total_issuers)" + fi +} + +check_s3_buckets() { + log_info "Checking S3 buckets..." + + # Expected buckets + local expected_buckets=("email" "logs" "traces" "lfs") + local found=0 + + # List S3 buckets + local buckets + buckets=$(aws s3 ls 2>/dev/null | awk '{print $3}' || echo "") + + for expected in "${expected_buckets[@]}"; do + if echo "$buckets" | grep -q "coherence.*$expected"; then + ((found++)) + fi + done + + if [ "$found" -eq ${#expected_buckets[@]} ]; then + log_pass "All expected S3 buckets found ($found/${#expected_buckets[@]})" + elif [ "$found" -gt 0 ]; then + log_warn "Only $found/${#expected_buckets[@]} expected S3 buckets found" + else + log_fail "No expected S3 buckets found" + fi +} + +check_karpenter() { + log_info "Checking Karpenter..." + + # Check if karpenter namespace exists + if ! kubectl get namespace karpenter &> /dev/null; then + log_warn "karpenter namespace does not exist (may be using Fargate only)" + return + fi + + # Check Karpenter pods + local kp_ready + kp_ready=$(kubectl -n karpenter get pods -l app.kubernetes.io/name=karpenter -o json 2>/dev/null | jq '[.items[] | select(.status.phase == "Running")] | length') + + if [ "$kp_ready" -ge 1 ]; then + log_pass "Karpenter is running ($kp_ready pods)" + else + log_fail "Karpenter is not running" + fi + + # Check NodePools + local nodepools + nodepools=$(kubectl get nodepools -o json 2>/dev/null || echo '{"items":[]}') + local np_count + np_count=$(echo "$nodepools" | jq '.items | length') + + if [ "$np_count" -ge 1 ]; then + log_pass "Karpenter NodePools configured ($np_count)" + else + log_warn "No Karpenter NodePools found" + fi +} + +print_summary() { + echo "" + echo "========================================" + echo "Phase 1 Validation Summary" + echo "========================================" + echo -e " ${GREEN}Passed:${NC} $PASSED" + echo -e " ${RED}Failed:${NC} $FAILED" + echo -e " ${YELLOW}Warnings:${NC} $WARNINGS" + echo "========================================" + + if [ "$FAILED" -gt 0 ]; then + echo "" + echo -e "${RED}Phase 1 validation FAILED${NC}" + echo "Please fix the issues above before proceeding to Phase 2." + exit 1 + elif [ "$WARNINGS" -gt 0 ]; then + echo "" + echo -e "${YELLOW}Phase 1 validation passed with warnings${NC}" + echo "Review warnings above. You may proceed to Phase 2." + exit 0 + else + echo "" + echo -e "${GREEN}Phase 1 validation PASSED${NC}" + echo "You may proceed to Phase 2." + exit 0 + fi +} + +main() { + echo "========================================" + echo "Phase 1 Validation: Foundation Infrastructure" + echo "========================================" + echo "" + + check_cockroachdb_cluster + check_certificates + check_s3_buckets + check_karpenter + + print_summary +} + +main "$@" diff --git a/scripts/validate-phase2.sh b/scripts/validate-phase2.sh new file mode 100755 index 0000000..4a88cb2 --- /dev/null +++ b/scripts/validate-phase2.sh @@ -0,0 +1,257 @@ +#!/usr/bin/env bash +# +# Phase 2 Validation: Core Services +# +# Validates that RFC 0040 components are deployed and healthy: +# - Vault (unsealed, HA mode) +# - Keycloak (realm exists, healthy) +# - Forgejo (SSO working) +# +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +PASSED=0 +FAILED=0 +WARNINGS=0 + +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_pass() { + echo -e "${GREEN}[PASS]${NC} $1" + ((PASSED++)) +} + +log_fail() { + echo -e "${RED}[FAIL]${NC} $1" + ((FAILED++)) +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $1" + ((WARNINGS++)) +} + +check_vault() { + log_info "Checking Vault status..." + + # Check if namespace exists + if ! kubectl get namespace vault &> /dev/null; then + log_fail "vault namespace does not exist" + return + fi + + # Check pod count + local pod_count + pod_count=$(kubectl -n vault get pods -l app=vault -o json | jq '[.items[] | select(.status.phase == "Running")] | length') + + if [ "$pod_count" -ge 1 ]; then + log_pass "Vault has $pod_count running pods" + else + log_fail "Vault has no running pods" + return + fi + + # Check Vault seal status + local vault_status + vault_status=$(kubectl -n vault exec vault-0 -- vault status -format=json 2>/dev/null || echo '{}') + + local initialized + initialized=$(echo "$vault_status" | jq -r '.initialized // false') + + local sealed + sealed=$(echo "$vault_status" | jq -r '.sealed // true') + + if [ "$initialized" = "true" ]; then + log_pass "Vault is initialized" + else + log_fail "Vault is not initialized" + fi + + if [ "$sealed" = "false" ]; then + log_pass "Vault is unsealed" + else + log_fail "Vault is sealed - must be unsealed before services can access secrets" + fi + + # Check HA status + local ha_enabled + ha_enabled=$(echo "$vault_status" | jq -r '.ha_enabled // false') + + if [ "$ha_enabled" = "true" ]; then + log_pass "Vault HA mode is enabled" + else + log_warn "Vault HA mode is not enabled" + fi + + # Check Vault health endpoint + local health_code + health_code=$(kubectl -n vault exec vault-0 -- wget -q -O /dev/null -S http://127.0.0.1:8200/v1/sys/health 2>&1 | grep "HTTP/" | awk '{print $2}' || echo "0") + + if [ "$health_code" = "200" ] || [ "$health_code" = "429" ]; then + log_pass "Vault health endpoint responding (HTTP $health_code)" + else + log_warn "Vault health endpoint returned HTTP $health_code" + fi +} + +check_keycloak() { + log_info "Checking Keycloak status..." + + # Check if namespace exists + if ! kubectl get namespace keycloak &> /dev/null; then + log_fail "keycloak namespace does not exist" + return + fi + + # Check pod count and health + local ready_pods + ready_pods=$(kubectl -n keycloak get pods -l app=keycloak -o json | jq '[.items[] | select(.status.phase == "Running") | select(.status.containerStatuses[]?.ready == true)] | length') + + if [ "$ready_pods" -ge 1 ]; then + log_pass "Keycloak has $ready_pods ready pods" + else + log_fail "Keycloak has no ready pods" + return + fi + + # Check Keycloak service + local svc_exists + svc_exists=$(kubectl -n keycloak get svc keycloak -o name 2>/dev/null || echo "") + + if [ -n "$svc_exists" ]; then + log_pass "Keycloak service exists" + else + log_fail "Keycloak service does not exist" + fi + + # Check for coherence realm (via API if accessible) + # This requires port-forward or ingress access + log_info "Note: Verify 'coherence' realm exists via Keycloak Web UI" + + # Check ingress/route + local ingress_exists + ingress_exists=$(kubectl get ingress -A -o json 2>/dev/null | jq '[.items[] | select(.spec.rules[]?.host | contains("auth"))] | length') + + if [ "$ingress_exists" -ge 1 ]; then + log_pass "Keycloak ingress configured" + else + log_warn "No Keycloak ingress found - may be using LoadBalancer service" + fi +} + +check_forgejo() { + log_info "Checking Forgejo status..." + + # Check if namespace exists + if ! kubectl get namespace forgejo &> /dev/null; then + log_fail "forgejo namespace does not exist" + return + fi + + # Check pod health + local ready_pods + ready_pods=$(kubectl -n forgejo get pods -l app=forgejo -o json | jq '[.items[] | select(.status.phase == "Running") | select(.status.containerStatuses[]?.ready == true)] | length') + + if [ "$ready_pods" -ge 1 ]; then + log_pass "Forgejo has $ready_pods ready pods" + else + log_fail "Forgejo has no ready pods" + return + fi + + # Check Forgejo service + local svc_exists + svc_exists=$(kubectl -n forgejo get svc forgejo -o name 2>/dev/null || echo "") + + if [ -n "$svc_exists" ]; then + log_pass "Forgejo service exists" + else + log_fail "Forgejo service does not exist" + fi + + # Check PVC + local pvc_bound + pvc_bound=$(kubectl -n forgejo get pvc -o json | jq '[.items[] | select(.status.phase == "Bound")] | length') + + if [ "$pvc_bound" -ge 1 ]; then + log_pass "Forgejo PVC is bound" + else + log_warn "No bound PVC found for Forgejo" + fi + + # Check OAuth configuration + local oauth_config + oauth_config=$(kubectl -n forgejo get configmap -o json 2>/dev/null | jq '[.items[] | select(.metadata.name | contains("oauth"))] | length') + + if [ "$oauth_config" -ge 1 ]; then + log_pass "Forgejo OAuth configuration found" + else + log_warn "No OAuth configuration found - SSO may not be configured" + fi +} + +check_sso_integration() { + log_info "Checking SSO integration..." + + # This is a placeholder for SSO checks + # In practice, you would test OAuth flows + log_info "Note: Manually verify SSO by logging into Forgejo with Keycloak" + log_info " 1. Navigate to Forgejo web UI" + log_info " 2. Click 'Sign in with Keycloak'" + log_info " 3. Verify redirect to Keycloak login" + log_info " 4. Verify successful login and redirect back" +} + +print_summary() { + echo "" + echo "========================================" + echo "Phase 2 Validation Summary" + echo "========================================" + echo -e " ${GREEN}Passed:${NC} $PASSED" + echo -e " ${RED}Failed:${NC} $FAILED" + echo -e " ${YELLOW}Warnings:${NC} $WARNINGS" + echo "========================================" + + if [ "$FAILED" -gt 0 ]; then + echo "" + echo -e "${RED}Phase 2 validation FAILED${NC}" + echo "Please fix the issues above before proceeding to Phase 3." + exit 1 + elif [ "$WARNINGS" -gt 0 ]; then + echo "" + echo -e "${YELLOW}Phase 2 validation passed with warnings${NC}" + echo "Review warnings above. You may proceed to Phase 3." + exit 0 + else + echo "" + echo -e "${GREEN}Phase 2 validation PASSED${NC}" + echo "You may proceed to Phase 3." + exit 0 + fi +} + +main() { + echo "========================================" + echo "Phase 2 Validation: Core Services" + echo "========================================" + echo "" + + check_vault + check_keycloak + check_forgejo + check_sso_integration + + print_summary +} + +main "$@" diff --git a/scripts/validate-phase3.sh b/scripts/validate-phase3.sh new file mode 100755 index 0000000..c84fc06 --- /dev/null +++ b/scripts/validate-phase3.sh @@ -0,0 +1,297 @@ +#!/usr/bin/env bash +# +# Phase 3 Validation: DNS and Email +# +# Validates that RFC 0041 components are deployed and healthy: +# - PowerDNS (responding, DNSSEC enabled) +# - Stalwart (SMTP, IMAP, TLS) +# - DNS records (SPF, DKIM, DMARC) +# +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +PASSED=0 +FAILED=0 +WARNINGS=0 + +# Configuration +DOMAIN="${DOMAIN:-coherence.dev}" + +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_pass() { + echo -e "${GREEN}[PASS]${NC} $1" + ((PASSED++)) +} + +log_fail() { + echo -e "${RED}[FAIL]${NC} $1" + ((FAILED++)) +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $1" + ((WARNINGS++)) +} + +check_powerdns() { + log_info "Checking PowerDNS status..." + + # Check if namespace exists + if ! kubectl get namespace dns &> /dev/null; then + log_fail "dns namespace does not exist" + return + fi + + # Check pod health + local ready_pods + ready_pods=$(kubectl -n dns get pods -l app=powerdns -o json | jq '[.items[] | select(.status.phase == "Running") | select(.status.containerStatuses[]?.ready == true)] | length') + + if [ "$ready_pods" -ge 1 ]; then + log_pass "PowerDNS has $ready_pods ready pods" + else + log_fail "PowerDNS has no ready pods" + return + fi + + # Check PowerDNS service + local svc_ip + svc_ip=$(kubectl -n dns get svc powerdns -o jsonpath='{.spec.clusterIP}' 2>/dev/null || echo "") + + if [ -n "$svc_ip" ]; then + log_pass "PowerDNS service exists (ClusterIP: $svc_ip)" + else + log_fail "PowerDNS service does not exist" + fi + + # Check external NLB + local nlb_hostname + nlb_hostname=$(kubectl -n dns get svc powerdns-external -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' 2>/dev/null || echo "") + + if [ -n "$nlb_hostname" ]; then + log_pass "PowerDNS NLB provisioned: $nlb_hostname" + else + log_warn "PowerDNS NLB not yet provisioned" + fi +} + +check_dns_resolution() { + log_info "Checking DNS resolution..." + + # Get NLB hostname + local nlb_hostname + nlb_hostname=$(kubectl -n dns get svc powerdns-external -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' 2>/dev/null || echo "") + + if [ -z "$nlb_hostname" ]; then + log_warn "Cannot test DNS resolution - NLB not provisioned" + return + fi + + # Get NLB IP + local nlb_ip + nlb_ip=$(dig +short "$nlb_hostname" | head -1 || echo "") + + if [ -z "$nlb_ip" ]; then + log_warn "Cannot resolve NLB hostname to IP" + return + fi + + # Test DNS resolution against PowerDNS + local dns_response + dns_response=$(dig @"$nlb_ip" "$DOMAIN" +short 2>/dev/null || echo "") + + if [ -n "$dns_response" ]; then + log_pass "DNS resolution working: $DOMAIN -> $dns_response" + else + log_warn "DNS resolution failed for $DOMAIN (may need NS delegation)" + fi +} + +check_dnssec() { + log_info "Checking DNSSEC..." + + # Check DNSSEC via public DNS + local dnssec_response + dnssec_response=$(dig +dnssec "$DOMAIN" 2>/dev/null | grep -c "ad" || echo "0") + + if [ "$dnssec_response" -gt 0 ]; then + log_pass "DNSSEC is enabled and validated" + else + log_warn "DNSSEC not validated (may need DS record at registrar)" + fi + + # Check for DNSKEY records + local dnskey_count + dnskey_count=$(dig DNSKEY "$DOMAIN" +short 2>/dev/null | wc -l | tr -d ' ') + + if [ "$dnskey_count" -gt 0 ]; then + log_pass "DNSKEY records present ($dnskey_count keys)" + else + log_warn "No DNSKEY records found" + fi +} + +check_stalwart() { + log_info "Checking Stalwart Mail Server..." + + # Check if namespace exists + if ! kubectl get namespace email &> /dev/null; then + log_fail "email namespace does not exist" + return + fi + + # Check pod health + local ready_pods + ready_pods=$(kubectl -n email get pods -l app=stalwart -o json | jq '[.items[] | select(.status.phase == "Running") | select(.status.containerStatuses[]?.ready == true)] | length') + + if [ "$ready_pods" -ge 1 ]; then + log_pass "Stalwart has $ready_pods ready pods" + else + log_fail "Stalwart has no ready pods" + return + fi + + # Check Stalwart services + local services=("smtp" "submission" "imap") + for svc in "${services[@]}"; do + local svc_exists + svc_exists=$(kubectl -n email get svc stalwart-"$svc" -o name 2>/dev/null || echo "") + if [ -n "$svc_exists" ]; then + log_pass "Stalwart $svc service exists" + else + log_warn "Stalwart $svc service not found" + fi + done +} + +check_email_tls() { + log_info "Checking email TLS..." + + # Get mail server hostname + local mail_host="mail.$DOMAIN" + + # Check SMTP TLS (port 465) + if command -v openssl &> /dev/null; then + local smtp_tls + smtp_tls=$(echo | timeout 5 openssl s_client -connect "$mail_host":465 2>/dev/null | grep -c "Verify return code: 0" || echo "0") + + if [ "$smtp_tls" -gt 0 ]; then + log_pass "SMTP TLS working on port 465" + else + log_warn "SMTP TLS check failed (may need DNS propagation)" + fi + + # Check IMAP TLS (port 993) + local imap_tls + imap_tls=$(echo | timeout 5 openssl s_client -connect "$mail_host":993 2>/dev/null | grep -c "Verify return code: 0" || echo "0") + + if [ "$imap_tls" -gt 0 ]; then + log_pass "IMAP TLS working on port 993" + else + log_warn "IMAP TLS check failed (may need DNS propagation)" + fi + else + log_warn "openssl not available - skipping TLS checks" + fi +} + +check_email_dns_records() { + log_info "Checking email DNS records..." + + # Check MX record + local mx_record + mx_record=$(dig MX "$DOMAIN" +short 2>/dev/null | head -1 || echo "") + + if [ -n "$mx_record" ]; then + log_pass "MX record exists: $mx_record" + else + log_warn "No MX record found for $DOMAIN" + fi + + # Check SPF record + local spf_record + spf_record=$(dig TXT "$DOMAIN" +short 2>/dev/null | grep "v=spf1" || echo "") + + if [ -n "$spf_record" ]; then + log_pass "SPF record exists" + else + log_warn "No SPF record found" + fi + + # Check DMARC record + local dmarc_record + dmarc_record=$(dig TXT "_dmarc.$DOMAIN" +short 2>/dev/null | grep "v=DMARC1" || echo "") + + if [ -n "$dmarc_record" ]; then + log_pass "DMARC record exists" + else + log_warn "No DMARC record found" + fi + + # Check DKIM record + local dkim_record + dkim_record=$(dig TXT "default._domainkey.$DOMAIN" +short 2>/dev/null | grep "v=DKIM1" || echo "") + + if [ -n "$dkim_record" ]; then + log_pass "DKIM record exists" + else + log_warn "No DKIM record found (check selector name)" + fi +} + +print_summary() { + echo "" + echo "========================================" + echo "Phase 3 Validation Summary" + echo "========================================" + echo -e " ${GREEN}Passed:${NC} $PASSED" + echo -e " ${RED}Failed:${NC} $FAILED" + echo -e " ${YELLOW}Warnings:${NC} $WARNINGS" + echo "========================================" + + if [ "$FAILED" -gt 0 ]; then + echo "" + echo -e "${RED}Phase 3 validation FAILED${NC}" + echo "Please fix the issues above before proceeding to Phase 4." + exit 1 + elif [ "$WARNINGS" -gt 0 ]; then + echo "" + echo -e "${YELLOW}Phase 3 validation passed with warnings${NC}" + echo "DNS propagation may take up to 48 hours." + echo "Review warnings above. You may proceed to Phase 4." + exit 0 + else + echo "" + echo -e "${GREEN}Phase 3 validation PASSED${NC}" + echo "You may proceed to Phase 4." + exit 0 + fi +} + +main() { + echo "========================================" + echo "Phase 3 Validation: DNS and Email" + echo "========================================" + echo "" + + check_powerdns + check_dns_resolution + check_dnssec + check_stalwart + check_email_tls + check_email_dns_records + + print_summary +} + +main "$@" diff --git a/scripts/validate-phase4.sh b/scripts/validate-phase4.sh new file mode 100755 index 0000000..8bd976c --- /dev/null +++ b/scripts/validate-phase4.sh @@ -0,0 +1,300 @@ +#!/usr/bin/env bash +# +# Phase 4 Validation: Observability Stack +# +# Validates that RFC 0042 components are deployed and healthy: +# - Prometheus (targets UP) +# - Grafana (SSO working, datasources configured) +# - Loki (logs visible) +# - Alertmanager (cluster formed) +# +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +PASSED=0 +FAILED=0 +WARNINGS=0 + +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_pass() { + echo -e "${GREEN}[PASS]${NC} $1" + ((PASSED++)) +} + +log_fail() { + echo -e "${RED}[FAIL]${NC} $1" + ((FAILED++)) +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $1" + ((WARNINGS++)) +} + +check_prometheus() { + log_info "Checking Prometheus..." + + # Check if namespace exists + if ! kubectl get namespace observability &> /dev/null; then + log_fail "observability namespace does not exist" + return + fi + + # Check pod health + local ready_pods + ready_pods=$(kubectl -n observability get pods -l app=prometheus -o json 2>/dev/null | jq '[.items[] | select(.status.phase == "Running") | select(.status.containerStatuses[]?.ready == true)] | length') + + if [ "$ready_pods" -ge 1 ]; then + log_pass "Prometheus has $ready_pods ready pods" + else + log_fail "Prometheus has no ready pods" + return + fi + + # Check Prometheus targets via API + local prom_pod + prom_pod=$(kubectl -n observability get pods -l app=prometheus -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") + + if [ -n "$prom_pod" ]; then + local targets + targets=$(kubectl -n observability exec "$prom_pod" -- wget -q -O - http://localhost:9090/api/v1/targets 2>/dev/null || echo '{}') + + local active_targets + active_targets=$(echo "$targets" | jq '.data.activeTargets | length' 2>/dev/null || echo "0") + + local up_targets + up_targets=$(echo "$targets" | jq '[.data.activeTargets[] | select(.health == "up")] | length' 2>/dev/null || echo "0") + + if [ "$active_targets" -gt 0 ]; then + log_pass "Prometheus has $active_targets active targets ($up_targets UP)" + else + log_warn "No active Prometheus targets found" + fi + + # Check for down targets + local down_targets + down_targets=$(echo "$targets" | jq '[.data.activeTargets[] | select(.health == "down")] | length' 2>/dev/null || echo "0") + + if [ "$down_targets" -gt 0 ]; then + log_warn "$down_targets Prometheus targets are DOWN" + fi + fi +} + +check_grafana() { + log_info "Checking Grafana..." + + # Check pod health + local ready_pods + ready_pods=$(kubectl -n observability get pods -l app=grafana -o json 2>/dev/null | jq '[.items[] | select(.status.phase == "Running") | select(.status.containerStatuses[]?.ready == true)] | length') + + if [ "$ready_pods" -ge 1 ]; then + log_pass "Grafana has $ready_pods ready pods" + else + log_fail "Grafana has no ready pods" + return + fi + + # Check Grafana service + local svc_exists + svc_exists=$(kubectl -n observability get svc grafana -o name 2>/dev/null || echo "") + + if [ -n "$svc_exists" ]; then + log_pass "Grafana service exists" + else + log_fail "Grafana service does not exist" + fi + + # Check datasources ConfigMap + local datasources_cm + datasources_cm=$(kubectl -n observability get configmap grafana-datasources -o name 2>/dev/null || echo "") + + if [ -n "$datasources_cm" ]; then + log_pass "Grafana datasources ConfigMap exists" + else + log_warn "Grafana datasources ConfigMap not found" + fi + + # Check dashboards ConfigMaps + local dashboard_cms + dashboard_cms=$(kubectl -n observability get configmap -l grafana_dashboard=1 -o name 2>/dev/null | wc -l | tr -d ' ') + + if [ "$dashboard_cms" -gt 0 ]; then + log_pass "Found $dashboard_cms Grafana dashboard ConfigMaps" + else + log_warn "No Grafana dashboard ConfigMaps found" + fi + + # Check ingress + local ingress_exists + ingress_exists=$(kubectl get ingress -A -o json 2>/dev/null | jq '[.items[] | select(.spec.rules[]?.host | contains("grafana"))] | length') + + if [ "$ingress_exists" -ge 1 ]; then + log_pass "Grafana ingress configured" + else + log_warn "No Grafana ingress found" + fi +} + +check_loki() { + log_info "Checking Loki..." + + # Check pod health + local ready_pods + ready_pods=$(kubectl -n observability get pods -l app=loki -o json 2>/dev/null | jq '[.items[] | select(.status.phase == "Running") | select(.status.containerStatuses[]?.ready == true)] | length') + + if [ "$ready_pods" -ge 1 ]; then + log_pass "Loki has $ready_pods ready pods" + else + log_fail "Loki has no ready pods" + return + fi + + # Check Loki service + local svc_exists + svc_exists=$(kubectl -n observability get svc loki -o name 2>/dev/null || echo "") + + if [ -n "$svc_exists" ]; then + log_pass "Loki service exists" + else + log_fail "Loki service does not exist" + fi + + # Check Loki health + local loki_pod + loki_pod=$(kubectl -n observability get pods -l app=loki -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") + + if [ -n "$loki_pod" ]; then + local loki_ready + loki_ready=$(kubectl -n observability exec "$loki_pod" -- wget -q -O - http://localhost:3100/ready 2>/dev/null || echo "") + + if [ "$loki_ready" = "ready" ]; then + log_pass "Loki is ready" + else + log_warn "Loki readiness check returned: $loki_ready" + fi + fi +} + +check_alertmanager() { + log_info "Checking Alertmanager..." + + # Check pod health + local ready_pods + ready_pods=$(kubectl -n observability get pods -l app=alertmanager -o json 2>/dev/null | jq '[.items[] | select(.status.phase == "Running") | select(.status.containerStatuses[]?.ready == true)] | length') + + if [ "$ready_pods" -ge 1 ]; then + log_pass "Alertmanager has $ready_pods ready pods" + else + log_fail "Alertmanager has no ready pods" + return + fi + + # Check Alertmanager cluster status + local am_pod + am_pod=$(kubectl -n observability get pods -l app=alertmanager -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") + + if [ -n "$am_pod" ]; then + local cluster_status + cluster_status=$(kubectl -n observability exec "$am_pod" -- wget -q -O - http://localhost:9093/api/v2/status 2>/dev/null || echo '{}') + + local cluster_peers + cluster_peers=$(echo "$cluster_status" | jq '.cluster.peers | length' 2>/dev/null || echo "0") + + if [ "$cluster_peers" -ge 1 ]; then + log_pass "Alertmanager cluster formed ($cluster_peers peers)" + else + log_warn "Alertmanager cluster not formed" + fi + fi + + # Check for active alerts + if [ -n "$am_pod" ]; then + local active_alerts + active_alerts=$(kubectl -n observability exec "$am_pod" -- wget -q -O - http://localhost:9093/api/v2/alerts 2>/dev/null | jq 'length' 2>/dev/null || echo "0") + + if [ "$active_alerts" -gt 0 ]; then + log_warn "$active_alerts active alerts in Alertmanager" + else + log_pass "No active alerts" + fi + fi +} + +check_promtail() { + log_info "Checking Promtail (log shipper)..." + + # Check DaemonSet + local promtail_ds + promtail_ds=$(kubectl -n observability get daemonset promtail -o json 2>/dev/null || echo '{}') + + local desired + desired=$(echo "$promtail_ds" | jq '.status.desiredNumberScheduled // 0') + + local ready + ready=$(echo "$promtail_ds" | jq '.status.numberReady // 0') + + if [ "$desired" -eq 0 ]; then + log_warn "Promtail DaemonSet not found (may use different log collector)" + elif [ "$ready" -eq "$desired" ]; then + log_pass "Promtail DaemonSet healthy ($ready/$desired ready)" + else + log_warn "Promtail DaemonSet partially ready ($ready/$desired)" + fi +} + +print_summary() { + echo "" + echo "========================================" + echo "Phase 4 Validation Summary" + echo "========================================" + echo -e " ${GREEN}Passed:${NC} $PASSED" + echo -e " ${RED}Failed:${NC} $FAILED" + echo -e " ${YELLOW}Warnings:${NC} $WARNINGS" + echo "========================================" + + if [ "$FAILED" -gt 0 ]; then + echo "" + echo -e "${RED}Phase 4 validation FAILED${NC}" + echo "Please fix the issues above before proceeding to Phase 5." + exit 1 + elif [ "$WARNINGS" -gt 0 ]; then + echo "" + echo -e "${YELLOW}Phase 4 validation passed with warnings${NC}" + echo "Review warnings above. You may proceed to Phase 5." + exit 0 + else + echo "" + echo -e "${GREEN}Phase 4 validation PASSED${NC}" + echo "You may proceed to Phase 5 (optional)." + exit 0 + fi +} + +main() { + echo "========================================" + echo "Phase 4 Validation: Observability Stack" + echo "========================================" + echo "" + + check_prometheus + check_grafana + check_loki + check_alertmanager + check_promtail + + print_summary +} + +main "$@" diff --git a/scripts/validate-phase5.sh b/scripts/validate-phase5.sh new file mode 100755 index 0000000..e43b1de --- /dev/null +++ b/scripts/validate-phase5.sh @@ -0,0 +1,224 @@ +#!/usr/bin/env bash +# +# Phase 5 Validation: E2EE Webmail +# +# Validates that RFC 0043 components are deployed and healthy: +# - Webmail frontend (loads, login works) +# - Key service (health endpoint, key generation) +# - E2EE functionality (encryption/decryption) +# +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +PASSED=0 +FAILED=0 +WARNINGS=0 + +# Configuration +DOMAIN="${DOMAIN:-coherence.dev}" + +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_pass() { + echo -e "${GREEN}[PASS]${NC} $1" + ((PASSED++)) +} + +log_fail() { + echo -e "${RED}[FAIL]${NC} $1" + ((FAILED++)) +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $1" + ((WARNINGS++)) +} + +check_webmail_frontend() { + log_info "Checking webmail frontend..." + + # Check if namespace exists + if ! kubectl get namespace webmail &> /dev/null; then + log_warn "webmail namespace does not exist (Phase 5 may be optional)" + return + fi + + # Check pod health + local ready_pods + ready_pods=$(kubectl -n webmail get pods -l app=webmail -o json 2>/dev/null | jq '[.items[] | select(.status.phase == "Running") | select(.status.containerStatuses[]?.ready == true)] | length') + + if [ "$ready_pods" -ge 1 ]; then + log_pass "Webmail frontend has $ready_pods ready pods" + else + log_fail "Webmail frontend has no ready pods" + return + fi + + # Check service + local svc_exists + svc_exists=$(kubectl -n webmail get svc webmail -o name 2>/dev/null || echo "") + + if [ -n "$svc_exists" ]; then + log_pass "Webmail service exists" + else + log_fail "Webmail service does not exist" + fi + + # Check ingress + local ingress_exists + ingress_exists=$(kubectl get ingress -A -o json 2>/dev/null | jq '[.items[] | select(.spec.rules[]?.host | contains("mail"))] | length') + + if [ "$ingress_exists" -ge 1 ]; then + log_pass "Webmail ingress configured" + else + log_warn "No webmail ingress found" + fi +} + +check_key_service() { + log_info "Checking key service..." + + # Check if namespace exists + if ! kubectl get namespace webmail &> /dev/null; then + log_warn "webmail namespace does not exist" + return + fi + + # Check pod health + local ready_pods + ready_pods=$(kubectl -n webmail get pods -l app=key-service -o json 2>/dev/null | jq '[.items[] | select(.status.phase == "Running") | select(.status.containerStatuses[]?.ready == true)] | length') + + if [ "$ready_pods" -ge 1 ]; then + log_pass "Key service has $ready_pods ready pods" + else + log_fail "Key service has no ready pods" + return + fi + + # Check health endpoint + local ks_pod + ks_pod=$(kubectl -n webmail get pods -l app=key-service -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") + + if [ -n "$ks_pod" ]; then + local health_status + health_status=$(kubectl -n webmail exec "$ks_pod" -- wget -q -O - http://localhost:8080/health 2>/dev/null || echo "") + + if echo "$health_status" | grep -q "ok\|healthy\|200"; then + log_pass "Key service health endpoint responding" + else + log_warn "Key service health check returned: $health_status" + fi + fi + + # Check Vault integration + log_info "Checking Vault PKI integration..." + local vault_pki_exists + vault_pki_exists=$(kubectl -n vault exec vault-0 -- vault secrets list -format=json 2>/dev/null | jq 'has("pki_smime/")' || echo "false") + + if [ "$vault_pki_exists" = "true" ]; then + log_pass "Vault S/MIME PKI is configured" + else + log_warn "Vault S/MIME PKI not found" + fi +} + +check_e2ee_functionality() { + log_info "Checking E2EE functionality..." + + log_info "Manual verification required:" + echo " 1. Navigate to https://mail.$DOMAIN" + echo " 2. Login via Keycloak SSO" + echo " 3. Verify key pair is generated on first login" + echo " 4. Compose an email to another user" + echo " 5. Verify email is encrypted in transit" + echo " 6. Verify recipient can decrypt and read email" + echo "" + + log_warn "E2EE functionality requires manual testing" +} + +check_stalwart_integration() { + log_info "Checking Stalwart integration..." + + # Check if webmail can reach Stalwart + local webmail_pod + webmail_pod=$(kubectl -n webmail get pods -l app=webmail -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") + + if [ -n "$webmail_pod" ]; then + # Check IMAP connectivity + local imap_check + imap_check=$(kubectl -n webmail exec "$webmail_pod" -- nc -z stalwart.email.svc.cluster.local 993 2>/dev/null && echo "ok" || echo "fail") + + if [ "$imap_check" = "ok" ]; then + log_pass "Webmail can reach Stalwart IMAP" + else + log_warn "Webmail cannot reach Stalwart IMAP" + fi + + # Check SMTP connectivity + local smtp_check + smtp_check=$(kubectl -n webmail exec "$webmail_pod" -- nc -z stalwart.email.svc.cluster.local 587 2>/dev/null && echo "ok" || echo "fail") + + if [ "$smtp_check" = "ok" ]; then + log_pass "Webmail can reach Stalwart SMTP" + else + log_warn "Webmail cannot reach Stalwart SMTP" + fi + else + log_warn "Cannot check Stalwart integration - no webmail pod" + fi +} + +print_summary() { + echo "" + echo "========================================" + echo "Phase 5 Validation Summary" + echo "========================================" + echo -e " ${GREEN}Passed:${NC} $PASSED" + echo -e " ${RED}Failed:${NC} $FAILED" + echo -e " ${YELLOW}Warnings:${NC} $WARNINGS" + echo "========================================" + + if [ "$FAILED" -gt 0 ]; then + echo "" + echo -e "${RED}Phase 5 validation FAILED${NC}" + echo "Please fix the issues above." + exit 1 + elif [ "$WARNINGS" -gt 0 ]; then + echo "" + echo -e "${YELLOW}Phase 5 validation passed with warnings${NC}" + echo "Review warnings above. Perform manual E2EE testing." + exit 0 + else + echo "" + echo -e "${GREEN}Phase 5 validation PASSED${NC}" + echo "Full infrastructure stack deployment complete!" + exit 0 + fi +} + +main() { + echo "========================================" + echo "Phase 5 Validation: E2EE Webmail" + echo "========================================" + echo "" + + check_webmail_frontend + check_key_service + check_stalwart_integration + check_e2ee_functionality + + print_summary +} + +main "$@" diff --git a/terraform/dns-elastic-ips.tf b/terraform/dns-elastic-ips.tf new file mode 100644 index 0000000..3fcb582 --- /dev/null +++ b/terraform/dns-elastic-ips.tf @@ -0,0 +1,84 @@ +# DNS Elastic IPs for Stable Glue Records +# RFC 0046: Domain Email Migration - Phase 1 +# +# Purpose: Allocate static Elastic IPs for the DNS NLB to enable +# stable glue records at GoDaddy for NS delegation. +# +# Architecture: +# - 3 EIPs (one per AZ) for high availability +# - Attached to NLB via subnet_mapping +# - Used as glue records: ns1.domain.com, ns2.domain.com, ns3.domain.com +# +# Cost: ~$0/mo (EIPs attached to running resources are free) + +locals { + # Enable static IPs for DNS delegation + enable_dns_static_ips = var.enable_dns_static_ips + + # Domains to be delegated from GoDaddy to PowerDNS + managed_domains = [ + "superviber.com", + "muffinlabs.ai", + "letemcook.com", + "appbasecamp.com", + "thanksforborrowing.com", + "alignment.coop" + ] +} + +# Allocate Elastic IPs for DNS NLB (one per AZ) +resource "aws_eip" "dns" { + count = local.enable_dns_static_ips ? length(local.azs) : 0 + domain = "vpc" + + tags = merge(local.common_tags, { + Name = "${local.name}-dns-${count.index + 1}" + Purpose = "dns-nlb" + RFC = "0046" + Description = "Stable IP for DNS glue records - AZ ${count.index + 1}" + }) + + lifecycle { + # Prevent accidental deletion - changing these breaks glue records + prevent_destroy = true + } +} + +# Output the EIP public IPs for glue record configuration +output "dns_elastic_ips" { + description = "Elastic IP addresses for DNS NLB (use for glue records at GoDaddy)" + value = aws_eip.dns[*].public_ip +} + +output "dns_elastic_ip_allocation_ids" { + description = "Elastic IP allocation IDs for NLB subnet_mapping" + value = aws_eip.dns[*].id +} + +output "glue_record_instructions" { + description = "Instructions for configuring glue records at GoDaddy" + value = local.enable_dns_static_ips ? join("\n", [ + "================================================================================", + "GoDaddy Glue Record Configuration", + "RFC 0046: Domain Email Migration - DNS Delegation", + "================================================================================", + "", + "Domains: ${join(", ", local.managed_domains)}", + "", + "1. Custom Nameservers (set for each domain):", + " - ns1.", + " - ns2.", + " - ns3.", + "", + "2. Glue Records (Host Records):", + " ns1 -> ${try(aws_eip.dns[0].public_ip, "PENDING")}", + " ns2 -> ${try(aws_eip.dns[1].public_ip, "PENDING")}", + " ns3 -> ${try(aws_eip.dns[2].public_ip, "PENDING")}", + "", + "3. Verification Commands:", + " dig @${try(aws_eip.dns[0].public_ip, "PENDING")} superviber.com NS", + " dig @8.8.8.8 superviber.com NS", + "", + "================================================================================" + ]) : "DNS static IPs not enabled. Set enable_dns_static_ips = true to allocate EIPs." +} diff --git a/terraform/main.tf b/terraform/main.tf new file mode 100644 index 0000000..29f4d9d --- /dev/null +++ b/terraform/main.tf @@ -0,0 +1,128 @@ +# Foundation Infrastructure - Main Configuration +# RFC 0039: ADR-Compliant Foundation Infrastructure +# +# Architecture: +# - VPC with 3 AZs for high availability +# - EKS cluster with Karpenter for auto-scaling compute +# - CockroachDB 3-node cluster with FIPS 140-2 +# - Shared NLB for all services +# - EBS gp3 and EFS storage +# - S3 for blob storage and backups + +locals { + name = "${var.project_name}-${var.environment}" + + common_tags = merge(var.tags, { + Project = var.project_name + Environment = var.environment + ManagedBy = "terraform" + RFC = "0039" + ADR = "0003,0004,0005" + }) + + # Get available AZs in the region + azs = slice(data.aws_availability_zones.available.names, 0, 3) +} + +data "aws_availability_zones" "available" { + state = "available" + filter { + name = "opt-in-status" + values = ["opt-in-not-required"] + } +} + +data "aws_caller_identity" "current" {} + +# VPC Module - Multi-AZ networking +module "vpc" { + source = "./modules/vpc" + + name = local.name + cidr = var.vpc_cidr + availability_zones = local.azs + enable_nat_gateway = true + single_nat_gateway = false # HA: one NAT per AZ + + tags = local.common_tags +} + +# EKS Module - Kubernetes with Karpenter +module "eks" { + source = "./modules/eks" + + cluster_name = local.name + cluster_version = var.kubernetes_version + vpc_id = module.vpc.vpc_id + private_subnet_ids = module.vpc.private_subnet_ids + public_subnet_ids = module.vpc.public_subnet_ids + + # Karpenter configuration + enable_karpenter = true + + # FIPS compliance + enable_fips = var.enable_fips + + tags = local.common_tags + + depends_on = [module.vpc] +} + +# IAM Module - Roles and IRSA +module "iam" { + source = "./modules/iam" + + cluster_name = module.eks.cluster_name + cluster_oidc_issuer_url = module.eks.cluster_oidc_issuer_url + cluster_oidc_provider_arn = module.eks.oidc_provider_arn + + tags = local.common_tags + + depends_on = [module.eks] +} + +# Storage Module - EBS, EFS, S3 +module "storage" { + source = "./modules/storage" + + name = local.name + vpc_id = module.vpc.vpc_id + private_subnet_ids = module.vpc.private_subnet_ids + availability_zones = local.azs + + # Enable encryption for FIPS compliance + enable_encryption = var.enable_fips + + tags = local.common_tags + + depends_on = [module.vpc] +} + +# NLB Module - Shared Network Load Balancer +# RFC 0046: Updated to support Elastic IPs for DNS glue records +module "nlb" { + source = "./modules/nlb" + + name = local.name + vpc_id = module.vpc.vpc_id + public_subnet_ids = module.vpc.public_subnet_ids + + # RFC 0046: Enable static IPs for stable DNS glue records + enable_static_ips = var.enable_dns_static_ips + elastic_ip_ids = var.enable_dns_static_ips ? aws_eip.dns[*].id : [] + + tags = local.common_tags + + depends_on = [module.vpc] +} + +# S3 Module - Additional blob storage buckets +module "s3" { + source = "./modules/s3" + + name = local.name + log_retention_days = var.log_retention_days + trace_retention_days = var.trace_retention_days + + tags = local.common_tags +} diff --git a/terraform/modules/eks/main.tf b/terraform/modules/eks/main.tf new file mode 100644 index 0000000..ad72dd2 --- /dev/null +++ b/terraform/modules/eks/main.tf @@ -0,0 +1,398 @@ +# EKS Module - Kubernetes Cluster with Karpenter +# RFC 0039: ADR-Compliant Foundation Infrastructure +# ADR 0004: "Set It and Forget It" Auto-Scaling Architecture +# +# Architecture: +# - EKS control plane (~$73/mo) +# - Fargate profile for Karpenter (chicken-egg problem) +# - Karpenter for auto-scaling nodes +# - No managed node groups (Karpenter handles all compute) +# - FIPS-compliant node configuration + +locals { + cluster_name = var.cluster_name +} + +# EKS Cluster +resource "aws_eks_cluster" "main" { + name = local.cluster_name + version = var.cluster_version + role_arn = aws_iam_role.cluster.arn + + vpc_config { + subnet_ids = concat(var.private_subnet_ids, var.public_subnet_ids) + endpoint_private_access = true + endpoint_public_access = var.enable_public_access + public_access_cidrs = var.public_access_cidrs + security_group_ids = [aws_security_group.cluster.id] + } + + # Enable all log types for security compliance + enabled_cluster_log_types = [ + "api", + "audit", + "authenticator", + "controllerManager", + "scheduler" + ] + + # Encryption configuration for secrets + encryption_config { + provider { + key_arn = aws_kms_key.eks.arn + } + resources = ["secrets"] + } + + tags = merge(var.tags, { + Name = local.cluster_name + }) + + depends_on = [ + aws_iam_role_policy_attachment.cluster_AmazonEKSClusterPolicy, + aws_iam_role_policy_attachment.cluster_AmazonEKSVPCResourceController, + aws_cloudwatch_log_group.cluster, + ] +} + +# CloudWatch Log Group for EKS +resource "aws_cloudwatch_log_group" "cluster" { + name = "/aws/eks/${local.cluster_name}/cluster" + retention_in_days = 30 + + tags = var.tags +} + +# KMS Key for EKS secrets encryption +resource "aws_kms_key" "eks" { + description = "EKS cluster secrets encryption key" + deletion_window_in_days = 7 + enable_key_rotation = true + + tags = merge(var.tags, { + Name = "${local.cluster_name}-eks-secrets" + }) +} + +resource "aws_kms_alias" "eks" { + name = "alias/${local.cluster_name}-eks-secrets" + target_key_id = aws_kms_key.eks.key_id +} + +# EKS Cluster IAM Role +resource "aws_iam_role" "cluster" { + name = "${local.cluster_name}-cluster" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Action = "sts:AssumeRole" + Effect = "Allow" + Principal = { + Service = "eks.amazonaws.com" + } + } + ] + }) + + tags = var.tags +} + +resource "aws_iam_role_policy_attachment" "cluster_AmazonEKSClusterPolicy" { + policy_arn = "arn:aws:iam::aws:policy/AmazonEKSClusterPolicy" + role = aws_iam_role.cluster.name +} + +resource "aws_iam_role_policy_attachment" "cluster_AmazonEKSVPCResourceController" { + policy_arn = "arn:aws:iam::aws:policy/AmazonEKSVPCResourceController" + role = aws_iam_role.cluster.name +} + +# Cluster Security Group +resource "aws_security_group" "cluster" { + name = "${local.cluster_name}-cluster" + description = "EKS cluster security group" + vpc_id = var.vpc_id + + tags = merge(var.tags, { + Name = "${local.cluster_name}-cluster" + "kubernetes.io/cluster/${local.cluster_name}" = "owned" + }) +} + +resource "aws_security_group_rule" "cluster_egress" { + type = "egress" + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + security_group_id = aws_security_group.cluster.id + description = "Allow all outbound traffic" +} + +# Node Security Group (for Karpenter-managed nodes) +resource "aws_security_group" "node" { + name = "${local.cluster_name}-node" + description = "Security group for EKS nodes" + vpc_id = var.vpc_id + + tags = merge(var.tags, { + Name = "${local.cluster_name}-node" + "kubernetes.io/cluster/${local.cluster_name}" = "owned" + "karpenter.sh/discovery" = local.cluster_name + }) +} + +resource "aws_security_group_rule" "node_egress" { + type = "egress" + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + security_group_id = aws_security_group.node.id + description = "Allow all outbound traffic" +} + +resource "aws_security_group_rule" "node_ingress_self" { + type = "ingress" + from_port = 0 + to_port = 65535 + protocol = "-1" + source_security_group_id = aws_security_group.node.id + security_group_id = aws_security_group.node.id + description = "Allow nodes to communicate with each other" +} + +resource "aws_security_group_rule" "node_ingress_cluster" { + type = "ingress" + from_port = 443 + to_port = 443 + protocol = "tcp" + source_security_group_id = aws_security_group.cluster.id + security_group_id = aws_security_group.node.id + description = "Allow cluster API to communicate with nodes" +} + +resource "aws_security_group_rule" "node_ingress_cluster_kubelet" { + type = "ingress" + from_port = 10250 + to_port = 10250 + protocol = "tcp" + source_security_group_id = aws_security_group.cluster.id + security_group_id = aws_security_group.node.id + description = "Allow cluster API to communicate with kubelet" +} + +resource "aws_security_group_rule" "cluster_ingress_node" { + type = "ingress" + from_port = 443 + to_port = 443 + protocol = "tcp" + source_security_group_id = aws_security_group.node.id + security_group_id = aws_security_group.cluster.id + description = "Allow nodes to communicate with cluster API" +} + +# Fargate Profile for Karpenter +# Karpenter needs to run on Fargate to avoid chicken-egg problem +resource "aws_eks_fargate_profile" "karpenter" { + count = var.enable_karpenter ? 1 : 0 + + cluster_name = aws_eks_cluster.main.name + fargate_profile_name = "karpenter" + pod_execution_role_arn = aws_iam_role.fargate.arn + + subnet_ids = var.private_subnet_ids + + selector { + namespace = "karpenter" + } + + tags = var.tags +} + +# Fargate Profile for kube-system (CoreDNS, etc) +resource "aws_eks_fargate_profile" "kube_system" { + cluster_name = aws_eks_cluster.main.name + fargate_profile_name = "kube-system" + pod_execution_role_arn = aws_iam_role.fargate.arn + + subnet_ids = var.private_subnet_ids + + selector { + namespace = "kube-system" + } + + tags = var.tags +} + +# Fargate IAM Role +resource "aws_iam_role" "fargate" { + name = "${local.cluster_name}-fargate" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Action = "sts:AssumeRole" + Effect = "Allow" + Principal = { + Service = "eks-fargate-pods.amazonaws.com" + } + } + ] + }) + + tags = var.tags +} + +resource "aws_iam_role_policy_attachment" "fargate_AmazonEKSFargatePodExecutionRolePolicy" { + policy_arn = "arn:aws:iam::aws:policy/AmazonEKSFargatePodExecutionRolePolicy" + role = aws_iam_role.fargate.name +} + +# OIDC Provider for IRSA +data "tls_certificate" "cluster" { + url = aws_eks_cluster.main.identity[0].oidc[0].issuer +} + +resource "aws_iam_openid_connect_provider" "cluster" { + client_id_list = ["sts.amazonaws.com"] + thumbprint_list = [data.tls_certificate.cluster.certificates[0].sha1_fingerprint] + url = aws_eks_cluster.main.identity[0].oidc[0].issuer + + tags = var.tags +} + +# Node IAM Role (for Karpenter-managed nodes) +resource "aws_iam_role" "node" { + name = "${local.cluster_name}-node" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Action = "sts:AssumeRole" + Effect = "Allow" + Principal = { + Service = "ec2.amazonaws.com" + } + } + ] + }) + + tags = var.tags +} + +resource "aws_iam_role_policy_attachment" "node_AmazonEKSWorkerNodePolicy" { + policy_arn = "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy" + role = aws_iam_role.node.name +} + +resource "aws_iam_role_policy_attachment" "node_AmazonEKS_CNI_Policy" { + policy_arn = "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy" + role = aws_iam_role.node.name +} + +resource "aws_iam_role_policy_attachment" "node_AmazonEC2ContainerRegistryReadOnly" { + policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly" + role = aws_iam_role.node.name +} + +resource "aws_iam_role_policy_attachment" "node_AmazonSSMManagedInstanceCore" { + policy_arn = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + role = aws_iam_role.node.name +} + +resource "aws_iam_instance_profile" "node" { + name = "${local.cluster_name}-node" + role = aws_iam_role.node.name + + tags = var.tags +} + +# EKS Addons +resource "aws_eks_addon" "vpc_cni" { + cluster_name = aws_eks_cluster.main.name + addon_name = "vpc-cni" + addon_version = var.vpc_cni_version + resolve_conflicts_on_create = "OVERWRITE" + resolve_conflicts_on_update = "OVERWRITE" + + tags = var.tags +} + +resource "aws_eks_addon" "coredns" { + cluster_name = aws_eks_cluster.main.name + addon_name = "coredns" + addon_version = var.coredns_version + resolve_conflicts_on_create = "OVERWRITE" + resolve_conflicts_on_update = "OVERWRITE" + + # CoreDNS needs compute to run + depends_on = [aws_eks_fargate_profile.kube_system] + + tags = var.tags +} + +resource "aws_eks_addon" "kube_proxy" { + cluster_name = aws_eks_cluster.main.name + addon_name = "kube-proxy" + addon_version = var.kube_proxy_version + resolve_conflicts_on_create = "OVERWRITE" + resolve_conflicts_on_update = "OVERWRITE" + + tags = var.tags +} + +resource "aws_eks_addon" "ebs_csi_driver" { + cluster_name = aws_eks_cluster.main.name + addon_name = "aws-ebs-csi-driver" + addon_version = var.ebs_csi_version + service_account_role_arn = var.ebs_csi_role_arn + resolve_conflicts_on_create = "OVERWRITE" + resolve_conflicts_on_update = "OVERWRITE" + + tags = var.tags +} + +# aws-auth ConfigMap for Karpenter nodes +resource "kubernetes_config_map_v1_data" "aws_auth" { + metadata { + name = "aws-auth" + namespace = "kube-system" + } + + data = { + mapRoles = yamlencode([ + { + rolearn = aws_iam_role.node.arn + username = "system:node:{{EC2PrivateDNSName}}" + groups = [ + "system:bootstrappers", + "system:nodes", + ] + }, + { + rolearn = aws_iam_role.fargate.arn + username = "system:node:{{SessionName}}" + groups = [ + "system:bootstrappers", + "system:nodes", + "system:node-proxier", + ] + } + ]) + } + + force = true + + depends_on = [aws_eks_cluster.main] +} + +# Data source for current region +data "aws_region" "current" {} + +# Data source for current account +data "aws_caller_identity" "current" {} diff --git a/terraform/modules/eks/outputs.tf b/terraform/modules/eks/outputs.tf new file mode 100644 index 0000000..ee3391f --- /dev/null +++ b/terraform/modules/eks/outputs.tf @@ -0,0 +1,88 @@ +# EKS Module - Outputs +# RFC 0039: ADR-Compliant Foundation Infrastructure + +output "cluster_name" { + description = "EKS cluster name" + value = aws_eks_cluster.main.name +} + +output "cluster_id" { + description = "EKS cluster ID" + value = aws_eks_cluster.main.id +} + +output "cluster_arn" { + description = "EKS cluster ARN" + value = aws_eks_cluster.main.arn +} + +output "cluster_endpoint" { + description = "EKS cluster API endpoint" + value = aws_eks_cluster.main.endpoint +} + +output "cluster_version" { + description = "EKS cluster Kubernetes version" + value = aws_eks_cluster.main.version +} + +output "cluster_certificate_authority_data" { + description = "EKS cluster CA certificate (base64)" + value = aws_eks_cluster.main.certificate_authority[0].data + sensitive = true +} + +output "cluster_oidc_issuer_url" { + description = "OIDC issuer URL for IRSA" + value = aws_eks_cluster.main.identity[0].oidc[0].issuer +} + +output "oidc_provider_arn" { + description = "OIDC provider ARN for IRSA" + value = aws_iam_openid_connect_provider.cluster.arn +} + +output "cluster_security_group_id" { + description = "Cluster security group ID" + value = aws_security_group.cluster.id +} + +output "node_security_group_id" { + description = "Node security group ID" + value = aws_security_group.node.id +} + +output "node_iam_role_arn" { + description = "Node IAM role ARN" + value = aws_iam_role.node.arn +} + +output "node_iam_role_name" { + description = "Node IAM role name" + value = aws_iam_role.node.name +} + +output "node_instance_profile_name" { + description = "Node instance profile name" + value = aws_iam_instance_profile.node.name +} + +output "fargate_role_arn" { + description = "Fargate pod execution role ARN" + value = aws_iam_role.fargate.arn +} + +output "cluster_primary_security_group_id" { + description = "EKS cluster primary security group ID" + value = aws_eks_cluster.main.vpc_config[0].cluster_security_group_id +} + +output "kms_key_arn" { + description = "KMS key ARN for EKS secrets encryption" + value = aws_kms_key.eks.arn +} + +output "kms_key_id" { + description = "KMS key ID for EKS secrets encryption" + value = aws_kms_key.eks.key_id +} diff --git a/terraform/modules/eks/variables.tf b/terraform/modules/eks/variables.tf new file mode 100644 index 0000000..30f539b --- /dev/null +++ b/terraform/modules/eks/variables.tf @@ -0,0 +1,89 @@ +# EKS Module - Variables +# RFC 0039: ADR-Compliant Foundation Infrastructure + +variable "cluster_name" { + description = "Name of the EKS cluster" + type = string +} + +variable "cluster_version" { + description = "Kubernetes version for EKS" + type = string + default = "1.29" +} + +variable "vpc_id" { + description = "VPC ID" + type = string +} + +variable "private_subnet_ids" { + description = "List of private subnet IDs" + type = list(string) +} + +variable "public_subnet_ids" { + description = "List of public subnet IDs" + type = list(string) +} + +variable "enable_karpenter" { + description = "Enable Karpenter for auto-scaling nodes" + type = bool + default = true +} + +variable "enable_fips" { + description = "Enable FIPS 140-2 compliance mode" + type = bool + default = true +} + +variable "enable_public_access" { + description = "Enable public access to EKS API endpoint" + type = bool + default = true +} + +variable "public_access_cidrs" { + description = "CIDR blocks allowed to access EKS API endpoint" + type = list(string) + default = ["0.0.0.0/0"] +} + +# Addon versions - update these as new versions are released +variable "vpc_cni_version" { + description = "VPC CNI addon version" + type = string + default = "v1.16.0-eksbuild.1" +} + +variable "coredns_version" { + description = "CoreDNS addon version" + type = string + default = "v1.11.1-eksbuild.6" +} + +variable "kube_proxy_version" { + description = "kube-proxy addon version" + type = string + default = "v1.29.0-eksbuild.1" +} + +variable "ebs_csi_version" { + description = "EBS CSI driver addon version" + type = string + default = "v1.26.1-eksbuild.1" +} + +variable "ebs_csi_role_arn" { + description = "IAM role ARN for EBS CSI driver (IRSA)" + type = string + default = null +} + +variable "tags" { + description = "Tags to apply to all resources" + type = map(string) + default = {} +} diff --git a/terraform/modules/iam/main.tf b/terraform/modules/iam/main.tf new file mode 100644 index 0000000..76efd88 --- /dev/null +++ b/terraform/modules/iam/main.tf @@ -0,0 +1,426 @@ +# IAM Module - Roles and IRSA +# RFC 0039: ADR-Compliant Foundation Infrastructure +# +# Creates IAM roles for: +# - Karpenter node provisioning +# - EBS CSI Driver +# - EFS CSI Driver +# - AWS Load Balancer Controller +# - External DNS +# - cert-manager + +locals { + oidc_issuer = replace(var.cluster_oidc_issuer_url, "https://", "") +} + +# Karpenter Controller IAM Role +resource "aws_iam_role" "karpenter_controller" { + name = "${var.cluster_name}-karpenter-controller" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Principal = { + Federated = var.cluster_oidc_provider_arn + } + Action = "sts:AssumeRoleWithWebIdentity" + Condition = { + StringEquals = { + "${local.oidc_issuer}:aud" = "sts.amazonaws.com" + "${local.oidc_issuer}:sub" = "system:serviceaccount:karpenter:karpenter" + } + } + } + ] + }) + + tags = var.tags +} + +resource "aws_iam_role_policy" "karpenter_controller" { + name = "${var.cluster_name}-karpenter-controller" + role = aws_iam_role.karpenter_controller.id + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Sid = "Karpenter" + Effect = "Allow" + Action = [ + "ec2:CreateLaunchTemplate", + "ec2:CreateFleet", + "ec2:CreateTags", + "ec2:DescribeLaunchTemplates", + "ec2:DescribeImages", + "ec2:DescribeInstances", + "ec2:DescribeSecurityGroups", + "ec2:DescribeSubnets", + "ec2:DescribeInstanceTypes", + "ec2:DescribeInstanceTypeOfferings", + "ec2:DescribeAvailabilityZones", + "ec2:DescribeSpotPriceHistory", + "ec2:DeleteLaunchTemplate", + "ec2:RunInstances", + "ec2:TerminateInstances", + ] + Resource = "*" + }, + { + Sid = "PassNodeRole" + Effect = "Allow" + Action = "iam:PassRole" + Resource = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:role/${var.cluster_name}-node" + }, + { + Sid = "EKSClusterEndpointLookup" + Effect = "Allow" + Action = "eks:DescribeCluster" + Resource = "arn:aws:eks:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:cluster/${var.cluster_name}" + }, + { + Sid = "KarpenterInterruptionQueue" + Effect = "Allow" + Action = [ + "sqs:DeleteMessage", + "sqs:GetQueueUrl", + "sqs:GetQueueAttributes", + "sqs:ReceiveMessage" + ] + Resource = aws_sqs_queue.karpenter_interruption.arn + }, + { + Sid = "PricingAPI" + Effect = "Allow" + Action = "pricing:GetProducts" + Resource = "*" + }, + { + Sid = "SSM" + Effect = "Allow" + Action = "ssm:GetParameter" + Resource = "arn:aws:ssm:${data.aws_region.current.name}::parameter/aws/service/*" + } + ] + }) +} + +# Karpenter Interruption Queue (for Spot instance termination notices) +resource "aws_sqs_queue" "karpenter_interruption" { + name = "${var.cluster_name}-karpenter-interruption" + message_retention_seconds = 300 + + tags = var.tags +} + +resource "aws_sqs_queue_policy" "karpenter_interruption" { + queue_url = aws_sqs_queue.karpenter_interruption.id + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Sid = "EC2InterruptionPolicy" + Effect = "Allow" + Principal = { + Service = [ + "events.amazonaws.com", + "sqs.amazonaws.com" + ] + } + Action = "sqs:SendMessage" + Resource = aws_sqs_queue.karpenter_interruption.arn + } + ] + }) +} + +# EventBridge rules for Karpenter interruption handling +resource "aws_cloudwatch_event_rule" "spot_interruption" { + name = "${var.cluster_name}-spot-interruption" + description = "Spot instance interruption handler for Karpenter" + + event_pattern = jsonencode({ + source = ["aws.ec2"] + detail-type = ["EC2 Spot Instance Interruption Warning"] + }) + + tags = var.tags +} + +resource "aws_cloudwatch_event_target" "spot_interruption" { + rule = aws_cloudwatch_event_rule.spot_interruption.name + target_id = "KarpenterInterruptionQueue" + arn = aws_sqs_queue.karpenter_interruption.arn +} + +resource "aws_cloudwatch_event_rule" "instance_rebalance" { + name = "${var.cluster_name}-instance-rebalance" + description = "Instance rebalance recommendation for Karpenter" + + event_pattern = jsonencode({ + source = ["aws.ec2"] + detail-type = ["EC2 Instance Rebalance Recommendation"] + }) + + tags = var.tags +} + +resource "aws_cloudwatch_event_target" "instance_rebalance" { + rule = aws_cloudwatch_event_rule.instance_rebalance.name + target_id = "KarpenterInterruptionQueue" + arn = aws_sqs_queue.karpenter_interruption.arn +} + +resource "aws_cloudwatch_event_rule" "instance_state_change" { + name = "${var.cluster_name}-instance-state-change" + description = "Instance state change handler for Karpenter" + + event_pattern = jsonencode({ + source = ["aws.ec2"] + detail-type = ["EC2 Instance State-change Notification"] + }) + + tags = var.tags +} + +resource "aws_cloudwatch_event_target" "instance_state_change" { + rule = aws_cloudwatch_event_rule.instance_state_change.name + target_id = "KarpenterInterruptionQueue" + arn = aws_sqs_queue.karpenter_interruption.arn +} + +# EBS CSI Driver IAM Role +resource "aws_iam_role" "ebs_csi" { + name = "${var.cluster_name}-ebs-csi" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Principal = { + Federated = var.cluster_oidc_provider_arn + } + Action = "sts:AssumeRoleWithWebIdentity" + Condition = { + StringEquals = { + "${local.oidc_issuer}:aud" = "sts.amazonaws.com" + "${local.oidc_issuer}:sub" = "system:serviceaccount:kube-system:ebs-csi-controller-sa" + } + } + } + ] + }) + + tags = var.tags +} + +resource "aws_iam_role_policy_attachment" "ebs_csi" { + policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy" + role = aws_iam_role.ebs_csi.name +} + +# Additional EBS CSI policy for encryption +resource "aws_iam_role_policy" "ebs_csi_kms" { + name = "${var.cluster_name}-ebs-csi-kms" + role = aws_iam_role.ebs_csi.id + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + "kms:CreateGrant", + "kms:ListGrants", + "kms:RevokeGrant", + "kms:Encrypt", + "kms:Decrypt", + "kms:ReEncrypt*", + "kms:GenerateDataKey*", + "kms:DescribeKey" + ] + Resource = "*" + } + ] + }) +} + +# EFS CSI Driver IAM Role +resource "aws_iam_role" "efs_csi" { + name = "${var.cluster_name}-efs-csi" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Principal = { + Federated = var.cluster_oidc_provider_arn + } + Action = "sts:AssumeRoleWithWebIdentity" + Condition = { + StringEquals = { + "${local.oidc_issuer}:aud" = "sts.amazonaws.com" + "${local.oidc_issuer}:sub" = "system:serviceaccount:kube-system:efs-csi-controller-sa" + } + } + } + ] + }) + + tags = var.tags +} + +resource "aws_iam_role_policy_attachment" "efs_csi" { + policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonEFSCSIDriverPolicy" + role = aws_iam_role.efs_csi.name +} + +# AWS Load Balancer Controller IAM Role +resource "aws_iam_role" "aws_load_balancer_controller" { + name = "${var.cluster_name}-aws-load-balancer-controller" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Principal = { + Federated = var.cluster_oidc_provider_arn + } + Action = "sts:AssumeRoleWithWebIdentity" + Condition = { + StringEquals = { + "${local.oidc_issuer}:aud" = "sts.amazonaws.com" + "${local.oidc_issuer}:sub" = "system:serviceaccount:kube-system:aws-load-balancer-controller" + } + } + } + ] + }) + + tags = var.tags +} + +resource "aws_iam_role_policy" "aws_load_balancer_controller" { + name = "${var.cluster_name}-aws-load-balancer-controller" + role = aws_iam_role.aws_load_balancer_controller.id + + policy = file("${path.module}/policies/aws-load-balancer-controller.json") +} + +# cert-manager IAM Role +resource "aws_iam_role" "cert_manager" { + name = "${var.cluster_name}-cert-manager" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Principal = { + Federated = var.cluster_oidc_provider_arn + } + Action = "sts:AssumeRoleWithWebIdentity" + Condition = { + StringEquals = { + "${local.oidc_issuer}:aud" = "sts.amazonaws.com" + "${local.oidc_issuer}:sub" = "system:serviceaccount:cert-manager:cert-manager" + } + } + } + ] + }) + + tags = var.tags +} + +resource "aws_iam_role_policy" "cert_manager" { + name = "${var.cluster_name}-cert-manager" + role = aws_iam_role.cert_manager.id + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + "route53:GetChange" + ] + Resource = "arn:aws:route53:::change/*" + }, + { + Effect = "Allow" + Action = [ + "route53:ChangeResourceRecordSets", + "route53:ListResourceRecordSets" + ] + Resource = "arn:aws:route53:::hostedzone/*" + }, + { + Effect = "Allow" + Action = "route53:ListHostedZonesByName" + Resource = "*" + } + ] + }) +} + +# External DNS IAM Role +resource "aws_iam_role" "external_dns" { + name = "${var.cluster_name}-external-dns" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Principal = { + Federated = var.cluster_oidc_provider_arn + } + Action = "sts:AssumeRoleWithWebIdentity" + Condition = { + StringEquals = { + "${local.oidc_issuer}:aud" = "sts.amazonaws.com" + "${local.oidc_issuer}:sub" = "system:serviceaccount:external-dns:external-dns" + } + } + } + ] + }) + + tags = var.tags +} + +resource "aws_iam_role_policy" "external_dns" { + name = "${var.cluster_name}-external-dns" + role = aws_iam_role.external_dns.id + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + "route53:ChangeResourceRecordSets" + ] + Resource = "arn:aws:route53:::hostedzone/*" + }, + { + Effect = "Allow" + Action = [ + "route53:ListHostedZones", + "route53:ListResourceRecordSets" + ] + Resource = "*" + } + ] + }) +} + +data "aws_caller_identity" "current" {} +data "aws_region" "current" {} diff --git a/terraform/modules/iam/outputs.tf b/terraform/modules/iam/outputs.tf new file mode 100644 index 0000000..de2476c --- /dev/null +++ b/terraform/modules/iam/outputs.tf @@ -0,0 +1,72 @@ +# IAM Module - Outputs +# RFC 0039: ADR-Compliant Foundation Infrastructure + +output "karpenter_role_arn" { + description = "Karpenter controller IAM role ARN" + value = aws_iam_role.karpenter_controller.arn +} + +output "karpenter_role_name" { + description = "Karpenter controller IAM role name" + value = aws_iam_role.karpenter_controller.name +} + +output "karpenter_interruption_queue_name" { + description = "SQS queue name for Karpenter interruption handling" + value = aws_sqs_queue.karpenter_interruption.name +} + +output "karpenter_interruption_queue_arn" { + description = "SQS queue ARN for Karpenter interruption handling" + value = aws_sqs_queue.karpenter_interruption.arn +} + +output "ebs_csi_role_arn" { + description = "EBS CSI driver IAM role ARN" + value = aws_iam_role.ebs_csi.arn +} + +output "ebs_csi_role_name" { + description = "EBS CSI driver IAM role name" + value = aws_iam_role.ebs_csi.name +} + +output "efs_csi_role_arn" { + description = "EFS CSI driver IAM role ARN" + value = aws_iam_role.efs_csi.arn +} + +output "efs_csi_role_name" { + description = "EFS CSI driver IAM role name" + value = aws_iam_role.efs_csi.name +} + +output "aws_load_balancer_controller_role_arn" { + description = "AWS Load Balancer Controller IAM role ARN" + value = aws_iam_role.aws_load_balancer_controller.arn +} + +output "aws_load_balancer_controller_role_name" { + description = "AWS Load Balancer Controller IAM role name" + value = aws_iam_role.aws_load_balancer_controller.name +} + +output "cert_manager_role_arn" { + description = "cert-manager IAM role ARN" + value = aws_iam_role.cert_manager.arn +} + +output "cert_manager_role_name" { + description = "cert-manager IAM role name" + value = aws_iam_role.cert_manager.name +} + +output "external_dns_role_arn" { + description = "External DNS IAM role ARN" + value = aws_iam_role.external_dns.arn +} + +output "external_dns_role_name" { + description = "External DNS IAM role name" + value = aws_iam_role.external_dns.name +} diff --git a/terraform/modules/iam/policies/aws-load-balancer-controller.json b/terraform/modules/iam/policies/aws-load-balancer-controller.json new file mode 100644 index 0000000..f038679 --- /dev/null +++ b/terraform/modules/iam/policies/aws-load-balancer-controller.json @@ -0,0 +1,241 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "iam:CreateServiceLinkedRole" + ], + "Resource": "*", + "Condition": { + "StringEquals": { + "iam:AWSServiceName": "elasticloadbalancing.amazonaws.com" + } + } + }, + { + "Effect": "Allow", + "Action": [ + "ec2:DescribeAccountAttributes", + "ec2:DescribeAddresses", + "ec2:DescribeAvailabilityZones", + "ec2:DescribeInternetGateways", + "ec2:DescribeVpcs", + "ec2:DescribeVpcPeeringConnections", + "ec2:DescribeSubnets", + "ec2:DescribeSecurityGroups", + "ec2:DescribeInstances", + "ec2:DescribeNetworkInterfaces", + "ec2:DescribeTags", + "ec2:GetCoipPoolUsage", + "ec2:DescribeCoipPools", + "elasticloadbalancing:DescribeLoadBalancers", + "elasticloadbalancing:DescribeLoadBalancerAttributes", + "elasticloadbalancing:DescribeListeners", + "elasticloadbalancing:DescribeListenerCertificates", + "elasticloadbalancing:DescribeSSLPolicies", + "elasticloadbalancing:DescribeRules", + "elasticloadbalancing:DescribeTargetGroups", + "elasticloadbalancing:DescribeTargetGroupAttributes", + "elasticloadbalancing:DescribeTargetHealth", + "elasticloadbalancing:DescribeTags" + ], + "Resource": "*" + }, + { + "Effect": "Allow", + "Action": [ + "cognito-idp:DescribeUserPoolClient", + "acm:ListCertificates", + "acm:DescribeCertificate", + "iam:ListServerCertificates", + "iam:GetServerCertificate", + "waf-regional:GetWebACL", + "waf-regional:GetWebACLForResource", + "waf-regional:AssociateWebACL", + "waf-regional:DisassociateWebACL", + "wafv2:GetWebACL", + "wafv2:GetWebACLForResource", + "wafv2:AssociateWebACL", + "wafv2:DisassociateWebACL", + "shield:GetSubscriptionState", + "shield:DescribeProtection", + "shield:CreateProtection", + "shield:DeleteProtection" + ], + "Resource": "*" + }, + { + "Effect": "Allow", + "Action": [ + "ec2:AuthorizeSecurityGroupIngress", + "ec2:RevokeSecurityGroupIngress" + ], + "Resource": "*" + }, + { + "Effect": "Allow", + "Action": [ + "ec2:CreateSecurityGroup" + ], + "Resource": "*" + }, + { + "Effect": "Allow", + "Action": [ + "ec2:CreateTags" + ], + "Resource": "arn:aws:ec2:*:*:security-group/*", + "Condition": { + "StringEquals": { + "ec2:CreateAction": "CreateSecurityGroup" + }, + "Null": { + "aws:RequestTag/elbv2.k8s.aws/cluster": "false" + } + } + }, + { + "Effect": "Allow", + "Action": [ + "ec2:CreateTags", + "ec2:DeleteTags" + ], + "Resource": "arn:aws:ec2:*:*:security-group/*", + "Condition": { + "Null": { + "aws:RequestTag/elbv2.k8s.aws/cluster": "true", + "aws:ResourceTag/elbv2.k8s.aws/cluster": "false" + } + } + }, + { + "Effect": "Allow", + "Action": [ + "ec2:AuthorizeSecurityGroupIngress", + "ec2:RevokeSecurityGroupIngress", + "ec2:DeleteSecurityGroup" + ], + "Resource": "*", + "Condition": { + "Null": { + "aws:ResourceTag/elbv2.k8s.aws/cluster": "false" + } + } + }, + { + "Effect": "Allow", + "Action": [ + "elasticloadbalancing:CreateLoadBalancer", + "elasticloadbalancing:CreateTargetGroup" + ], + "Resource": "*", + "Condition": { + "Null": { + "aws:RequestTag/elbv2.k8s.aws/cluster": "false" + } + } + }, + { + "Effect": "Allow", + "Action": [ + "elasticloadbalancing:CreateListener", + "elasticloadbalancing:DeleteListener", + "elasticloadbalancing:CreateRule", + "elasticloadbalancing:DeleteRule" + ], + "Resource": "*" + }, + { + "Effect": "Allow", + "Action": [ + "elasticloadbalancing:AddTags", + "elasticloadbalancing:RemoveTags" + ], + "Resource": [ + "arn:aws:elasticloadbalancing:*:*:targetgroup/*/*", + "arn:aws:elasticloadbalancing:*:*:loadbalancer/net/*/*", + "arn:aws:elasticloadbalancing:*:*:loadbalancer/app/*/*" + ], + "Condition": { + "Null": { + "aws:RequestTag/elbv2.k8s.aws/cluster": "true", + "aws:ResourceTag/elbv2.k8s.aws/cluster": "false" + } + } + }, + { + "Effect": "Allow", + "Action": [ + "elasticloadbalancing:AddTags", + "elasticloadbalancing:RemoveTags" + ], + "Resource": [ + "arn:aws:elasticloadbalancing:*:*:listener/net/*/*/*", + "arn:aws:elasticloadbalancing:*:*:listener/app/*/*/*", + "arn:aws:elasticloadbalancing:*:*:listener-rule/net/*/*/*", + "arn:aws:elasticloadbalancing:*:*:listener-rule/app/*/*/*" + ] + }, + { + "Effect": "Allow", + "Action": [ + "elasticloadbalancing:ModifyLoadBalancerAttributes", + "elasticloadbalancing:SetIpAddressType", + "elasticloadbalancing:SetSecurityGroups", + "elasticloadbalancing:SetSubnets", + "elasticloadbalancing:DeleteLoadBalancer", + "elasticloadbalancing:ModifyTargetGroup", + "elasticloadbalancing:ModifyTargetGroupAttributes", + "elasticloadbalancing:DeleteTargetGroup" + ], + "Resource": "*", + "Condition": { + "Null": { + "aws:ResourceTag/elbv2.k8s.aws/cluster": "false" + } + } + }, + { + "Effect": "Allow", + "Action": [ + "elasticloadbalancing:AddTags" + ], + "Resource": [ + "arn:aws:elasticloadbalancing:*:*:targetgroup/*/*", + "arn:aws:elasticloadbalancing:*:*:loadbalancer/net/*/*", + "arn:aws:elasticloadbalancing:*:*:loadbalancer/app/*/*" + ], + "Condition": { + "StringEquals": { + "elasticloadbalancing:CreateAction": [ + "CreateTargetGroup", + "CreateLoadBalancer" + ] + }, + "Null": { + "aws:RequestTag/elbv2.k8s.aws/cluster": "false" + } + } + }, + { + "Effect": "Allow", + "Action": [ + "elasticloadbalancing:RegisterTargets", + "elasticloadbalancing:DeregisterTargets" + ], + "Resource": "arn:aws:elasticloadbalancing:*:*:targetgroup/*/*" + }, + { + "Effect": "Allow", + "Action": [ + "elasticloadbalancing:SetWebAcl", + "elasticloadbalancing:ModifyListener", + "elasticloadbalancing:AddListenerCertificates", + "elasticloadbalancing:RemoveListenerCertificates", + "elasticloadbalancing:ModifyRule" + ], + "Resource": "*" + } + ] +} diff --git a/terraform/modules/iam/variables.tf b/terraform/modules/iam/variables.tf new file mode 100644 index 0000000..6d15687 --- /dev/null +++ b/terraform/modules/iam/variables.tf @@ -0,0 +1,23 @@ +# IAM Module - Variables +# RFC 0039: ADR-Compliant Foundation Infrastructure + +variable "cluster_name" { + description = "Name of the EKS cluster" + type = string +} + +variable "cluster_oidc_issuer_url" { + description = "OIDC issuer URL from EKS cluster" + type = string +} + +variable "cluster_oidc_provider_arn" { + description = "OIDC provider ARN from EKS cluster" + type = string +} + +variable "tags" { + description = "Tags to apply to all resources" + type = map(string) + default = {} +} diff --git a/terraform/modules/nlb/main.tf b/terraform/modules/nlb/main.tf new file mode 100644 index 0000000..e042047 --- /dev/null +++ b/terraform/modules/nlb/main.tf @@ -0,0 +1,294 @@ +# NLB Module - Shared Network Load Balancer +# RFC 0039: ADR-Compliant Foundation Infrastructure +# +# Architecture: +# - Single shared NLB for all services (~$16/mo) +# - Ports: 53 (DNS), 25/587/993 (Email), 443 (HTTPS) +# - Cross-zone load balancing enabled +# - Target groups for each service type + +# Network Load Balancer +# RFC 0046: Domain Email Migration - Elastic IP support for stable glue records +resource "aws_lb" "main" { + name = var.name + internal = false + load_balancer_type = "network" + + # Use subnet_mapping with Elastic IPs when enabled (for DNS glue records) + # Otherwise use simple subnets assignment + dynamic "subnet_mapping" { + for_each = var.enable_static_ips ? zipmap(var.public_subnet_ids, var.elastic_ip_ids) : {} + content { + subnet_id = subnet_mapping.key + allocation_id = subnet_mapping.value + } + } + + # Fallback to simple subnets when not using static IPs + subnets = var.enable_static_ips ? null : var.public_subnet_ids + + enable_cross_zone_load_balancing = true + enable_deletion_protection = var.enable_deletion_protection + + tags = merge(var.tags, { + Name = var.name + }) +} + +# Target Group for HTTPS (443) - Routes to ALB Ingress Controller +resource "aws_lb_target_group" "https" { + name = "${var.name}-https" + port = 443 + protocol = "TCP" + vpc_id = var.vpc_id + target_type = "ip" + + health_check { + enabled = true + protocol = "TCP" + port = "traffic-port" + healthy_threshold = 3 + unhealthy_threshold = 3 + interval = 30 + } + + tags = merge(var.tags, { + Name = "${var.name}-https" + Service = "ingress" + }) +} + +# Listener for HTTPS (443) +resource "aws_lb_listener" "https" { + load_balancer_arn = aws_lb.main.arn + port = 443 + protocol = "TCP" + + default_action { + type = "forward" + target_group_arn = aws_lb_target_group.https.arn + } + + tags = var.tags +} + +# Target Group for DNS UDP (53) +resource "aws_lb_target_group" "dns_udp" { + name = "${var.name}-dns-udp" + port = 53 + protocol = "UDP" + vpc_id = var.vpc_id + target_type = "ip" + + health_check { + enabled = true + protocol = "TCP" + port = 53 + healthy_threshold = 3 + unhealthy_threshold = 3 + interval = 30 + } + + tags = merge(var.tags, { + Name = "${var.name}-dns-udp" + Service = "dns" + }) +} + +# Listener for DNS UDP (53) +resource "aws_lb_listener" "dns_udp" { + load_balancer_arn = aws_lb.main.arn + port = 53 + protocol = "UDP" + + default_action { + type = "forward" + target_group_arn = aws_lb_target_group.dns_udp.arn + } + + tags = var.tags +} + +# Target Group for DNS TCP (53) +resource "aws_lb_target_group" "dns_tcp" { + name = "${var.name}-dns-tcp" + port = 53 + protocol = "TCP" + vpc_id = var.vpc_id + target_type = "ip" + + health_check { + enabled = true + protocol = "TCP" + port = "traffic-port" + healthy_threshold = 3 + unhealthy_threshold = 3 + interval = 30 + } + + tags = merge(var.tags, { + Name = "${var.name}-dns-tcp" + Service = "dns" + }) +} + +# Listener for DNS TCP (53) +resource "aws_lb_listener" "dns_tcp" { + load_balancer_arn = aws_lb.main.arn + port = 53 + protocol = "TCP" + + default_action { + type = "forward" + target_group_arn = aws_lb_target_group.dns_tcp.arn + } + + tags = var.tags +} + +# Target Group for SMTP (25) +resource "aws_lb_target_group" "smtp" { + name = "${var.name}-smtp" + port = 25 + protocol = "TCP" + vpc_id = var.vpc_id + target_type = "ip" + + health_check { + enabled = true + protocol = "TCP" + port = "traffic-port" + healthy_threshold = 3 + unhealthy_threshold = 3 + interval = 30 + } + + tags = merge(var.tags, { + Name = "${var.name}-smtp" + Service = "email" + }) +} + +# Listener for SMTP (25) +resource "aws_lb_listener" "smtp" { + load_balancer_arn = aws_lb.main.arn + port = 25 + protocol = "TCP" + + default_action { + type = "forward" + target_group_arn = aws_lb_target_group.smtp.arn + } + + tags = var.tags +} + +# Target Group for Email Submission (587) +resource "aws_lb_target_group" "submission" { + name = "${var.name}-submission" + port = 587 + protocol = "TCP" + vpc_id = var.vpc_id + target_type = "ip" + + health_check { + enabled = true + protocol = "TCP" + port = "traffic-port" + healthy_threshold = 3 + unhealthy_threshold = 3 + interval = 30 + } + + tags = merge(var.tags, { + Name = "${var.name}-submission" + Service = "email" + }) +} + +# Listener for Email Submission (587) +resource "aws_lb_listener" "submission" { + load_balancer_arn = aws_lb.main.arn + port = 587 + protocol = "TCP" + + default_action { + type = "forward" + target_group_arn = aws_lb_target_group.submission.arn + } + + tags = var.tags +} + +# Target Group for IMAPS (993) +resource "aws_lb_target_group" "imaps" { + name = "${var.name}-imaps" + port = 993 + protocol = "TCP" + vpc_id = var.vpc_id + target_type = "ip" + + health_check { + enabled = true + protocol = "TCP" + port = "traffic-port" + healthy_threshold = 3 + unhealthy_threshold = 3 + interval = 30 + } + + tags = merge(var.tags, { + Name = "${var.name}-imaps" + Service = "email" + }) +} + +# Listener for IMAPS (993) +resource "aws_lb_listener" "imaps" { + load_balancer_arn = aws_lb.main.arn + port = 993 + protocol = "TCP" + + default_action { + type = "forward" + target_group_arn = aws_lb_target_group.imaps.arn + } + + tags = var.tags +} + +# HTTP Listener (80) - Redirect to HTTPS +resource "aws_lb_target_group" "http" { + name = "${var.name}-http" + port = 80 + protocol = "TCP" + vpc_id = var.vpc_id + target_type = "ip" + + health_check { + enabled = true + protocol = "TCP" + port = "traffic-port" + healthy_threshold = 3 + unhealthy_threshold = 3 + interval = 30 + } + + tags = merge(var.tags, { + Name = "${var.name}-http" + Service = "ingress" + }) +} + +resource "aws_lb_listener" "http" { + load_balancer_arn = aws_lb.main.arn + port = 80 + protocol = "TCP" + + default_action { + type = "forward" + target_group_arn = aws_lb_target_group.http.arn + } + + tags = var.tags +} diff --git a/terraform/modules/nlb/outputs.tf b/terraform/modules/nlb/outputs.tf new file mode 100644 index 0000000..a3d3532 --- /dev/null +++ b/terraform/modules/nlb/outputs.tf @@ -0,0 +1,67 @@ +# NLB Module - Outputs +# RFC 0039: ADR-Compliant Foundation Infrastructure + +output "arn" { + description = "NLB ARN" + value = aws_lb.main.arn +} + +output "dns_name" { + description = "NLB DNS name" + value = aws_lb.main.dns_name +} + +output "zone_id" { + description = "NLB Route53 zone ID" + value = aws_lb.main.zone_id +} + +output "target_group_https_arn" { + description = "HTTPS target group ARN" + value = aws_lb_target_group.https.arn +} + +output "target_group_http_arn" { + description = "HTTP target group ARN" + value = aws_lb_target_group.http.arn +} + +output "target_group_dns_udp_arn" { + description = "DNS UDP target group ARN" + value = aws_lb_target_group.dns_udp.arn +} + +output "target_group_dns_tcp_arn" { + description = "DNS TCP target group ARN" + value = aws_lb_target_group.dns_tcp.arn +} + +output "target_group_smtp_arn" { + description = "SMTP target group ARN" + value = aws_lb_target_group.smtp.arn +} + +output "target_group_submission_arn" { + description = "Email submission target group ARN" + value = aws_lb_target_group.submission.arn +} + +output "target_group_imaps_arn" { + description = "IMAPS target group ARN" + value = aws_lb_target_group.imaps.arn +} + +output "listener_https_arn" { + description = "HTTPS listener ARN" + value = aws_lb_listener.https.arn +} + +output "listener_http_arn" { + description = "HTTP listener ARN" + value = aws_lb_listener.http.arn +} + +output "nlb_ip_addresses" { + description = "List of NLB IP addresses (from subnet mappings when using EIPs)" + value = [for eni in aws_lb.main.subnet_mapping : eni.private_ipv4_address if eni.private_ipv4_address != null] +} diff --git a/terraform/modules/nlb/variables.tf b/terraform/modules/nlb/variables.tf new file mode 100644 index 0000000..dda62ba --- /dev/null +++ b/terraform/modules/nlb/variables.tf @@ -0,0 +1,41 @@ +# NLB Module - Variables +# RFC 0039: ADR-Compliant Foundation Infrastructure + +variable "name" { + description = "Name prefix for all NLB resources" + type = string +} + +variable "vpc_id" { + description = "VPC ID" + type = string +} + +variable "public_subnet_ids" { + description = "List of public subnet IDs for NLB" + type = list(string) +} + +variable "enable_deletion_protection" { + description = "Enable deletion protection for NLB" + type = bool + default = true +} + +variable "tags" { + description = "Tags to apply to all resources" + type = map(string) + default = {} +} + +variable "enable_static_ips" { + description = "Enable Elastic IPs for stable glue records (required for DNS delegation)" + type = bool + default = false +} + +variable "elastic_ip_ids" { + description = "List of pre-allocated Elastic IP allocation IDs (one per subnet/AZ)" + type = list(string) + default = [] +} diff --git a/terraform/modules/s3/main.tf b/terraform/modules/s3/main.tf new file mode 100644 index 0000000..1368548 --- /dev/null +++ b/terraform/modules/s3/main.tf @@ -0,0 +1,429 @@ +# S3 Module - Blob Storage Buckets +# RFC 0039: ADR-Compliant Foundation Infrastructure +# +# Additional S3 buckets for: +# - Email blob storage (Stalwart attachments) +# - Loki log chunks +# - Tempo trace storage +# - Git LFS objects (Forgejo) + +# KMS Key for S3 Encryption +resource "aws_kms_key" "s3" { + description = "S3 bucket encryption key" + deletion_window_in_days = 30 + enable_key_rotation = true + + tags = merge(var.tags, { + Name = "${var.name}-s3" + }) +} + +resource "aws_kms_alias" "s3" { + name = "alias/${var.name}-s3" + target_key_id = aws_kms_key.s3.key_id +} + +# ============================================================================ +# EMAIL BLOB STORAGE +# ============================================================================ + +resource "aws_s3_bucket" "email_blobs" { + bucket = "${var.name}-email-blobs-${data.aws_caller_identity.current.account_id}" + + tags = merge(var.tags, { + Name = "${var.name}-email-blobs" + Purpose = "Stalwart email attachments and large messages" + Service = "stalwart" + }) +} + +resource "aws_s3_bucket_versioning" "email_blobs" { + bucket = aws_s3_bucket.email_blobs.id + + versioning_configuration { + status = "Enabled" + } +} + +resource "aws_s3_bucket_server_side_encryption_configuration" "email_blobs" { + bucket = aws_s3_bucket.email_blobs.id + + rule { + apply_server_side_encryption_by_default { + sse_algorithm = "aws:kms" + kms_master_key_id = aws_kms_key.s3.arn + } + bucket_key_enabled = true + } +} + +resource "aws_s3_bucket_public_access_block" "email_blobs" { + bucket = aws_s3_bucket.email_blobs.id + + block_public_acls = true + block_public_policy = true + ignore_public_acls = true + restrict_public_buckets = true +} + +resource "aws_s3_bucket_lifecycle_configuration" "email_blobs" { + bucket = aws_s3_bucket.email_blobs.id + + rule { + id = "archive-old-attachments" + status = "Enabled" + + transition { + days = 90 + storage_class = "STANDARD_IA" + } + + transition { + days = 365 + storage_class = "GLACIER" + } + + noncurrent_version_expiration { + noncurrent_days = 30 + } + } +} + +# ============================================================================ +# LOKI LOG CHUNKS +# ============================================================================ + +resource "aws_s3_bucket" "loki_chunks" { + bucket = "${var.name}-loki-chunks-${data.aws_caller_identity.current.account_id}" + + tags = merge(var.tags, { + Name = "${var.name}-loki-chunks" + Purpose = "Loki log storage chunks" + Service = "loki" + }) +} + +resource "aws_s3_bucket_versioning" "loki_chunks" { + bucket = aws_s3_bucket.loki_chunks.id + + versioning_configuration { + status = "Suspended" # Not needed for Loki chunks + } +} + +resource "aws_s3_bucket_server_side_encryption_configuration" "loki_chunks" { + bucket = aws_s3_bucket.loki_chunks.id + + rule { + apply_server_side_encryption_by_default { + sse_algorithm = "aws:kms" + kms_master_key_id = aws_kms_key.s3.arn + } + bucket_key_enabled = true + } +} + +resource "aws_s3_bucket_public_access_block" "loki_chunks" { + bucket = aws_s3_bucket.loki_chunks.id + + block_public_acls = true + block_public_policy = true + ignore_public_acls = true + restrict_public_buckets = true +} + +resource "aws_s3_bucket_lifecycle_configuration" "loki_chunks" { + bucket = aws_s3_bucket.loki_chunks.id + + rule { + id = "expire-old-logs" + status = "Enabled" + + # Delete logs older than retention period + expiration { + days = var.log_retention_days + } + + # Move to cheaper storage after 30 days + transition { + days = 30 + storage_class = "STANDARD_IA" + } + } +} + +# ============================================================================ +# TEMPO TRACE STORAGE +# ============================================================================ + +resource "aws_s3_bucket" "tempo_traces" { + bucket = "${var.name}-tempo-traces-${data.aws_caller_identity.current.account_id}" + + tags = merge(var.tags, { + Name = "${var.name}-tempo-traces" + Purpose = "Tempo distributed trace storage" + Service = "tempo" + }) +} + +resource "aws_s3_bucket_versioning" "tempo_traces" { + bucket = aws_s3_bucket.tempo_traces.id + + versioning_configuration { + status = "Suspended" + } +} + +resource "aws_s3_bucket_server_side_encryption_configuration" "tempo_traces" { + bucket = aws_s3_bucket.tempo_traces.id + + rule { + apply_server_side_encryption_by_default { + sse_algorithm = "aws:kms" + kms_master_key_id = aws_kms_key.s3.arn + } + bucket_key_enabled = true + } +} + +resource "aws_s3_bucket_public_access_block" "tempo_traces" { + bucket = aws_s3_bucket.tempo_traces.id + + block_public_acls = true + block_public_policy = true + ignore_public_acls = true + restrict_public_buckets = true +} + +resource "aws_s3_bucket_lifecycle_configuration" "tempo_traces" { + bucket = aws_s3_bucket.tempo_traces.id + + rule { + id = "expire-old-traces" + status = "Enabled" + + expiration { + days = var.trace_retention_days + } + + transition { + days = 7 + storage_class = "STANDARD_IA" + } + } +} + +# ============================================================================ +# GIT LFS STORAGE +# ============================================================================ + +resource "aws_s3_bucket" "git_lfs" { + bucket = "${var.name}-git-lfs-${data.aws_caller_identity.current.account_id}" + + tags = merge(var.tags, { + Name = "${var.name}-git-lfs" + Purpose = "Git LFS object storage for Forgejo" + Service = "forgejo" + }) +} + +resource "aws_s3_bucket_versioning" "git_lfs" { + bucket = aws_s3_bucket.git_lfs.id + + versioning_configuration { + status = "Enabled" + } +} + +resource "aws_s3_bucket_server_side_encryption_configuration" "git_lfs" { + bucket = aws_s3_bucket.git_lfs.id + + rule { + apply_server_side_encryption_by_default { + sse_algorithm = "aws:kms" + kms_master_key_id = aws_kms_key.s3.arn + } + bucket_key_enabled = true + } +} + +resource "aws_s3_bucket_public_access_block" "git_lfs" { + bucket = aws_s3_bucket.git_lfs.id + + block_public_acls = true + block_public_policy = true + ignore_public_acls = true + restrict_public_buckets = true +} + +resource "aws_s3_bucket_lifecycle_configuration" "git_lfs" { + bucket = aws_s3_bucket.git_lfs.id + + rule { + id = "intelligent-tiering" + status = "Enabled" + + transition { + days = 30 + storage_class = "INTELLIGENT_TIERING" + } + + noncurrent_version_expiration { + noncurrent_days = 90 + } + } +} + +# ============================================================================ +# IAM POLICIES FOR SERVICE ACCESS +# ============================================================================ + +# Policy for Loki to access its bucket +resource "aws_iam_policy" "loki_s3" { + name = "${var.name}-loki-s3-access" + description = "Allow Loki to access S3 bucket for log storage" + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + "s3:PutObject", + "s3:GetObject", + "s3:DeleteObject", + "s3:ListBucket" + ] + Resource = [ + aws_s3_bucket.loki_chunks.arn, + "${aws_s3_bucket.loki_chunks.arn}/*" + ] + }, + { + Effect = "Allow" + Action = [ + "kms:Encrypt", + "kms:Decrypt", + "kms:GenerateDataKey" + ] + Resource = [aws_kms_key.s3.arn] + } + ] + }) + + tags = var.tags +} + +# Policy for Tempo to access its bucket +resource "aws_iam_policy" "tempo_s3" { + name = "${var.name}-tempo-s3-access" + description = "Allow Tempo to access S3 bucket for trace storage" + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + "s3:PutObject", + "s3:GetObject", + "s3:DeleteObject", + "s3:ListBucket" + ] + Resource = [ + aws_s3_bucket.tempo_traces.arn, + "${aws_s3_bucket.tempo_traces.arn}/*" + ] + }, + { + Effect = "Allow" + Action = [ + "kms:Encrypt", + "kms:Decrypt", + "kms:GenerateDataKey" + ] + Resource = [aws_kms_key.s3.arn] + } + ] + }) + + tags = var.tags +} + +# Policy for Stalwart to access email blobs +resource "aws_iam_policy" "stalwart_s3" { + name = "${var.name}-stalwart-s3-access" + description = "Allow Stalwart to access S3 bucket for email blob storage" + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + "s3:PutObject", + "s3:GetObject", + "s3:DeleteObject", + "s3:ListBucket" + ] + Resource = [ + aws_s3_bucket.email_blobs.arn, + "${aws_s3_bucket.email_blobs.arn}/*" + ] + }, + { + Effect = "Allow" + Action = [ + "kms:Encrypt", + "kms:Decrypt", + "kms:GenerateDataKey" + ] + Resource = [aws_kms_key.s3.arn] + } + ] + }) + + tags = var.tags +} + +# Policy for Forgejo to access Git LFS bucket +resource "aws_iam_policy" "forgejo_s3" { + name = "${var.name}-forgejo-s3-access" + description = "Allow Forgejo to access S3 bucket for Git LFS storage" + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + "s3:PutObject", + "s3:GetObject", + "s3:DeleteObject", + "s3:ListBucket" + ] + Resource = [ + aws_s3_bucket.git_lfs.arn, + "${aws_s3_bucket.git_lfs.arn}/*" + ] + }, + { + Effect = "Allow" + Action = [ + "kms:Encrypt", + "kms:Decrypt", + "kms:GenerateDataKey" + ] + Resource = [aws_kms_key.s3.arn] + } + ] + }) + + tags = var.tags +} + +# ============================================================================ +# DATA SOURCES +# ============================================================================ + +data "aws_caller_identity" "current" {} diff --git a/terraform/modules/s3/outputs.tf b/terraform/modules/s3/outputs.tf new file mode 100644 index 0000000..283a891 --- /dev/null +++ b/terraform/modules/s3/outputs.tf @@ -0,0 +1,73 @@ +# S3 Module Outputs +# RFC 0039: ADR-Compliant Foundation Infrastructure + +output "email_blobs_bucket_id" { + description = "ID of the email blobs S3 bucket" + value = aws_s3_bucket.email_blobs.id +} + +output "email_blobs_bucket_arn" { + description = "ARN of the email blobs S3 bucket" + value = aws_s3_bucket.email_blobs.arn +} + +output "loki_chunks_bucket_id" { + description = "ID of the Loki chunks S3 bucket" + value = aws_s3_bucket.loki_chunks.id +} + +output "loki_chunks_bucket_arn" { + description = "ARN of the Loki chunks S3 bucket" + value = aws_s3_bucket.loki_chunks.arn +} + +output "tempo_traces_bucket_id" { + description = "ID of the Tempo traces S3 bucket" + value = aws_s3_bucket.tempo_traces.id +} + +output "tempo_traces_bucket_arn" { + description = "ARN of the Tempo traces S3 bucket" + value = aws_s3_bucket.tempo_traces.arn +} + +output "git_lfs_bucket_id" { + description = "ID of the Git LFS S3 bucket" + value = aws_s3_bucket.git_lfs.id +} + +output "git_lfs_bucket_arn" { + description = "ARN of the Git LFS S3 bucket" + value = aws_s3_bucket.git_lfs.arn +} + +output "kms_key_arn" { + description = "ARN of the KMS key for S3 encryption" + value = aws_kms_key.s3.arn +} + +output "kms_key_id" { + description = "ID of the KMS key for S3 encryption" + value = aws_kms_key.s3.key_id +} + +# IAM Policy ARNs for IRSA +output "loki_s3_policy_arn" { + description = "ARN of the IAM policy for Loki S3 access" + value = aws_iam_policy.loki_s3.arn +} + +output "tempo_s3_policy_arn" { + description = "ARN of the IAM policy for Tempo S3 access" + value = aws_iam_policy.tempo_s3.arn +} + +output "stalwart_s3_policy_arn" { + description = "ARN of the IAM policy for Stalwart S3 access" + value = aws_iam_policy.stalwart_s3.arn +} + +output "forgejo_s3_policy_arn" { + description = "ARN of the IAM policy for Forgejo S3 access" + value = aws_iam_policy.forgejo_s3.arn +} diff --git a/terraform/modules/s3/variables.tf b/terraform/modules/s3/variables.tf new file mode 100644 index 0000000..5e5508b --- /dev/null +++ b/terraform/modules/s3/variables.tf @@ -0,0 +1,37 @@ +# S3 Module Variables +# RFC 0039: ADR-Compliant Foundation Infrastructure + +variable "name" { + description = "Name prefix for resources" + type = string +} + +variable "tags" { + description = "Tags to apply to all resources" + type = map(string) + default = {} +} + +variable "log_retention_days" { + description = "Number of days to retain logs in Loki bucket" + type = number + default = 90 +} + +variable "trace_retention_days" { + description = "Number of days to retain traces in Tempo bucket" + type = number + default = 30 +} + +variable "enable_cross_region_replication" { + description = "Enable cross-region replication for critical buckets" + type = bool + default = false +} + +variable "replica_region" { + description = "AWS region for bucket replication" + type = string + default = "us-west-2" +} diff --git a/terraform/modules/storage/main.tf b/terraform/modules/storage/main.tf new file mode 100644 index 0000000..441d913 --- /dev/null +++ b/terraform/modules/storage/main.tf @@ -0,0 +1,246 @@ +# Storage Module - EBS, EFS, S3 +# RFC 0039: ADR-Compliant Foundation Infrastructure +# +# Architecture: +# - EFS for shared persistent storage (git repos, files) +# - S3 for backups and blob storage +# - KMS for encryption (FIPS compliance) + +# KMS Key for Storage Encryption +resource "aws_kms_key" "storage" { + description = "Storage encryption key" + deletion_window_in_days = 30 + enable_key_rotation = true + + tags = merge(var.tags, { + Name = "${var.name}-storage" + }) +} + +resource "aws_kms_alias" "storage" { + name = "alias/${var.name}-storage" + target_key_id = aws_kms_key.storage.key_id +} + +# EFS File System +resource "aws_efs_file_system" "main" { + creation_token = var.name + encrypted = var.enable_encryption + kms_key_id = var.enable_encryption ? aws_kms_key.storage.arn : null + performance_mode = "generalPurpose" + throughput_mode = "bursting" + + lifecycle_policy { + transition_to_ia = "AFTER_30_DAYS" + } + + lifecycle_policy { + transition_to_primary_storage_class = "AFTER_1_ACCESS" + } + + tags = merge(var.tags, { + Name = var.name + }) +} + +# EFS Mount Targets (one per AZ) +resource "aws_efs_mount_target" "main" { + count = length(var.private_subnet_ids) + + file_system_id = aws_efs_file_system.main.id + subnet_id = var.private_subnet_ids[count.index] + security_groups = [aws_security_group.efs.id] +} + +# EFS Security Group +resource "aws_security_group" "efs" { + name = "${var.name}-efs" + description = "EFS security group" + vpc_id = var.vpc_id + + ingress { + description = "NFS from VPC" + from_port = 2049 + to_port = 2049 + protocol = "tcp" + cidr_blocks = [data.aws_vpc.main.cidr_block] + } + + egress { + description = "All outbound" + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } + + tags = merge(var.tags, { + Name = "${var.name}-efs" + }) +} + +# EFS Access Point for CSI driver +resource "aws_efs_access_point" "main" { + file_system_id = aws_efs_file_system.main.id + + posix_user { + gid = 1000 + uid = 1000 + } + + root_directory { + path = "/data" + creation_info { + owner_gid = 1000 + owner_uid = 1000 + permissions = "0755" + } + } + + tags = merge(var.tags, { + Name = "${var.name}-data" + }) +} + +# S3 Bucket for Backups +resource "aws_s3_bucket" "backups" { + bucket = "${var.name}-backups-${data.aws_caller_identity.current.account_id}" + + tags = merge(var.tags, { + Name = "${var.name}-backups" + Purpose = "CockroachDB and service backups" + }) +} + +resource "aws_s3_bucket_versioning" "backups" { + bucket = aws_s3_bucket.backups.id + + versioning_configuration { + status = "Enabled" + } +} + +resource "aws_s3_bucket_server_side_encryption_configuration" "backups" { + bucket = aws_s3_bucket.backups.id + + rule { + apply_server_side_encryption_by_default { + sse_algorithm = var.enable_encryption ? "aws:kms" : "AES256" + kms_master_key_id = var.enable_encryption ? aws_kms_key.storage.arn : null + } + bucket_key_enabled = true + } +} + +resource "aws_s3_bucket_public_access_block" "backups" { + bucket = aws_s3_bucket.backups.id + + block_public_acls = true + block_public_policy = true + ignore_public_acls = true + restrict_public_buckets = true +} + +resource "aws_s3_bucket_lifecycle_configuration" "backups" { + bucket = aws_s3_bucket.backups.id + + rule { + id = "transition-to-ia" + status = "Enabled" + + transition { + days = 30 + storage_class = "STANDARD_IA" + } + + transition { + days = 90 + storage_class = "GLACIER" + } + + noncurrent_version_transition { + noncurrent_days = 30 + storage_class = "STANDARD_IA" + } + + noncurrent_version_expiration { + noncurrent_days = 365 + } + } +} + +# S3 Bucket for Blob Storage +resource "aws_s3_bucket" "blobs" { + bucket = "${var.name}-blobs-${data.aws_caller_identity.current.account_id}" + + tags = merge(var.tags, { + Name = "${var.name}-blobs" + Purpose = "Application blob storage" + }) +} + +resource "aws_s3_bucket_versioning" "blobs" { + bucket = aws_s3_bucket.blobs.id + + versioning_configuration { + status = "Enabled" + } +} + +resource "aws_s3_bucket_server_side_encryption_configuration" "blobs" { + bucket = aws_s3_bucket.blobs.id + + rule { + apply_server_side_encryption_by_default { + sse_algorithm = var.enable_encryption ? "aws:kms" : "AES256" + kms_master_key_id = var.enable_encryption ? aws_kms_key.storage.arn : null + } + bucket_key_enabled = true + } +} + +resource "aws_s3_bucket_public_access_block" "blobs" { + bucket = aws_s3_bucket.blobs.id + + block_public_acls = true + block_public_policy = true + ignore_public_acls = true + restrict_public_buckets = true +} + +resource "aws_s3_bucket_cors_configuration" "blobs" { + bucket = aws_s3_bucket.blobs.id + + cors_rule { + allowed_headers = ["*"] + allowed_methods = ["GET", "PUT", "POST", "DELETE", "HEAD"] + allowed_origins = ["*"] # Restrict in production + expose_headers = ["ETag"] + max_age_seconds = 3000 + } +} + +resource "aws_s3_bucket_lifecycle_configuration" "blobs" { + bucket = aws_s3_bucket.blobs.id + + rule { + id = "intelligent-tiering" + status = "Enabled" + + transition { + days = 30 + storage_class = "INTELLIGENT_TIERING" + } + + noncurrent_version_expiration { + noncurrent_days = 90 + } + } +} + +# Data sources +data "aws_vpc" "main" { + id = var.vpc_id +} + +data "aws_caller_identity" "current" {} diff --git a/terraform/modules/storage/outputs.tf b/terraform/modules/storage/outputs.tf new file mode 100644 index 0000000..2cf6094 --- /dev/null +++ b/terraform/modules/storage/outputs.tf @@ -0,0 +1,57 @@ +# Storage Module - Outputs +# RFC 0039: ADR-Compliant Foundation Infrastructure + +output "efs_id" { + description = "EFS filesystem ID" + value = aws_efs_file_system.main.id +} + +output "efs_arn" { + description = "EFS filesystem ARN" + value = aws_efs_file_system.main.arn +} + +output "efs_dns_name" { + description = "EFS filesystem DNS name" + value = aws_efs_file_system.main.dns_name +} + +output "efs_access_point_id" { + description = "EFS access point ID" + value = aws_efs_access_point.main.id +} + +output "efs_security_group_id" { + description = "EFS security group ID" + value = aws_security_group.efs.id +} + +output "backup_bucket_name" { + description = "S3 bucket name for backups" + value = aws_s3_bucket.backups.id +} + +output "backup_bucket_arn" { + description = "S3 bucket ARN for backups" + value = aws_s3_bucket.backups.arn +} + +output "blob_bucket_name" { + description = "S3 bucket name for blob storage" + value = aws_s3_bucket.blobs.id +} + +output "blob_bucket_arn" { + description = "S3 bucket ARN for blob storage" + value = aws_s3_bucket.blobs.arn +} + +output "kms_key_arn" { + description = "KMS key ARN for storage encryption" + value = aws_kms_key.storage.arn +} + +output "kms_key_id" { + description = "KMS key ID for storage encryption" + value = aws_kms_key.storage.key_id +} diff --git a/terraform/modules/storage/variables.tf b/terraform/modules/storage/variables.tf new file mode 100644 index 0000000..588c429 --- /dev/null +++ b/terraform/modules/storage/variables.tf @@ -0,0 +1,34 @@ +# Storage Module - Variables +# RFC 0039: ADR-Compliant Foundation Infrastructure + +variable "name" { + description = "Name prefix for all storage resources" + type = string +} + +variable "vpc_id" { + description = "VPC ID" + type = string +} + +variable "private_subnet_ids" { + description = "List of private subnet IDs for EFS mount targets" + type = list(string) +} + +variable "availability_zones" { + description = "List of availability zones" + type = list(string) +} + +variable "enable_encryption" { + description = "Enable encryption for all storage (FIPS compliance)" + type = bool + default = true +} + +variable "tags" { + description = "Tags to apply to all resources" + type = map(string) + default = {} +} diff --git a/terraform/modules/vpc/main.tf b/terraform/modules/vpc/main.tf new file mode 100644 index 0000000..ec7775a --- /dev/null +++ b/terraform/modules/vpc/main.tf @@ -0,0 +1,344 @@ +# VPC Module - Multi-AZ Networking +# RFC 0039: ADR-Compliant Foundation Infrastructure +# +# Architecture: +# - 3 AZs for high availability +# - Public subnets for NLB and NAT Gateways +# - Private subnets for EKS nodes and workloads +# - Database subnets for CockroachDB (isolated) +# - NAT Gateway per AZ for HA outbound traffic + +locals { + # Calculate subnet CIDRs + # /16 VPC -> /20 subnets (4096 IPs each) + # AZ a: public=10.0.0.0/20, private=10.0.16.0/20, database=10.0.32.0/20 + # AZ b: public=10.0.48.0/20, private=10.0.64.0/20, database=10.0.80.0/20 + # AZ c: public=10.0.96.0/20, private=10.0.112.0/20, database=10.0.128.0/20 + public_subnets = [for i, az in var.availability_zones : cidrsubnet(var.cidr, 4, i * 3)] + private_subnets = [for i, az in var.availability_zones : cidrsubnet(var.cidr, 4, i * 3 + 1)] + database_subnets = [for i, az in var.availability_zones : cidrsubnet(var.cidr, 4, i * 3 + 2)] +} + +# VPC +resource "aws_vpc" "main" { + cidr_block = var.cidr + enable_dns_hostnames = true + enable_dns_support = true + + tags = merge(var.tags, { + Name = var.name + }) +} + +# Internet Gateway +resource "aws_internet_gateway" "main" { + vpc_id = aws_vpc.main.id + + tags = merge(var.tags, { + Name = "${var.name}-igw" + }) +} + +# Public Subnets +resource "aws_subnet" "public" { + count = length(var.availability_zones) + + vpc_id = aws_vpc.main.id + cidr_block = local.public_subnets[count.index] + availability_zone = var.availability_zones[count.index] + map_public_ip_on_launch = true + + tags = merge(var.tags, { + Name = "${var.name}-public-${var.availability_zones[count.index]}" + "kubernetes.io/role/elb" = "1" + "kubernetes.io/cluster/${var.name}" = "shared" + }) +} + +# Private Subnets +resource "aws_subnet" "private" { + count = length(var.availability_zones) + + vpc_id = aws_vpc.main.id + cidr_block = local.private_subnets[count.index] + availability_zone = var.availability_zones[count.index] + + tags = merge(var.tags, { + Name = "${var.name}-private-${var.availability_zones[count.index]}" + "kubernetes.io/role/internal-elb" = "1" + "kubernetes.io/cluster/${var.name}" = "shared" + "karpenter.sh/discovery" = "true" + }) +} + +# Database Subnets (isolated - no internet access) +resource "aws_subnet" "database" { + count = length(var.availability_zones) + + vpc_id = aws_vpc.main.id + cidr_block = local.database_subnets[count.index] + availability_zone = var.availability_zones[count.index] + + tags = merge(var.tags, { + Name = "${var.name}-database-${var.availability_zones[count.index]}" + }) +} + +# Elastic IPs for NAT Gateways +resource "aws_eip" "nat" { + count = var.enable_nat_gateway ? (var.single_nat_gateway ? 1 : length(var.availability_zones)) : 0 + + domain = "vpc" + + tags = merge(var.tags, { + Name = "${var.name}-nat-${count.index + 1}" + }) + + depends_on = [aws_internet_gateway.main] +} + +# NAT Gateways +resource "aws_nat_gateway" "main" { + count = var.enable_nat_gateway ? (var.single_nat_gateway ? 1 : length(var.availability_zones)) : 0 + + allocation_id = aws_eip.nat[count.index].id + subnet_id = aws_subnet.public[count.index].id + + tags = merge(var.tags, { + Name = "${var.name}-nat-${var.availability_zones[count.index]}" + }) + + depends_on = [aws_internet_gateway.main] +} + +# Public Route Table +resource "aws_route_table" "public" { + vpc_id = aws_vpc.main.id + + tags = merge(var.tags, { + Name = "${var.name}-public" + }) +} + +resource "aws_route" "public_internet" { + route_table_id = aws_route_table.public.id + destination_cidr_block = "0.0.0.0/0" + gateway_id = aws_internet_gateway.main.id +} + +resource "aws_route_table_association" "public" { + count = length(var.availability_zones) + + subnet_id = aws_subnet.public[count.index].id + route_table_id = aws_route_table.public.id +} + +# Private Route Tables (one per AZ for HA) +resource "aws_route_table" "private" { + count = length(var.availability_zones) + + vpc_id = aws_vpc.main.id + + tags = merge(var.tags, { + Name = "${var.name}-private-${var.availability_zones[count.index]}" + }) +} + +resource "aws_route" "private_nat" { + count = var.enable_nat_gateway ? length(var.availability_zones) : 0 + + route_table_id = aws_route_table.private[count.index].id + destination_cidr_block = "0.0.0.0/0" + nat_gateway_id = var.single_nat_gateway ? aws_nat_gateway.main[0].id : aws_nat_gateway.main[count.index].id +} + +resource "aws_route_table_association" "private" { + count = length(var.availability_zones) + + subnet_id = aws_subnet.private[count.index].id + route_table_id = aws_route_table.private[count.index].id +} + +# Database Route Table (no internet access) +resource "aws_route_table" "database" { + vpc_id = aws_vpc.main.id + + tags = merge(var.tags, { + Name = "${var.name}-database" + }) +} + +resource "aws_route_table_association" "database" { + count = length(var.availability_zones) + + subnet_id = aws_subnet.database[count.index].id + route_table_id = aws_route_table.database.id +} + +# VPC Flow Logs for security auditing +resource "aws_flow_log" "main" { + count = var.enable_flow_logs ? 1 : 0 + + vpc_id = aws_vpc.main.id + traffic_type = "ALL" + log_destination_type = "cloud-watch-logs" + log_destination = aws_cloudwatch_log_group.flow_logs[0].arn + iam_role_arn = aws_iam_role.flow_logs[0].arn + max_aggregation_interval = 60 + + tags = merge(var.tags, { + Name = "${var.name}-flow-logs" + }) +} + +resource "aws_cloudwatch_log_group" "flow_logs" { + count = var.enable_flow_logs ? 1 : 0 + + name = "/aws/vpc/${var.name}/flow-logs" + retention_in_days = 30 + + tags = var.tags +} + +resource "aws_iam_role" "flow_logs" { + count = var.enable_flow_logs ? 1 : 0 + + name = "${var.name}-flow-logs" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Action = "sts:AssumeRole" + Effect = "Allow" + Principal = { + Service = "vpc-flow-logs.amazonaws.com" + } + } + ] + }) + + tags = var.tags +} + +resource "aws_iam_role_policy" "flow_logs" { + count = var.enable_flow_logs ? 1 : 0 + + name = "${var.name}-flow-logs" + role = aws_iam_role.flow_logs[0].id + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + "logs:CreateLogGroup", + "logs:CreateLogStream", + "logs:PutLogEvents", + "logs:DescribeLogGroups", + "logs:DescribeLogStreams" + ] + Resource = "*" + } + ] + }) +} + +# VPC Endpoints for AWS services (cost optimization - no NAT charges) +resource "aws_vpc_endpoint" "s3" { + vpc_id = aws_vpc.main.id + service_name = "com.amazonaws.${data.aws_region.current.name}.s3" + vpc_endpoint_type = "Gateway" + + route_table_ids = concat( + [aws_route_table.public.id], + aws_route_table.private[*].id, + [aws_route_table.database.id] + ) + + tags = merge(var.tags, { + Name = "${var.name}-s3-endpoint" + }) +} + +resource "aws_vpc_endpoint" "ecr_api" { + vpc_id = aws_vpc.main.id + service_name = "com.amazonaws.${data.aws_region.current.name}.ecr.api" + vpc_endpoint_type = "Interface" + subnet_ids = aws_subnet.private[*].id + security_group_ids = [aws_security_group.vpc_endpoints.id] + private_dns_enabled = true + + tags = merge(var.tags, { + Name = "${var.name}-ecr-api-endpoint" + }) +} + +resource "aws_vpc_endpoint" "ecr_dkr" { + vpc_id = aws_vpc.main.id + service_name = "com.amazonaws.${data.aws_region.current.name}.ecr.dkr" + vpc_endpoint_type = "Interface" + subnet_ids = aws_subnet.private[*].id + security_group_ids = [aws_security_group.vpc_endpoints.id] + private_dns_enabled = true + + tags = merge(var.tags, { + Name = "${var.name}-ecr-dkr-endpoint" + }) +} + +resource "aws_vpc_endpoint" "sts" { + vpc_id = aws_vpc.main.id + service_name = "com.amazonaws.${data.aws_region.current.name}.sts" + vpc_endpoint_type = "Interface" + subnet_ids = aws_subnet.private[*].id + security_group_ids = [aws_security_group.vpc_endpoints.id] + private_dns_enabled = true + + tags = merge(var.tags, { + Name = "${var.name}-sts-endpoint" + }) +} + +resource "aws_vpc_endpoint" "ec2" { + vpc_id = aws_vpc.main.id + service_name = "com.amazonaws.${data.aws_region.current.name}.ec2" + vpc_endpoint_type = "Interface" + subnet_ids = aws_subnet.private[*].id + security_group_ids = [aws_security_group.vpc_endpoints.id] + private_dns_enabled = true + + tags = merge(var.tags, { + Name = "${var.name}-ec2-endpoint" + }) +} + +# Security Group for VPC Endpoints +resource "aws_security_group" "vpc_endpoints" { + name = "${var.name}-vpc-endpoints" + description = "Security group for VPC endpoints" + vpc_id = aws_vpc.main.id + + ingress { + description = "HTTPS from VPC" + from_port = 443 + to_port = 443 + protocol = "tcp" + cidr_blocks = [var.cidr] + } + + egress { + description = "All outbound" + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } + + tags = merge(var.tags, { + Name = "${var.name}-vpc-endpoints" + }) +} + +data "aws_region" "current" {} diff --git a/terraform/modules/vpc/outputs.tf b/terraform/modules/vpc/outputs.tf new file mode 100644 index 0000000..3c00269 --- /dev/null +++ b/terraform/modules/vpc/outputs.tf @@ -0,0 +1,72 @@ +# VPC Module - Outputs +# RFC 0039: ADR-Compliant Foundation Infrastructure + +output "vpc_id" { + description = "VPC ID" + value = aws_vpc.main.id +} + +output "vpc_cidr" { + description = "VPC CIDR block" + value = aws_vpc.main.cidr_block +} + +output "vpc_arn" { + description = "VPC ARN" + value = aws_vpc.main.arn +} + +output "public_subnet_ids" { + description = "List of public subnet IDs" + value = aws_subnet.public[*].id +} + +output "public_subnet_cidrs" { + description = "List of public subnet CIDR blocks" + value = aws_subnet.public[*].cidr_block +} + +output "private_subnet_ids" { + description = "List of private subnet IDs" + value = aws_subnet.private[*].id +} + +output "private_subnet_cidrs" { + description = "List of private subnet CIDR blocks" + value = aws_subnet.private[*].cidr_block +} + +output "database_subnet_ids" { + description = "List of database subnet IDs" + value = aws_subnet.database[*].id +} + +output "database_subnet_cidrs" { + description = "List of database subnet CIDR blocks" + value = aws_subnet.database[*].cidr_block +} + +output "nat_gateway_ips" { + description = "List of NAT Gateway public IPs" + value = aws_eip.nat[*].public_ip +} + +output "internet_gateway_id" { + description = "Internet Gateway ID" + value = aws_internet_gateway.main.id +} + +output "availability_zones" { + description = "List of availability zones used" + value = var.availability_zones +} + +output "vpc_endpoints_security_group_id" { + description = "Security group ID for VPC endpoints" + value = aws_security_group.vpc_endpoints.id +} + +output "s3_endpoint_id" { + description = "S3 VPC endpoint ID" + value = aws_vpc_endpoint.s3.id +} diff --git a/terraform/modules/vpc/variables.tf b/terraform/modules/vpc/variables.tf new file mode 100644 index 0000000..657e1c3 --- /dev/null +++ b/terraform/modules/vpc/variables.tf @@ -0,0 +1,52 @@ +# VPC Module - Variables +# RFC 0039: ADR-Compliant Foundation Infrastructure + +variable "name" { + description = "Name prefix for all VPC resources" + type = string +} + +variable "cidr" { + description = "VPC CIDR block" + type = string + default = "10.0.0.0/16" + + validation { + condition = can(cidrhost(var.cidr, 0)) + error_message = "CIDR block must be a valid IPv4 CIDR." + } +} + +variable "availability_zones" { + description = "List of availability zones to use (minimum 3 for HA)" + type = list(string) + + validation { + condition = length(var.availability_zones) >= 3 + error_message = "Minimum 3 availability zones required for HA." + } +} + +variable "enable_nat_gateway" { + description = "Enable NAT Gateway for private subnet internet access" + type = bool + default = true +} + +variable "single_nat_gateway" { + description = "Use a single NAT Gateway instead of one per AZ (cost vs HA tradeoff)" + type = bool + default = false +} + +variable "enable_flow_logs" { + description = "Enable VPC Flow Logs for security auditing" + type = bool + default = true +} + +variable "tags" { + description = "Tags to apply to all resources" + type = map(string) + default = {} +} diff --git a/terraform/outputs.tf b/terraform/outputs.tf new file mode 100644 index 0000000..58b414e --- /dev/null +++ b/terraform/outputs.tf @@ -0,0 +1,146 @@ +# Foundation Infrastructure - Outputs +# RFC 0039: ADR-Compliant Foundation Infrastructure + +# VPC Outputs +output "vpc_id" { + description = "VPC ID" + value = module.vpc.vpc_id +} + +output "vpc_cidr" { + description = "VPC CIDR block" + value = module.vpc.vpc_cidr +} + +output "private_subnet_ids" { + description = "Private subnet IDs" + value = module.vpc.private_subnet_ids +} + +output "public_subnet_ids" { + description = "Public subnet IDs" + value = module.vpc.public_subnet_ids +} + +output "database_subnet_ids" { + description = "Database subnet IDs" + value = module.vpc.database_subnet_ids +} + +# EKS Outputs +output "cluster_name" { + description = "EKS cluster name" + value = module.eks.cluster_name +} + +output "cluster_endpoint" { + description = "EKS cluster API endpoint" + value = module.eks.cluster_endpoint +} + +output "cluster_certificate_authority_data" { + description = "EKS cluster CA certificate" + value = module.eks.cluster_certificate_authority_data + sensitive = true +} + +output "cluster_oidc_issuer_url" { + description = "OIDC issuer URL for IRSA" + value = module.eks.cluster_oidc_issuer_url +} + +# Storage Outputs +output "efs_id" { + description = "EFS filesystem ID" + value = module.storage.efs_id +} + +output "backup_bucket_name" { + description = "S3 bucket for backups" + value = module.storage.backup_bucket_name +} + +output "blob_bucket_name" { + description = "S3 bucket for blob storage" + value = module.storage.blob_bucket_name +} + +# NLB Outputs +output "nlb_dns_name" { + description = "NLB DNS name" + value = module.nlb.dns_name +} + +output "nlb_zone_id" { + description = "NLB Route53 zone ID" + value = module.nlb.zone_id +} + +output "nlb_arn" { + description = "NLB ARN" + value = module.nlb.arn +} + +# IAM Outputs +output "karpenter_role_arn" { + description = "Karpenter IAM role ARN" + value = module.iam.karpenter_role_arn +} + +output "ebs_csi_role_arn" { + description = "EBS CSI driver IAM role ARN" + value = module.iam.ebs_csi_role_arn +} + +output "efs_csi_role_arn" { + description = "EFS CSI driver IAM role ARN" + value = module.iam.efs_csi_role_arn +} + +# kubectl Configuration +output "kubectl_config" { + description = "kubectl configuration command" + value = "aws eks update-kubeconfig --region ${var.aws_region} --name ${module.eks.cluster_name}" +} + +# S3 Module Outputs +output "email_blobs_bucket_id" { + description = "S3 bucket ID for email blobs" + value = module.s3.email_blobs_bucket_id +} + +output "loki_chunks_bucket_id" { + description = "S3 bucket ID for Loki log chunks" + value = module.s3.loki_chunks_bucket_id +} + +output "tempo_traces_bucket_id" { + description = "S3 bucket ID for Tempo traces" + value = module.s3.tempo_traces_bucket_id +} + +output "git_lfs_bucket_id" { + description = "S3 bucket ID for Git LFS objects" + value = module.s3.git_lfs_bucket_id +} + +# IAM Policies for IRSA +output "loki_s3_policy_arn" { + description = "IAM policy ARN for Loki S3 access" + value = module.s3.loki_s3_policy_arn +} + +output "tempo_s3_policy_arn" { + description = "IAM policy ARN for Tempo S3 access" + value = module.s3.tempo_s3_policy_arn +} + +output "stalwart_s3_policy_arn" { + description = "IAM policy ARN for Stalwart S3 access" + value = module.s3.stalwart_s3_policy_arn +} + +output "forgejo_s3_policy_arn" { + description = "IAM policy ARN for Forgejo S3 access" + value = module.s3.forgejo_s3_policy_arn +} diff --git a/terraform/variables.tf b/terraform/variables.tf new file mode 100644 index 0000000..8652ec9 --- /dev/null +++ b/terraform/variables.tf @@ -0,0 +1,85 @@ +# Foundation Infrastructure - Root Variables +# RFC 0039: ADR-Compliant Foundation Infrastructure + +variable "project_name" { + description = "Project name used for resource naming" + type = string + default = "alignment" +} + +variable "environment" { + description = "Environment name (production, staging)" + type = string + default = "production" + + validation { + condition = contains(["production", "staging"], var.environment) + error_message = "Environment must be 'production' or 'staging'." + } +} + +variable "aws_region" { + description = "AWS region for deployment" + type = string + default = "us-east-1" +} + +variable "vpc_cidr" { + description = "CIDR block for VPC" + type = string + default = "10.0.0.0/16" +} + +variable "enable_fips" { + description = "Enable FIPS 140-2 compliance mode" + type = bool + default = true +} + +variable "kubernetes_version" { + description = "Kubernetes version for EKS" + type = string + default = "1.29" +} + +variable "cockroachdb_node_count" { + description = "Number of CockroachDB nodes (minimum 3 for HA)" + type = number + default = 3 + + validation { + condition = var.cockroachdb_node_count >= 3 + error_message = "CockroachDB requires minimum 3 nodes for HA." + } +} + +variable "cockroachdb_instance_type" { + description = "EC2 instance type for CockroachDB nodes" + type = string + default = "m6i.large" +} + +variable "tags" { + description = "Common tags applied to all resources" + type = map(string) + default = {} +} + +variable "log_retention_days" { + description = "Number of days to retain logs in Loki S3 bucket" + type = number + default = 90 +} + +variable "trace_retention_days" { + description = "Number of days to retain traces in Tempo S3 bucket" + type = number + default = 30 +} + +# RFC 0046: Domain Email Migration - DNS Static IPs +variable "enable_dns_static_ips" { + description = "Enable Elastic IPs for DNS NLB to support stable glue records for domain delegation" + type = bool + default = true +} diff --git a/terraform/vault-pki-smime.tf b/terraform/vault-pki-smime.tf new file mode 100644 index 0000000..365e155 --- /dev/null +++ b/terraform/vault-pki-smime.tf @@ -0,0 +1,212 @@ +# Vault PKI for S/MIME Certificates +# RFC 0041: Self-Hosted DNS and Email +# ADR 0006: Zero-Knowledge Zero-Trust (Layer 3 encryption support) +# +# This configuration creates a PKI secrets engine in Vault for issuing +# S/MIME certificates to users. S/MIME provides end-to-end email encryption +# (Layer 3) where only the sender and recipient can read the email content. + +terraform { + required_providers { + vault = { + source = "hashicorp/vault" + version = "~> 4.0" + } + } +} + +# ============================================================================= +# ROOT CA (Offline - used only to sign intermediate) +# ============================================================================= + +resource "vault_mount" "pki_root_smime" { + path = "pki-smime-root" + type = "pki" + description = "S/MIME Root CA - Offline, used only to sign intermediate" + + # Root CA has a longer lifetime + default_lease_ttl_seconds = 315360000 # 10 years + max_lease_ttl_seconds = 315360000 # 10 years +} + +resource "vault_pki_secret_backend_root_cert" "smime_root" { + backend = vault_mount.pki_root_smime.path + type = "internal" + common_name = "Alignment S/MIME Root CA" + ttl = "315360000" # 10 years + key_type = "rsa" + key_bits = 4096 + + organization = "Alignment" + country = "US" + exclude_cn_from_sans = true +} + +# ============================================================================= +# INTERMEDIATE CA (Online - used to issue user certificates) +# ============================================================================= + +resource "vault_mount" "pki_smime" { + path = "pki-smime" + type = "pki" + description = "S/MIME Intermediate CA - Issues user certificates" + + # Intermediate has shorter lifetime + default_lease_ttl_seconds = 31536000 # 1 year + max_lease_ttl_seconds = 94608000 # 3 years +} + +# Generate intermediate CSR +resource "vault_pki_secret_backend_intermediate_cert_request" "smime_intermediate" { + backend = vault_mount.pki_smime.path + type = "internal" + common_name = "Alignment S/MIME Intermediate CA" + key_type = "rsa" + key_bits = 4096 + + organization = "Alignment" + country = "US" +} + +# Sign intermediate with root +resource "vault_pki_secret_backend_root_sign_intermediate" "smime_intermediate" { + backend = vault_mount.pki_root_smime.path + common_name = "Alignment S/MIME Intermediate CA" + csr = vault_pki_secret_backend_intermediate_cert_request.smime_intermediate.csr + ttl = "157680000" # 5 years + + organization = "Alignment" + country = "US" + + # Intermediate CA permissions + permitted_dns_domains = ["alignment.dev"] +} + +# Set the signed intermediate certificate +resource "vault_pki_secret_backend_intermediate_set_signed" "smime_intermediate" { + backend = vault_mount.pki_smime.path + certificate = vault_pki_secret_backend_root_sign_intermediate.smime_intermediate.certificate +} + +# ============================================================================= +# S/MIME CERTIFICATE ROLE +# ============================================================================= + +resource "vault_pki_secret_backend_role" "smime_user" { + backend = vault_mount.pki_smime.path + name = "smime-user" + + # Certificate lifetime + ttl = 31536000 # 1 year + max_ttl = 63072000 # 2 years + + # Domain restrictions + allow_any_name = false + allowed_domains = ["alignment.dev"] + allow_subdomains = false + enforce_hostnames = false + + # Allow email addresses + allow_bare_domains = true + + # Key configuration + key_type = "rsa" + key_bits = 4096 + key_usage = ["DigitalSignature", "KeyEncipherment", "ContentCommitment"] + + # S/MIME specific - Email Protection extended key usage + ext_key_usage = ["EmailProtection"] + + # Certificate properties + require_cn = true + use_csr_common_name = true + use_csr_sans = true + + # Organization defaults + organization = ["Alignment"] + country = ["US"] + + # Don't include SANs automatically + allow_ip_sans = false + allow_localhost = false + allowed_uri_sans = [] + allowed_other_sans = [] + + # Generate certificates (not just sign CSRs) + generate_lease = true + no_store = false +} + +# ============================================================================= +# POLICY FOR S/MIME CERTIFICATE ISSUANCE +# ============================================================================= + +resource "vault_policy" "smime_user_issue" { + name = "smime-user-issue" + policy = <<-EOT + # Allow users to issue S/MIME certificates for their own email + # Users authenticate via OIDC, and their email claim is used to verify + # they can only request certificates for their own email address. + + # Issue S/MIME certificate + path "pki-smime/issue/smime-user" { + capabilities = ["create", "update"] + + # Restrict to user's own email + allowed_parameters = { + "common_name" = [] + "alt_names" = [] + "ttl" = [] + } + } + + # Read certificate chain + path "pki-smime/ca/pem" { + capabilities = ["read"] + } + + path "pki-smime/ca_chain" { + capabilities = ["read"] + } + + # List own certificates + path "pki-smime/certs" { + capabilities = ["list"] + } + EOT +} + +# ============================================================================= +# CRL CONFIGURATION +# ============================================================================= + +resource "vault_pki_secret_backend_config_urls" "smime_urls" { + backend = vault_mount.pki_smime.path + + issuing_certificates = [ + "https://vault.alignment.dev/v1/pki-smime/ca" + ] + + crl_distribution_points = [ + "https://vault.alignment.dev/v1/pki-smime/crl" + ] + + ocsp_servers = [ + "https://vault.alignment.dev/v1/pki-smime/ocsp" + ] +} + +# ============================================================================= +# OUTPUTS +# ============================================================================= + +output "smime_ca_chain" { + description = "S/MIME CA certificate chain" + value = vault_pki_secret_backend_intermediate_set_signed.smime_intermediate.certificate + sensitive = false +} + +output "smime_issue_path" { + description = "Path to issue S/MIME certificates" + value = "${vault_mount.pki_smime.path}/issue/${vault_pki_secret_backend_role.smime_user.name}" +} diff --git a/terraform/versions.tf b/terraform/versions.tf new file mode 100644 index 0000000..1f60354 --- /dev/null +++ b/terraform/versions.tf @@ -0,0 +1,35 @@ +# Foundation Infrastructure - Terraform Configuration +# RFC 0039: ADR-Compliant Foundation Infrastructure +# ADR 0003: CockroachDB Self-Hosted FIPS +# ADR 0004: Set It and Forget It Architecture +# ADR 0005: Full-Stack Self-Hosting + +terraform { + required_version = ">= 1.6.0" + + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.30" + } + kubernetes = { + source = "hashicorp/kubernetes" + version = "~> 2.24" + } + helm = { + source = "hashicorp/helm" + version = "~> 2.12" + } + kubectl = { + source = "gavinbunney/kubectl" + version = "~> 1.14" + } + tls = { + source = "hashicorp/tls" + version = "~> 4.0" + } + } + + # Backend configuration - use S3 for state storage + # Configure in environments/production/backend.tf +}