hearth/terraform/minimal/user-data.sh
Eric Garcia 3879d2fe35 fix: install Traefik CRDs for IngressRouteTCP SSH routing
The IngressRouteTCP resource was being silently ignored because
Traefik CRDs were never installed. This caused SSH traffic on
port 22 to be handled as HTTP, returning 400 Bad Request.

Add CRD installation step before Traefik deployment.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-03 13:00:28 -05:00

437 lines
12 KiB
Bash

#!/bin/bash
set -euo pipefail
# Hearth Minimal - EC2 User Data Script
# Installs k3s and deploys Forgejo
exec > >(tee /var/log/user-data.log) 2>&1
echo "Starting user-data script at $(date)"
# -----------------------------------------------------------------------------
# Variables from Terraform
# -----------------------------------------------------------------------------
DOMAIN="${domain}"
LETSENCRYPT_EMAIL="${letsencrypt_email}"
SSH_PORT="${ssh_port}"
S3_BUCKET="${s3_bucket}"
# -----------------------------------------------------------------------------
# System Setup
# -----------------------------------------------------------------------------
# Update system
dnf update -y
# Install required packages
dnf install -y docker git jq awscli
# Enable and start Docker (for building if needed)
systemctl enable --now docker
# Move SSH to alternate port for admin access
sed -i "s/#Port 22/Port $SSH_PORT/" /etc/ssh/sshd_config
systemctl restart sshd
# Add admin SSH key (passed from terraform)
SSH_KEY="${ssh_public_key}"
if [ -n "$SSH_KEY" ]; then
mkdir -p /home/ec2-user/.ssh
echo "$SSH_KEY" >> /home/ec2-user/.ssh/authorized_keys
chown -R ec2-user:ec2-user /home/ec2-user/.ssh
chmod 700 /home/ec2-user/.ssh
chmod 600 /home/ec2-user/.ssh/authorized_keys
fi
# Enable automatic security updates
dnf install -y dnf-automatic
sed -i 's/apply_updates = no/apply_updates = yes/' /etc/dnf/automatic.conf
systemctl enable --now dnf-automatic-install.timer
# -----------------------------------------------------------------------------
# Install k3s
# -----------------------------------------------------------------------------
echo "Installing k3s..."
curl -sfL https://get.k3s.io | sh -s - \
--disable traefik \
--write-kubeconfig-mode 644
# Wait for k3s to be ready
echo "Waiting for k3s to be ready..."
until kubectl get nodes 2>/dev/null | grep -q "Ready"; do
sleep 5
done
echo "k3s is ready"
# -----------------------------------------------------------------------------
# Install Traefik CRDs (required for IngressRouteTCP)
# -----------------------------------------------------------------------------
echo "Installing Traefik CRDs..."
kubectl apply -f https://raw.githubusercontent.com/traefik/traefik/v3.2/docs/content/reference/dynamic-configuration/kubernetes-crd-definition-v1.yml
# Wait for CRDs to be established
echo "Waiting for Traefik CRDs to be ready..."
kubectl wait --for=condition=Established crd/ingressroutetcps.traefik.io --timeout=60s || true
kubectl wait --for=condition=Established crd/ingressroutes.traefik.io --timeout=60s || true
# -----------------------------------------------------------------------------
# Install Traefik with Let's Encrypt
# -----------------------------------------------------------------------------
echo "Installing Traefik..."
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: Namespace
metadata:
name: traefik
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: traefik
namespace: traefik
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: traefik
rules:
- apiGroups: [""]
resources: [services, endpoints, secrets, nodes, pods]
verbs: [get, list, watch]
- apiGroups: [extensions, networking.k8s.io]
resources: [ingresses, ingressclasses]
verbs: [get, list, watch]
- apiGroups: [extensions, networking.k8s.io]
resources: [ingresses/status]
verbs: [update]
- apiGroups: [traefik.io]
resources: ["*"]
verbs: [get, list, watch]
- apiGroups: [discovery.k8s.io]
resources: [endpointslices]
verbs: [get, list, watch]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: traefik
subjects:
- kind: ServiceAccount
name: traefik
namespace: traefik
roleRef:
kind: ClusterRole
name: traefik
apiGroup: rbac.authorization.k8s.io
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: traefik
namespace: traefik
spec:
replicas: 1
selector:
matchLabels:
app: traefik
template:
metadata:
labels:
app: traefik
spec:
serviceAccountName: traefik
containers:
- name: traefik
image: traefik:v3.2
args:
- --api.insecure=false
- --providers.kubernetesingress
- --providers.kubernetescrd
- --entrypoints.web.address=:80
- --entrypoints.websecure.address=:443
- --entrypoints.ssh.address=:22/tcp
- --entrypoints.web.http.redirections.entrypoint.to=websecure
- --certificatesresolvers.letsencrypt.acme.email=${letsencrypt_email}
- --certificatesresolvers.letsencrypt.acme.storage=/data/acme.json
- --certificatesresolvers.letsencrypt.acme.httpchallenge.entrypoint=web
ports:
- name: web
containerPort: 80
- name: websecure
containerPort: 443
- name: ssh
containerPort: 22
volumeMounts:
- name: acme
mountPath: /data
volumes:
- name: acme
emptyDir: {}
---
apiVersion: v1
kind: Service
metadata:
name: traefik
namespace: traefik
spec:
type: LoadBalancer
externalTrafficPolicy: Local
selector:
app: traefik
ports:
- name: web
port: 80
targetPort: 80
- name: websecure
port: 443
targetPort: 443
- name: ssh
port: 22
targetPort: 22
EOF
# Wait for Traefik
echo "Waiting for Traefik..."
sleep 15
# -----------------------------------------------------------------------------
# Create Forgejo Namespace and Resources
# -----------------------------------------------------------------------------
echo "Creating Forgejo namespace..."
kubectl create namespace forgejo --dry-run=client -o yaml | kubectl apply -f -
# Create Forgejo data directory
mkdir -p /data/forgejo
chown 1000:1000 /data/forgejo
# Create PV and PVC for Forgejo data
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: PersistentVolume
metadata:
name: forgejo-data
spec:
capacity:
storage: 10Gi
accessModes:
- ReadWriteOnce
hostPath:
path: /data/forgejo
storageClassName: local
persistentVolumeReclaimPolicy: Retain
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: forgejo-data
namespace: forgejo
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 10Gi
storageClassName: local
EOF
# -----------------------------------------------------------------------------
# Deploy Forgejo
# -----------------------------------------------------------------------------
echo "Deploying Forgejo..."
cat <<EOF | kubectl apply -f -
apiVersion: apps/v1
kind: Deployment
metadata:
name: forgejo
namespace: forgejo
spec:
replicas: 1
selector:
matchLabels:
app: forgejo
strategy:
type: Recreate
template:
metadata:
labels:
app: forgejo
spec:
securityContext:
fsGroup: 1000
containers:
- name: forgejo
image: codeberg.org/forgejo/forgejo:9
ports:
- name: http
containerPort: 3000
- name: ssh
containerPort: 22
env:
- name: FORGEJO__server__DOMAIN
value: "${domain}"
- name: FORGEJO__server__ROOT_URL
value: "https://${domain}"
- name: FORGEJO__server__SSH_DOMAIN
value: "${domain}"
- name: FORGEJO__server__SSH_PORT
value: "22"
- name: FORGEJO__server__LFS_START_SERVER
value: "true"
- name: FORGEJO__database__DB_TYPE
value: "sqlite3"
- name: FORGEJO__database__PATH
value: "/data/gitea/gitea.db"
- name: FORGEJO__security__INSTALL_LOCK
value: "false"
- name: FORGEJO__service__DISABLE_REGISTRATION
value: "false"
- name: FORGEJO__log__MODE
value: "console"
- name: FORGEJO__log__LEVEL
value: "Info"
volumeMounts:
- name: data
mountPath: /data
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 1000m
memory: 1Gi
livenessProbe:
httpGet:
path: /api/healthz
port: 3000
initialDelaySeconds: 30
periodSeconds: 30
readinessProbe:
httpGet:
path: /api/healthz
port: 3000
initialDelaySeconds: 5
periodSeconds: 10
volumes:
- name: data
persistentVolumeClaim:
claimName: forgejo-data
---
apiVersion: v1
kind: Service
metadata:
name: forgejo
namespace: forgejo
spec:
selector:
app: forgejo
ports:
- name: http
port: 3000
targetPort: 3000
- name: ssh
port: 22
targetPort: 22
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: forgejo
namespace: forgejo
annotations:
traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/router.tls: "true"
traefik.ingress.kubernetes.io/router.tls.certresolver: letsencrypt
spec:
rules:
- host: ${domain}
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: forgejo
port:
number: 3000
---
apiVersion: traefik.io/v1alpha1
kind: IngressRouteTCP
metadata:
name: forgejo-ssh
namespace: forgejo
spec:
entryPoints:
- ssh
routes:
- match: HostSNI(\`*\`)
services:
- name: forgejo
port: 22
EOF
# -----------------------------------------------------------------------------
# Setup Backup Cron Job
# -----------------------------------------------------------------------------
echo "Setting up backup cron..."
cat <<'BACKUP_SCRIPT' > /usr/local/bin/backup-forgejo.sh
#!/bin/bash
set -euo pipefail
TIMESTAMP=$(date +%Y%m%d-%H%M%S)
S3_BUCKET="$1"
BACKUP_DIR="/tmp/backup-$TIMESTAMP"
mkdir -p "$BACKUP_DIR"
# Backup Forgejo SQLite database
if [ -f /data/forgejo/gitea/gitea.db ]; then
sqlite3 /data/forgejo/gitea/gitea.db ".backup '$BACKUP_DIR/gitea.db'"
fi
# Backup k3s state
cp -r /var/lib/rancher/k3s/server/db "$BACKUP_DIR/k3s-db" 2>/dev/null || true
# Create tarball
tar -czf "/tmp/backup-$TIMESTAMP.tar.gz" -C "$BACKUP_DIR" .
# Upload to S3
aws s3 cp "/tmp/backup-$TIMESTAMP.tar.gz" "s3://$S3_BUCKET/backups/backup-$TIMESTAMP.tar.gz"
# Cleanup
rm -rf "$BACKUP_DIR" "/tmp/backup-$TIMESTAMP.tar.gz"
# Keep only last 7 days of backups in S3 (lifecycle policy handles older ones)
echo "Backup completed: s3://$S3_BUCKET/backups/backup-$TIMESTAMP.tar.gz"
BACKUP_SCRIPT
chmod +x /usr/local/bin/backup-forgejo.sh
# Add cron job for daily backup at 3 AM
echo "0 3 * * * root /usr/local/bin/backup-forgejo.sh ${s3_bucket} >> /var/log/backup.log 2>&1" > /etc/cron.d/forgejo-backup
# Initial backup
/usr/local/bin/backup-forgejo.sh ${s3_bucket} || true
# -----------------------------------------------------------------------------
# Done
# -----------------------------------------------------------------------------
echo "User-data script completed at $(date)"
echo ""
echo "=========================================="
echo "Forgejo deployment complete!"
echo "=========================================="
echo ""
echo "Web URL: https://${domain}"
echo "Git SSH: git@${domain}:ORG/REPO.git"
echo "Admin SSH: ssh -p ${ssh_port} ec2-user@<ELASTIC_IP>"
echo ""
echo "Next steps:"
echo "1. Point DNS: ${domain} -> <ELASTIC_IP>"
echo "2. Wait for DNS propagation"
echo "3. Visit https://${domain} to complete setup"
echo "=========================================="