diff --git a/appsets/node-problem-detector.yaml b/appsets/node-problem-detector.yaml new file mode 100644 index 0000000..59a196e --- /dev/null +++ b/appsets/node-problem-detector.yaml @@ -0,0 +1,46 @@ +apiVersion: argoproj.io/v1alpha1 +kind: ApplicationSet +metadata: + name: universal-cert-manager + annotations: + argocd.argoproj.io/sync-wave: "3" +spec: + generators: + - list: + elements: + - cluster: cherry + url: https://kubernetes.default.svc + - cluster: lychee + url: https://172.16.152.1:6443 + syncPolicy: + applicationsSync: sync + template: + metadata: + name: "{{cluster}}-node-problem-detector" + annotations: + argocd.argoproj.io/sync-wave: "5" + spec: + project: monitoring + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true + source: + chart: node-problem-detector + repoURL: https://charts.deliveryhero.io/ + targetRevision: 2.3.10 + helm: + releaseName: node-problem-detector + valuesObject: + metrics: + enabled: true + serviceMonitor: + enabled: true + prometheusRule: + enabled: true + + destination: + server: "{{url}}" + namespace: monitoring diff --git a/appsets/prometheus.yaml b/appsets/prometheus.yaml new file mode 100644 index 0000000..5fab5a8 --- /dev/null +++ b/appsets/prometheus.yaml @@ -0,0 +1,103 @@ +apiVersion: argoproj.io/v1alpha1 +kind: ApplicationSet +metadata: + name: universal-prometheus + annotations: + argocd.argoproj.io/sync-wave: "2" +spec: + generators: + - list: + elements: + - cluster: lychee + url: https://172.16.152.1:6443 + syncPolicy: + applicationsSync: create-update + preserveResourcesOnDeletion: true + template: + metadata: + name: "{{cluster}}-prometheus" + annotations: + argocd.argoproj.io/sync-wave: "5" + spec: + project: monitoring + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true + - ServerSideApply=true + source: + chart: kube-prometheus-stack + repoURL: https://prometheus-community.github.io/helm-charts + targetRevision: 51.2.0 + helm: + releaseName: prometheus + values: | + fullnameOverride: "prometheus" + + prometheus: + ingress: + enabled: true + ingressClassName: nginx + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + hosts: + - prometheus.{{cluster}}.k8s.vxm.cz + paths: + - / + pathType: Prefix + tls: + - hosts: + - prometheus.{{cluster}}.k8s.vxm.cz + secretName: prometheus-ingress-tls + + prometheusSpec: + serviceMonitorSelectorNilUsesHelmValues: false + podMonitorSelectorNilUsesHelmValues: false + ruleSelectorNilUsesHelmValues: false + + storageSpec: + volumeClaimTemplate: + spec: + storageClassName: hcloud-volumes + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 20Gi + + serviceMonitor: + selfMonitor: true + + grafana: + enabled: false + + alertmanager: + enabled: true + ingress: + enabled: true + ingressClassName: nginx + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + hosts: + - alertmanager.{{cluster}}.k8s.vxm.cz + paths: + - / + pathType: Prefix + tls: + - hosts: + - alertmanager.{{cluster}}.k8s.vxm.cz + secretName: alertmanager-ingress-tls + + alertmanagerSpec: + resources: + limits: + cpu: 250m + memory: 256Mi + requests: + cpu: 100m + memory: 256Mi + + destination: + server: "{{url}}" + namespace: monitoring diff --git a/clusters/cherry/apps/blackbox-exporter.yaml b/clusters/cherry/apps/blackbox-exporter.yaml new file mode 100644 index 0000000..2d8c782 --- /dev/null +++ b/clusters/cherry/apps/blackbox-exporter.yaml @@ -0,0 +1,38 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: cherry-blackbox-exporter + namespace: argocd + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + project: monitoring + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true + destination: + server: https://kubernetes.default.svc + namespace: monitoring + source: + chart: prometheus-blackbox-exporter + repoURL: https://prometheus-community.github.io/helm-charts + targetRevision: 8.4.0 + helm: + releaseName: blackbox-exporter + valuesObject: + releaseLabel: true + pspEnabled: false + resources: + limits: + cpu: 100m + memory: 300Mi + requests: + cpu: 50m + memory: 50Mi + + serviceMonitor: + selfMonitor: + enabled: true diff --git a/clusters/cherry/apps/hcloud-cloud-controller-manager.yaml b/clusters/cherry/apps/hcloud-cloud-controller-manager.yaml new file mode 100644 index 0000000..8d4fe38 --- /dev/null +++ b/clusters/cherry/apps/hcloud-cloud-controller-manager.yaml @@ -0,0 +1,26 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: cherry-hcloud-cloud-controller-manager + namespace: argocd +spec: + project: infrastructure + syncPolicy: + automated: + prune: true + selfHeal: true + destination: + server: https://kubernetes.default.svc + namespace: kube-system + source: + chart: hcloud-cloud-controller-manager + repoURL: https://charts.hetzner.cloud + targetRevision: 1.18.0 + helm: + releaseName: hcloud-cloud-controller-manager + valuesObject: + controller: + hcloudToken: + existingSecret: + name: hcloud-token + key: token diff --git a/clusters/cherry/apps/ingress-nginx.yaml b/clusters/cherry/apps/ingress-nginx.yaml index 04d94c4..fa6e1d9 100644 --- a/clusters/cherry/apps/ingress-nginx.yaml +++ b/clusters/cherry/apps/ingress-nginx.yaml @@ -55,6 +55,10 @@ spec: type: ClusterIP hostPort: enabled: true + metrics: + enabled: true + serviceMonitor: + enabled: true config: allow-snippet-annotations: "true" generate-request-id: "true" diff --git a/clusters/cherry/apps/kube-prometheus-stack.yaml b/clusters/cherry/apps/kube-prometheus-stack.yaml index 8f4db7b..b3ae6ba 100644 --- a/clusters/cherry/apps/kube-prometheus-stack.yaml +++ b/clusters/cherry/apps/kube-prometheus-stack.yaml @@ -107,6 +107,34 @@ spec: requests: storage: 50Gi + additionalScrapeConfigs: + - job_name: "node-exporter" + metrics_path: "/metrics" + static_configs: + - targets: + - 172.16.1.1:9100 + - 172.16.1.2:9100 + - maple.vxm.cz:9100 + - alder.vxm.cz:9100 + + # See: https://developers.mattermost.com/blog/cloud-monitoring/ + - job_name: "federate" + scrape_interval: 15s + + honor_labels: true + metrics_path: "/federate" + + params: + 'match[]': + - '{job="prometheus"}' + - '{__name__=~"job:.*"}' + + static_configs: + - targets: + - prometheus.lychee.k8s.vxm.cz. + labels: + clusterID: lychee + serviceMonitor: selfMonitor: true diff --git a/clusters/cherry/apps/uptime-kuma.yaml b/clusters/cherry/apps/uptime-kuma.yaml index 58ae2bd..0ad7b61 100644 --- a/clusters/cherry/apps/uptime-kuma.yaml +++ b/clusters/cherry/apps/uptime-kuma.yaml @@ -3,6 +3,8 @@ kind: Application metadata: name: uptime-kuma namespace: argocd + finalizers: + - resources-finalizer.argocd.argoproj.io spec: project: monitoring syncPolicy: diff --git a/clusters/cherry/manifests/vault/ca.yaml b/clusters/cherry/manifests/vault/ca.yaml new file mode 100644 index 0000000..d199432 --- /dev/null +++ b/clusters/cherry/manifests/vault/ca.yaml @@ -0,0 +1,33 @@ +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: selfsigned +spec: + selfSigned: {} +--- +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: server-selfsigned-ca +spec: + isCA: true + commonName: Vault Server CA + secretName: server-ca + duration: 87660h # 10 years + privateKey: + algorithm: RSA + size: 4096 + issuerRef: + name: selfsigned + kind: Issuer + group: cert-manager.io + additionalOutputFormats: + - type: CombinedPEM +--- +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: server-ca-issuer +spec: + ca: + secretName: server-ca diff --git a/clusters/cherry/manifests/vault/certificate.yaml b/clusters/cherry/manifests/vault/certificate.yaml new file mode 100644 index 0000000..f2eb44c --- /dev/null +++ b/clusters/cherry/manifests/vault/certificate.yaml @@ -0,0 +1,21 @@ +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: server +spec: + secretName: server-tls + duration: 24h + renewBefore: 144m # 10% of 24h + dnsNames: + - vault-0.vault-internal + - vault-1.vault-internal + - vault-2.vault-internal + - vault-0.vault-internal.vault + - vault-1.vault-internal.vault + - vault-2.vault-internal.vault + - vault-0.vault-internal.vault.svc + - vault-1.vault-internal.vault.svc + - vault-2.vault-internal.vault.svc + issuerRef: + name: server-ca-issuer + commonName: Server Certificate diff --git a/vault-unseal.sh b/vault-unseal.sh new file mode 100755 index 0000000..48d1321 --- /dev/null +++ b/vault-unseal.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash + +UNSEAL_KEYS="xxx yyy zzz" + +VAULT_PODS=$(kubectl get pods -n vault -l app.kubernetes.io/name=vault -o jsonpath='{.items[*].metadata.name}') + +echo "" +echo "Unsealing Vault..." +echo "" + +echo "Deleting existing pods..." +echo "" + +kubectl delete pods -n vault $VAULT_PODS >> /dev/null + +echo "Waiting for new pods to become ready..." +echo "" + +kubectl wait --for=condition=ready --timeout=180s pods -n vault -l app.kubernetes.io/name=vault >> /dev/null + +echo "Unsealing Vault pods..." +echo "" + +for pod in $VAULT_PODS +do + echo "Unsealing $pod..." + + for key in $UNSEAL_KEYS + do + kubectl exec -n vault -it $pod -- vault operator unseal $key >> /dev/null + sleep 1 + done + + echo "$pod unsealed!" + echo "" +done + +echo "Vault unsealed!" +echo "" +echo "" +echo "" + + +