From be21c1c2987373fd2aae1b85e6370197f12ea751 Mon Sep 17 00:00:00 2001 From: Vojtech Mares Date: Tue, 3 Oct 2023 15:49:51 +0200 Subject: [PATCH 01/10] feat(cherry/ingress-nginx): enablem metrics --- clusters/cherry/apps/ingress-nginx.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/clusters/cherry/apps/ingress-nginx.yaml b/clusters/cherry/apps/ingress-nginx.yaml index 04d94c4..fa6e1d9 100644 --- a/clusters/cherry/apps/ingress-nginx.yaml +++ b/clusters/cherry/apps/ingress-nginx.yaml @@ -55,6 +55,10 @@ spec: type: ClusterIP hostPort: enabled: true + metrics: + enabled: true + serviceMonitor: + enabled: true config: allow-snippet-annotations: "true" generate-request-id: "true" From f38e6d5a3c7ec443647e9ce73fd585e7221906d8 Mon Sep 17 00:00:00 2001 From: Vojtech Mares Date: Tue, 3 Oct 2023 15:50:24 +0200 Subject: [PATCH 02/10] feat(cherry/uptime-kuma): add argocd finalizers --- clusters/cherry/apps/uptime-kuma.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/clusters/cherry/apps/uptime-kuma.yaml b/clusters/cherry/apps/uptime-kuma.yaml index 58ae2bd..0ad7b61 100644 --- a/clusters/cherry/apps/uptime-kuma.yaml +++ b/clusters/cherry/apps/uptime-kuma.yaml @@ -3,6 +3,8 @@ kind: Application metadata: name: uptime-kuma namespace: argocd + finalizers: + - resources-finalizer.argocd.argoproj.io spec: project: monitoring syncPolicy: From 9ed52183d5aa289a4fef3401d9708d67fcacb2c1 Mon Sep 17 00:00:00 2001 From: Vojtech Mares Date: Tue, 3 Oct 2023 19:24:21 +0200 Subject: [PATCH 03/10] feat(cherry/kube-prometheus-stack): add additionalScrapeConfigs --- clusters/cherry/apps/kube-prometheus-stack.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/clusters/cherry/apps/kube-prometheus-stack.yaml b/clusters/cherry/apps/kube-prometheus-stack.yaml index 8f4db7b..be7b9c0 100644 --- a/clusters/cherry/apps/kube-prometheus-stack.yaml +++ b/clusters/cherry/apps/kube-prometheus-stack.yaml @@ -107,6 +107,16 @@ spec: requests: storage: 50Gi + additionalScrapeConfigs: + - job_name: "node-exporter" + metrics_path: "/metrics" + static_configs: + - targets: + - 172.16.1.1:9100 + - 172.16.1.2:9100 + - maple.vxm.cz:9100 + - alder.vxm.cz:9100 + serviceMonitor: selfMonitor: true From 005fe949d6a61b619122e2a11c511f5f7f4b63ed Mon Sep 17 00:00:00 2001 From: Vojtech Mares Date: Tue, 3 Oct 2023 20:11:58 +0200 Subject: [PATCH 04/10] feat(appsets): add prometheus --- appsets/prometheus.yaml | 101 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 appsets/prometheus.yaml diff --git a/appsets/prometheus.yaml b/appsets/prometheus.yaml new file mode 100644 index 0000000..6fca6bf --- /dev/null +++ b/appsets/prometheus.yaml @@ -0,0 +1,101 @@ +apiVersion: argoproj.io/v1alpha1 +kind: ApplicationSet +metadata: + name: universal-prometheus + annotations: + argocd.argoproj.io/sync-wave: "2" +spec: + generators: + - list: + elements: + - cluster: lychee + url: https://172.16.152.1:6443 + syncPolicy: + applicationsSync: create-update + preserveResourcesOnDeletion: true + template: + metadata: + name: "{{cluster}}-prometheus" + annotations: + argocd.argoproj.io/sync-wave: "5" + spec: + project: monitoring + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true + source: + chart: kube-prometheus-stack + repoURL: https://prometheus-community.github.io/helm-charts + targetRevision: 51.2.0 + helm: + releaseName: prometheus + values: | + fullnameOverride: "prometheus" + + prometheus: + prometheusSpec: + serviceMonitorSelectorNilUsesHelmValues: false + podMonitorSelectorNilUsesHelmValues: false + ruleSelectorNilUsesHelmValues: false + + storageSpec: + volumeClaimTemplate: + spec: + storageClassName: hcloud-volumes + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 20Gi + + ingress: + enabled: true + pathType: Prefix + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + hosts: + - prometheus.{{cluster}}.k8s.vxm.cz + paths: + - / + tls: + - hosts: + - prometheus.{{cluster}}.k8s.vxm.cz + secretName: prometheus-ingress-tls + + serviceMonitor: + selfMonitor: true + + grafana: + enabled: false + + alertmanager: + enabled: true + ingress: + enabled: true + ingressClassName: nginx + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + hosts: + - alertmanager.{{cluster}}.k8s.vxm.cz + paths: + - / + pathType: Prefix + tls: + - hosts: + - alertmanager.{{cluster}}.k8s.vxm.cz + secretName: alertmanager-ingress-tls + + alertmanagerSpec: + resources: + limits: + cpu: 250m + memory: 256Mi + requests: + cpu: 100m + memory: 256Mi + + destination: + server: "{{url}}" + namespace: monitoring From 3122d80ee128aa3e73628d84573ff74cdc426da6 Mon Sep 17 00:00:00 2001 From: Vojtech Mares Date: Tue, 3 Oct 2023 20:12:17 +0200 Subject: [PATCH 05/10] feat(appsets): add node-problem-detector --- appsets/node-problem-detector.yaml | 46 ++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 appsets/node-problem-detector.yaml diff --git a/appsets/node-problem-detector.yaml b/appsets/node-problem-detector.yaml new file mode 100644 index 0000000..551f480 --- /dev/null +++ b/appsets/node-problem-detector.yaml @@ -0,0 +1,46 @@ +apiVersion: argoproj.io/v1alpha1 +kind: ApplicationSet +metadata: + name: universal-cert-manager + annotations: + argocd.argoproj.io/sync-wave: "3" +spec: + generators: + - list: + elements: + - cluster: cherry + url: https://kubernetes.default.svc + - cluster: lychee + url: https://172.16.152.1:6443 + syncPolicy: + applicationsSync: sync + template: + metadata: + name: "{{cluster}}-node-problem-detector" + annotations: + argocd.argoproj.io/sync-wave: "5" + spec: + project: monitoring + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true + source: + chart: node-problem-detector + repoURL: https://charts.deliveryhero.io/ + targetRevision: 2.3.11 + helm: + releaseName: node-problem-detector + valuesObject: + metrics: + enabled: true + serviceMonitor: + enabled: true + prometheusRule: + enabled: true + + destination: + server: "{{url}}" + namespace: monitoring From 88e41eb5cc80c522d318bf15eed3b3bf2e1a2d96 Mon Sep 17 00:00:00 2001 From: Vojtech Mares Date: Tue, 3 Oct 2023 20:16:32 +0200 Subject: [PATCH 06/10] fix(appsets/prometheus): syncOptions: ServerSideApply=true --- appsets/prometheus.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/appsets/prometheus.yaml b/appsets/prometheus.yaml index 6fca6bf..1847549 100644 --- a/appsets/prometheus.yaml +++ b/appsets/prometheus.yaml @@ -26,6 +26,7 @@ spec: selfHeal: true syncOptions: - CreateNamespace=true + - ServerSideApply=true source: chart: kube-prometheus-stack repoURL: https://prometheus-community.github.io/helm-charts From 12525660cd7bcde50cb8afa75d78f85b4dd6518d Mon Sep 17 00:00:00 2001 From: Vojtech Mares Date: Tue, 3 Oct 2023 20:23:16 +0200 Subject: [PATCH 07/10] fix(appsets/prometheus_: missing ingressClassName --- appsets/prometheus.yaml | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/appsets/prometheus.yaml b/appsets/prometheus.yaml index 1847549..5fab5a8 100644 --- a/appsets/prometheus.yaml +++ b/appsets/prometheus.yaml @@ -37,6 +37,21 @@ spec: fullnameOverride: "prometheus" prometheus: + ingress: + enabled: true + ingressClassName: nginx + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + hosts: + - prometheus.{{cluster}}.k8s.vxm.cz + paths: + - / + pathType: Prefix + tls: + - hosts: + - prometheus.{{cluster}}.k8s.vxm.cz + secretName: prometheus-ingress-tls + prometheusSpec: serviceMonitorSelectorNilUsesHelmValues: false podMonitorSelectorNilUsesHelmValues: false @@ -51,20 +66,6 @@ spec: requests: storage: 20Gi - ingress: - enabled: true - pathType: Prefix - annotations: - cert-manager.io/cluster-issuer: letsencrypt-prod - hosts: - - prometheus.{{cluster}}.k8s.vxm.cz - paths: - - / - tls: - - hosts: - - prometheus.{{cluster}}.k8s.vxm.cz - secretName: prometheus-ingress-tls - serviceMonitor: selfMonitor: true From 4577bacd0f727631b47d261df0996619b6f5f4bd Mon Sep 17 00:00:00 2001 From: Vojtech Mares Date: Tue, 3 Oct 2023 20:34:24 +0200 Subject: [PATCH 08/10] refactor(appsets/node-problem-detector): downgrade to helm chart 2.3.10 --- appsets/node-problem-detector.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/appsets/node-problem-detector.yaml b/appsets/node-problem-detector.yaml index 551f480..59a196e 100644 --- a/appsets/node-problem-detector.yaml +++ b/appsets/node-problem-detector.yaml @@ -30,7 +30,7 @@ spec: source: chart: node-problem-detector repoURL: https://charts.deliveryhero.io/ - targetRevision: 2.3.11 + targetRevision: 2.3.10 helm: releaseName: node-problem-detector valuesObject: From 2eed37953308e9e806e24969b0b19bfff2bcd237 Mon Sep 17 00:00:00 2001 From: Vojtech Mares Date: Tue, 3 Oct 2023 20:51:29 +0200 Subject: [PATCH 09/10] feat(cherry/kube-prometheus-stack): federate data from lychee prometheus --- .../cherry/apps/kube-prometheus-stack.yaml | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/clusters/cherry/apps/kube-prometheus-stack.yaml b/clusters/cherry/apps/kube-prometheus-stack.yaml index be7b9c0..b3ae6ba 100644 --- a/clusters/cherry/apps/kube-prometheus-stack.yaml +++ b/clusters/cherry/apps/kube-prometheus-stack.yaml @@ -117,6 +117,24 @@ spec: - maple.vxm.cz:9100 - alder.vxm.cz:9100 + # See: https://developers.mattermost.com/blog/cloud-monitoring/ + - job_name: "federate" + scrape_interval: 15s + + honor_labels: true + metrics_path: "/federate" + + params: + 'match[]': + - '{job="prometheus"}' + - '{__name__=~"job:.*"}' + + static_configs: + - targets: + - prometheus.lychee.k8s.vxm.cz. + labels: + clusterID: lychee + serviceMonitor: selfMonitor: true From d410d564f34dc39806d7b97916289ce44fe491e4 Mon Sep 17 00:00:00 2001 From: Vojtech Mares Date: Thu, 25 Jan 2024 17:07:18 +0100 Subject: [PATCH 10/10] wip --- clusters/cherry/apps/blackbox-exporter.yaml | 38 ++++++++++++++++ .../apps/hcloud-cloud-controller-manager.yaml | 26 +++++++++++ clusters/cherry/manifests/vault/ca.yaml | 33 ++++++++++++++ .../cherry/manifests/vault/certificate.yaml | 21 +++++++++ vault-unseal.sh | 44 +++++++++++++++++++ 5 files changed, 162 insertions(+) create mode 100644 clusters/cherry/apps/blackbox-exporter.yaml create mode 100644 clusters/cherry/apps/hcloud-cloud-controller-manager.yaml create mode 100644 clusters/cherry/manifests/vault/ca.yaml create mode 100644 clusters/cherry/manifests/vault/certificate.yaml create mode 100755 vault-unseal.sh diff --git a/clusters/cherry/apps/blackbox-exporter.yaml b/clusters/cherry/apps/blackbox-exporter.yaml new file mode 100644 index 0000000..2d8c782 --- /dev/null +++ b/clusters/cherry/apps/blackbox-exporter.yaml @@ -0,0 +1,38 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: cherry-blackbox-exporter + namespace: argocd + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + project: monitoring + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true + destination: + server: https://kubernetes.default.svc + namespace: monitoring + source: + chart: prometheus-blackbox-exporter + repoURL: https://prometheus-community.github.io/helm-charts + targetRevision: 8.4.0 + helm: + releaseName: blackbox-exporter + valuesObject: + releaseLabel: true + pspEnabled: false + resources: + limits: + cpu: 100m + memory: 300Mi + requests: + cpu: 50m + memory: 50Mi + + serviceMonitor: + selfMonitor: + enabled: true diff --git a/clusters/cherry/apps/hcloud-cloud-controller-manager.yaml b/clusters/cherry/apps/hcloud-cloud-controller-manager.yaml new file mode 100644 index 0000000..8d4fe38 --- /dev/null +++ b/clusters/cherry/apps/hcloud-cloud-controller-manager.yaml @@ -0,0 +1,26 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: cherry-hcloud-cloud-controller-manager + namespace: argocd +spec: + project: infrastructure + syncPolicy: + automated: + prune: true + selfHeal: true + destination: + server: https://kubernetes.default.svc + namespace: kube-system + source: + chart: hcloud-cloud-controller-manager + repoURL: https://charts.hetzner.cloud + targetRevision: 1.18.0 + helm: + releaseName: hcloud-cloud-controller-manager + valuesObject: + controller: + hcloudToken: + existingSecret: + name: hcloud-token + key: token diff --git a/clusters/cherry/manifests/vault/ca.yaml b/clusters/cherry/manifests/vault/ca.yaml new file mode 100644 index 0000000..d199432 --- /dev/null +++ b/clusters/cherry/manifests/vault/ca.yaml @@ -0,0 +1,33 @@ +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: selfsigned +spec: + selfSigned: {} +--- +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: server-selfsigned-ca +spec: + isCA: true + commonName: Vault Server CA + secretName: server-ca + duration: 87660h # 10 years + privateKey: + algorithm: RSA + size: 4096 + issuerRef: + name: selfsigned + kind: Issuer + group: cert-manager.io + additionalOutputFormats: + - type: CombinedPEM +--- +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: server-ca-issuer +spec: + ca: + secretName: server-ca diff --git a/clusters/cherry/manifests/vault/certificate.yaml b/clusters/cherry/manifests/vault/certificate.yaml new file mode 100644 index 0000000..f2eb44c --- /dev/null +++ b/clusters/cherry/manifests/vault/certificate.yaml @@ -0,0 +1,21 @@ +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: server +spec: + secretName: server-tls + duration: 24h + renewBefore: 144m # 10% of 24h + dnsNames: + - vault-0.vault-internal + - vault-1.vault-internal + - vault-2.vault-internal + - vault-0.vault-internal.vault + - vault-1.vault-internal.vault + - vault-2.vault-internal.vault + - vault-0.vault-internal.vault.svc + - vault-1.vault-internal.vault.svc + - vault-2.vault-internal.vault.svc + issuerRef: + name: server-ca-issuer + commonName: Server Certificate diff --git a/vault-unseal.sh b/vault-unseal.sh new file mode 100755 index 0000000..48d1321 --- /dev/null +++ b/vault-unseal.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash + +UNSEAL_KEYS="xxx yyy zzz" + +VAULT_PODS=$(kubectl get pods -n vault -l app.kubernetes.io/name=vault -o jsonpath='{.items[*].metadata.name}') + +echo "" +echo "Unsealing Vault..." +echo "" + +echo "Deleting existing pods..." +echo "" + +kubectl delete pods -n vault $VAULT_PODS >> /dev/null + +echo "Waiting for new pods to become ready..." +echo "" + +kubectl wait --for=condition=ready --timeout=180s pods -n vault -l app.kubernetes.io/name=vault >> /dev/null + +echo "Unsealing Vault pods..." +echo "" + +for pod in $VAULT_PODS +do + echo "Unsealing $pod..." + + for key in $UNSEAL_KEYS + do + kubectl exec -n vault -it $pod -- vault operator unseal $key >> /dev/null + sleep 1 + done + + echo "$pod unsealed!" + echo "" +done + +echo "Vault unsealed!" +echo "" +echo "" +echo "" + + +