Bladeren bron

simple role to add storage to monitoring and optionally enable uwm (with storage as well)

Grega Bremec 2 dagen geleden
bovenliggende
commit
2677d5bcdc

+ 20 - 0
p0f/operators/roles/cluster-monitoring/defaults/main.yml

@@ -0,0 +1,20 @@
+---
+# Variables that are usually overridden.
+kubeadmin_config: "tmp/kubeconfig-ocp4"
+enable_user_workload: true
+main_prom_pvc: 100Gi
+main_prom_sc: odf-cluster-ceph-rbd
+main_prom_retain_time: 21d
+main_prom_retain_size: 80GiB
+main_alrt_pvc: 10Gi
+main_alrt_sc: odf-cluster-ceph-rbd
+user_prom_pvc: 40Gi
+user_prom_sc: odf-cluster-ceph-rbd
+user_prom_retain_time: 21d
+user_prom_retain_size: 30GiB
+user_alrt_pvc: 10Gi
+user_alrt_sc: odf-cluster-ceph-rbd
+user_thanos_pvc: 10Gi
+user_thanos_sc: odf-cluster-ceph-rbd
+user_thanos_retain_time: 21d
+...

+ 85 - 0
p0f/operators/roles/cluster-monitoring/tasks/main.yml

@@ -0,0 +1,85 @@
+---
+# Configure cluster logging:
+#  - storage for Prometheus and AlertManager
+#  - retention settings
+#  - UWM
+#  - storage for Prometheus, AlertManager, and ThanosRuler
+#  - retention settings
+#  - user RBAC to allow them to query and view metrics (cluster-monitoring-view) TODO
+#
+# Required variables:
+#
+#   NONE
+#
+# Optional variables:
+#
+#   kubeadmin_config          the administrator kubeconfig file (tmp/kubeconfig-ocp4)
+#
+#   enable_user_workload      defaults to true
+#   main_prom_pvc             prometheusK8s PVC size (100Gi)
+#   main_prom_sc              prometheusK8s PVC storage class (odf-cluster-ceph-rbd)
+#   main_prom_retain_time     system metric retention time (21d)
+#   main_prom_retain_size     system metric retention size in GiB (80GiB)
+#   main_alrt_pvc             alertmanagerMain PVC size (10Gi)
+#   main_alrt_sc              alertmanagerMain PVC storage class (odf-cluster-ceph-rbd)
+#   user_prom_pvc             prometheus PVC size (40Gi)
+#   user_prom_sc              prometheus PVC storage class (odf-cluster-ceph-rbd)
+#   user_prom_retain_time     user metric retention time (21d)
+#   user_prom_retain_size     user metric retention size in GiB (30GiB)
+#   user_alrt_pvc             alertmanager PVC size (10Gi)
+#   user_alrt_sc              alertmanager PVC storage class (odf-cluster-ceph-rbd)
+#   user_thanos_pvc           thanos ruler PVC size (10Gi)
+#   user_thanos_sc            thanos ruler PVC storage class (odf-cluster-ceph-rbd)
+#   user_thanos_retain_time   thanos ruler retention time (21d)
+#
+# OPTIONAL TODOs:
+#  - nodeSelector
+#  - taints
+#  - tolerations
+#
+# NOTES:
+#  symptoms: disk pressure, https://access.redhat.com/solutions/5341801 and
+#                           https://access.redhat.com/solutions/6738851
+#
+- name: Apply cluster monitoring configmap
+  kubernetes.core.k8s:
+    kubeconfig: "{{ kubeadmin_config }}"
+    validate_certs: no
+    api_version: v1
+    kind: configmap
+    namespace: openshift-monitoring
+    name: cluster-monitoring-config
+    template: templates/cluster-monitoring.yml.j2
+
+- name: Apply user monitoring settings if required
+  block:
+    - name: Wait for UVM operator pod to become ready
+      kubernetes.core.k8s_info:
+        kubeconfig: "{{ kubeadmin_config }}"
+        validate_certs: no
+        api_version: v1
+        kind: pod
+        namespace: openshift-user-workload-monitoring
+        label_selectors:
+          - app.kubernetes.io/component=controller
+      register: uwm_op_ready
+      until:
+        - uwm_op_ready.resources is defined
+        - uwm_op_ready.resources | length == 1
+        - uwm_op_ready.resources[0].status is defined
+        - (uwm_op_ready.resources[0].status | community.general.json_query('conditions[?type==`Ready`].status'))[0] == 'True'
+      retries: 6
+      delay: 5
+
+    - name: Apply user monitoring configmap if required
+      kubernetes.core.k8s:
+        kubeconfig: "{{ kubeadmin_config }}"
+        validate_certs: no
+        api_version: v1
+        kind: configmap
+        namespace: openshift-user-workload-monitoring
+        name: user-workload-monitoring-config
+        template: templates/user-monitoring.yml.j2
+
+  when: enable_user_workload == True
+...

+ 26 - 0
p0f/operators/roles/cluster-monitoring/templates/cluster-monitoring.yml.j2

@@ -0,0 +1,26 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: cluster-monitoring-config
+  namespace: openshift-monitoring
+data:
+  config.yaml: |
+{% if enable_user_workload %}
+    enableUserWorkload: true
+{% endif %}
+    prometheusK8s:
+      retention: {{ main_prom_retain_time }}
+      retentionSize: {{ main_prom_retain_size }}
+      volumeClaimTemplate:
+        spec:
+          storageClassName: {{ main_prom_sc }}
+          resources:
+            requests:
+              storage: {{ main_prom_pvc }}
+    alertmanagerMain:
+      volumeClaimTemplate:
+        spec:
+          storageClassName: {{ main_alrt_sc }}
+          resources:
+            requests:
+              storage: {{ main_alrt_pvc }}

+ 33 - 0
p0f/operators/roles/cluster-monitoring/templates/user-monitoring.yml.j2

@@ -0,0 +1,33 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: user-workload-monitoring-config
+  namespace: openshift-user-workload-monitoring
+data:
+  config.yaml: |
+    prometheus:
+      retention: {{ user_prom_retain_time }}
+      retentionSize: {{ user_prom_retain_size }}
+      volumeClaimTemplate:
+        spec:
+          storageClassName: {{ user_prom_sc }}
+          resources:
+            requests:
+              storage: {{ user_prom_pvc }}
+    alertmanager:
+      enabled: true
+      enableAlertmanagerConfig: true
+      volumeClaimTemplate:
+        spec:
+          storageClassName: {{ user_alrt_sc }}
+          resources:
+            requests:
+              storage: {{ user_alrt_pvc }}
+    thanosRuler:
+      retention: {{ user_thanos_retain_time }}
+      volumeClaimTemplate:
+        spec:
+          storageClassName: {{ user_thanos_sc }}
+          resources:
+            requests:
+              storage: {{ user_thanos_pvc }}