main.yml 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179
  1. ---
  2. # some data sanity checks
  3. - assert:
  4. that: cluster is defined
  5. fail_msg: "ERROR: Variable cluster is not defined, but is required."
  6. success_msg: "OK, cluster is defined - federating {{ cluster }}"
  7. - assert:
  8. that: clusters is defined and (clusters.keys() | length) > 0 and clusters[cluster] is defined
  9. fail_msg: "ERROR: Variable clusters is not defined or is missing cluster {{ cluster }}, but is required."
  10. success_msg: "OK, clusters are defined and cluster is found."
  11. - assert:
  12. that: api_ep is defined
  13. fail_msg: "ERROR: Variable api_ep is not defined, but is required."
  14. success_msg: "OK, api_ep is defined."
  15. - assert:
  16. that: api_token is defined
  17. fail_msg: "ERROR: Variable api_token is not defined, but is required."
  18. success_msg: "OK, api_token is defined."
  19. # is there anything to do?
  20. - name: check for cluster definitions in central
  21. uri:
  22. method: GET
  23. return_content: true
  24. validate_certs: false
  25. url: "https://{{ api_ep }}/v1/clusters"
  26. headers:
  27. Authorization: Bearer {{ api_token }}
  28. Accept: application/json
  29. register: cluster_query
  30. - name: assume cluster isn't found in the result
  31. set_fact:
  32. cluster_found: false
  33. - name: unless found
  34. set_fact:
  35. cluster_found: true
  36. when:
  37. - cluster_query.json.clusters | length > 0
  38. - (cluster_query.json.clusters | items2dict(key_name='name', value_name='status'))[clusters[cluster].name] is defined
  39. - ((cluster_query.json.clusters | items2dict(key_name='name', value_name='status'))[clusters[cluster].name]).sensorVersion is defined
  40. # (this last one is because roxctl creates a cluster record but leaves its status at null until services check in)
  41. # step 1: we could have lots of fun (authentication in place)
  42. - include_tasks:
  43. file: init-bundles.yml
  44. when:
  45. - clusters[cluster].method in ['operator', 'helm']
  46. - not cluster_found
  47. # no init bundles for method 'roxctl'
  48. # step 2: there's so much we can do (not really, just make sure artifacts are either present or created)
  49. - include_tasks:
  50. file: "{{ clusters[cluster].method }}.yml"
  51. when:
  52. - not cluster_found
  53. # step 3: there is just you and me (wait for pods to pop up)
  54. - name: wait for sensor to show up
  55. k8s_info:
  56. kubeconfig: "{{ ansible_facts['user_dir'] }}/kubeconfig-{{ cluster }}"
  57. validate_certs: no
  58. api_version: v1
  59. kind: pod
  60. namespace: "{{ clusters[cluster].namespace }}"
  61. label_selectors:
  62. - app=sensor
  63. register: sensor_pod
  64. until:
  65. - sensor_pod.resources is defined
  66. - (sensor_pod.resources | length) > 0
  67. retries: 30
  68. delay: 5
  69. - name: wait for admission-control to show up
  70. k8s_info:
  71. kubeconfig: "{{ ansible_facts['user_dir'] }}/kubeconfig-{{ cluster }}"
  72. validate_certs: no
  73. api_version: v1
  74. kind: pod
  75. namespace: "{{ clusters[cluster].namespace }}"
  76. label_selectors:
  77. - app=admission-control
  78. register: admctl_pod
  79. until:
  80. - admctl_pod.resources is defined
  81. - (admctl_pod.resources | length) > 0
  82. retries: 30
  83. delay: 5
  84. - name: wait for collector to show up
  85. k8s_info:
  86. kubeconfig: "{{ ansible_facts['user_dir'] }}/kubeconfig-{{ cluster }}"
  87. validate_certs: no
  88. api_version: v1
  89. kind: pod
  90. namespace: "{{ clusters[cluster].namespace }}"
  91. label_selectors:
  92. - app=collector
  93. register: collect_pod
  94. until:
  95. - collect_pod.resources is defined
  96. - (collect_pod.resources | length) > 0
  97. retries: 30
  98. delay: 5
  99. # step 4: i can give you more (any sort of corrections needed? pending pods?)
  100. - name: any pending pods?
  101. k8s_info:
  102. kubeconfig: "{{ ansible_facts['user_dir'] }}/kubeconfig-{{ cluster }}"
  103. validate_certs: no
  104. api_version: v1
  105. kind: pod
  106. namespace: "{{ clusters[cluster].namespace }}"
  107. field_selectors:
  108. - status.phase=Pending
  109. register: pending_pods
  110. - name: fix pending sensor by decreasing requests
  111. kubernetes.core.k8s_json_patch:
  112. kubeconfig: "{{ ansible_facts['user_dir'] }}/kubeconfig-{{ cluster }}"
  113. validate_certs: no
  114. api_version: apps/v1
  115. kind: deployment
  116. name: sensor
  117. namespace: "{{ clusters[cluster].namespace }}"
  118. patch:
  119. - op: replace
  120. path: /spec/template/spec/containers/0/resources/requests/cpu
  121. value: 750m
  122. when:
  123. - (pending_pods.resources | length) > 0
  124. - pending_pods.resources[0].metadata.labels.app == 'sensor'
  125. - name: fix pending collectors by deleting random operators
  126. kubernetes.core.k8s:
  127. kubeconfig: "{{ ansible_facts['user_dir'] }}/kubeconfig-{{ cluster }}"
  128. validate_certs: no
  129. api_version: apps/v1
  130. kind: deployment
  131. name: "{{ item.name }}"
  132. namespace: "{{ item.namespace }}"
  133. state: absent
  134. loop:
  135. - name: cluster-autoscaler-operator
  136. namespace: openshift-machine-api
  137. - name: cluster-baremetal-operator
  138. namespace: openshift-machine-api
  139. - name: csi-snapshot-controller-operator
  140. namespace: openshift-cluster-storage-operator
  141. - name: csi-snapshot-controller
  142. namespace: openshift-cluster-storage-operator
  143. when:
  144. - (pending_pods.resources | length) > 0
  145. - pending_pods.resources[0].metadata.labels.app == 'collector'
  146. # step 5: don't you know the time has arrived (just recheck the cluster in central - it should be healthy)
  147. - name: check that the cluster is marked as discovered
  148. uri:
  149. method: GET
  150. return_content: true
  151. validate_certs: false
  152. url: "https://{{ api_ep }}/v1/clusters"
  153. headers:
  154. Authorization: Bearer {{ api_token }}
  155. Accept: application/json
  156. register: cluster_query_fin
  157. until:
  158. - cluster_query_fin.json.clusters | length > 0
  159. - (cluster_query_fin.json.clusters | items2dict(key_name='name', value_name='status'))[clusters[cluster].name] is defined
  160. - ((cluster_query_fin.json.clusters | items2dict(key_name='name', value_name='status'))[clusters[cluster].name]).sensorVersion is defined
  161. retries: 30
  162. delay: 5
  163. ...