@cichy I know this isn't as easy as what you're asking for, but I wrote some terrible python code.
It relies on health checks being defined as VM tags, or at least the management agent being detected. For example in my terraform code I have these tags on a test postgres instance and test nginx instances respectively:
# postgres
tags = [
"bootOrder/agent-detect-timeout=45",
"bootOrder/ip=${jsonencode("auto")}",
"bootOrder/healtcheck/tcp=${jsonencode({
"port" : 5432,
})}",
]
# nginx
tags = [
"bootOrder/agent-detect-timeout=45",
"bootOrder/ip=${jsonencode("auto")}",
"bootOrder/healtcheck/http=${jsonencode({
"port" : 80,
"scheme" : "http",
"path" : "/"
})}",
]
Then the actual python:
#!/usr/bin/env python3
import urllib3
import json
import os
import sys
import socket
import time
import logging
logging.basicConfig(level=logging.INFO)
BOOT_ORDER = [
# Postgres
["55e88cb4-0c50-8384-2149-cf73e40b8c8e"],
# nginx
["ba620f01-69d1-ddd8-b1d4-c256abe07e05", "bbe333bd-380a-1f94-4052-881c763b6177"],
]
DEFAULT_AGENT_DETECT_TIMEOUT_SECONDS = 60
class HealthCheck:
def __init__(self, target: str, config: dict) -> None:
self.type = "base"
self.target = target
self.config = config
self.timeout = 3
self.retry_max_count = 5
self.retry_cur_count = 0
self.retry_sleep = 10
def _retry(self):
if self.retry_cur_count == 0:
logging.info("Starting %s healtcheck against %s", self.type, self.target)
self.retry_cur_count += 1
return True
if self.retry_cur_count == self.retry_max_count:
logging.warning('Failed Healtcheck of type %s for %s', self.type, self.target)
return False
time.sleep(self.retry_sleep)
self.retry_cur_count += 1
return True
class TCPHealthCheck(HealthCheck):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.type = "TCP"
def run(self):
port = self.config.get("port")
while self._retry():
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
sock.settimeout(self.timeout)
success = sock.connect_ex((self.target, port)) == 0
if success:
return True
return False
class HttpHealthCheck(HealthCheck):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.type = "HTTP"
def run(self):
while self._retry():
assert_hostname = self.config.get("tls_verification", True)
http = urllib3.PoolManager(
cert_reqs="CERT_REQUIRED" if assert_hostname else "CERT_NONE",
)
scheme = self.config.get("scheme", "http")
port = self.config.get("port", 80)
path = self.config.get("path", "").lstrip("/")
url = f"{scheme}://{self.target}:{port}/{path}"
response = http.request('GET', url, timeout=self.timeout)
if response.status >= 200 and response.status < 300:
return True
return False
class XoaClient:
def __init__(self, base_url: str, token: str) -> None:
self.base_url = base_url.rstrip("/")
self.tags_prefix = "bootOrder/"
self.token = token
self.http = urllib3.PoolManager()
self.headers = {
"Content-Type": "application/json",
"Cookie": f"token={self.token}",
}
self._vm_cache = {}
def vm_ip(self, uuid):
vm_tags = self._extract_vm_tags(uuid)
ip = vm_tags.get("ip", "auto")
if ip != "auto":
return ip
return self._get_vm(uuid).get("mainIpAddress")
def vm_healthcheck(self, uuid):
vm_tags = self._extract_vm_tags(uuid)
tcp = vm_tags.get("healtcheck/tcp")
http = vm_tags.get("healtcheck/http")
return tcp, http
def _get_vm(self, uuid: str):
url = f"{self.base_url}/rest/v0/vms/{uuid}"
# if url in self._vm_cache:
# return self._vm_cache[url]
response = self.http.request("GET", url, headers=self.headers)
result = self._handle_json_response(response)
self._vm_cache[url] = result
return result
def _extract_vm_tags(self, uuid: str) -> dict:
dict_tags = {}
tags = self._get_vm(uuid).get("tags")
for tag in tags:
if tag.startswith(self.tags_prefix):
k,v = tag.split("=", 1)
k = k[len(self.tags_prefix):]
dict_tags[k] = json.loads(v)
return dict_tags
def start_vm(self, uuid: str):
if self._get_vm(uuid).get("power_state") == "Running":
return
url = f"{self.base_url}/rest/v0/vms/{uuid}/actions/start?sync=true"
response = self.http.request("POST", url, headers=self.headers)
if response.status != 204:
raise Exception(f"HTTP {response.status}: {response.data.decode('utf-8')}")
return
def management_agent_detected(self, uuid: str) -> bool:
return self._get_vm(uuid).get("managementAgentDetected")
def vm_agent_detection_timeout(self, uuid: str, default_seconds: int = 60) -> bool:
tags = self._extract_vm_tags(uuid)
return tags.get("agent-detect-timeout", default_seconds)
def _handle_json_response(self, response):
if response.status >= 200 and response.status < 300:
return json.loads(response.data.decode("utf-8"))
else:
raise Exception(f"HTTP {response.status}: {response.data.decode('utf-8')}")
if __name__ == "__main__":
xoa_url = os.getenv("XOA_URL")
xoa_token = os.getenv("XOA_TOKEN")
if not xoa_url:
logging.fatal("Missing XOA_URL environment variable")
sys.exit(1)
if not xoa_token:
logging.fatal("Missing XOA_TOKEN environment variable")
sys.exit(1)
client = XoaClient(xoa_url, xoa_token)
group_number = 1
for boot_group in BOOT_ORDER:
logging.info("Starting to boot group %s, length %s", group_number, len(boot_group))
# These should be booted in parallel, but aren't
for uuid in boot_group:
client.start_vm(uuid)
timeout = client.vm_agent_detection_timeout(
uuid=uuid,
default_seconds=DEFAULT_AGENT_DETECT_TIMEOUT_SECONDS,
)
mad = False
for n in range(timeout):
mad = client.management_agent_detected(uuid)
if mad:
break
time.sleep(1)
if not mad:
raise Exception(f"No management agent detected in host {uuid}")
target = client.vm_ip(uuid)
tcp, http = client.vm_healthcheck(uuid)
if tcp:
hc = TCPHealthCheck(target=target, config=tcp)
hc.run()
if http:
hc = HttpHealthCheck(target=target, config=http)
hc.run()
logging.info("All healthchecks passed for %s", target)
group_number += 1
It'll boot each VM in order and wait for its agent to be detected, then wait for all its health checks to pass before moving on to the next VM.
This is by no means production ready code, but it might be a decent solution.
Finally a systemd timer would be set up on the XOA instance to auto-run this script on boot.