I've run into an error a few times now when attempting to live-migrate VMs within a pool.
The migrate process seems to start successfully and sends the Memory to the target Host, however the process gets stuck at around the 85% mark and never completes. Ultimately the process fails with the error: Xenops_interface.Xenopsd_error([S(Failed_to_suspend);
The hosts have shared storage so the disk is not being migrated.
The hosts are connected with a 10Gig management network.
The Control domain memory is set to 8GiB (and doesn't seem to be running out).
Everything is up to date, all patches were installed just yesterday (manually since we can't use the rolling pool upgrade due to this issue - and rebooted).
Running Xen Orchestra v5.97.
VMs are running Cloudlinux 8 or 9 with Management Agent 7.30.0-11 or 12.
This has happened with multiple different VMs on multiple hosts (in this one pool). The only similarity that I can see is that they all have large disks (over 1TB) - but I can't see how this is relevant given that the disk contents are not being migrated. VMs with a smaller disk seem to work reliably.
My other theory is that something running on the VMs is preventing the suspend operation (ie. an active NFS connection or other running process). Is this possible? Can a running process prevent a suspend operation?
{
"id": "0m04goylw",
"properties": {
"method": "vm.migrate",
"params": {
"vm": "121d53f3-71e7-be03-f58d-8eacf7bb55d1",
"migrationNetwork": "a189fcf5-2bc1-a8e6-52fe-c8a88b9c4a57",
"targetHost": "349dbfa5-184b-416c-9fc7-2fe68cff9ac3"
},
"name": "API call: vm.migrate",
"userId": "b03288de-9b3b-4f5f-84f2-a8079a601de7",
"type": "api.call"
},
"start": 1724281513268,
"status": "failure",
"updatedAt": 1724282833944,
"end": 1724282833944,
"result": {
"code": "INTERNAL_ERROR",
"params": [
"Xenops_interface.Xenopsd_error([S(Failed_to_suspend);[S(121d53f3-71e7-be03-f58d-8eacf7bb55d1);F(1200)]])"
],
"task": {
"uuid": "3a3b04f8-2272-e1d5-45a4-f8b7aab31915",
"name_label": "Async.VM.migrate_send",
"name_description": "",
"allowed_operations": [],
"current_operations": {},
"created": "20240821T23:05:13Z",
"finished": "20240821T23:27:13Z",
"status": "failure",
"resident_on": "OpaqueRef:06124cbc-db3e-454f-815b-55744f966d70",
"progress": 1,
"type": "<none/>",
"result": "",
"error_info": [
"INTERNAL_ERROR",
"Xenops_interface.Xenopsd_error([S(Failed_to_suspend);[S(121d53f3-71e7-be03-f58d-8eacf7bb55d1);F(1200)]])"
],
"other_config": {},
"subtask_of": "OpaqueRef:NULL",
"subtasks": [],
"backtrace": "(((process xenopsd-xc)(filename xc/xenops_server_xen.ml)(line 2545))((process xenopsd-xc)(filename xc/domain.ml)(line 1658))((process xenopsd-xc)(filename xc/emu_manager.ml)(line 239))((process xenopsd-xc)(filename xc/emu_manager.ml)(line 244))((process xenopsd-xc)(filename lib/xapi-stdext-pervasives/pervasiveext.ml)(line 24))((process xenopsd-xc)(filename lib/xapi-stdext-pervasives/pervasiveext.ml)(line 35))((process xenopsd-xc)(filename lib/xapi-stdext-pervasives/pervasiveext.ml)(line 24))((process xenopsd-xc)(filename lib/xapi-stdext-pervasives/pervasiveext.ml)(line 35))((process xenopsd-xc)(filename xc/domain.ml)(line 1770))((process xenopsd-xc)(filename xc/domain.ml)(line 1763))((process xenopsd-xc)(filename xc/xenops_server_xen.ml)(line 2536))((process xenopsd-xc)(filename lib/xenops_server.ml)(line 2140))((process xenopsd-xc)(filename list.ml)(line 121))((process xenopsd-xc)(filename lib/xenops_server.ml)(line 2133))((process xenopsd-xc)(filename lib/xenops_server.ml)(line 2458))((process xenopsd-xc)(filename lib/xenops_server.ml)(line 2470))((process xenopsd-xc)(filename lib/xapi-stdext-pervasives/pervasiveext.ml)(line 24))((process xenopsd-xc)(filename lib/xapi-stdext-pervasives/pervasiveext.ml)(line 35))((process xenopsd-xc)(filename lib/xapi-stdext-pervasives/pervasiveext.ml)(line 24))((process xenopsd-xc)(filename lib/xapi-stdext-pervasives/pervasiveext.ml)(line 35))((process xenopsd-xc)(filename stunnel/stunnel.ml)(line 349))((process xenopsd-xc)(filename lib/xenops_server.ml)(line 2393))((process xenopsd-xc)(filename lib/xenops_server.ml)(line 2834))((process xenopsd-xc)(filename lib/xenops_server.ml)(line 2844))((process xenopsd-xc)(filename lib/xenops_server.ml)(line 2864))((process xenopsd-xc)(filename lib/task_server.ml)(line 177))((process xapi)(filename ocaml/xapi/xapi_xenops.ml)(line 3302))((process xapi)(filename lib/xapi-stdext-pervasives/pervasiveext.ml)(line 24))((process xapi)(filename lib/xapi-stdext-pervasives/pervasiveext.ml)(line 35))((process xapi)(filename ocaml/xapi/xapi_xenops.ml)(line 3470))((process xapi)(filename ocaml/xapi/xapi_vm_migrate.ml)(line 219))((process xapi)(filename ocaml/xapi/xapi_vm_migrate.ml)(line 225))((process xapi)(filename ocaml/xapi/xapi_vm_migrate.ml)(line 249))((process xapi)(filename ocaml/xapi/xapi_vm_migrate.ml)(line 1433))((process xapi)(filename lib/xapi-stdext-pervasives/pervasiveext.ml)(line 24))((process xapi)(filename ocaml/xapi/xapi_vm_migrate.ml)(line 1573))((process xapi)(filename lib/xapi-stdext-pervasives/pervasiveext.ml)(line 24))((process xapi)(filename lib/xapi-stdext-pervasives/pervasiveext.ml)(line 35))((process xapi)(filename ocaml/xapi/message_forwarding.ml)(line 131))((process xapi)(filename lib/xapi-stdext-pervasives/pervasiveext.ml)(line 24))((process xapi)(filename lib/xapi-stdext-pervasives/pervasiveext.ml)(line 35))((process xapi)(filename lib/xapi-stdext-pervasives/pervasiveext.ml)(line 24))((process xapi)(filename ocaml/xapi/rbac.ml)(line 205))((process xapi)(filename ocaml/xapi/server_helpers.ml)(line 95)))"
},
"message": "INTERNAL_ERROR(Xenops_interface.Xenopsd_error([S(Failed_to_suspend);[S(121d53f3-71e7-be03-f58d-8eacf7bb55d1);F(1200)]]))",
"name": "XapiError",
"stack": "XapiError: INTERNAL_ERROR(Xenops_interface.Xenopsd_error([S(Failed_to_suspend);[S(121d53f3-71e7-be03-f58d-8eacf7bb55d1);F(1200)]]))\n at Function.wrap (file:///opt/xo/xo-builds/xen-orchestra-202408051306/packages/xen-api/_XapiError.mjs:16:12)\n at default (file:///opt/xo/xo-builds/xen-orchestra-202408051306/packages/xen-api/_getTaskResult.mjs:13:29)\n at Xapi._addRecordToCache (file:///opt/xo/xo-builds/xen-orchestra-202408051306/packages/xen-api/index.mjs:1076:24)\n at file:///opt/xo/xo-builds/xen-orchestra-202408051306/packages/xen-api/index.mjs:1110:14\n at Array.forEach (<anonymous>)\n at Xapi._processEvents (file:///opt/xo/xo-builds/xen-orchestra-202408051306/packages/xen-api/index.mjs:1100:12)\n at Xapi._watchEvents (file:///opt/xo/xo-builds/xen-orchestra-202408051306/packages/xen-api/index.mjs:1273:14)"
}
}
Edit: I've spent some time digging through the source code to try to work this one out. I haven't found anything conclusive, but here's my findings so far.
The error is triggered in xenopsd/xc/xenops_server_xen.ml
at the following line:
if not (wait_shutdown task vm Suspend 1200.) then
raise (Xenopsd_error (Failed_to_suspend (vm.Vm.id, 1200.)))
This calls the wait_shutdown
method with a 1200 timeout. That method looks like this:
let wait_shutdown task vm reason timeout =
event_wait internal_updates task timeout (function
| Dynamic.Vm id when id = vm.Vm.id ->
debug "EVENT on our VM: %s" id ;
on_domain (fun xc xs _ vm di -> di.Xenctrl.shutdown) task vm
| Dynamic.Vm id ->
debug "EVENT on other VM: %s" id ;
false
| _ ->
debug "OTHER EVENT" ; false
)
This is basically calling the Xenctrl.shutdown
method on the target VM.
So the issue is that the VM is failing to shutdown when instructed to.