Hi,
i tried running Rolling pool update for my servers today, first host was correctly updated and rebooted, but the process got stuck on 2nd host.
After investigating, i found that one of my VM's was not being migrated from the host, which is why the update process failed.
Initial Rolling pool update error was this:
pool.rollingUpdate
{
"pool": "fe688bb2-b9ac-db7b-737a-cc457195f095"
}
{
"code": "VM_SUSPEND_TIMEOUT",
"params": [
"OpaqueRef:5c1818ff-cb37-4103-993a-4b80fa8c8231",
"1200."
],
"task": {
"uuid": "4547170a-ec23-4ad7-128c-69958985de34
",
"name_label": "Async.host.evacuate",
"name_description": "",
"allowed_operations": [],
"current_operations": {},
"created": "20250210T12:37:20Z",
"finished": "20250210T12:58:42Z",
"status": "failure",
"resident_on": "OpaqueRef:010eebba-be27-489f-9f87-d06c8b675f19",
"progress": 1,
"type": "<none/>",
"result": "",
"error_info": [
"VM_SUSPEND_TIMEOUT",
"OpaqueRef:5c1818ff-cb37-4103-993a-4b80fa8c8231",
"1200."
],
"other_config": {},
"subtask_of": "OpaqueRef:NULL",
"subtasks": [],
"backtrace": "(((process xapi)(filename ocaml/xapi-client/client.ml)(line 7))((process xapi)(filename ocaml/xapi-client/client.ml)(line 19))((process xapi)(filename ocaml/xapi-client/client.ml)(line 6172))((process xapi)(filename lib/xapi-stdext-pervasives/pervasiveext.ml)(line 24))((process xapi)(filename lib/xapi-stdext-pervasives/pervasiveext.ml)(line 35))((process xapi)(filename ocaml/xapi/xapi_host.ml)(line 612))((process xapi)(filename ocaml/xapi/xapi_host.ml)(line 621))((process xapi)(filename hashtbl.ml)(line 266))((process xapi)(filename hashtbl.ml)(line 272))((process xapi)(filename hashtbl.ml)(line 277))((process xapi)(filename ocaml/xapi/xapi_host.ml)(line 629))((process xapi)(filename lib/xapi-stdext-pervasives/pervasiveext.ml)(line 24))((process xapi)(filename ocaml/xapi/rbac.ml)(line 205))((process xapi)(filename ocaml/xapi/server_helpers.ml)(line 95)))"
},
"message": "VM_SUSPEND_TIMEOUT(OpaqueRef:5c1818ff-cb37-4103-993a-4b80fa8c8231, 1200.)",
"name": "XapiError",
"stack": "XapiError: VM_SUSPEND_TIMEOUT(OpaqueRef:5c1818ff-cb37-4103-993a-4b80fa8c8231, 1200.)
at Function.wrap (file:///opt/xo/xo-builds/xen-orchestra-202410070635/packages/xen-api/_XapiError.mjs:16:12)
at default (file:///opt/xo/xo-builds/xen-orchestra-202410070635/packages/xen-api/_getTaskResult.mjs:13:29)
at Xapi._addRecordToCache (file:///opt/xo/xo-builds/xen-orchestra-202410070635/packages/xen-api/index.mjs:1041:24)
at file:///opt/xo/xo-builds/xen-orchestra-202410070635/packages/xen-api/index.mjs:1075:14
at Array.forEach (<anonymous>)
at Xapi._processEvents (file:///opt/xo/xo-builds/xen-orchestra-202410070635/packages/xen-api/index.mjs:1065:12)
at Xapi._watchEvents (file:///opt/xo/xo-builds/xen-orchestra-202410070635/packages/xen-api/index.mjs:1238:14)"
}
I then tried manually migrating which failed:
vm.migrate
{
"vm": "0c012493-da75-832d-3f3a-cadce8afb757",
"migrationNetwork": "1f6f4495-1045-6fe2-3da6-4e43862e623d",
"sr": "6b24cd1c-22ad-0994-5b6b-a75389a6ddba",
"targetHost": "48bf1075-066f-4ed1-ba54-a350da4a426c"
}
{
"code": "INTERNAL_ERROR",
"params": [
"Storage_error ([S(Internal_error);S(Xmlrpc_client.Connection_reset)])"
],
"task": {
"uuid": "92d36644-b919-fdf3-e879-475be5b4d8c9",
"name_label": "Async.VM.migrate_send",
"name_description": "",
"allowed_operations": [],
"current_operations": {},
"created": "20250210T13:40:48Z",
"finished": "20250210T13:40:54Z",
"status": "failure",
"resident_on": "OpaqueRef:010eebba-be27-489f-9f87-d06c8b675f19",
"progress": 1,
"type": "<none/>",
"result": "",
"error_info": [
"INTERNAL_ERROR",
"Storage_error ([S(Internal_error);S(Xmlrpc_client.Connection_reset)])"
],
"other_config": {},
"subtask_of": "OpaqueRef:NULL",
"subtasks": [],
"backtrace": "(((process xapi)(filename ocaml/xapi/helpers.ml)(line 1690))((process xapi)(filename lib/xapi-stdext-pervasives/pervasiveext.ml)(line 24))((process xapi)(filename lib/xapi-stdext-pervasives/pervasiveext.ml)(line 35))((process xapi)(filename lib/xapi-stdext-pervasives/pervasiveext.ml)(line 24))((process xapi)(filename lib/xapi-stdext-pervasives/pervasiveext.ml)(line 35))((process xapi)(filename ocaml/xapi/message_forwarding.ml)(line 134))((process xapi)(filename lib/xapi-stdext-pervasives/pervasiveext.ml)(line 24))((process xapi)(filename lib/xapi-stdext-pervasives/pervasiveext.ml)(line 35))((process xapi)(filename lib/xapi-stdext-pervasives/pervasiveext.ml)(line 24))((process xapi)(filename ocaml/xapi/rbac.ml)(line 205))((process xapi)(filename ocaml/xapi/server_helpers.ml)(line 95)))"
},
"message": "INTERNAL_ERROR(Storage_error ([S(Internal_error);S(Xmlrpc_client.Connection_reset)]))",
"name": "XapiError",
"stack": "XapiError: INTERNAL_ERROR(Storage_error ([S(Internal_error);S(Xmlrpc_client.Connection_reset)]))
at Function.wrap (file:///opt/xo/xo-builds/xen-orchestra-202410070635/packages/xen-api/_XapiError.mjs:16:12)
at default (file:///opt/xo/xo-builds/xen-orchestra-202410070635/packages/xen-api/_getTaskResult.mjs:13:29)
at Xapi._addRecordToCache (file:///opt/xo/xo-builds/xen-orchestra-202410070635/packages/xen-api/index.mjs:1041:24)
at file:///opt/xo/xo-builds/xen-orchestra-202410070635/packages/xen-api/index.mjs:1075:14
at Array.forEach (<anonymous>)
at Xapi._processEvents (file:///opt/xo/xo-builds/xen-orchestra-202410070635/packages/xen-api/index.mjs:1065:12)
at Xapi._watchEvents (file:///opt/xo/xo-builds/xen-orchestra-202410070635/packages/xen-api/index.mjs:1238:14)"
}
I also tried Warm migration, that job was just "Interrupted" found no log for it.
After little digging i found that issue might actually be in the VM that was being migrated, i found this in its kern.log:
Feb 10 13:58:34 lapio kernel: [9681908.582545] Freezing user space processes ...
Feb 10 13:58:34 lapio kernel: [9681928.590134] Freezing of tasks failed after 20.007 seconds (1 tasks refusing to freeze, wq_busy=0):
Feb 10 13:58:34 lapio kernel: [9681928.590463] tar D ffff880009a7b978 0 26564 1 0x00000006
Feb 10 13:58:34 lapio kernel: [9681928.590469] ffff880009a7b978 ffff880120d0a800 ffff88020444c740 ffff8802012f3900
Feb 10 13:58:34 lapio kernel: [9681928.590473] ffff880009a7c000 ffff880205f162c0 7fffffffffffffff ffffffff81867910
Feb 10 13:58:34 lapio kernel: [9681928.590477] ffff880009a7bab8 ffff880009a7b990 ffffffff81867065 0000000000000000
Feb 10 13:58:34 lapio kernel: [9681928.590481] Call Trace:
Feb 10 13:58:34 lapio kernel: [9681928.590494] [<ffffffff81867910>] ? bit_wait+0x60/0x60
Feb 10 13:58:34 lapio kernel: [9681928.590499] [<ffffffff81867065>] schedule+0x35/0x80
Feb 10 13:58:34 lapio kernel: [9681928.590503] [<ffffffff8186a7d8>] schedule_timeout+0x208/0x280
Feb 10 13:58:34 lapio kernel: [9681928.590528] [<ffffffffc050ea49>] ? cifsFileInfo_put+0xa9/0x3f0 [cifs]
Feb 10 13:58:34 lapio kernel: [9681928.590534] [<ffffffff81023d95>] ? xen_clocksource_get_cycles+0x15/0x20
Feb 10 13:58:34 lapio kernel: [9681928.590538] [<ffffffff81867910>] ? bit_wait+0x60/0x60
Feb 10 13:58:34 lapio kernel: [9681928.590542] [<ffffffff818667b4>] io_schedule_timeout+0xa4/0x110
Feb 10 13:58:34 lapio kernel: [9681928.590546] [<ffffffff8186792b>] bit_wait_io+0x1b/0x70
Feb 10 13:58:34 lapio kernel: [9681928.590550] [<ffffffff818674bf>] __wait_on_bit+0x5f/0x90
Feb 10 13:58:34 lapio kernel: [9681928.590555] [<ffffffff8119857b>] wait_on_page_bit+0xcb/0xf0
Feb 10 13:58:34 lapio kernel: [9681928.590561] [<ffffffff810cad70>] ? autoremove_wake_function+0x40/0x40
Feb 10 13:58:34 lapio kernel: [9681928.590565] [<ffffffff81198693>] __filemap_fdatawait_range+0xf3/0x160
Feb 10 13:58:34 lapio kernel: [9681928.590569] [<ffffffff81198714>] filemap_fdatawait_range+0x14/0x30
Feb 10 13:58:34 lapio kernel: [9681928.590573] [<ffffffff8119a63a>] filemap_write_and_wait+0x6a/0x70
Feb 10 13:58:34 lapio kernel: [9681928.590584] [<ffffffffc0512b43>] cifs_flush+0x43/0x90 [cifs]
Feb 10 13:58:34 lapio kernel: [9681928.590589] [<ffffffff81219972>] filp_close+0x32/0x80
Feb 10 13:58:34 lapio kernel: [9681928.590594] [<ffffffff8123b3f5>] put_files_struct+0x75/0xd0
Feb 10 13:58:34 lapio kernel: [9681928.590598] [<ffffffff8123b4f7>] exit_files+0x47/0x50
Feb 10 13:58:34 lapio kernel: [9681928.590603] [<ffffffff810885ae>] do_exit+0x2ae/0xb90
Feb 10 13:58:34 lapio kernel: [9681928.590607] [<ffffffff811989fb>] ? __lock_page_killable+0xbb/0xe0
Feb 10 13:58:34 lapio kernel: [9681928.590611] [<ffffffff81088f17>] do_group_exit+0x47/0xb0
Feb 10 13:58:34 lapio kernel: [9681928.590616] [<ffffffff810957e1>] get_signal+0x171/0x950
Feb 10 13:58:34 lapio kernel: [9681928.590621] [<ffffffff8102e467>] do_signal+0x37/0x6f0
Feb 10 13:58:34 lapio kernel: [9681928.590631] [<ffffffffc0513577>] ? cifs_strict_readv+0xa7/0x100 [cifs]
Feb 10 13:58:34 lapio kernel: [9681928.590636] [<ffffffff810034fc>] exit_to_usermode_loop+0x8c/0xd0
Feb 10 13:58:34 lapio kernel: [9681928.590640] [<ffffffff81003c9a>] syscall_return_slowpath+0x5a/0x60
Feb 10 13:58:34 lapio kernel: [9681928.590644] [<ffffffff8186bd18>] int_ret_from_sys_call+0x25/0xa3
Feb 10 13:58:34 lapio kernel: [9681928.590668]
Feb 10 13:58:34 lapio kernel: [9681928.590679] Restarting tasks ... done.
Feb 10 13:58:34 lapio kernel: [9681928.602642] xen:manage: do_suspend: freeze processes failed -16
I see CIFS mentioned couple times in the VM kern log, could CIFS mounts on the VM be preventing the freeze task? I havent had this issue with this particular VM before though.