XCP-ng
    • Categories
    • Recent
    • Tags
    • Popular
    • Users
    • Groups
    • Register
    • Login

    S3 backup not retrying after error

    Scheduled Pinned Locked Moved Xen Orchestra
    22 Posts 4 Posters 1.6k Views 4 Watching
    Loading More Posts
    • Oldest to Newest
    • Newest to Oldest
    • Most Votes
    Reply
    • Reply as topic
    Log in to reply
    This topic has been deleted. Only users with topic management privileges can see it.
    • A Offline
      Andrew Top contributor
      last edited by

      @olivierlambert @julien-f I have been getting nightly 400 errors from wasabi during S3 delta backups. Looking at the XO code it seems like it should retry (default 3 times), but it does not and just fails. Using XO Source commit 5dc99...

          {
            "data": {
              "type": "VM",
              "id": "96c1cc6a-e59c-c48d-69dd-17441b667f18"
            },
            "id": "1662960463806:0",
            "message": "backup VM",
            "start": 1662960463806,
            "status": "failure",
            "tasks": [
              {
                "id": "1662960463910",
                "message": "clean-vm",
                "start": 1662960463910,
                "status": "success",
                "end": 1662960497360,
                "result": {
                  "merge": false
                }
              },
              {
                "id": "1662960517777",
                "message": "snapshot",
                "start": 1662960517777,
                "status": "success",
                "end": 1662960520235,
                "result": "8c7f7a53-dfda-7d40-fdc3-d566b064022b"
              },
              {
                "data": {
                  "id": "db9ad0a8-bce6-4a2b-b9fd-5c4cecf059c4",
                  "isFull": false,
                  "type": "remote"
                },
                "id": "1662960520235:0",
                "message": "export",
                "start": 1662960520235,
                "status": "failure",
                "tasks": [
                  {
                    "id": "1662960520560",
                    "message": "transfer",
                    "start": 1662960520560,
                    "status": "success",
                    "end": 1662960551132,
                    "result": {
                      "size": 667090432
                    }
                  },
                  {
                    "id": "1662960552733",
                    "message": "clean-vm",
                    "start": 1662960552733,
                    "status": "failure",
                    "tasks": [
                      {
                        "id": "1662960580247",
                        "message": "merge",
                        "start": 1662960580247,
                        "status": "failure",
                        "end": 1662960617830,
                        "result": {
                          "name": "InternalError",
                          "$fault": "client",
                          "$metadata": {
                            "httpStatusCode": 400,
                            "extendedRequestId": "ZCIiTVghBw8KS511Q1PuTbYfQsuT/gHfHqqSdKSGJead6PGtBqUgi85dLqhG6FZ7+Yh35E6KrOyQ",
                            "attempts": 1,
                            "totalRetryDelay": 0
                          },
                          "Code": "InternalError",
                          "Detail": "None:UnexpectedError",
                          "RequestId": "2E7A2B6402806D7E",
                          "HostId": "ZCIiTVghBw8KS511Q1PuTbYfQsuT/gHfHqqSdKSGJead6PGtBqUgi85dLqhG6FZ7+Yh35E6KrOyQ",
                          "message": "We encountered an internal error.  Please retry the operation again later.",
                          "chain": [
                            "/xo-vm-backups/96c1cc6a-e59c-c48d-69dd-17441b667f18/vdis/f9405e51-da48-4eb1-980d-99da34359b3c/76857192-9016-491e-8bdc-a4ff3a5c7bfc/20220821T052933Z.alias.vhd",
                            "/xo-vm-backups/96c1cc6a-e59c-c48d-69dd-17441b667f18/vdis/f9405e51-da48-4eb1-980d-99da34359b3c/76857192-9016-491e-8bdc-a4ff3a5c7bfc/20220822T052859Z.alias.vhd"
                          ],
                          "stack": "InternalError: We encountered an internal error.  Please retry the operation again later.\n    at throwDefaultError (/opt/xo/xo-builds/xen-orchestra-202209110933/node_modules/@aws-sdk/smithy-client/dist-cjs/default-error-handler.js:8:22)\n    at deserializeAws_restXmlCopyObjectCommandError (/opt/xo/xo-builds/xen-orchestra-202209110933/node_modules/@aws-sdk/client-s3/dist-cjs/protocols/Aws_restXml.js:3131:51)\n    at process.processTicksAndRejections (node:internal/process/task_queues:95:5)\n    at async /opt/xo/xo-builds/xen-orchestra-202209110933/node_modules/@aws-sdk/middleware-serde/dist-cjs/deserializerMiddleware.js:7:24\n    at async /opt/xo/xo-builds/xen-orchestra-202209110933/node_modules/@aws-sdk/middleware-signing/dist-cjs/middleware.js:11:20\n    at async StandardRetryStrategy.retry (/opt/xo/xo-builds/xen-orchestra-202209110933/node_modules/@aws-sdk/middleware-retry/dist-cjs/StandardRetryStrategy.js:51:46)\n    at async /opt/xo/xo-builds/xen-orchestra-202209110933/node_modules/@aws-sdk/middleware-logger/dist-cjs/loggerMiddleware.js:6:22\n    at async S3Handler._copy (/opt/xo/xo-builds/xen-orchestra-202209110933/@xen-orchestra/fs/dist/s3.js:184:7)"
                        }
                      }
                    ],
                    "end": 1662960617831,
                    "result": {
                      "name": "InternalError",
                      "$fault": "client",
                      "$metadata": {
                        "httpStatusCode": 400,
                        "extendedRequestId": "ZCIiTVghBw8KS511Q1PuTbYfQsuT/gHfHqqSdKSGJead6PGtBqUgi85dLqhG6FZ7+Yh35E6KrOyQ",
                        "attempts": 1,
                        "totalRetryDelay": 0
                      },
                      "Code": "InternalError",
                      "Detail": "None:UnexpectedError",
                      "RequestId": "2E7A2B6402806D7E",
                      "HostId": "ZCIiTVghBw8KS511Q1PuTbYfQsuT/gHfHqqSdKSGJead6PGtBqUgi85dLqhG6FZ7+Yh35E6KrOyQ",
                      "message": "We encountered an internal error.  Please retry the operation again later.",
                      "chain": [
                        "/xo-vm-backups/96c1cc6a-e59c-c48d-69dd-17441b667f18/vdis/f9405e51-da48-4eb1-980d-99da34359b3c/76857192-9016-491e-8bdc-a4ff3a5c7bfc/20220821T052933Z.alias.vhd",
                        "/xo-vm-backups/96c1cc6a-e59c-c48d-69dd-17441b667f18/vdis/f9405e51-da48-4eb1-980d-99da34359b3c/76857192-9016-491e-8bdc-a4ff3a5c7bfc/20220822T052859Z.alias.vhd"
                      ],
                      "stack": "InternalError: We encountered an internal error.  Please retry the operation again later.\n    at throwDefaultError (/opt/xo/xo-builds/xen-orchestra-202209110933/node_modules/@aws-sdk/smithy-client/dist-cjs/default-error-handler.js:8:22)\n    at deserializeAws_restXmlCopyObjectCommandError (/opt/xo/xo-builds/xen-orchestra-202209110933/node_modules/@aws-sdk/client-s3/dist-cjs/protocols/Aws_restXml.js:3131:51)\n    at process.processTicksAndRejections (node:internal/process/task_queues:95:5)\n    at async /opt/xo/xo-builds/xen-orchestra-202209110933/node_modules/@aws-sdk/middleware-serde/dist-cjs/deserializerMiddleware.js:7:24\n    at async /opt/xo/xo-builds/xen-orchestra-202209110933/node_modules/@aws-sdk/middleware-signing/dist-cjs/middleware.js:11:20\n    at async StandardRetryStrategy.retry (/opt/xo/xo-builds/xen-orchestra-202209110933/node_modules/@aws-sdk/middleware-retry/dist-cjs/StandardRetryStrategy.js:51:46)\n    at async /opt/xo/xo-builds/xen-orchestra-202209110933/node_modules/@aws-sdk/middleware-logger/dist-cjs/loggerMiddleware.js:6:22\n    at async S3Handler._copy (/opt/xo/xo-builds/xen-orchestra-202209110933/@xen-orchestra/fs/dist/s3.js:184:7)"
                    }
                  }
                ],
                "end": 1662960617834
              }
            ],
            "end": 1662960617834
          },
      
      julien-fJ 1 Reply Last reply Reply Quote 0
      • olivierlambertO Online
        olivierlambert Vates 🪐 Co-Founder CEO
        last edited by

        Adding @florent who is the main S3 dev 🙂

        1 Reply Last reply Reply Quote 0
        • julien-fJ Offline
          julien-f Vates 🪐 Co-Founder XO Team @Andrew
          last edited by

          @Andrew I don't think it makes sense for us to retry on a 400 (Bad Request) error, it's exactly the status code I would expect the client NOT to retry.

          I would be far less opposed to retry on 429 (Too Many Requests) or 503 (Service Unavailable).

          Do you have some documentation regarding the S3 protocol or your specific backend regarding this error?

          Also, keep in mind that only AWS S3 is officially supported, other backends are best effort.

          A 1 Reply Last reply Reply Quote 0
          • A Offline
            Andrew Top contributor @julien-f
            last edited by

            @julien-f Here is the Wasabi API guide.

            Is the error text "Please retry the operation again later" generated by the Wasabi S3 server or the XO software? If it comes from Wasabi then they should be sending a 503 not a 400.

            julien-fJ 1 Reply Last reply Reply Quote 0
            • julien-fJ Offline
              julien-f Vates 🪐 Co-Founder XO Team @Andrew
              last edited by

              @Andrew It comes from Wasabi.

              A 1 Reply Last reply Reply Quote 0
              • A Offline
                Andrew Top contributor @julien-f
                last edited by

                @julien-f Ticket open with Wasabi. They are looking into the error.

                julien-fJ 1 Reply Last reply Reply Quote 0
                • julien-fJ Offline
                  julien-f Vates 🪐 Co-Founder XO Team @Andrew
                  last edited by

                  @Andrew Great! Let me know how it goes and if there is something that should be fixed in XO 🙂

                  A 1 Reply Last reply Reply Quote 0
                  • A Offline
                    Andrew Top contributor @julien-f
                    last edited by

                    @julien-f Wasabi says they checked the logs and did send back only 500 errors and not 400 errors.

                    We encountered an internal error. Please retry the operation again later. InternalError 500

                    Any chance of a XO programming error that reassigned the result to 400 somewhere?

                    julien-fJ 1 Reply Last reply Reply Quote 0
                    • julien-fJ Offline
                      julien-f Vates 🪐 Co-Founder XO Team @Andrew
                      last edited by

                      @Andrew I don't think so but I don't trust entirely the AWS SDK lib 🤔

                      A 1 Reply Last reply Reply Quote 0
                      • A Offline
                        Andrew Top contributor @julien-f
                        last edited by

                        @julien-f Can you add some better error/diag reports that list the exact S3 call/command/operation being used when the S3 error is returned. This would help finding the actual problem.

                        julien-fJ 1 Reply Last reply Reply Quote 0
                        • julien-fJ Offline
                          julien-f Vates 🪐 Co-Founder XO Team @Andrew
                          last edited by

                          @florent Can you take a look? I wonder if there is a debug option we can enable in the AWS lib.

                          A florentF 2 Replies Last reply Reply Quote 0
                          • A Offline
                            Andrew Top contributor @julien-f
                            last edited by Andrew

                            @julien-f AWS docs state:

                            Transient errors (HTTP status codes 400, 408, 500, 502, 503, and 504) 
                            and throttling errors (HTTP status codes 400, 403, 429, 502, 503, and 509) can all be retried.
                            

                            So there are more than a few that can be retried.

                            They also have adaptive and standard retry as well as a configurable number of retries.

                            1 Reply Last reply Reply Quote 0
                            • olivierlambertO Online
                              olivierlambert Vates 🪐 Co-Founder CEO
                              last edited by

                              I think we did not have a lot of issues reported on AWS, our concerns are on "similar" implementation but not behaving the same.

                              A 1 Reply Last reply Reply Quote 0
                              • florentF Offline
                                florent Vates 🪐 XO Team @julien-f
                                last edited by olivierlambert

                                @julien-f I didn't find a debug option, but we can put a custom logger. I did quick test with a direct console/log.info and the volume of data logged is huge. I'm trying to make a more reasonable logger

                                1 Reply Last reply Reply Quote 1
                                • A Offline
                                  Andrew Top contributor @olivierlambert
                                  last edited by

                                  @olivierlambert @florent I chose Wasabi because of cost. I understand that non-AWS S3 vendors may not always work as expected but Wasabi is not insignificant.... Local testing to MinIO S3 has never given me any problems.

                                  With tens of thousands of S3 operations I only want to know about the one that failed so I can yell a Wasabi and get them to fix their issue. I have recently been having more nightly issues with them (same retry failures).

                                  So it's a two fold issue. Wasabi is causing problems and XO needs to be more tolerant of problems.

                                  florentF 2 Replies Last reply Reply Quote 0
                                  • olivierlambertO Online
                                    olivierlambert Vates 🪐 Co-Founder CEO
                                    last edited by

                                    I'm not saying to use only AWS 😉 It's just telling you that we do not have errors reported for AWS customers on S3. It can be for a lot of reasons.

                                    We have invested 10k€ in test hardware to install our own "decent" S3 cluster, so we are taking seriously the fact to try to make it work on other stuff. But as you can see, there's a limit on the number of platforms we can try. Indeed, the solution is to get more debugs, but it's not enjoyable to lose time on potential issues related to 3rd party S3 providers 😕

                                    1 Reply Last reply Reply Quote 0
                                    • florentF Offline
                                      florent Vates 🪐 XO Team @Andrew
                                      last edited by

                                      @Andrew I am sure we'll be able to tune XO to be a little more robust in the near future

                                      1 Reply Last reply Reply Quote 0
                                      • florentF Offline
                                        florent Vates 🪐 XO Team @Andrew
                                        last edited by

                                        @Andrew hi ,

                                        I made a PR that retries 5 times even on error 400, can you test it ?
                                        https://github.com/vatesfr/xen-orchestra/pull/6433

                                        Regards
                                        Florent

                                        fbeauchamp opened this pull request in vatesfr/xen-orchestra

                                        closed fix(s3): retry upload even on error 400 #6433

                                        A 2 Replies Last reply Reply Quote 0
                                        • A Offline
                                          Andrew Top contributor @florent
                                          last edited by

                                          @florent Thanks. I'm running it. I'll report after a few days.

                                          1 Reply Last reply Reply Quote 1
                                          • A Offline
                                            Andrew Top contributor @florent
                                            last edited by

                                            @florent Last night's failure (commit 81ae8)...

                                            {
                                                  "data": {
                                                    "type": "VM",
                                                    "id": "f80fdf51-65e5-132d-bb2a-936bbd2814fc"
                                                  },
                                                  "id": "1663912365483:2",
                                                  "message": "backup VM",
                                                  "start": 1663912365483,
                                                  "status": "failure",
                                                  "tasks": [
                                                    {
                                                      "id": "1663912365570",
                                                      "message": "clean-vm",
                                                      "start": 1663912365570,
                                                      "status": "failure",
                                                      "end": 1663912403372,
                                                      "result": {
                                                        "name": "InternalError",
                                                        "$fault": "client",
                                                        "$metadata": {
                                                          "httpStatusCode": 500,
                                                          "extendedRequestId": "jOYV90/W5XHJFnOq1mlfpaMT/T9EV4/EnSluEni+p9TJQykrtI0cJMntJqFThy/PvX/LN0XX4xXS",
                                                          "attempts": 3,
                                                          "totalRetryDelay": 369
                                                        },
                                                        "Code": "InternalError",
                                                        "Detail": "None:UnexpectedError",
                                                        "RequestId": "85780FD1B7DFCB7C",
                                                        "HostId": "jOYV90/W5XHJFnOq1mlfpaMT/T9EV4/EnSluEni+p9TJQykrtI0cJMntJqFThy/PvX/LN0XX4xXS",
                                                        "message": "We encountered an internal error.  Please retry the operation again later.",
                                                        "stack": "InternalError: We encountered an internal error.  Please retry the operation again later.\n    at throwDefaultError (/opt/xo/xo-builds/xen-orchestra-202209221033/node_modules/@aws-sdk/smithy-client/dist-cjs/default-error-handler.js:8:22)\n    at deserializeAws_restXmlGetObjectCommandError (/opt/xo/xo-builds/xen-orchestra-202209221033/node_modules/@aws-sdk/client-s3/dist-cjs/protocols/Aws_restXml.js:4356:51)\n    at process.processTicksAndRejections (node:internal/process/task_queues:95:5)\n    at async /opt/xo/xo-builds/xen-orchestra-202209221033/node_modules/@aws-sdk/middleware-serde/dist-cjs/deserializerMiddleware.js:7:24\n    at async /opt/xo/xo-builds/xen-orchestra-202209221033/node_modules/@aws-sdk/middleware-signing/dist-cjs/middleware.js:11:20\n    at async StandardRetryStrategy.retry (/opt/xo/xo-builds/xen-orchestra-202209221033/node_modules/@aws-sdk/middleware-retry/dist-cjs/StandardRetryStrategy.js:51:46)\n    at async /opt/xo/xo-builds/xen-orchestra-202209221033/node_modules/@aws-sdk/middleware-flexible-checksums/dist-cjs/flexibleChecksumsMiddleware.js:56:20\n    at async /opt/xo/xo-builds/xen-orchestra-202209221033/node_modules/@aws-sdk/middleware-logger/dist-cjs/loggerMiddleware.js:6:22\n    at async S3Handler._createReadStream (/opt/xo/xo-builds/xen-orchestra-202209221033/@xen-orchestra/fs/dist/s3.js:261:15)\n    at async S3Handler.readFile (/opt/xo/xo-builds/xen-orchestra-202209221033/@xen-orchestra/fs/dist/abstract.js:326:18)"
                                                      }
                                                    },
                                                    {
                                                      "id": "1663912517635",
                                                      "message": "snapshot",
                                                      "start": 1663912517635,
                                                      "status": "success",
                                                      "end": 1663912520335,
                                                      "result": "85b00101-5704-c847-8c91-8806195154b4"
                                                    },
                                                    {
                                                      "data": {
                                                        "id": "db9ad0a8-bce6-4a2b-b9fd-5c4cecf059c4",
                                                        "isFull": false,
                                                        "type": "remote"
                                                      },
                                                      "id": "1663912520336",
                                                      "message": "export",
                                                      "start": 1663912520336,
                                                      "status": "success",
                                                      "tasks": [
                                                        {
                                                          "id": "1663912520634",
                                                          "message": "transfer",
                                                          "start": 1663912520634,
                                                          "status": "success",
                                                          "end": 1663912549741,
                                                          "result": {
                                                            "size": 251742720
                                                          }
                                                        },
                                                        {
                                                          "id": "1663912551469",
                                                          "message": "clean-vm",
                                                          "start": 1663912551469,
                                                          "status": "success",
                                                          "end": 1663912629752,
                                                          "result": {
                                                            "merge": false
                                                          }
                                                        }
                                                      ],
                                                      "end": 1663912629752
                                                    }
                                                  ],
                                                  "end": 1663912629752
                                                },
                                            
                                            florentF 1 Reply Last reply Reply Quote 0
                                            • First post
                                              Last post