diff options
author | Jane Zhu <jzhu116@bloomberg.net> | 2023-09-05 20:56:15 +0200 |
---|---|---|
committer | Juan Zhu <jzhu4@dev-10-34-20-139.pw1.bcc.bloomberg.com> | 2024-01-16 16:50:45 +0100 |
commit | e200499bb3c5703862b92a4d7fb534d98601f1bf (patch) | |
tree | 456339f2847e1ff37d6198867bd1c6c16cfd695f /src/rgw/driver/rados/rgw_data_sync.cc | |
parent | Merge pull request #54966 from yuvalif/wip-yuval-63855 (diff) | |
download | ceph-e200499bb3c5703862b92a4d7fb534d98601f1bf.tar.xz ceph-e200499bb3c5703862b92a4d7fb534d98601f1bf.zip |
rgw/multisite: maintain endpoints connectable status and retry the requests to them when appropriate
Signed-off-by: Juan Zhu <jzhu4@dev-10-34-20-139.pw1.bcc.bloomberg.com>
Diffstat (limited to 'src/rgw/driver/rados/rgw_data_sync.cc')
-rw-r--r-- | src/rgw/driver/rados/rgw_data_sync.cc | 150 |
1 files changed, 87 insertions, 63 deletions
diff --git a/src/rgw/driver/rados/rgw_data_sync.cc b/src/rgw/driver/rados/rgw_data_sync.cc index 7b273e68129..0a7d21a7277 100644 --- a/src/rgw/driver/rados/rgw_data_sync.cc +++ b/src/rgw/driver/rados/rgw_data_sync.cc @@ -233,6 +233,9 @@ class RGWReadRemoteDataLogShardInfoCR : public RGWCoroutine { int shard_id; RGWDataChangesLogInfo *shard_info; + int tries{0}; + int op_ret{0}; + public: RGWReadRemoteDataLogShardInfoCR(RGWDataSyncCtx *_sc, int _shard_id, RGWDataChangesLogInfo *_shard_info) : RGWCoroutine(_sc->cct), @@ -243,41 +246,48 @@ public: shard_info(_shard_info) { } - ~RGWReadRemoteDataLogShardInfoCR() override { - if (http_op) { - http_op->put(); - } - } - int operate(const DoutPrefixProvider *dpp) override { reenter(this) { - yield { - char buf[16]; - snprintf(buf, sizeof(buf), "%d", shard_id); - rgw_http_param_pair pairs[] = { { "type" , "data" }, - { "id", buf }, - { "info" , NULL }, - { NULL, NULL } }; + static constexpr int NUM_ENPOINT_IOERROR_RETRIES = 20; + for (tries = 0; tries < NUM_ENPOINT_IOERROR_RETRIES; tries++) { + ldpp_dout(dpp, 20) << "read remote datalog shard info. shard_id=" << shard_id << " retries=" << tries << dendl; - string p = "/admin/log/"; + yield { + char buf[16]; + snprintf(buf, sizeof(buf), "%d", shard_id); + rgw_http_param_pair pairs[] = { { "type" , "data" }, + { "id", buf }, + { "info" , NULL }, + { NULL, NULL } }; - http_op = new RGWRESTReadResource(sc->conn, p, pairs, NULL, sync_env->http_manager); + string p = "/admin/log/"; - init_new_io(http_op); + http_op = new RGWRESTReadResource(sc->conn, p, pairs, NULL, sync_env->http_manager); - int ret = http_op->aio_read(dpp); - if (ret < 0) { - ldpp_dout(dpp, 0) << "ERROR: failed to read from " << p << dendl; - log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl; - return set_cr_error(ret); + init_new_io(http_op); + + int ret = http_op->aio_read(dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to read from " << p << dendl; + log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl; + http_op->put(); + return set_cr_error(ret); + } + + return io_block(0); + } + yield { + op_ret = http_op->wait(shard_info, null_yield); + http_op->put(); } - return io_block(0); - } - yield { - int ret = http_op->wait(shard_info, null_yield); - if (ret < 0) { - return set_cr_error(ret); + if (op_ret < 0) { + if (op_ret == -EIO && tries < NUM_ENPOINT_IOERROR_RETRIES - 1) { + ldpp_dout(dpp, 20) << "failed to fetch remote datalog shard info. retry. shard_id=" << shard_id << dendl; + continue; + } else { + return set_cr_error(op_ret); + } } return set_cr_done(); } @@ -315,6 +325,9 @@ class RGWReadRemoteDataLogShardCR : public RGWCoroutine { read_remote_data_log_response response; std::optional<TOPNSPC::common::PerfGuard> timer; + int tries{0}; + int op_ret{0}; + public: RGWReadRemoteDataLogShardCR(RGWDataSyncCtx *_sc, int _shard_id, const std::string& marker, string *pnext_marker, @@ -324,53 +337,62 @@ public: shard_id(_shard_id), marker(marker), pnext_marker(pnext_marker), entries(_entries), truncated(_truncated) { } - ~RGWReadRemoteDataLogShardCR() override { - if (http_op) { - http_op->put(); - } - } int operate(const DoutPrefixProvider *dpp) override { reenter(this) { - yield { - char buf[16]; - snprintf(buf, sizeof(buf), "%d", shard_id); - rgw_http_param_pair pairs[] = { { "type" , "data" }, - { "id", buf }, - { "marker", marker.c_str() }, - { "extra-info", "true" }, - { NULL, NULL } }; + static constexpr int NUM_ENPOINT_IOERROR_RETRIES = 20; + for (tries = 0; tries < NUM_ENPOINT_IOERROR_RETRIES; tries++) { + ldpp_dout(dpp, 20) << "read remote datalog shard. shard_id=" << shard_id << " retries=" << tries << dendl; - string p = "/admin/log/"; + yield { + char buf[16]; + snprintf(buf, sizeof(buf), "%d", shard_id); + rgw_http_param_pair pairs[] = { { "type" , "data" }, + { "id", buf }, + { "marker", marker.c_str() }, + { "extra-info", "true" }, + { NULL, NULL } }; - http_op = new RGWRESTReadResource(sc->conn, p, pairs, NULL, sync_env->http_manager); + string p = "/admin/log/"; - init_new_io(http_op); + http_op = new RGWRESTReadResource(sc->conn, p, pairs, NULL, sync_env->http_manager); + + init_new_io(http_op); - if (sync_env->counters) { - timer.emplace(sync_env->counters, sync_counters::l_poll); - } - int ret = http_op->aio_read(dpp); - if (ret < 0) { - ldpp_dout(dpp, 0) << "ERROR: failed to read from " << p << dendl; - log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl; if (sync_env->counters) { - sync_env->counters->inc(sync_counters::l_poll_err); + timer.emplace(sync_env->counters, sync_counters::l_poll); + } + int ret = http_op->aio_read(dpp); + if (ret < 0) { + ldpp_dout(dpp, 0) << "ERROR: failed to read from " << p << dendl; + log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl; + if (sync_env->counters) { + sync_env->counters->inc(sync_counters::l_poll_err); + } + http_op->put(); + return set_cr_error(ret); } - return set_cr_error(ret); + + return io_block(0); + } + yield { + timer.reset(); + op_ret = http_op->wait(&response, null_yield); + http_op->put(); } - return io_block(0); - } - yield { - timer.reset(); - int ret = http_op->wait(&response, null_yield); - if (ret < 0) { - if (sync_env->counters && ret != -ENOENT) { - sync_env->counters->inc(sync_counters::l_poll_err); + if (op_ret < 0) { + if (op_ret == -EIO && tries < NUM_ENPOINT_IOERROR_RETRIES - 1) { + ldpp_dout(dpp, 20) << "failed to read remote datalog shard. retry. shard_id=" << shard_id << dendl; + continue; + } else { + if (sync_env->counters && op_ret != -ENOENT) { + sync_env->counters->inc(sync_counters::l_poll_err); + } + return set_cr_error(op_ret); } - return set_cr_error(ret); } + entries->clear(); entries->swap(response.entries); *pnext_marker = response.marker; @@ -421,6 +443,8 @@ bool RGWReadRemoteDataLogInfoCR::spawn_next() { } class RGWListRemoteDataLogShardCR : public RGWSimpleCoroutine { + static constexpr int NUM_ENPOINT_IOERROR_RETRIES = 20; + RGWDataSyncCtx *sc; RGWDataSyncEnv *sync_env; RGWRESTReadResource *http_op; @@ -434,7 +458,7 @@ public: RGWListRemoteDataLogShardCR(RGWDataSyncCtx *sc, int _shard_id, const string& _marker, uint32_t _max_entries, rgw_datalog_shard_data *_result) - : RGWSimpleCoroutine(sc->cct), sc(sc), sync_env(sc->env), http_op(NULL), + : RGWSimpleCoroutine(sc->cct, NUM_ENPOINT_IOERROR_RETRIES), sc(sc), sync_env(sc->env), http_op(NULL), shard_id(_shard_id), marker(_marker), max_entries(_max_entries), result(_result) {} int send_request(const DoutPrefixProvider *dpp) override { @@ -474,7 +498,7 @@ public: int ret = http_op->wait(result, null_yield); http_op->put(); if (ret < 0 && ret != -ENOENT) { - ldpp_dout(sync_env->dpp, 0) << "ERROR: failed to list remote datalog shard, ret=" << ret << dendl; + ldpp_dout(sync_env->dpp, 5) << "ERROR: failed to list remote datalog shard, ret=" << ret << dendl; return ret; } return 0; |