summaryrefslogtreecommitdiffstats
path: root/src/rgw/driver/rados/rgw_data_sync.cc
diff options
context:
space:
mode:
authorJane Zhu <jzhu116@bloomberg.net>2023-09-05 20:56:15 +0200
committerJuan Zhu <jzhu4@dev-10-34-20-139.pw1.bcc.bloomberg.com>2024-01-16 16:50:45 +0100
commite200499bb3c5703862b92a4d7fb534d98601f1bf (patch)
tree456339f2847e1ff37d6198867bd1c6c16cfd695f /src/rgw/driver/rados/rgw_data_sync.cc
parentMerge pull request #54966 from yuvalif/wip-yuval-63855 (diff)
downloadceph-e200499bb3c5703862b92a4d7fb534d98601f1bf.tar.xz
ceph-e200499bb3c5703862b92a4d7fb534d98601f1bf.zip
rgw/multisite: maintain endpoints connectable status and retry the requests to them when appropriate
Signed-off-by: Juan Zhu <jzhu4@dev-10-34-20-139.pw1.bcc.bloomberg.com>
Diffstat (limited to 'src/rgw/driver/rados/rgw_data_sync.cc')
-rw-r--r--src/rgw/driver/rados/rgw_data_sync.cc150
1 files changed, 87 insertions, 63 deletions
diff --git a/src/rgw/driver/rados/rgw_data_sync.cc b/src/rgw/driver/rados/rgw_data_sync.cc
index 7b273e68129..0a7d21a7277 100644
--- a/src/rgw/driver/rados/rgw_data_sync.cc
+++ b/src/rgw/driver/rados/rgw_data_sync.cc
@@ -233,6 +233,9 @@ class RGWReadRemoteDataLogShardInfoCR : public RGWCoroutine {
int shard_id;
RGWDataChangesLogInfo *shard_info;
+ int tries{0};
+ int op_ret{0};
+
public:
RGWReadRemoteDataLogShardInfoCR(RGWDataSyncCtx *_sc,
int _shard_id, RGWDataChangesLogInfo *_shard_info) : RGWCoroutine(_sc->cct),
@@ -243,41 +246,48 @@ public:
shard_info(_shard_info) {
}
- ~RGWReadRemoteDataLogShardInfoCR() override {
- if (http_op) {
- http_op->put();
- }
- }
-
int operate(const DoutPrefixProvider *dpp) override {
reenter(this) {
- yield {
- char buf[16];
- snprintf(buf, sizeof(buf), "%d", shard_id);
- rgw_http_param_pair pairs[] = { { "type" , "data" },
- { "id", buf },
- { "info" , NULL },
- { NULL, NULL } };
+ static constexpr int NUM_ENPOINT_IOERROR_RETRIES = 20;
+ for (tries = 0; tries < NUM_ENPOINT_IOERROR_RETRIES; tries++) {
+ ldpp_dout(dpp, 20) << "read remote datalog shard info. shard_id=" << shard_id << " retries=" << tries << dendl;
- string p = "/admin/log/";
+ yield {
+ char buf[16];
+ snprintf(buf, sizeof(buf), "%d", shard_id);
+ rgw_http_param_pair pairs[] = { { "type" , "data" },
+ { "id", buf },
+ { "info" , NULL },
+ { NULL, NULL } };
- http_op = new RGWRESTReadResource(sc->conn, p, pairs, NULL, sync_env->http_manager);
+ string p = "/admin/log/";
- init_new_io(http_op);
+ http_op = new RGWRESTReadResource(sc->conn, p, pairs, NULL, sync_env->http_manager);
- int ret = http_op->aio_read(dpp);
- if (ret < 0) {
- ldpp_dout(dpp, 0) << "ERROR: failed to read from " << p << dendl;
- log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
- return set_cr_error(ret);
+ init_new_io(http_op);
+
+ int ret = http_op->aio_read(dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to read from " << p << dendl;
+ log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
+ http_op->put();
+ return set_cr_error(ret);
+ }
+
+ return io_block(0);
+ }
+ yield {
+ op_ret = http_op->wait(shard_info, null_yield);
+ http_op->put();
}
- return io_block(0);
- }
- yield {
- int ret = http_op->wait(shard_info, null_yield);
- if (ret < 0) {
- return set_cr_error(ret);
+ if (op_ret < 0) {
+ if (op_ret == -EIO && tries < NUM_ENPOINT_IOERROR_RETRIES - 1) {
+ ldpp_dout(dpp, 20) << "failed to fetch remote datalog shard info. retry. shard_id=" << shard_id << dendl;
+ continue;
+ } else {
+ return set_cr_error(op_ret);
+ }
}
return set_cr_done();
}
@@ -315,6 +325,9 @@ class RGWReadRemoteDataLogShardCR : public RGWCoroutine {
read_remote_data_log_response response;
std::optional<TOPNSPC::common::PerfGuard> timer;
+ int tries{0};
+ int op_ret{0};
+
public:
RGWReadRemoteDataLogShardCR(RGWDataSyncCtx *_sc, int _shard_id,
const std::string& marker, string *pnext_marker,
@@ -324,53 +337,62 @@ public:
shard_id(_shard_id), marker(marker), pnext_marker(pnext_marker),
entries(_entries), truncated(_truncated) {
}
- ~RGWReadRemoteDataLogShardCR() override {
- if (http_op) {
- http_op->put();
- }
- }
int operate(const DoutPrefixProvider *dpp) override {
reenter(this) {
- yield {
- char buf[16];
- snprintf(buf, sizeof(buf), "%d", shard_id);
- rgw_http_param_pair pairs[] = { { "type" , "data" },
- { "id", buf },
- { "marker", marker.c_str() },
- { "extra-info", "true" },
- { NULL, NULL } };
+ static constexpr int NUM_ENPOINT_IOERROR_RETRIES = 20;
+ for (tries = 0; tries < NUM_ENPOINT_IOERROR_RETRIES; tries++) {
+ ldpp_dout(dpp, 20) << "read remote datalog shard. shard_id=" << shard_id << " retries=" << tries << dendl;
- string p = "/admin/log/";
+ yield {
+ char buf[16];
+ snprintf(buf, sizeof(buf), "%d", shard_id);
+ rgw_http_param_pair pairs[] = { { "type" , "data" },
+ { "id", buf },
+ { "marker", marker.c_str() },
+ { "extra-info", "true" },
+ { NULL, NULL } };
- http_op = new RGWRESTReadResource(sc->conn, p, pairs, NULL, sync_env->http_manager);
+ string p = "/admin/log/";
- init_new_io(http_op);
+ http_op = new RGWRESTReadResource(sc->conn, p, pairs, NULL, sync_env->http_manager);
+
+ init_new_io(http_op);
- if (sync_env->counters) {
- timer.emplace(sync_env->counters, sync_counters::l_poll);
- }
- int ret = http_op->aio_read(dpp);
- if (ret < 0) {
- ldpp_dout(dpp, 0) << "ERROR: failed to read from " << p << dendl;
- log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
if (sync_env->counters) {
- sync_env->counters->inc(sync_counters::l_poll_err);
+ timer.emplace(sync_env->counters, sync_counters::l_poll);
+ }
+ int ret = http_op->aio_read(dpp);
+ if (ret < 0) {
+ ldpp_dout(dpp, 0) << "ERROR: failed to read from " << p << dendl;
+ log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
+ if (sync_env->counters) {
+ sync_env->counters->inc(sync_counters::l_poll_err);
+ }
+ http_op->put();
+ return set_cr_error(ret);
}
- return set_cr_error(ret);
+
+ return io_block(0);
+ }
+ yield {
+ timer.reset();
+ op_ret = http_op->wait(&response, null_yield);
+ http_op->put();
}
- return io_block(0);
- }
- yield {
- timer.reset();
- int ret = http_op->wait(&response, null_yield);
- if (ret < 0) {
- if (sync_env->counters && ret != -ENOENT) {
- sync_env->counters->inc(sync_counters::l_poll_err);
+ if (op_ret < 0) {
+ if (op_ret == -EIO && tries < NUM_ENPOINT_IOERROR_RETRIES - 1) {
+ ldpp_dout(dpp, 20) << "failed to read remote datalog shard. retry. shard_id=" << shard_id << dendl;
+ continue;
+ } else {
+ if (sync_env->counters && op_ret != -ENOENT) {
+ sync_env->counters->inc(sync_counters::l_poll_err);
+ }
+ return set_cr_error(op_ret);
}
- return set_cr_error(ret);
}
+
entries->clear();
entries->swap(response.entries);
*pnext_marker = response.marker;
@@ -421,6 +443,8 @@ bool RGWReadRemoteDataLogInfoCR::spawn_next() {
}
class RGWListRemoteDataLogShardCR : public RGWSimpleCoroutine {
+ static constexpr int NUM_ENPOINT_IOERROR_RETRIES = 20;
+
RGWDataSyncCtx *sc;
RGWDataSyncEnv *sync_env;
RGWRESTReadResource *http_op;
@@ -434,7 +458,7 @@ public:
RGWListRemoteDataLogShardCR(RGWDataSyncCtx *sc, int _shard_id,
const string& _marker, uint32_t _max_entries,
rgw_datalog_shard_data *_result)
- : RGWSimpleCoroutine(sc->cct), sc(sc), sync_env(sc->env), http_op(NULL),
+ : RGWSimpleCoroutine(sc->cct, NUM_ENPOINT_IOERROR_RETRIES), sc(sc), sync_env(sc->env), http_op(NULL),
shard_id(_shard_id), marker(_marker), max_entries(_max_entries), result(_result) {}
int send_request(const DoutPrefixProvider *dpp) override {
@@ -474,7 +498,7 @@ public:
int ret = http_op->wait(result, null_yield);
http_op->put();
if (ret < 0 && ret != -ENOENT) {
- ldpp_dout(sync_env->dpp, 0) << "ERROR: failed to list remote datalog shard, ret=" << ret << dendl;
+ ldpp_dout(sync_env->dpp, 5) << "ERROR: failed to list remote datalog shard, ret=" << ret << dendl;
return ret;
}
return 0;