diff options
author | Adam C. Emerson <aemerson@redhat.com> | 2022-10-11 20:49:58 +0200 |
---|---|---|
committer | Adam C. Emerson <aemerson@redhat.com> | 2023-01-13 00:13:46 +0100 |
commit | b5d5da9c4d2a7bb07bdc6b51b9f58831d220d99b (patch) | |
tree | fe413929201d6929fef90d33db38ba08754ca6e0 /src/rgw/driver/rados/rgw_data_sync.cc | |
parent | rgw: `SimpleRadosWriteAttrsCR` uses an async RADOS call (diff) | |
download | ceph-b5d5da9c4d2a7bb07bdc6b51b9f58831d220d99b.tar.xz ceph-b5d5da9c4d2a7bb07bdc6b51b9f58831d220d99b.zip |
rgw: Reduce data sync parallelism in response to RADOS lock latency
Lock latency in RGWContinuousLeaseCR gets high enough under load that
the locks end up timing out, leading to incorrect behavior.
Monitor lock latency and cut concurrent operations in half if it goes
above ten seconds.
Cut currency to one if it goes about twenty seconds.
Signed-off-by: Adam C. Emerson <aemerson@redhat.com>
Diffstat (limited to 'src/rgw/driver/rados/rgw_data_sync.cc')
-rw-r--r-- | src/rgw/driver/rados/rgw_data_sync.cc | 25 |
1 files changed, 13 insertions, 12 deletions
diff --git a/src/rgw/driver/rados/rgw_data_sync.cc b/src/rgw/driver/rados/rgw_data_sync.cc index 1b5ad619bef..e51690974eb 100644 --- a/src/rgw/driver/rados/rgw_data_sync.cc +++ b/src/rgw/driver/rados/rgw_data_sync.cc @@ -561,7 +561,7 @@ public: sc->env->async_rados, sc->env->driver, { sc->env->svc->zone->get_zone_params().log_pool, RGWDataSyncStatusManager::sync_status_oid(sc->source_zone) }, - string(lock_name), lock_duration, caller); + string(lock_name), lock_duration, caller, &sc->lcc); } int operate(const DoutPrefixProvider *dpp) override { @@ -1534,7 +1534,7 @@ public: tn->log(10, SSTR("writing shard_id " << sid << " of gen " << each->gen << " to error repo for retry")); yield_spawn_window(rgw::error_repo::write_cr(sync_env->driver->svc()->rados, error_repo, rgw::error_repo::encode_key(bs, each->gen), - timestamp), cct->_conf->rgw_data_sync_spawn_window, + timestamp), sc->lcc.adj_concurrency(cct->_conf->rgw_data_sync_spawn_window), [&](uint64_t stack_id, int ret) { if (ret < 0) { retcode = ret; @@ -1660,7 +1660,7 @@ public: tn->log(10, SSTR("Write " << source_bs.shard_id << " to error repo for retry")); yield_spawn_window(rgw::error_repo::write_cr(sync_env->driver->svc()->rados, error_repo, rgw::error_repo::encode_key(source_bs, each->gen), - timestamp), cct->_conf->rgw_data_sync_spawn_window, std::nullopt); + timestamp), sc->lcc.adj_concurrency(cct->_conf->rgw_data_sync_spawn_window), std::nullopt); } else { shard_cr = data_sync_single_entry(sc, source_bs, each->gen, key, timestamp, lease_cr, bucket_shard_cache, nullptr, error_repo, tn, false); @@ -1669,7 +1669,7 @@ public: yield call(shard_cr); first_shard = false; } else { - yield_spawn_window(shard_cr, cct->_conf->rgw_data_sync_spawn_window, + yield_spawn_window(shard_cr, sc->lcc.adj_concurrency(cct->_conf->rgw_data_sync_spawn_window), [&](uint64_t stack_id, int ret) { if (ret < 0) { retcode = ret; @@ -1807,7 +1807,7 @@ public: sc, pool, source_bs, iter->first, sync_status, error_repo, entry_timestamp, lease_cr, bucket_shard_cache, &*marker_tracker, tn), - cct->_conf->rgw_data_sync_spawn_window, + sc->lcc.adj_concurrency(cct->_conf->rgw_data_sync_spawn_window), std::nullopt); } sync_marker.marker = iter->first; @@ -2026,7 +2026,7 @@ public: log_iter->log_id, log_iter->log_timestamp, lease_cr,bucket_shard_cache, &*marker_tracker, error_repo, tn, false), - cct->_conf->rgw_data_sync_spawn_window, std::nullopt); + sc->lcc.adj_concurrency(cct->_conf->rgw_data_sync_spawn_window), std::nullopt); } } @@ -2179,7 +2179,8 @@ public: auto driver = sync_env->driver; lease_cr.reset(new RGWContinuousLeaseCR(sync_env->async_rados, driver, rgw_raw_obj(pool, status_oid), - lock_name, lock_duration, this)); + lock_name, lock_duration, this, + &sc->lcc)); lease_stack.reset(spawn(lease_cr.get(), false)); } }; @@ -4511,7 +4512,7 @@ int RGWBucketFullSyncCR::operate(const DoutPrefixProvider *dpp) entry->key, &marker_tracker, zones_trace, tn), false); } - drain_with_cb(cct->_conf->rgw_bucket_sync_spawn_window, + drain_with_cb(sc->lcc.adj_concurrency(cct->_conf->rgw_bucket_sync_spawn_window), [&](uint64_t stack_id, int ret) { if (ret < 0) { tn->log(10, "a sync operation returned error"); @@ -4908,7 +4909,7 @@ int RGWBucketShardIncrementalSyncCR::operate(const DoutPrefixProvider *dpp) false); } // } - drain_with_cb(cct->_conf->rgw_bucket_sync_spawn_window, + drain_with_cb(sc->lcc.adj_concurrency(cct->_conf->rgw_bucket_sync_spawn_window), [&](uint64_t stack_id, int ret) { if (ret < 0) { tn->log(10, "a sync operation returned error"); @@ -5169,7 +5170,7 @@ int RGWRunBucketSourcesSyncCR::operate(const DoutPrefixProvider *dpp) yield_spawn_window(sync_bucket_shard_cr(sc, lease_cr, sync_pair, gen, tn, &*cur_shard_progress), - cct->_conf->rgw_bucket_sync_spawn_window, + sc->lcc.adj_concurrency(cct->_conf->rgw_bucket_sync_spawn_window), [&](uint64_t stack_id, int ret) { if (ret < 0) { tn->log(10, SSTR("ERROR: a sync operation returned error: " << ret)); @@ -5662,7 +5663,7 @@ int RGWSyncBucketCR::operate(const DoutPrefixProvider *dpp) tn->log(20, "remote bilog indicates that sync was stopped"); if (!bucket_lease_cr) { bucket_lease_cr.reset(new RGWContinuousLeaseCR(env->async_rados, env->driver, status_obj, - lock_name, lock_duration, this)); + lock_name, lock_duration, this, &sc->lcc)); yield spawn(bucket_lease_cr.get(), false); while (!bucket_lease_cr->is_locked()) { if (bucket_lease_cr->is_done()) { @@ -5720,7 +5721,7 @@ int RGWSyncBucketCR::operate(const DoutPrefixProvider *dpp) // different shards from duplicating the init and full sync if (!bucket_lease_cr) { bucket_lease_cr.reset(new RGWContinuousLeaseCR(env->async_rados, env->driver, status_obj, - lock_name, lock_duration, this)); + lock_name, lock_duration, this, &sc->lcc)); yield spawn(bucket_lease_cr.get(), false); while (!bucket_lease_cr->is_locked()) { if (bucket_lease_cr->is_done()) { |