878 files changed, 33523 insertions, 9447 deletions
diff --git a/.gitmodules b/.gitmodules
index aa04fde0446..85a78ace76b 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -52,3 +52,9 @@
 [submodule "src/crypto/isa-l/isa-l_crypto"]
 	path = src/crypto/isa-l/isa-l_crypto
 	url = https://github.com/01org/isa-l_crypto
+[submodule "src/blkin"]
+	path = src/blkin
+	url = https://github.com/ceph/blkin
+[submodule "src/rapidjson"]
+	path = src/rapidjson
+	url = https://github.com/ceph/rapidjson
diff --git a/.mailmap b/.mailmap
index 671a3226d5e..32f4b9a3a3a 100644
--- a/.mailmap
+++ b/.mailmap
@@ -218,6 +218,7 @@ Kiseleva Alyona <akiselyova@mirantis.com>
 Kongming Wu <wu.kongming@h3c.com>
 Lan De <lan.de3@zte.com.cn>
 Laszlo Boszormenyi <gcs@debian.hu>
+Leo Zhang <nguzcf@gmail.com>
 Luo Kexue <luo.kexue@zte.com.cn>
 Luo Runbing <runsisi@hust.edu.cn>
 Luo Runbing <runsisi@zte.com.cn>
@@ -403,8 +404,7 @@ Warren Usui <warren.usui@inktank.com>
 Warren Usui <wusui@redhat.com> <wusui@users.noreply.github.com>
 Warren Usui <wusui@redhat.com> <wusui@magna002.ceph.redhat.com>
 Warren Usui <wusui@redhat.com> <wusui@ubuntu.(none)>
-Weibing Zhang <zhangweibing@unitedstack.com>
-Weibing Zhang <zhangweibing@unitedstack.com> <atheism.zhang@gmail.com>
+Weibing Zhang <atheism.zhang@gmail.com>
 Weijun Duan <duanweijun@h3c.com>
 Wei Jin <jinwei15@wanda.cn>
 Wei Jin <jinwei15@wanda.cn> <wjin.cn@gmail.com>
diff --git a/.organizationmap b/.organizationmap
index b3086dc66f1..25c35a17ce6 100644
--- a/.organizationmap
+++ b/.organizationmap
@@ -27,6 +27,19 @@ Acaleph <contact@acale.ph> Alistair Israel <aisrael@gmail.com>
 Alcatel Lucent <contact@alcatel-lucent.com> Joseph McDonald <joseph.mcdonald@alcatel-lucent.com>
 Alcatel Lucent <contact@alcatel-lucent.com> Ker Liu <ker.liu@alcatel-lucent.com>
 Aliyun <contact@aliyun.com> Jeffrey Lu <lzhng2000@aliyun.com>
+Alibaba <contact@alibaba-inc.com> James Liu <james.liu@alibaba-inc.com>
+Alibaba <contact@alibaba-inc.com> Pan Liu <wanjun.lp@alibaba-inc.com>
+Alibaba <contact@alibaba-inc.com> Jin Cai <caijin.caij@alibaba-inc.com>
+Alibaba <contact@alibaba-inc.com> Ming Lin <ming.lin@alibaba-inc.com>
+Alibaba <contact@alibaba-inc.com> Sheng Qiu <sheng.qiu@alibaba-inc.com>
+Alibaba <contact@alibaba-inc.com> Jianjian Huo <j.huo@alibaba-inc.com>
+Alibaba <contact@alibaba-inc.com> Yanjiang Xu <yanjiang.xyj@alibaba-inc.com>
+Alibaba <contact@alibaba-inc.com> Jin Cai <caijin.caij@alibaba-inc.com>
+Alibaba <contact@alibaba-inc.com> Ke Lin <haoqi.lk@alibaba-inc.com>
+Alibaba <contact@alibaba-inc.com> Wentao Zhang <wentao.zwt@alibaba-inc.com>
+Alibaba <contact@alibaba-inc.com> Rongyao Chen <rongyao.cry@alibaba-inc.com>
+Alibaba <contact@alibaba-inc.com> Yi Zhang <boqian.zy@alibaba-inc.com>
+Alibaba <contact@alibaba-inc.com> Yongyou <yongyou.yl@alibaba-inc.com>
 Anchor Hosting <contact@anchor.com.au> Christian Marie <pingu@anchor.net.au>
 Anchor Hosting <contact@anchor.com.au> Sharif Olorin <sio@tesser.org>
 ArtiBit <contact@artibit.com> Rutger ter Borg <rutger@terborg.net>
@@ -476,6 +489,7 @@ SUSE <contact@suse.com> Thorsten Behrens <tbehrens@suse.com>
 SUSE <contact@suse.com> Tim Serong <tserong@suse.com>
 SWITCH <contact@switch.ch> Jens-Christian Fischer <jens-christian.fischer@switch.ch>
 SWITCH <contact@switch.ch> Simon Leinen <simon.leinen@switch.ch>
+T2Cloud <contact@t2cloud.net> Leo Zhang <nguzcf@gmail.com>
 TCloud Computing <contact@tcloudcomputing.com> CC Lien <cc_lien@tcloudcomputing.com>
 TCloud Computing <contact@tcloudcomputing.com> Henry C Chang <henry_c_chang@tcloudcomputing.com>
 TCloud Computing <contact@tcloudcomputing.com> Herb Shiu <herb_shiu@tcloudcomputing.com>
@@ -495,6 +509,7 @@ The Linux Box <contact@linuxbox.com> Matt Benjamin <matt@linuxbox.com>
 The University of Arizona <contact@arizona.edu> James Ryan Cresawn <jrcresawn@gmail.com>
 Time Warner Cable Inc. <contact@twcable.com> Bryan Stillwell <bryan.stillwell@twcable.com>
 Trendy Tech <contact@trendytech.com.cn> shiqi <m13913886148@gmail.com>
+Trendy Tech <contact@trendytech.com.cn> Lei Zhang <243290414@qq.com>
 Ubuntu Kylin <contact@ubuntukylin.com> Min Chen <minchen@ubuntukylin.com>
 UMCloud <contact@umcloud.com> Jiaying Ren <mikulely@gmail.com>
 UMCloud <contact@umcloud.com> Rongze Zhu <zrzhit@gmail.com>
@@ -610,7 +625,6 @@ Unaffiliated <no@organization.net> koleosfuscus <koleosfuscus@yahoo.com>
 Unaffiliated <no@organization.net> Kyr Shatskyy <kyrylo.shatskyy@gmail.com>
 Unaffiliated <no@organization.net> Laurent Guerby <laurent@guerby.net>
 Unaffiliated <no@organization.net> Lee Revell <rlrevell@gmail.com>
-Unaffiliated <no@organization.net> Leo Zhang <nguzcf@gmail.com>
 Unaffiliated <no@organization.net> Lucas Fantinel <lucas.fantinel@gmail.com>
 Unaffiliated <no@organization.net> Коренберг Марк <socketpair@gmail.com>
 Unaffiliated <no@organization.net> Matt Richards <mattjrichards@gmail.com>
@@ -690,6 +704,7 @@ Unaffiliated <no@organization.net> YiQiang Chen <cyqsign@163.com>
 Unaffiliated <no@organization.net> Yongyue Sun <abioy.sun@gmail.com>
 Unaffiliated <no@organization.net> You Ji <jiyou09@gmail.com>
 Unaffiliated <no@organization.net> Zhang Shaowen <zhang_shaowen@139.com>
+Unaffiliated <no@organization.net> Weibing Zhang <atheism.zhang@gmail.com>
 Unaffiliated <no@organization.net> Zhao Junwang <zhjwpku@gmail.com>
 Unaffiliated <no@organization.net> Zhao Kun <jswps2011@163.com>
 Unaffiliated <no@organization.net> Zhe Zhang <zzxuanyuan@gmail.com>
@@ -705,7 +720,6 @@ UnitedStack <contact@unitedstack.com> Kun Huang <academicgareth@gmail.com>
 UnitedStack <contact@unitedstack.com> Li Tianqing <tianqing@unitedstack.com>
 UnitedStack <contact@unitedstack.com> Ning Yao <yaoning@unitedstack.com>
 UnitedStack <contact@unitedstack.com> wangsongbo <wangsongbo@unitedstack.com>
-UnitedStack <contact@unitedstack.com> Weibing Zhang <zhangweibing@unitedstack.com>
 UnitedStack <contact@unitedstack.com> Yankun Li <liyankun@unitedstack.com>
 University of California, Santa Cruz <contact@cs.ucsc.edu> Adam Crume <adamcrume@gmail.com>
 University of California, Santa Cruz <contact@cs.ucsc.edu> Andrew Leung <aleung@cs.ucsc.edu>
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 039209fcea0..005c8567ef8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 2.8.11)
 
 project(ceph)
-set(VERSION 12.0.1)
+set(VERSION 12.0.2)
 
 if(POLICY CMP0046)
   # Tweak policies (this one disables "missing" dependency warning)
@@ -76,11 +76,6 @@ if(FREEBSD)
   list(APPEND CMAKE_REQUIRED_INCLUDES /usr/local/include)
 endif(FREEBSD)
 
-#put all the libs and binaries in one place
-set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
-set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
-set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
-
 #Check Includes
 include(CheckIncludeFiles)
 include(CheckIncludeFileCXX)
@@ -111,10 +106,8 @@ CHECK_FUNCTION_EXISTS(pthread_setname_np HAVE_PTHREAD_SETNAME_NP)
 CHECK_FUNCTION_EXISTS(pthread_getname_np HAVE_PTHREAD_GETNAME_NP)
 CHECK_FUNCTION_EXISTS(eventfd HAVE_EVENTFD)
 
-CHECK_INCLUDE_FILES("inttypes.h" HAVE_INTTYPES_H)
 CHECK_INCLUDE_FILES("linux/types.h" HAVE_LINUX_TYPES_H)
 CHECK_INCLUDE_FILES("linux/version.h" HAVE_LINUX_VERSION_H)
-CHECK_INCLUDE_FILES("stdint.h" HAVE_STDINT_H)
 CHECK_INCLUDE_FILES("arpa/nameser_compat.h" HAVE_ARPA_NAMESER_COMPAT_H)
 CHECK_INCLUDE_FILES("sys/mount.h" HAVE_SYS_MOUNT_H)
 CHECK_INCLUDE_FILES("sys/param.h" HAVE_SYS_PARAM_H)
@@ -342,10 +335,16 @@ find_package(dpdk)
 set(HAVE_DPDK ${DPDK_FOUND})
 endif(WITH_DPDK)
 
+option(WITH_BLKIN "Use blkin to emit LTTng tracepoints for Zipkin" OFF)
+if(WITH_BLKIN)
+  set(BLKIN_LIBRARIES blkin lttng-ust)
+  include_directories(src/blkin/blkin-lib)
+endif(WITH_BLKIN)
+
 #option for RGW
 option(WITH_RADOSGW "Rados Gateway is enabled" ON)
 option(WITH_RADOSGW_FCGI_FRONTEND "Rados Gateway's FCGI frontend is enabled" ON)
-option(WITH_RADOSGW_ASIO_FRONTEND "Rados Gateway's ASIO frontend is enabled" ON)
+option(WITH_RADOSGW_BEAST_FRONTEND "Rados Gateway's Beast frontend is enabled" ON)
 if(WITH_RADOSGW)
   find_package(EXPAT REQUIRED)
   if(WITH_RADOSGW_FCGI_FRONTEND)
@@ -513,7 +512,9 @@ endif()
 option(WITH_SYSTEM_BOOST "require and build with system Boost" OFF)
 
 set(BOOST_COMPONENTS
-	thread system regex random program_options date_time iostreams)
+	thread system regex random program_options date_time iostreams coroutine context)
+set(BOOST_HEADER_COMPONENTS container)
+
 if(WITH_MGR)
 	list(APPEND BOOST_COMPONENTS python)
 endif()
@@ -533,6 +534,7 @@ else()
   set(BOOST_SOURCE_DIR "${PROJECT_SOURCE_DIR}/src/boost")
   set(BOOST_PREFIX "${PROJECT_BINARY_DIR}/boost")
   set(BOOST_BUILD "${PROJECT_BINARY_DIR}/boost-build")
+  list(APPEND BOOST_COMPONENTS ${BOOST_HEADER_COMPONENTS})
   string(REPLACE ";" "," BOOST_WITH_LIBS "${BOOST_COMPONENTS}")
   execute_process(COMMAND "./bootstrap.sh"
     "--prefix=${BOOST_PREFIX}"
@@ -548,6 +550,24 @@ else()
   endif()
   list(APPEND b2
     variant=release link=static threading=multi cxxflags=${BOOST_CFLAGS})
+  if(NOT CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL CMAKE_SYSTEM_PROCESSOR)
+    # we are crosscompiling
+    if(CMAKE_CXX_COMPILER_ID STREQUAL GNU)
+      set(b2_cc gcc)
+    elseif(CMAKE_CXX_COMPILER_ID STREQUAL Clang)
+      set(b2_cc clang)
+    else()
+      message(SEND_ERROR "unknown compiler: ${CMAKE_CXX_COMPILER_ID}")
+    endif()
+    # edit the config.jam so, b2 will be able to use the specified toolset
+    execute_process(
+      COMMAND
+      sed -i
+      "s|using ${b2_cc} ;|using ${b2_cc} : ${CMAKE_SYSTEM_PROCESSOR} : ${CMAKE_CXX_COMPILER} ;|"
+      ${PROJECT_SOURCE_DIR}/src/boost/project-config.jam)
+    # use ${CMAKE_SYSTEM_PROCESSOR} as the version identifier of compiler
+    list(APPEND b2 toolset=${b2_cc}-${CMAKE_SYSTEM_PROCESSOR})
+  endif()
   # 2. install headers
   execute_process(COMMAND
     ${b2}
diff --git a/PendingReleaseNotes b/PendingReleaseNotes
index 6386dd9770f..f4718eb179f 100644
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -121,3 +121,15 @@
 * The 'rados df' JSON output now prints numeric values as numbers instead of
   strings.
 
+* There was a bug introduced in Jewel (#19119) that broke the mapping behavior
+  when an "out" OSD that still existed in the CRUSH map was removed with 'osd rm'.
+  This could result in 'misdirected op' and other errors.  The bug is now fixed,
+  but the fix itself introduces the same risk because the behavior may vary between
+  clients and OSDs.  To avoid problems, please ensure that all OSDs are removed
+  from the CRUSH map before deleting them.  That is, be sure to do::
+
+     ceph osd crush rm osd.123
+
+  before::
+
+     ceph osd rm osd.123
diff --git a/README.git-subtree b/README.git-subtree
new file mode 100644
index 00000000000..7219bee8d31
--- /dev/null
+++ b/README.git-subtree
@@ -0,0 +1,48 @@
+Some libraries that Ceph uses are incorporated into the build tree
+through a technique known as git subtrees. This is an alternative to
+git submodules, which is also used in Ceph.
+
+One such library is the dmclock library. Here are some basic notes on
+the use of git subtrees.
+
+When a subtree is added to the repo, commands such as these were run
+from the top-level ceph directory:
+
+    git subtree add --prefix src/dmclock \
+	git@github.com:ceph/dmclock.git master --squash
+
+That essentially brings in a full copy of the library into the
+subdirectory src/dmclock, but squashes all the commits into a single
+one.
+
+If in time the library is updated and you'd like to bring the updates
+in, you could run:
+
+    git subtree pull --prefix src/dmclock \
+        git@github.com:ceph/dmclock.git master --squash
+
+WARNINGS
+
+1. A rebase should NEVER include the commits by which a subtree is
+added or pulled. Those commits do not include the prefix that was used
+for the subtree add/pull, and therefore the commits are applied to the
+wrong files or, more likely, to non-existant files. If something like
+this must be done, a workable approach is to a) do an interactive
+rebase, b) remove the commits for the former subtree add/pull, and
+either c) replace those commits with executions (x/exec) of the
+commands used to add/pull the subtrees, or d) do those commands from
+the command-line by using (e/edit) on preceding commits.
+
+2. If you'd like to modify the library contained in a subtree you'll
+need to choose whether to just change your subtree and maintain those
+differences into the future (until the library incorporates them) or,
+if you're able, modify the library and use a "git subtree pull ..." to
+bring them in.
+
+3. If you modify the library within the ceph tree, then it's best not
+to combine changes within the subtree and outside the subtree in a
+single commit. Each commit should either only contain changes within
+the subtree or outside the subtree. That gives you the option to
+cleanly push those changes back to the library's repo. That way if you
+ultimately decide to make the changes to the library, you can easily
+remove the subtree commits.
diff --git a/ceph-object-corpus b/ceph-object-corpus
-Subproject 71fb1a5dffccc7e66db88491d2bdc7d4d0d6eb0
+Subproject ccc273180b8074f13e375ed2ecf4a05bc32e596
diff --git a/ceph.spec.in b/ceph.spec.in
index 0b2f1a71927..0b02efabe35 100644
--- a/ceph.spec.in
+++ b/ceph.spec.in
@@ -97,13 +97,14 @@ BuildRequires:	selinux-policy-devel
 BuildRequires:	/usr/share/selinux/devel/policyhelp
 %endif
 BuildRequires:	bc
+BuildRequires:	gperf
 BuildRequires:  cmake
 BuildRequires:	cryptsetup
 BuildRequires:	fuse-devel
 BuildRequires:	gcc-c++
 BuildRequires:	gdbm
 %if 0%{with tcmalloc}
-BuildRequires:	gperftools-devel
+BuildRequires:	gperftools-devel >= 2.4
 %endif
 BuildRequires:  jq
 BuildRequires:	leveldb-devel > 1.2
@@ -121,8 +122,8 @@ BuildRequires:	pkgconfig
 BuildRequires:	python
 BuildRequires:	python-devel
 BuildRequires:	python-nose
+BuildRequires:	python-prettytable
 BuildRequires:	python-requests
-BuildRequires:	python-sphinx
 BuildRequires:	python-virtualenv
 BuildRequires:	snappy-devel
 BuildRequires:	udev
@@ -152,6 +153,7 @@ BuildRequires:  libopenssl-devel
 BuildRequires:  lsb-release
 BuildRequires:  openldap2-devel
 BuildRequires:	python-Cython
+BuildRequires:	python-Sphinx
 %endif
 %if 0%{?fedora} || 0%{?rhel} 
 Requires:	systemd
@@ -163,6 +165,7 @@ BuildRequires:  openldap-devel
 BuildRequires:  openssl-devel
 BuildRequires:  redhat-lsb-core
 BuildRequires:	Cython
+BuildRequires:	python-sphinx
 %endif
 # python34-... for RHEL, python3-... for all other supported distros
 %if 0%{?rhel}
@@ -256,6 +259,7 @@ Requires:	python-rados = %{epoch}:%{version}-%{release}
 Requires:	python-rbd = %{epoch}:%{version}-%{release}
 Requires:	python-cephfs = %{epoch}:%{version}-%{release}
 Requires:	python-rgw = %{epoch}:%{version}-%{release}
+Requires:	python-prettytable
 Requires:	python-requests
 %{?systemd_requires}
 %if 0%{?suse_version}
@@ -718,6 +722,15 @@ This package contains the Java libraries for the Ceph File System.
 
 %endif
 
+%package -n rados-objclass-devel
+Summary:        RADOS object class development kit
+Group:          Development/Libraries
+License:        LGPL-2.0
+Requires:       librados2-devel = %{epoch}:%{version}-%{release}
+%description -n rados-objclass-devel
+This package contains libraries and headers needed to develop RADOS object
+class plugins.
+
 %if 0%{with selinux}
 
 %package selinux
@@ -892,6 +905,7 @@ mkdir -p %{buildroot}%{_localstatedir}/lib/ceph/radosgw
 mkdir -p %{buildroot}%{_localstatedir}/lib/ceph/bootstrap-osd
 mkdir -p %{buildroot}%{_localstatedir}/lib/ceph/bootstrap-mds
 mkdir -p %{buildroot}%{_localstatedir}/lib/ceph/bootstrap-rgw
+mkdir -p %{buildroot}%{_localstatedir}/lib/ceph/bootstrap-mgr
 
 %if 0%{?suse_version}
 # create __pycache__ directories and their contents
@@ -929,13 +943,14 @@ rm -rf %{buildroot}
 %{_libdir}/ceph/erasure-code/libec_*.so*
 %dir %{_libdir}/ceph/compressor
 %{_libdir}/ceph/compressor/libceph_*.so*
+%ifarch x86_64
 %dir %{_libdir}/ceph/crypto
 %{_libdir}/ceph/crypto/libceph_*.so*
+%endif
 %if %{with lttng}
 %{_libdir}/libos_tp.so*
 %{_libdir}/libosd_tp.so*
 %endif
-%config %{_sysconfdir}/bash_completion.d/ceph
 %config(noreplace) %{_sysconfdir}/logrotate.d/ceph
 %if 0%{?fedora} || 0%{?rhel}
 %config(noreplace) %{_sysconfdir}/sysconfig/ceph
@@ -960,6 +975,7 @@ rm -rf %{buildroot}
 %attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/bootstrap-osd
 %attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/bootstrap-mds
 %attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/bootstrap-rgw
+%attr(750,ceph,ceph) %dir %{_localstatedir}/lib/ceph/bootstrap-mgr
 
 %post base
 /sbin/ldconfig
@@ -1007,6 +1023,7 @@ DISABLE_RESTART_ON_UPDATE="yes"
 %{_bindir}/cephfs-journal-tool
 %{_bindir}/cephfs-table-tool
 %{_bindir}/rados
+%{_bindir}/radosgw-admin
 %{_bindir}/rbd
 %{_bindir}/rbd-replay
 %{_bindir}/rbd-replay-many
@@ -1030,6 +1047,7 @@ DISABLE_RESTART_ON_UPDATE="yes"
 %{_mandir}/man8/ceph.8*
 %{_mandir}/man8/mount.ceph.8*
 %{_mandir}/man8/rados.8*
+%{_mandir}/man8/radosgw-admin.8*
 %{_mandir}/man8/rbd.8*
 %{_mandir}/man8/rbdmap.8*
 %{_mandir}/man8/rbd-replay.8*
@@ -1040,8 +1058,10 @@ DISABLE_RESTART_ON_UPDATE="yes"
 %{_datadir}/ceph/id_rsa_drop.ceph.com
 %{_datadir}/ceph/id_rsa_drop.ceph.com.pub
 %dir %{_sysconfdir}/ceph/
+%config %{_sysconfdir}/bash_completion.d/ceph
 %config %{_sysconfdir}/bash_completion.d/rados
 %config %{_sysconfdir}/bash_completion.d/rbd
+%config %{_sysconfdir}/bash_completion.d/radosgw-admin
 %config(noreplace) %{_sysconfdir}/ceph/rbdmap
 %{_unitdir}/rbdmap.service
 %{python_sitelib}/ceph_argparse.py*
@@ -1307,12 +1327,9 @@ fi
 %files radosgw
 %defattr(-,root,root,-)
 %{_bindir}/radosgw
-%{_bindir}/radosgw-admin
 %{_bindir}/radosgw-token
 %{_bindir}/radosgw-object-expirer
 %{_mandir}/man8/radosgw.8*
-%{_mandir}/man8/radosgw-admin.8*
-%config %{_sysconfdir}/bash_completion.d/radosgw-admin
 %dir %{_localstatedir}/lib/ceph/radosgw
 %{_unitdir}/ceph-radosgw@.service
 %{_unitdir}/ceph-radosgw.target
@@ -1620,6 +1637,8 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %{_bindir}/ceph-osdomap-tool
 %{_bindir}/ceph-kvstore-tool
 %{_bindir}/ceph-debugpack
+%{_bindir}/dmclock-tests
+%{_bindir}/dmclock-data-struct-tests
 %{_mandir}/man8/ceph-debugpack.8*
 %dir %{_libdir}/ceph
 %{_libdir}/ceph/ceph-monstore-update-crush.sh
@@ -1644,6 +1663,11 @@ ln -sf %{_libdir}/librbd.so.1 /usr/lib64/qemu/librbd.so.1
 %{_javadir}/libcephfs-test.jar
 %endif
 
+%files -n rados-objclass-devel
+%defattr(-,root,root,-)
+%dir %{_includedir}/rados
+%{_includedir}/rados/objclass.h
+
 %if 0%{with selinux}
 %files selinux
 %defattr(-,root,root,-)
diff --git a/cmake/modules/Findbabeltrace.cmake b/cmake/modules/Findbabeltrace.cmake
index 76f6210578a..6b29a246b21 100644
--- a/cmake/modules/Findbabeltrace.cmake
+++ b/cmake/modules/Findbabeltrace.cmake
@@ -16,7 +16,7 @@ find_program(BABELTRACE_EXECUTABLE
   NAMES babeltrace babeltrace-ctf)
 
 include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(BABELTRACE DEFAULT_MSG
+find_package_handle_standard_args(babeltrace DEFAULT_MSG
   BABELTRACE_INCLUDE_DIR BABELTRACE_LIBRARY)
 set(BABELTRACE_LIBRARIES ${BABELTRACE_LIBRARY})
 mark_as_advanced(BABELTRACE_INCLUDE_DIR BABELTRACE_LIBRARY)
diff --git a/cmake/modules/Findcryptopp.cmake b/cmake/modules/Findcryptopp.cmake
index 74a01e83ac3..f7c3b9b4641 100644
--- a/cmake/modules/Findcryptopp.cmake
+++ b/cmake/modules/Findcryptopp.cmake
@@ -104,5 +104,5 @@ SET (CRYPTOPP_LIBRARIES ${CRYPTOPP_LIBRARY})
 MARK_AS_ADVANCED (CRYPTOPP_INCLUDE_DIR CRYPTOPP_LIBRARY CRYPTOPP_LIBRARY_DEBUG
   CRYPTOPP_LIBRARY_RELEASE)
 
-FIND_PACKAGE_HANDLE_STANDARD_ARGS (CryptoPP REQUIRED_VARS CRYPTOPP_ROOT_DIR
+FIND_PACKAGE_HANDLE_STANDARD_ARGS (cryptopp REQUIRED_VARS CRYPTOPP_ROOT_DIR
   CRYPTOPP_INCLUDE_DIR CRYPTOPP_LIBRARY VERSION_VAR CRYPTOPP_VERSION)
diff --git a/cmake/modules/Findfuse.cmake b/cmake/modules/Findfuse.cmake
index 58c50cb9808..e7a7ff08d89 100644
--- a/cmake/modules/Findfuse.cmake
+++ b/cmake/modules/Findfuse.cmake
@@ -21,7 +21,7 @@ find_library(FUSE_LIBRARIES
   PATHS /usr/local/lib64 /usr/local/lib)
 
 include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(FUSE DEFAULT_MSG
+find_package_handle_standard_args(fuse DEFAULT_MSG
   FUSE_INCLUDE_DIRS FUSE_LIBRARIES)
 
 mark_as_advanced(
diff --git a/cmake/modules/Findrocksdb.cmake b/cmake/modules/Findrocksdb.cmake
index 8dbec262ff2..e1aac574113 100644
--- a/cmake/modules/Findrocksdb.cmake
+++ b/cmake/modules/Findrocksdb.cmake
@@ -10,7 +10,7 @@ find_path(ROCKSDB_INCLUDE_DIR rocksdb/db.h)
 find_library(ROCKSDB_LIBRARIES rocksdb)
 
 include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(Rocksdb DEFAULT_MSG
+find_package_handle_standard_args(rocksdb DEFAULT_MSG
   ROCKSDB_LIBRARIES ROCKSDB_INCLUDE_DIR)
 
 mark_as_advanced(
diff --git a/debian/ceph-base.dirs b/debian/ceph-base.dirs
index 1f52f14b9c6..97d66adf00d 100644
--- a/debian/ceph-base.dirs
+++ b/debian/ceph-base.dirs
@@ -2,3 +2,4 @@ var/lib/ceph/tmp
 var/lib/ceph/bootstrap-osd
 var/lib/ceph/bootstrap-mds
 var/lib/ceph/bootstrap-rgw
+var/lib/ceph/bootstrap-mgr
diff --git a/debian/ceph-base.install b/debian/ceph-base.install
index 55ba5e20469..95ffe0a8485 100644
--- a/debian/ceph-base.install
+++ b/debian/ceph-base.install
@@ -1,4 +1,3 @@
-etc/bash_completion.d/ceph
 etc/init.d/ceph
 usr/sbin/ceph-create-keys
 usr/bin/ceph-detect-init
diff --git a/debian/ceph-common.install b/debian/ceph-common.install
index e38ede33f46..11e24f49472 100644..100755
--- a/debian/ceph-common.install
+++ b/debian/ceph-common.install
@@ -1,4 +1,8 @@
+#! /usr/bin/dh-exec --with=install
+
+etc/bash_completion.d/ceph
 etc/bash_completion.d/rados
+etc/bash_completion.d/radosgw-admin
 etc/bash_completion.d/rbd
 usr/bin/ceph
 usr/bin/ceph-authtool
@@ -11,6 +15,7 @@ usr/bin/cephfs-data-scan
 usr/bin/cephfs-journal-tool
 usr/bin/cephfs-table-tool
 usr/bin/rados
+usr/bin/radosgw-admin
 usr/bin/rbd
 usr/bin/rbdmap
 usr/bin/rbd-replay*
@@ -18,7 +23,7 @@ usr/bin/ceph-post-file
 usr/bin/ceph-brag
 usr/sbin/mount.ceph sbin
 usr/lib/ceph/compressor/*
-usr/lib/ceph/crypto/*
+usr/lib/ceph/crypto/* [amd64]
 usr/share/man/man8/ceph-authtool.8
 usr/share/man/man8/ceph-conf.8
 usr/share/man/man8/ceph-dencoder.8
@@ -28,6 +33,7 @@ usr/share/man/man8/ceph-post-file.8
 usr/share/man/man8/ceph.8
 usr/share/man/man8/mount.ceph.8
 usr/share/man/man8/rados.8
+usr/share/man/man8/radosgw-admin.8
 usr/share/man/man8/rbd.8
 usr/share/man/man8/rbdmap.8
 usr/share/man/man8/rbd-replay*.8
diff --git a/debian/ceph-test.install b/debian/ceph-test.install
index 15b04664eaa..4fdf179e304 100644
--- a/debian/ceph-test.install
+++ b/debian/ceph-test.install
@@ -26,5 +26,7 @@ usr/bin/ceph_xattr_bench
 usr/bin/ceph-monstore-tool
 usr/bin/ceph-osdomap-tool
 usr/bin/ceph-kvstore-tool
+usr/bin/dmclock-tests
+usr/bin/dmclock-data-struct-tests
 usr/share/java/libcephfs-test.jar
 usr/lib/ceph/ceph-monstore-update-crush.sh
diff --git a/debian/changelog b/debian/changelog
index e7ab1540bb7..adf0b980f59 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,9 @@
+ceph (12.0.2-1) stable; urgency=medium
+
+  * New upstream release
+
+ -- Ceph Release Team <ceph-maintainers@ceph.com>  Thu, 20 Apr 2017 19:59:57 +0000
+
 ceph (12.0.1-1) stable; urgency=medium
 
   * New upstream release
diff --git a/debian/control b/debian/control
index 1ae3a8a7662..43b582b4e0d 100644
--- a/debian/control
+++ b/debian/control
@@ -9,12 +9,14 @@ Uploaders: Ken Dreyer <kdreyer@redhat.com>,
            Alfredo Deza <adeza@redhat.com>
 Build-Depends: bc,
                btrfs-tools,
+	       gperf,
 	       cmake,
                cpio,
 	       cryptsetup-bin | cryptsetup,
                cython,
                cython3,
                debhelper (>= 9),
+               dh-exec,
                dh-python,
 	       dh-systemd,
                default-jdk,
@@ -50,6 +52,7 @@ Build-Depends: bc,
                python (>= 2.7),
                python-all-dev,
                python-nose,
+	       python-prettytable,
 	       python-setuptools,
                python-sphinx,
                python3-all-dev,
@@ -363,7 +366,8 @@ Depends: librbd1 (= ${binary:Version}), ${misc:Depends}, ${shlibs:Depends},
          python-rbd (= ${binary:Version}),
          python-rgw (= ${binary:Version}),
          ${python:Depends},
-	 python-requests
+	 python-requests,
+	 python-prettytable
 Conflicts: ceph-client-tools
 Replaces: ceph-client-tools,
 	  ceph (<< 10),
@@ -824,3 +828,11 @@ Section: java
 Depends: libcephfs2 (= ${binary:Version}), ${java:Depends},
          ${misc:Depends}, ${shlibs:Depends}
 Description: Java Native Interface library for CephFS Java bindings
+
+Package: rados-objclass-dev
+Architecture: linux-any
+Section: libdevel
+Depends: librados-dev (= ${binary:Version}) ${misc:Depends}
+Description: RADOS object class development kit.
+ .
+ This package contains development files needed for building RADOS object class plugins.
diff --git a/debian/rados-objclass-dev.install b/debian/rados-objclass-dev.install
new file mode 100644
index 00000000000..ac8f90ee2aa
--- /dev/null
+++ b/debian/rados-objclass-dev.install
@@ -0,0 +1 @@
+usr/include/rados/objclass.h
diff --git a/debian/radosgw.install b/debian/radosgw.install
index f70afed150d..192f2329bfa 100644
--- a/debian/radosgw.install
+++ b/debian/radosgw.install
@@ -1,7 +1,4 @@
-etc/bash_completion.d/radosgw-admin
 usr/bin/radosgw
-usr/bin/radosgw-admin
 usr/bin/radosgw-token
 usr/bin/radosgw-object-expirer
-usr/share/man/man8/radosgw-admin.8
 usr/share/man/man8/radosgw.8
diff --git a/doc/cephfs/client-config-ref.rst b/doc/cephfs/client-config-ref.rst
index ce5562fae40..6a149acf08f 100644
--- a/doc/cephfs/client-config-ref.rst
+++ b/doc/cephfs/client-config-ref.rst
@@ -35,7 +35,7 @@
 
 ``client_dirsize_rbytes``
 
-:Description: If set to `true`, use the recursive size of a directory (that is, total of all descendants).
+:Description: If set to ``true``, use the recursive size of a directory (that is, total of all descendants).
 :Type: Boolean
 :Default: ``true``
 
@@ -71,7 +71,7 @@
 
 ``client_mountpoint``
 
-:Description: Directory to mount on the CephFS file system. An alternative to the `-r` option of the `ceph-fuse` command.
+:Description: Directory to mount on the CephFS file system. An alternative to the ``-r`` option of the ``ceph-fuse`` command.
 :Type: String
 :Default: ``"/"``
 
@@ -208,7 +208,7 @@ Developer Options
 
 ``client_trace``
 
-:Description: The path to the trace file for all file operations. The output is designed to be used by the Ceph `synthetic client <../man/8/ceph-syn>`.
+:Description: The path to the trace file for all file operations. The output is designed to be used by the Ceph `synthetic client <../../man/8/ceph-syn>`_.
 :Type: String
 :Default: ``""`` (disabled)
 
diff --git a/doc/cephfs/multimds.rst b/doc/cephfs/multimds.rst
index a1832bb9382..942a786d623 100644
--- a/doc/cephfs/multimds.rst
+++ b/doc/cephfs/multimds.rst
@@ -107,3 +107,35 @@ When a daemon finishes stopping, it will respawn itself and go
 back to being a standby.
 
 
+Manually pinning directory trees to a particular rank
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In multiple active metadata server configurations, a balancer runs which works
+to spread metadata load evenly across the cluster. This usually works well
+enough for most users but sometimes it is desirable to override the dynamic
+balancer with explicit mappings of metadata to particular ranks. This can allow
+the administrator or users to evenly spread application load or limit impact of
+users' metadata requests on the entire cluster.
+
+The mechanism provided for this purpose is called an ``export pin``, an
+extended attribute of directories. The name of this extended attribute is
+``ceph.dir.pin``.  Users can set this attribute using standard commands:
+
+::
+    setfattr -n ceph.dir.pin -v 2 path/to/dir
+
+The value of the extended attribute is the rank to assign the directory subtree
+to. A default value of ``-1`` indicates the directory is not pinned.
+
+A directory's export pin is inherited from its closest parent with a set export
+pin.  In this way, setting the export pin on a directory affects all of its
+children. However, the parents pin can be overriden by setting the child
+directory's export pin. For example:
+
+::
+    mkdir -p a/b
+    # "a" and "a/b" both start without an export pin set
+    setfattr -n ceph.dir.pin -v 1 a/
+    # a and b are now pinned to rank 1
+    setfattr -n ceph.dir.pin -v 0 a/b
+    # a/b is now pinned to rank 0 and a/ and the rest of its children are still pinned to rank 1
diff --git a/doc/dev/blkin.rst b/doc/dev/blkin.rst
new file mode 100644
index 00000000000..9427202496c
--- /dev/null
+++ b/doc/dev/blkin.rst
@@ -0,0 +1,167 @@
+=========================
+ Tracing Ceph With BlkKin
+=========================
+
+Ceph can use Blkin, a library created by Marios Kogias and others,
+which enables tracking a specific request from the time it enters
+the system at higher levels till it is finally served by RADOS.
+
+In general, Blkin implements the Dapper_ tracing semantics
+in order to show the causal relationships between the different
+processing phases that an IO request may trigger. The goal is an
+end-to-end visualisation of the request's route in the system,
+accompanied by information concerning latencies in each processing
+phase. Thanks to LTTng this can happen with a minimal overhead and
+in realtime. The LTTng traces can then be visualized with Twitter's
+Zipkin_.
+
+.. _Dapper: http://static.googleusercontent.com/media/research.google.com/el//pubs/archive/36356.pdf
+.. _Zipkin: http://twitter.github.io/zipkin/
+
+
+Installing Blkin
+================
+
+You can install Markos Kogias' upstream Blkin_ by hand.::
+
+  cd blkin/
+  make && make install
+
+or build distribution packages using DistroReadyBlkin_, which also comes with
+pkgconfig support. If you choose the latter, then you must generate the
+configure and make files first.::
+
+  cd blkin
+  autoreconf -i
+
+.. _Blkin: https://github.com/marioskogias/blkin
+.. _DistroReadyBlkin: https://github.com/agshew/blkin
+
+
+Configuring Ceph with Blkin
+===========================
+
+If you built and installed Blkin by hand, rather than building and
+installing packages, then set these variables before configuring
+Ceph.::
+
+  export BLKIN_CFLAGS=-Iblkin/
+  export BLKIN_LIBS=-lzipkin-cpp
+
+Since there are separate lttng and blkin changes to Ceph, you may
+want to configure with something like::
+
+  ./configure --with-blkin --without-lttng --with-debug
+
+
+Testing Blkin
+=============
+
+It's easy to test Ceph's Blkin tracing. Let's assume you don't have
+Ceph already running, and you compiled Ceph with Blkin support but
+you did't install it. Then launch Ceph with the ``vstart.sh`` script
+in Ceph's src directgory so you can see the possible tracepoints.::
+
+  cd src
+  OSD=3 MON=3 RGW=1 ./vstart.sh -n
+  lttng list --userspace
+
+You'll see something like the following:::
+
+  UST events:
+  -------------
+  PID: 8987 - Name: ./ceph-osd
+        zipkin:timestamp (loglevel: TRACE_WARNING (4)) (type: tracepoint)
+        zipkin:keyval (loglevel: TRACE_WARNING (4)) (type: tracepoint)
+        ust_baddr_statedump:soinfo (loglevel: TRACE_DEBUG_LINE (13)) (type: tracepoint)
+
+  PID: 8407 - Name: ./ceph-mon
+        zipkin:timestamp (loglevel: TRACE_WARNING (4)) (type: tracepoint)
+        zipkin:keyval (loglevel: TRACE_WARNING (4)) (type: tracepoint)
+        ust_baddr_statedump:soinfo (loglevel: TRACE_DEBUG_LINE (13)) (type: tracepoint)
+
+  ...
+
+Next, stop Ceph so that the tracepoints can be enabled.::
+
+  ./stop.sh
+
+Start up an LTTng session and enable the tracepoints.::
+
+  lttng create blkin-test
+  lttng enable-event --userspace zipkin:timestamp
+  lttng enable-event --userspace zipkin:keyval
+  lttng start
+
+Then start up Ceph again.::
+
+  OSD=3 MON=3 RGW=1 ./vstart.sh -n
+
+You may want to check that ceph is up.::
+
+  ./ceph status
+
+Now put something in usin rados, check that it made it, get it back, and remove it.::
+
+  ./rados mkpool test-blkin
+  ./rados put test-object-1 ./vstart.sh --pool=test-blkin
+  ./rados -p test-blkin ls
+  ./ceph osd map test-blkin test-object-1
+  ./rados get test-object-1 ./vstart-copy.sh --pool=test-blkin
+  md5sum vstart*
+  ./rados rm test-object-1 --pool=test-blkin
+
+You could also use the example in ``examples/librados/`` or ``rados bench``.
+
+Then stop the LTTng session and see what was collected.::
+
+  lttng stop
+  lttng view
+
+You'll see something like:::
+
+  [13:09:07.755054973] (+?.?????????) scruffy zipkin:timestamp: { cpu_id = 5 }, { trace_name = "Main", service_name = "MOSDOp", port_no = 0, ip = "0.0.0.0", trace_id = 7492589359882233221, span_id = 2694140257089376129, parent_span_id = 0, event = "Message allocated" }
+  [13:09:07.755071569] (+0.000016596) scruffy zipkin:keyval: { cpu_id = 5 }, { trace_name = "Main", service_name = "MOSDOp", port_no = 0, ip = "0.0.0.0", trace_id = 7492589359882233221, span_id = 2694140257089376129, parent_span_id = 0, key = "Type", val = "MOSDOp" }
+  [13:09:07.755074217] (+0.000002648) scruffy zipkin:keyval: { cpu_id = 5 }, { trace_name = "Main", service_name = "MOSDOp", port_no = 0, ip = "0.0.0.0", trace_id = 7492589359882233221, span_id = 2694140257089376129, parent_span_id = 0, key = "Reqid", val = "client.4126.0:1" }
+  ...
+
+
+Install  Zipkin
+===============
+One of the points of using Blkin is so that you can look at the traces
+using Zipkin. Users should run Zipkin as a tracepoints collector and
+also a web service, which means users need to run three services,
+zipkin-collector, zipkin-query and zipkin-web.
+
+Download Zipkin Package::
+
+  wget https://github.com/twitter/zipkin/archive/1.1.0.tar.gz
+  tar zxf 1.1.0.tar.gz
+  cd zipkin-1.1.0
+  bin/collector cassandra &
+  bin/query cassandra &
+  bin/web &
+
+Check Zipkin::
+
+  bin/test
+  Browse http://${zipkin-web-ip}:8080
+
+
+Show Ceph's Blkin Traces in Zipkin-web
+======================================
+Blkin provides a script which translates lttng result to Zipkin
+(Dapper) semantics.
+
+Send lttng data to Zipkin::
+
+  python3 babeltrace_zipkin.py ${lttng-traces-dir}/${blkin-test}/ust/uid/0/64-bit/ -p ${zipkin-collector-port(9410 by default)} -s ${zipkin-collector-ip}
+
+Example::
+
+  python3 babeltrace_zipkin.py ~/lttng-traces-dir/blkin-test-20150225-160222/ust/uid/0/64-bit/ -p 9410 -s 127.0.0.1
+
+Check Ceph traces on webpage::
+
+  Browse http://${zipkin-web-ip}:8080
+  Click "Find traces"
diff --git a/doc/dev/index.rst b/doc/dev/index.rst
index 38b64eaf636..d3217b72dff 100644
--- a/doc/dev/index.rst
+++ b/doc/dev/index.rst
@@ -165,6 +165,39 @@ Building from source
 
 See instructions at :doc:`/install/build-ceph`.
 
+Using ccache to speed up local builds
+-------------------------------------
+
+Rebuilds of the ceph source tree can benefit significantly from use of `ccache`_.
+Many a times while switching branches and such, one might see build failures for
+certain older branches mostly due to older build artifacts. These rebuilds can
+significantly benefit the use of ccache. For a full clean source tree, one could
+do ::
+
+  $ make clean
+
+  # note the following will nuke everything in the source tree that
+  # isn't tracked by git, so make sure to backup any log files /conf options
+
+  $ git clean -fdx; git submodule foreach git clean -fdx
+
+ccache is available as a package in most distros. To build ceph with ccache one
+can::
+
+  $ cmake -DWITH_CCACHE=ON ..
+
+ccache can also be used for speeding up all builds in the system. for more
+details refer to the `run modes`_ of the ccache manual. The default settings of
+``ccache`` can be displayed with ``ccache -s``.
+
+.. note: It is recommended to override the ``max_size``, which is the size of
+   cache, defaulting to 10G, to a larger size like 25G or so. Refer to the
+   `configuration`_ section of ccache manual.
+
+.. _`ccache`: https://ccache.samba.org/
+.. _`run modes`: https://ccache.samba.org/manual.html#_run_modes
+.. _`configuration`: https://ccache.samba.org/manual.html#_configuration
+
 Development-mode cluster
 ------------------------
 
@@ -1154,8 +1187,8 @@ proceed to the next step.
 To start with a clean slate, login to your tenant via the Horizon dashboard and:
 
 * terminate the ``teuthology`` and ``packages-repository`` instances, if any
-* delete the ``teuthology`` security group
-* delete the ``teuthology`` and ``teuthology-myself`` key pairs
+* delete the ``teuthology`` and ``teuthology-worker`` security groups, if any
+* delete the ``teuthology`` and ``teuthology-myself`` key pairs, if any
 
 Also do the above if you ever get key-related errors ("invalid key", etc.) when
 trying to schedule suites.
diff --git a/doc/dev/perf_counters.rst b/doc/dev/perf_counters.rst
index 11ecd20e9c6..398d51a00ee 100644
--- a/doc/dev/perf_counters.rst
+++ b/doc/dev/perf_counters.rst
@@ -39,89 +39,121 @@ The ``perf schema`` command dumps a json description of which values are availab
 +------+-------------------------------------+
 | 2    | unsigned 64-bit integer value       |
 +------+-------------------------------------+
-| 4    | average (sum + count pair)          |
+| 4    | average (sum + count pair), where   |
 +------+-------------------------------------+
 | 8    | counter (vs gauge)                  |
 +------+-------------------------------------+
 
-Every value will have either bit 1 or 2 set to indicate the type (float or integer).  If bit 8 is set (counter), the reader may want to subtract off the previously read value to get the delta during the previous interval.  
+Every value will have either bit 1 or 2 set to indicate the type
+(float or integer).
 
-If bit 4 is set (average), there will be two values to read, a sum and a count.  If it is a counter, the average for the previous interval would be sum delta (since the previous read) divided by the count delta.  Alternatively, dividing the values outright would provide the lifetime average value.  Normally these are used to measure latencies (number of requests and a sum of request latencies), and the average for the previous interval is what is interesting.
+If bit 8 is set (counter), the value is monotonically increasing and
+the reader may want to subtract off the previously read value to get
+the delta during the previous interval.
+
+If bit 4 is set (average), there will be two values to read, a sum and
+a count.  If it is a counter, the average for the previous interval
+would be sum delta (since the previous read) divided by the count
+delta.  Alternatively, dividing the values outright would provide the
+lifetime average value.  Normally these are used to measure latencies
+(number of requests and a sum of request latencies), and the average
+for the previous interval is what is interesting.
+
+Instead of interpreting the bit fields, the ``metric type`` has a
+value of either ``guage`` or ``counter``, and the ``value type``
+property will be one of ``real``, ``integer``, ``real-integer-pair``
+(for a sum + real count pair), or ``integer-integer-pair`` (for a
+sum + integer count pair).
 
 Here is an example of the schema output::
 
- {
-   "throttle-msgr_dispatch_throttler-hbserver" : {
-      "get_or_fail_fail" : {
-         "type" : 10
-      },
-      "get_sum" : {
-         "type" : 10
-      },
-      "max" : {
-         "type" : 10
-      },
-      "put" : {
-         "type" : 10
-      },
-      "val" : {
-         "type" : 10
-      },
-      "take" : {
-         "type" : 10
-      },
-      "get_or_fail_success" : {
-         "type" : 10
-      },
-      "wait" : {
-         "type" : 5
-      },
-      "get" : {
-         "type" : 10
-      },
-      "take_sum" : {
-         "type" : 10
-      },
-      "put_sum" : {
-         "type" : 10
-      }
-   },
-   "throttle-msgr_dispatch_throttler-client" : {
-      "get_or_fail_fail" : {
-         "type" : 10
-      },
-      "get_sum" : {
-         "type" : 10
-      },
-      "max" : {
-         "type" : 10
-      },
-      "put" : {
-         "type" : 10
-      },
-      "val" : {
-         "type" : 10
-      },
-      "take" : {
-         "type" : 10
-      },
-      "get_or_fail_success" : {
-         "type" : 10
-      },
-      "wait" : {
-         "type" : 5
-      },
-      "get" : {
-         "type" : 10
-      },
-      "take_sum" : {
-         "type" : 10
-      },
-      "put_sum" : {
-         "type" : 10
-      }
-   }
- }
+  {
+    "throttle-bluestore_throttle_bytes": {
+        "val": {
+            "type": 2,
+            "metric_type": "gauge",
+            "value_type": "integer",
+            "description": "Currently available throttle",
+            "nick": ""
+        },
+        "max": {
+            "type": 2,
+            "metric_type": "gauge",
+            "value_type": "integer",
+            "description": "Max value for throttle",
+            "nick": ""
+        },
+        "get_started": {
+            "type": 10,
+            "metric_type": "counter",
+            "value_type": "integer",
+            "description": "Number of get calls, increased before wait",
+            "nick": ""
+        },
+        "get": {
+            "type": 10,
+            "metric_type": "counter",
+            "value_type": "integer",
+            "description": "Gets",
+            "nick": ""
+        },
+        "get_sum": {
+            "type": 10,
+            "metric_type": "counter",
+            "value_type": "integer",
+            "description": "Got data",
+            "nick": ""
+        },
+        "get_or_fail_fail": {
+            "type": 10,
+            "metric_type": "counter",
+            "value_type": "integer",
+            "description": "Get blocked during get_or_fail",
+            "nick": ""
+        },
+        "get_or_fail_success": {
+            "type": 10,
+            "metric_type": "counter",
+            "value_type": "integer",
+            "description": "Successful get during get_or_fail",
+            "nick": ""
+        },
+        "take": {
+            "type": 10,
+            "metric_type": "counter",
+            "value_type": "integer",
+            "description": "Takes",
+            "nick": ""
+        },
+        "take_sum": {
+            "type": 10,
+            "metric_type": "counter",
+            "value_type": "integer",
+            "description": "Taken data",
+            "nick": ""
+        },
+        "put": {
+            "type": 10,
+            "metric_type": "counter",
+            "value_type": "integer",
+            "description": "Puts",
+            "nick": ""
+        },
+        "put_sum": {
+            "type": 10,
+            "metric_type": "counter",
+            "value_type": "integer",
+            "description": "Put data",
+            "nick": ""
+        },
+        "wait": {
+            "type": 5,
+            "metric_type": "gauge",
+            "value_type": "real-integer-pair",
+            "description": "Waiting latency",
+            "nick": ""
+        }
+  }
 
 
 Dump
diff --git a/doc/install/build-ceph.rst b/doc/install/build-ceph.rst
index cc61e0a8e62..9c834b98727 100644
--- a/doc/install/build-ceph.rst
+++ b/doc/install/build-ceph.rst
@@ -28,9 +28,8 @@ Ceph is built using cmake. To build Ceph, navigate to your cloned Ceph
 repository and execute the following::
 
     cd ceph
-    mkdir build
+    ./do_cmake.sh
     cd build
-    cmake ..
     make
 
 .. topic:: Hyperthreading
diff --git a/doc/install/get-packages.rst b/doc/install/get-packages.rst
index 4e96c2b3f5b..02a24cd14c5 100644
--- a/doc/install/get-packages.rst
+++ b/doc/install/get-packages.rst
@@ -107,9 +107,7 @@ You may find releases for CentOS/RHEL and others (installed with YUM) at::
 
 	https://download.ceph.com/rpm-{release-name}
 
-The major releases of Ceph are summarized at:
-
-    http://docs.ceph.com/docs/master/releases/
+The major releases of Ceph are summarized at: :doc:`/releases`.
 
 Every second major release is considered Long Term Stable (LTS). Critical
 bugfixes are backported to LTS releases until their retirement. Since retired
diff --git a/doc/install/manual-deployment.rst b/doc/install/manual-deployment.rst
index cf14d4b838a..5e58f57c0e1 100644
--- a/doc/install/manual-deployment.rst
+++ b/doc/install/manual-deployment.rst
@@ -58,7 +58,7 @@ a number of things:
   For example, when you run multiple clusters in a `federated architecture`_, 
   the cluster name (e.g., ``us-west``, ``us-east``) identifies the cluster for
   the current CLI session. **Note:** To identify the cluster name on the 
-  command line interface, specify the a Ceph configuration file with the 
+  command line interface, specify the Ceph configuration file with the 
   cluster name (e.g., ``ceph.conf``, ``us-west.conf``, ``us-east.conf``, etc.).
   Also see CLI usage (``ceph --cluster {cluster-name}``).
   
diff --git a/doc/install/mirrors.rst b/doc/install/mirrors.rst
index 5810571d521..49742d12aa0 100644
--- a/doc/install/mirrors.rst
+++ b/doc/install/mirrors.rst
@@ -24,6 +24,7 @@ These mirrors are available on the following locations:
 - **UK: UK**: http://uk.ceph.com
 - **US-East: US East Coast**: http://us-east.ceph.com/
 - **US-West: US West Coast**: http://us-west.ceph.com/
+- **CN: China**: http://cn.ceph.com/
 
 You can replace all download.ceph.com URLs with any of the mirrors, for example:
 
diff --git a/doc/man/8/ceph-detect-init.rst b/doc/man/8/ceph-detect-init.rst
index aeb3316e503..c409a949d43 100644
--- a/doc/man/8/ceph-detect-init.rst
+++ b/doc/man/8/ceph-detect-init.rst
@@ -41,6 +41,17 @@ Options
 
    Display additional information for debugging.
 
+Bugs
+====
+
+:program:`ceph-detect-init` is used by :program:`ceph-disk` to figure out the init system to manage the mount directory of an OSD. But only following combinations are fully tested:
+
+- `upstart` on `Ubuntu 14.04`
+- `systemd` on `Ubuntu 15.04` and up
+- `systemd` on `Debian 8` and up
+- `systemd` on `RHEL/CentOS 7` and up
+- `systemd` on `Fedora 22` and up
+
 Availability
 ============
 
diff --git a/doc/man/8/ceph-disk.rst b/doc/man/8/ceph-disk.rst
index ed938b8b570..4635937a498 100644
--- a/doc/man/8/ceph-disk.rst
+++ b/doc/man/8/ceph-disk.rst
@@ -78,6 +78,11 @@ the subcommands ``deactivate`` and ``destroy``.
 The documentation for each subcommand (prepare, activate, etc.) can be displayed
 with its ``--help`` option. For instance ``ceph-disk prepare --help``.
 
+Bugs
+====
+
+See also the ``Bugs`` section in :doc:`ceph-detect-init <ceph-detect-init>`\(8).
+
 Availability
 ============
 
@@ -87,5 +92,6 @@ the Ceph documentation at http://ceph.com/docs for more information.
 See also
 ========
 
+:doc:`ceph-detect-init <ceph-detect-init>`\(8)
 :doc:`ceph-osd <ceph-osd>`\(8),
 :doc:`ceph-deploy <ceph-deploy>`\(8)
diff --git a/doc/man/8/ceph.rst b/doc/man/8/ceph.rst
index b2489126848..ba3ccdfe276 100644
--- a/doc/man/8/ceph.rst
+++ b/doc/man/8/ceph.rst
@@ -13,7 +13,7 @@ Synopsis
 
 | **ceph** **compact**
 
-| **ceph** **config-key** [ *del* | *exists* | *get* | *list* | *put* ] ...
+| **ceph** **config-key** [ *del* | *exists* | *get* | *list* | *dump* | *put* ] ...
 
 | **ceph** **daemon** *<name>* \| *<path>* *<command>* ...
 
@@ -39,7 +39,7 @@ Synopsis
 
 | **ceph** **mon_status**
 
-| **ceph** **osd** [ *blacklist* \| *blocked-by* \| *create* \| *deep-scrub* \| *df* \| *down* \| *dump* \| *erasure-code-profile* \| *find* \| *getcrushmap* \| *getmap* \| *getmaxosd* \| *in* \| *lspools* \| *map* \| *metadata* \| *out* \| *pause* \| *perf* \| *pg-temp* \| *primary-affinity* \| *primary-temp* \| *repair* \| *reweight* \| *reweight-by-pg* \| *rm* \| *scrub* \| *set* \| *setcrushmap* \| *setmaxosd*  \| *stat* \| *thrash* \| *tree* \| *unpause* \| *unset* ] ...
+| **ceph** **osd** [ *blacklist* \| *blocked-by* \| *create* \| *deep-scrub* \| *df* \| *down* \| *dump* \| *erasure-code-profile* \| *find* \| *getcrushmap* \| *getmap* \| *getmaxosd* \| *in* \| *lspools* \| *map* \| *metadata* \| *out* \| *pause* \| *perf* \| *pg-temp* \| *primary-affinity* \| *primary-temp* \| *repair* \| *reweight* \| *reweight-by-pg* \| *rm* \| *scrub* \| *set* \| *setcrushmap* \| *setmaxosd*  \| *stat* \| *tree* \| *unpause* \| *unset* ] ...
 
 | **ceph** **osd** **crush** [ *add* \| *add-bucket* \| *create-or-move* \| *dump* \| *get-tunable* \| *link* \| *move* \| *remove* \| *rename-bucket* \| *reweight* \| *reweight-all* \| *reweight-subtree* \| *rm* \| *rule* \| *set* \| *set-tunable* \| *show-tunables* \| *tunables* \| *unlink* ] ...
 
@@ -201,7 +201,13 @@ Usage::
 
 	ceph config-key list
 
-Subcommand ``put`` puts configuration key and values.
+Subcommand ``dump`` dumps configuration keys and values.
+
+Usage::
+
+	ceph config-key dump
+
+Subcommand ``put`` puts configuration key and value.
 
 Usage::
 
diff --git a/doc/man/8/radosgw-admin.rst b/doc/man/8/radosgw-admin.rst
index 8da402e60ab..0a34df80b78 100644
--- a/doc/man/8/radosgw-admin.rst
+++ b/doc/man/8/radosgw-admin.rst
@@ -457,7 +457,15 @@ Remove a user and all associated buckets with their contents::
 
 Remove a bucket::
 
-        $ radosgw-admin bucket unlink --bucket=foo
+	$ radosgw-admin bucket rm --bucket=foo
+
+Link bucket to specified user::
+	
+	$ radosgw-admin bucket link --bucket=foo --bucket_id=<bucket id> --uid=johnny
+
+Unlink bucket from specified user::
+
+        $ radosgw-admin bucket unlink --bucket=foo --uid=johnny
 
 Show the logs of a bucket from April 1st, 2012::
 
diff --git a/doc/man/8/rbd.rst b/doc/man/8/rbd.rst
index a3471ffc10c..d821c3de486 100644
--- a/doc/man/8/rbd.rst
+++ b/doc/man/8/rbd.rst
@@ -484,6 +484,8 @@ Per mapping (block device) `rbd map` options:
 * lock_on_read - Acquire exclusive lock on reads, in addition to writes and
   discards (since 4.9).
 
+* exclusive - Disable automatic exclusive lock transitions (since 4.12).
+
 `rbd unmap` options:
 
 * force - Force the unmapping of a block device that is open (since 4.9).  The
diff --git a/doc/rados/api/index.rst b/doc/rados/api/index.rst
index 4030e70d9aa..cccc153cf1d 100644
--- a/doc/rados/api/index.rst
+++ b/doc/rados/api/index.rst
@@ -18,4 +18,5 @@ Ceph, your own interface to Ceph, etc.).
    Introduction to librados <librados-intro>
    librados (C) <librados>
    librados (C++) <libradospp>
-   librados (Python) <python>
-\ No newline at end of file
+   librados (Python) <python>
+   object class <objclass-sdk>
diff --git a/doc/rados/api/objclass-sdk.rst b/doc/rados/api/objclass-sdk.rst
new file mode 100644
index 00000000000..6b1162fd494
--- /dev/null
+++ b/doc/rados/api/objclass-sdk.rst
@@ -0,0 +1,37 @@
+===========================
+SDK for Ceph Object Classes
+===========================
+
+`Ceph` can be extended by creating shared object classes called `Ceph Object 
+Classes`. The existing framework to build these object classes has dependencies 
+on the internal functionality of `Ceph`, which restricts users to build object 
+classes within the tree. The aim of this project is to create an independent 
+object class interface, which can be used to build object classes outside the 
+`Ceph` tree. This allows us to have two types of object classes, 1) those that 
+have in-tree dependencies and reside in the tree and 2) those that can make use 
+of the `Ceph Object Class SDK framework` and can be built outside of the `Ceph` 
+tree because they do not depend on any internal implementation of `Ceph`. This 
+project decouples object class development from Ceph and encourages creation 
+and distribution of object classes as packages.
+
+In order to demonstrate the use of this framework, we have provided an example 
+called ``cls_sdk``, which is a very simple object class that makes use of the 
+SDK framework. This object class resides in the ``src/cls`` directory. 
+
+Installing objclass.h
+---------------------
+
+The object class interface that enables out-of-tree development of object 
+classes resides in ``src/include/rados/`` and gets installed with `Ceph` 
+installation. After running ``make install``, you should be able to see it 
+in ``<prefix>/include/rados``. ::
+
+        ls /usr/local/include/rados
+
+Using the SDK example
+---------------------
+
+The ``cls_sdk`` object class resides in ``src/cls/sdk/``. This gets built and 
+loaded into Ceph, with the Ceph build process. You can run the 
+``ceph_test_cls_sdk`` unittest, which resides in ``src/test/cls_sdk/``, 
+to test this class.
diff --git a/doc/rados/configuration/mon-config-ref.rst b/doc/rados/configuration/mon-config-ref.rst
index b19461f7a62..b27a8c8619c 100644
--- a/doc/rados/configuration/mon-config-ref.rst
+++ b/doc/rados/configuration/mon-config-ref.rst
@@ -106,6 +106,11 @@ A consensus requires a majority of monitors running to establish a quorum for
 consensus about the cluster map (e.g., 1; 2 out of 3; 3 out of 5; 4 out of 6;
 etc.).
 
+``mon force quorum join``
+
+:Description: Force monitor to join quorum even if it has been previously removed from the map
+:Type: Boolean
+:Default: ``False``
 
 .. index:: Ceph Monitor; consistency
 
@@ -314,6 +319,126 @@ by setting it in the ``[mon]`` section of the configuration file.
 :Default: ``/var/lib/ceph/mon/$cluster-$id``
 
 
+``mon data size warn``
+
+:Description: Issue a ``HEALTH_WARN`` in cluster log when the monitor's data
+              store goes over 15GB.
+:Type: Integer
+:Default: 15*1024*1024*1024*
+
+
+``mon data avail warn``
+
+:Description: Issue a ``HEALTH_WARN`` in cluster log when the available disk
+              space of monitor's data store is lower or equal to this
+              percentage.
+:Type: Integer
+:Default: 30
+
+
+``mon data avail crit``
+
+:Description: Issue a ``HEALTH_ERR`` in cluster log when the available disk
+              space of monitor's data store is lower or equal to this
+              percentage.
+:Type: Integer
+:Default: 5
+
+
+``mon warn on cache pools without hit sets``
+
+:Description: Issue a ``HEALTH_WARN`` in cluster log if a cache pool does not
+              have the hitset type set set.
+              See `hit set type <../operations/pools#hit-set-type>`_ for more
+              details.
+:Type: Boolean
+:Default: True
+
+
+``mon warn on crush straw calc version zero``
+
+:Description: Issue a ``HEALTH_WARN`` in cluster log if the CRUSH's
+              ``straw_calc_version`` is zero. See
+              `CRUSH map tunables <../operations/crush-map#tunables>`_ for
+              details.
+:Type: Boolean
+:Default: True
+
+
+``mon warn on legacy crush tunables``
+
+:Description: Issue a ``HEALTH_WARN`` in cluster log if
+              CRUSH tunables are too old (older than ``mon_min_crush_required_version``)
+:Type: Boolean
+:Default: True
+
+
+``mon crush min required version``
+
+:Description: The minimum tunable profile version required by the cluster.
+              See
+              `CRUSH map tunables <../operations/crush-map#tunables>`_ for
+              details.
+:Type: String
+:Default: ``firefly``
+
+
+``mon warn on osd down out interval zero``
+
+:Description: Issue a ``HEALTH_WARN`` in cluster log if
+              ``mon osd down out interval`` is zero. Having this option set to
+              zero on the leader acts much like the ``noout`` flag. It's hard
+              to figure out what's going wrong with clusters witout the
+              ``noout`` flag set but acting like that just the same, so we
+              report a warning in this case.
+:Type: Boolean
+:Default: True
+
+
+``mon cache target full warn ratio``
+
+:Description: Position between pool's ``cache_target_full`` and
+              ``target_max_object`` where we start warning
+:Type: Float
+:Default: ``0.66``
+
+
+``mon health data update interval``
+
+:Description: How often (in seconds) the monitor in quorum shares its health
+              status with its peers. (negative number disables it)
+:Type: Float
+:Default: ``60``
+
+
+``mon health to clog``
+
+:Description: Enable sending health summary to cluster log periodically.
+:Type: Boolean
+:Default: True
+
+
+``mon health to clog tick interval``
+
+:Description: How often (in seconds) the monitor send health summary to cluster
+              log (a non-positive number disables it). If current health summary
+              is empty or identical to the last time, monitor will not send it
+              to cluster log.
+:Type: Integer
+:Default: 3600
+
+
+``mon health to clog interval``
+
+:Description: How often (in seconds) the monitor send health summary to cluster
+              log (a non-positive number disables it). Monitor will always
+              send the summary to cluster log no matter if the summary changes
+              or not.
+:Type: Integer
+:Default: 60
+
+
+
 .. index:: Ceph Storage Cluster; capacity planning, Ceph Monitor; capacity planning
 
 Storage Capacity
@@ -546,7 +671,9 @@ Trimming requires that the placement groups are ``active + clean``.
 
 ``mon sync timeout``
 
-:Description: 
+:Description: Number of seconds the monitor will wait for the next update
+              message from its sync provider before it gives up and bootstrap
+              again.
 :Type: Double
 :Default: ``30.0``
 
@@ -560,39 +687,123 @@ Trimming requires that the placement groups are ``active + clean``.
 
 ``mon sync max payload size``
 
-:Description: The maximum size for a sync payload.
+:Description: The maximum size for a sync payload (in bytes).
 :Type: 32-bit Integer
 :Default: ``1045676``
 
 
-``mon accept timeout`` 
+``paxos max join drift``
 
-:Description: Number of seconds the Leader will wait for the Requester(s) to 
-              accept a Paxos update. It is also used during the Paxos recovery 
-              phase for similar purposes.
+:Description: The maximum Paxos iterations before we must first sync the
+              monitor data stores. When a monitor finds that its peer is too
+              far ahead of it, it will first sync with data stores before moving
+              on.
+:Type: Integer
+:Default: ``10``
 
-:Type: Float
-:Default: ``10.0`` 
+``paxos stash full interval``
 
+:Description: How often (in commits) to stash a full copy of the PaxosService state.
+              Current this setting only affects ``mds``, ``mon``, ``auth`` and ``mgr``
+              PaxosServices.
+:Type: Integer
+:Default: 25
 
 ``paxos propose interval``
 
 :Description: Gather updates for this time interval before proposing 
-              a map update. 
-
+              a map update.
 :Type: Double
 :Default: ``1.0``
 
 
+``paxos min``
+
+:Description: The minimum number of paxos states to keep around
+:Type: Integer
+:Default: 500
+
+
 ``paxos min wait``
 
 :Description: The minimum amount of time to gather updates after a period of 
               inactivity.
-
 :Type: Double
 :Default: ``0.05``
 
 
+``paxos trim min``
+
+:Description: Number of extra proposals tolerated before trimming
+:Type: Integer
+:Default: 250
+
+
+``paxos trim max``
+
+:Description: The maximum number of extra proposals to trim at a time
+:Type: Integer
+:Default: 500
+
+
+``paxos service trim min``
+
+:Description: The minimum amount of versions to trigger a trim (0 disables it)
+:Type: Integer
+:Default: 250
+
+
+``paxos service trim max``
+
+:Description: The maximum amount of versions to trim during a single proposal (0 disables it)
+:Type: Integer
+:Default: 500
+
+
+``mon max log epochs``
+
+:Description: The maximum amount of log epochs to trim during a single proposal
+:Type: Integer
+:Default: 500
+
+
+``mon max pgmap epochs``
+
+:Description: The maximum amount of pgmap epochs to trim during a single proposal
+:Type: Integer
+:Default: 500
+
+
+``mon mds force trim to``
+
+:Description: Force monitor to trim mdsmaps to this point (0 disables it.
+              dangerous, use with care)
+:Type: Integer
+:Default: 0
+
+
+``mon osd force trim to``
+
+:Description: Force monitor to trim osdmaps to this point, even if there is
+              PGs not clean at the specified epoch (0 disables it. dangerous,
+              use with care)
+:Type: Integer
+:Default: 0
+
+``mon osd cache size``
+
+:Description: The size of osdmaps cache, not to rely on underlying store's cache
+:Type: Integer
+:Default: 10
+
+
+``mon election timeout``
+
+:Description: On election proposer, maximum waiting time for all ACKs in seconds.
+:Type: Float
+:Default: ``5``
+
+
 ``mon lease`` 
 
 :Description: The length (in seconds) of the lease on the monitor's versions.
@@ -600,22 +811,30 @@ Trimming requires that the placement groups are ``active + clean``.
 :Default: ``5``
 
 
-``mon lease renew interval`` 
+``mon lease renew interval factor``
 
-:Description: The interval (in seconds) for the Leader to renew the other 
-              monitor's leases.
-              
+:Description: ``mon lease`` \* ``mon lease renew interval factor`` will be the
+              interval for the Leader to renew the other monitor's leases. The
+              factor should be less than ``1.0``.
 :Type: Float
-:Default: ``3``
+:Default: ``0.6``
 
 
-``mon lease ack timeout`` 
+``mon lease ack timeout factor``
 
-:Description: The number of seconds the Leader will wait for the Providers to 
-              acknowledge the lease extension.
-              
+:Description: The Leader will wait ``mon lease`` \* ``mon lease ack timeout factor``
+              for the Providers to acknowledge the lease extension.
 :Type: Float
-:Default: ``10.0``
+:Default: ``2.0``
+
+
+``mon accept timeout factor``
+
+:Description: The Leader will wait ``mon lease`` \* ``mon accept timeout factor``
+              for the Requester(s) to accept a Paxos update. It is also used
+              during the Paxos recovery phase for similar purposes.
+:Type: Float
+:Default: ``2.0``
 
 
 ``mon min osdmap epochs`` 
@@ -640,42 +859,6 @@ Trimming requires that the placement groups are ``active + clean``.
 
 
 
-
-Slurp
------
-
-In Ceph version 0.58 and earlier, when a Paxos service drifts beyond a given
-number of versions, Ceph triggers the `slurp` mechanism, which establishes a
-connection with the quorum Leader and obtains every single version the Leader
-has for every service that has drifted. In Ceph versions 0.59 and later, slurp
-will not work, because there is a single Paxos instance for all services.
-
-.. deprecated:: 0.58
-
-``paxos max join drift``
-
-:Description: The maximum Paxos iterations before we must first sync the
-              monitor data stores.
-:Type: Integer
-:Default: ``10`` 
-
-
-``mon slurp timeout`` 
-
-:Description: The number of seconds the monitor has to recover using slurp 
-              before the process is aborted and the monitor bootstraps.
-
-:Type: Double
-:Default: ``10.0``
-
-
-``mon slurp bytes``
-
-:Description: Limits the slurp messages to the specified number of bytes.
-:Type: 32-bit Integer
-:Default: ``256 * 1024``
-
-
 .. index:: Ceph Monitor; clock
 
 Clock
@@ -739,12 +922,19 @@ acceptable values.
 ``mon timecheck interval``
 
 :Description: The time check interval (clock drift check) in seconds 
-              for the leader.
+              for the Leader.
 
 :Type: Float
 :Default: ``300.0``
 
 
+``mon timecheck skew interval``
+
+:Description: The time check interval (clock drift check) in seconds when in
+              presence of a skew in seconds for the Leader.
+:Type: Float
+:Default: ``30.0``
+
 
 Client
 ------
@@ -897,6 +1087,129 @@ Miscellaneous
 :Default: ``0.5``
 
 
+``mon osd prime pg temp max time estimate``
+
+:Description: Maximum estimate of time spent on each PG before we prime all PGs
+              in parallel.
+:Type: Float
+:Default: ``0.25``
+
+
+``mon osd allow primary affinity``
+
+:Description: allow ``primary_affinity`` to be set in the osdmap.
+:Type: Boolean
+:Default: False
+
+
+``mon osd pool ec fast read``
+
+:Description: Whether turn on fast read on the pool or not. It will be used as
+              the default setting of newly created erasure pools if ``fast_read``
+              is not specified at create time.
+:Type: Boolean
+:Default: False
+
+
+``mon mds skip sanity``
+
+:Description: Skip safety assertions on FSMap (in case of bugs where we want to
+              continue anyway). Monitor terminates if the FSMap sanity check
+              fails, but we can disable it by enabling this option.
+:Type: Boolean
+:Default: False
+
+
+``mon max mdsmap epochs``
+
+:Description: The maximum amount of mdsmap epochs to trim during a single proposal.
+:Type: Integer
+:Default: 500
+
+
+``mon config key max entry size``
+
+:Description: The maximum size of config-key entry (in bytes)
+:Type: Integer
+:Default: 4096
+
+
+``mon scrub interval``
+
+:Description: How often (in seconds) the monitor scrub its store by comparing
+              the stored checksums with the computed ones of all the stored
+              keys.
+:Type: Integer
+:Default: 3600*24
+
+
+``mon scrub max keys``
+
+:Description: The maximum number of keys to scrub each time.
+:Type: Integer
+:Default: 100
+
+
+``mon compact on start``
+
+:Description: Compact the database used as Ceph Monitor store on
+              ``ceph-mon`` start. A manual compaction helps to shrink the
+              monitor database and improve the performance of it if the regular
+              compaction fails to work.
+:Type: Boolean
+:Default: False
+
+
+``mon compact on bootstrap``
+
+:Description: Compact the database used as Ceph Monitor store on
+              on bootstrap. Monitor starts probing each other for creating
+              a quorum after bootstrap. If it times out before joining the
+              quorum, it will start over and bootstrap itself again.
+:Type: Boolean
+:Default: False
+
+
+``mon compact on trim``
+
+:Description: Compact a certain prefix (including paxos) when we trim its old states.
+:Type: Boolean
+:Default: True
+
+
+``mon cpu threads``
+
+:Description: Number of threads for performing CPU intensive work on monitor.
+:Type: Boolean
+:Default: True
+
+
+``mon osd mapping pgs per chunk``
+
+:Description: We calculate the mapping from placement group to OSDs in chunks.
+              This option specifies the number of placement groups per chunk.
+:Type: Integer
+:Default: 4096
+
+
+``mon osd max split count``
+
+:Description: Largest number of PGs per "involved" OSD to let split create.
+              When we increase the ``pg_num`` of a pool, the placement groups
+              will be splitted on all OSDs serving that pool. We want to avoid
+              extreme multipliers on PG splits.
+:Type: Integer
+:Default: 300
+
+
+``mon session timeout``
+
+:Description: Monitor will terminate inactive sessions stay idle over this
+              time limit.
+:Type: Integer
+:Default: 300
+
+
 
 .. _Paxos: http://en.wikipedia.org/wiki/Paxos_(computer_science)
 .. _Monitor Keyrings: ../../../dev/mon-bootstrap#secret-keys
diff --git a/doc/rados/configuration/mon-lookup-dns.rst b/doc/rados/configuration/mon-lookup-dns.rst
index 0b0cb5699a9..9aa1d37396e 100644
--- a/doc/rados/configuration/mon-lookup-dns.rst
+++ b/doc/rados/configuration/mon-lookup-dns.rst
@@ -12,6 +12,13 @@ This allows for less configuration on clients and monitors. Using a DNS update c
 
 By default clients and daemons will look for the TCP service called *ceph-mon* which is configured by the *mon_dns_srv_name* configuration directive.
 
+
+``mon dns srv name``
+
+:Description: the service name used querying the DNS for the monitor hosts/addresses
+:Type: String
+:Default: ``ceph-mon``
+
 Example
 -------
 When the DNS search domain is set to *example.com* a DNS zone file might contain the following elements.
diff --git a/doc/rados/configuration/mon-osd-interaction.rst b/doc/rados/configuration/mon-osd-interaction.rst
index c71b400ed0d..ab57cb069a8 100644
--- a/doc/rados/configuration/mon-osd-interaction.rst
+++ b/doc/rados/configuration/mon-osd-interaction.rst
@@ -66,39 +66,46 @@ or by setting the value at runtime.
                 |----+ Mark          |
                 |    | OSD 2         |
                 |<---+ Down          |
-                
+
 
 .. index:: OSD down report
 
 OSDs Report Down OSDs
 =====================
 
-By default, a Ceph OSD Daemon must report to the Ceph Monitors that another Ceph
-OSD Daemon is ``down`` three times before the Ceph Monitors acknowledge that the
-reported Ceph OSD Daemon is ``down``. By default, only one
-Ceph OSD Daemon is required to report another Ceph OSD Daemon ``down``. You can
-change the number of Ceph OSD Daemones required to report a Ceph OSD Daemon
-``down`` to a Ceph Monitor by adding an ``mon osd min down reporters`` setting
-(``osd min down reporters`` prior to v0.62) under the ``[mon]`` section of your
-Ceph configuration file, or by setting the value at runtime.
-
-
-.. ditaa:: +---------+     +---------+
-           |  OSD 1  |     | Monitor |
-           +---------+     +---------+
-                |               |             
-                | OSD 2 Is Down |
-                |-------------->|
-                |               |             
-                | OSD 2 Is Down |
-                |-------------->|
-                |               |             
-                | OSD 2 Is Down |
-                |-------------->|
-                |               |             
-                |               |----------+ Mark
-                |               |          | OSD 2                
-                |               |<---------+ Down
+By default, two Ceph OSD Daemons from different hosts must report to the Ceph
+Monitors that another Ceph OSD Daemon is ``down`` before the Ceph Monitors
+acknowledge that the reported Ceph OSD Daemon is ``down``. But there is chance
+that all the OSDs reporting the failure are hosted in a rack with a bad switch
+which has trouble connecting to another OSD. To avoid this sort of false alarm,
+we consider the peers reporting a failure a proxy for a potential "subcluster"
+over the overall cluster that is similarly laggy. This is clearly not true in
+all cases, but will sometimes help us localize the grace correction to a subset
+of the system that is unhappy. ``mon osd reporter subtree level`` is used to
+group the peers into the "subcluster" by their common ancestor type in CRUSH
+map. By default, only two reports from different subtree are required to report
+another Ceph OSD Daemon ``down``. You can change the number of reporters from
+unique subtrees and the common ancestor type required to report a Ceph OSD
+Daemon ``down`` to a Ceph Monitor by adding an ``mon osd min down reporters``
+and ``mon osd reporter subtree level`` settings  under the ``[mon]`` section of
+your Ceph configuration file, or by setting the value at runtime.
+
+
+.. ditaa:: +---------+     +---------+      +---------+
+           |  OSD 1  |     |  OSD 2  |      | Monitor |
+           +---------+     +---------+      +---------+
+                |               |                |
+                | OSD 3 Is Down |                |
+                |---------------+--------------->|
+                |               |                |
+                |               |                |
+                |               | OSD 3 Is Down  |
+                |               |--------------->|
+                |               |                |
+                |               |                |
+                |               |                |---------+ Mark
+                |               |                |         | OSD 3
+                |               |                |<--------+ Down
 
 
 .. index:: peering failure
@@ -118,13 +125,13 @@ setting the value at runtime.
            +---------+     +---------+     +-------+     +---------+
                 |               |              |              |
                 |  Request To   |              |              |
-                |     Peer      |              |              |               
+                |     Peer      |              |              |
                 |-------------->|              |              |
                 |<--------------|              |              |
                 |    Peering                   |              |
                 |                              |              |
                 |  Request To                  |              |
-                |     Peer                     |              |               
+                |     Peer                     |              |
                 |----------------------------->|              |
                 |                                             |
                 |----+ OSD Monitor                            |
@@ -135,7 +142,7 @@ setting the value at runtime.
                 |-------------------------------------------->|
                 |<--------------------------------------------|
                 |          Receive New Cluster Map            |
- 
+
 
 .. index:: OSD status
 
@@ -149,10 +156,10 @@ event such as a failure, a change in placement group stats, a change in
 ``up_thru`` or when it boots within 5 seconds. You can change the Ceph OSD
 Daemon minimum report interval by adding an ``osd mon report interval min``
 setting under the ``[osd]`` section of your Ceph configuration file, or by
-setting the value at runtime. A Ceph OSD Daemon sends a report to a Ceph 
-Monitor every 120 seconds irrespective of whether any notable changes occur. 
-You can change the Ceph Monitor report interval by adding an ``osd mon report 
-interval max`` setting under the ``[osd]`` section of your Ceph configuration 
+setting the value at runtime. A Ceph OSD Daemon sends a report to a Ceph
+Monitor every 120 seconds irrespective of whether any notable changes occur.
+You can change the Ceph Monitor report interval by adding an ``osd mon report
+interval max`` setting under the ``[osd]`` section of your Ceph configuration
 file, or by setting the value at runtime.
 
 
@@ -207,18 +214,18 @@ Monitor Settings
 
 ``mon osd min up ratio``
 
-:Description: The minimum ratio of ``up`` Ceph OSD Daemons before Ceph will 
+:Description: The minimum ratio of ``up`` Ceph OSD Daemons before Ceph will
               mark Ceph OSD Daemons ``down``.
-              
+
 :Type: Double
 :Default: ``.3``
 
 
 ``mon osd min in ratio``
 
-:Description: The minimum ratio of ``in`` Ceph OSD Daemons before Ceph will 
+:Description: The minimum ratio of ``in`` Ceph OSD Daemons before Ceph will
               mark Ceph OSD Daemons ``out``.
-              
+
 :Type: Double
 :Default: ``.75``
 
@@ -237,6 +244,15 @@ Monitor Settings
 :Default: ``0.3``
 
 
+
+``mon osd laggy max interval``
+:Description: Maximum value of ``laggy_interval`` in laggy estimations (in seconds).
+              Monitor uses an adaptive approach to evaluate the ``laggy_interval`` of
+              a certain OSD. This value will be used to calculate the grace time for
+              that OSD.
+:Type: Integer
+:Default: 300
+
 ``mon osd adjust heartbeat grace``
 
 :Description: If set to ``true``, Ceph will scale based on laggy estimations.
@@ -251,38 +267,38 @@ Monitor Settings
 :Default: ``true``
 
 
-``mon osd auto mark in`` 
+``mon osd auto mark in``
 
-:Description: Ceph will mark any booting Ceph OSD Daemons as ``in`` 
+:Description: Ceph will mark any booting Ceph OSD Daemons as ``in``
               the Ceph Storage Cluster.
 
 :Type: Boolean
 :Default: ``false``
 
 
-``mon osd auto mark auto out in`` 
+``mon osd auto mark auto out in``
 
-:Description: Ceph will mark booting Ceph OSD Daemons auto marked ``out`` 
+:Description: Ceph will mark booting Ceph OSD Daemons auto marked ``out``
               of the Ceph Storage Cluster as ``in`` the cluster.
-              
+
 :Type: Boolean
-:Default: ``true`` 
+:Default: ``true``
 
 
-``mon osd auto mark new in`` 
+``mon osd auto mark new in``
 
-:Description: Ceph will mark booting new Ceph OSD Daemons as ``in`` the 
+:Description: Ceph will mark booting new Ceph OSD Daemons as ``in`` the
               Ceph Storage Cluster.
-              
+
 :Type: Boolean
-:Default: ``true`` 
+:Default: ``true``
 
 
-``mon osd down out interval`` 
+``mon osd down out interval``
 
 :Description: The number of seconds Ceph waits before marking a Ceph OSD Daemon
               ``down`` and ``out`` if it doesn't respond.
-              
+
 :Type: 32-bit Integer
 :Default: ``600``
 
@@ -298,21 +314,30 @@ Monitor Settings
 :Default: ``rack``
 
 
-``mon osd report timeout`` 
+``mon osd report timeout``
 
-:Description: The grace period in seconds before declaring 
+:Description: The grace period in seconds before declaring
               unresponsive Ceph OSD Daemons ``down``.
 
 :Type: 32-bit Integer
 :Default: ``900``
 
-``mon osd min down reporters`` 
+``mon osd min down reporters``
 
-:Description: The minimum number of Ceph OSD Daemons required to report a 
+:Description: The minimum number of Ceph OSD Daemons required to report a
               ``down`` Ceph OSD Daemon.
 
 :Type: 32-bit Integer
-:Default: ``1``
+:Default: ``2``
+
+
+``mon osd reporter subtree level``
+
+:Description: In which level of parent bucket the reporters are counted. The OSDs
+              send failure reports to monitor if they find its peer is not responsive.
+              And monitor mark the reported OSD out and then down after a grace period.
+:Type: String
+:Default: ``host``
 
 
 .. index:: OSD hearbeat
@@ -322,63 +347,61 @@ OSD Settings
 
 ``osd heartbeat address``
 
-:Description: An Ceph OSD Daemon's network address for heartbeats. 
+:Description: An Ceph OSD Daemon's network address for heartbeats.
 :Type: Address
 :Default: The host address.
 
 
-``osd heartbeat interval`` 
+``osd heartbeat interval``
 
 :Description: How often an Ceph OSD Daemon pings its peers (in seconds).
 :Type: 32-bit Integer
 :Default: ``6``
 
 
-``osd heartbeat grace`` 
+``osd heartbeat grace``
 
 :Description: The elapsed time when a Ceph OSD Daemon hasn't shown a heartbeat
               that the Ceph Storage Cluster considers it ``down``.
               This setting has to be set in both the [mon] and [osd] or [global]
               section so that it is read by both the MON and OSD daemons.
- 
 :Type: 32-bit Integer
 :Default: ``20``
 
 
-``osd mon heartbeat interval`` 
+``osd mon heartbeat interval``
 
-:Description: How often the Ceph OSD Daemon pings a Ceph Monitor if it has no 
+:Description: How often the Ceph OSD Daemon pings a Ceph Monitor if it has no
               Ceph OSD Daemon peers.
 
 :Type: 32-bit Integer
-:Default: ``30`` 
+:Default: ``30``
 
 
-``osd mon report interval max`` 
+``osd mon report interval max``
 
 :Description: The maximum time in seconds that a Ceph OSD Daemon can wait before
               it must report to a Ceph Monitor.
 
 :Type: 32-bit Integer
-:Default: ``120`` 
+:Default: ``120``
 
 
-``osd mon report interval min`` 
+``osd mon report interval min``
 
 :Description: The minimum number of seconds a Ceph OSD Daemon may wait
-              from startup or another reportable event before reporting 
+              from startup or another reportable event before reporting
               to a Ceph Monitor.
 
 :Type: 32-bit Integer
 :Default: ``5``
-:Valid Range: Should be less than ``osd mon report interval max`` 
+:Valid Range: Should be less than ``osd mon report interval max``
 
 
-``osd mon ack timeout`` 
+``osd mon ack timeout``
 
-:Description: The number of seconds to wait for a Ceph Monitor to acknowledge a 
+:Description: The number of seconds to wait for a Ceph Monitor to acknowledge a
               request for statistics.
 
 :Type: 32-bit Integer
-:Default: ``30`` 
-
+:Default: ``30``
diff --git a/doc/rados/configuration/pool-pg-config-ref.rst b/doc/rados/configuration/pool-pg-config-ref.rst
index 9cb2c8f54ff..3c871f1df85 100644
--- a/doc/rados/configuration/pool-pg-config-ref.rst
+++ b/doc/rados/configuration/pool-pg-config-ref.rst
@@ -42,15 +42,95 @@ Ceph configuration file.
 :Type: 32-bit Integer
 :Default: ``300``
 
+``mon pg min inactive``
 
-``osd pg bits`` 
+:Description: Issue a ``HEALTH_ERR`` in cluster log if the number of PGs stay
+              inactive longer than ``mon_pg_stuck_threshold`` exceeds this
+              setting. A non-positive number means disabled, never go into ERR.
+:Type: Integer
+:Default: ``1``
+
+
+``mon pg warn min per osd``
+
+:Description: Issue a ``HEALTH_WARN`` in cluster log if the average number
+              of PGs per (in) OSD is under this number. (a non-positive number
+              disables this)
+:Type: Integer
+:Default: ``30``
+
+
+``mon pg warn max per osd``
+
+:Description: Issue a ``HEALTH_WARN`` in cluster log if the average number
+              of PGs per (in) OSD is above this number. (a non-positive number
+              disables this)
+:Type: Integer
+:Default: ``300``
+
+
+``mon pg warn min objects``
+
+:Description: Do not warn if the total number of objects in cluster is below
+              this number
+:Type: Integer
+:Default: ``1000``
+
+
+``mon pg warn min pool objects``
+
+:Description: Do not warn on pools whose object number is below this number
+:Type: Integer
+:Default: ``1000``
+
+
+``mon pg check down all threshold``
+
+:Description: Threshold of down OSDs percentage after which we check all PGs
+              for stale ones.
+:Type: Float
+:Default: ``0.5``
+
+
+``mon pg warn max object skew``
+
+:Description: Issue a ``HEALTH_WARN`` in cluster log if the average object number
+              of a certain pool is greater than ``mon pg warn max object skew`` times
+              the average object number of the whole pool. (a non-positive number
+              disables this)
+:Type: Float
+:Default: ``10``
+
+
+``mon delta reset interval``
+
+:Description: Seconds of inactivity before we reset the pg delta to 0. We keep
+              track of the delta of the used space of each pool, so, for
+              example, it would be easier for us to understand the progress of
+              recovery or the performance of cache tier. But if the is no
+              activity reported for a certain pool, we just reset the history of
+              deltas of that pool.
+:Type: Integer
+:Default: ``10``
+
+
+``mon osd max op age``
+
+:Description: Maximum op age before we get concerned (make it a power of 2).
+              A ``HEALTH_WARN`` will be issued if a request has been blocked longer
+              than this limit.
+:Type: Float
+:Default: ``32.0``
+
+
+``osd pg bits``
 
 :Description: Placement group bits per Ceph OSD Daemon.
 :Type: 32-bit Integer
 :Default: ``6`` 
 
 
-``osd pgp bits`` 
+``osd pgp bits``
 
 :Description: The number of bits per Ceph OSD Daemon for PGPs.
 :Type: 32-bit Integer
diff --git a/doc/rados/operations/erasure-code.rst b/doc/rados/operations/erasure-code.rst
index 6878777ac72..568f26aec16 100644
--- a/doc/rados/operations/erasure-code.rst
+++ b/doc/rados/operations/erasure-code.rst
@@ -139,7 +139,7 @@ erasure coded pool as the ``--data-pool`` during image creation::
     rbd create --size 1G --data-pool ec_pool replicated_pool/image_name
 
 For Cephfs, using an erasure coded pool means setting that pool in
-a `file layout<../../cephfs/file-layouts>`_.
+a `file layout <../../../cephfs/file-layouts>`_.
 
 
 Erasure coded pool and cache tiering
diff --git a/doc/rados/troubleshooting/log-and-debug.rst b/doc/rados/troubleshooting/log-and-debug.rst
index dabe8412a84..757dfb178e9 100644
--- a/doc/rados/troubleshooting/log-and-debug.rst
+++ b/doc/rados/troubleshooting/log-and-debug.rst
@@ -88,8 +88,6 @@ particular daemons are set under the daemon section in your configuration file
 	[mds]
 		debug mds = 1
 		debug mds balancer = 1
-		debug mds log = 1
-		debug mds migrator = 1
 
 
 See `Subsystem, Log and Debug Settings`_ for details.
@@ -171,7 +169,7 @@ as ``debug ms = 1/5``. For example:
 
 	debug {subsystem} = {log-level}/{memory-level}
 	#for example
-	debug mds log = 1/20
+	debug mds balancer = 1/20
 
 
 The following table provides a list of Ceph subsystems and their default log and
diff --git a/doc/radosgw/admin.rst b/doc/radosgw/admin.rst
index 3d3ef4f2519..f1916481bd9 100644
--- a/doc/radosgw/admin.rst
+++ b/doc/radosgw/admin.rst
@@ -211,61 +211,89 @@ Options include:
   to the UID.
 
 
-Create a Key
-------------
+Add / Remove a Key
+------------------------
+
+Both users and subusers require the key to access the S3 or Swift interface. To
+use S3, the user needs a key pair which is composed of an access key and a 
+secret key. On the other hand, to use Swift, the user typically needs a secret 
+key (password), and use it together with the associated user ID. You may create
+a key and either specify or generate the access key and/or secret key. You may 
+also remove a key. Options include:
+
+- ``--key-type=<type>`` specifies the key type. The options are: s3, swift
+- ``--access-key=<key>`` manually specifies an S3 access key.
+- ``--secret-key=<key>`` manually specifies a S3 secret key or a Swift secret key.
+- ``--gen-access-key`` automatically generates a S3 key.
+- ``--gen-secret`` automatically generates a S3 secret key or a Swift secret key.
 
-To create a key for a user, you must specify ``key create``. For a user, specify
-the user ID and the ``s3`` key type. To create a key for subuser, you must
-specify the subuser ID and the ``swift`` keytype. For example::
+An example how to add a specified S3 key pair for a user. ::
 
-	radosgw-admin key create --subuser=johndoe:swift --key-type=swift --gen-secret
+	radosgw-admin key create --uid=foo --key-type=s3 --access-key fooAccessKey --secret-key fooSecretKey
 
 .. code-block:: javascript
 
-  { "user_id": "johndoe",
+  { "user_id": "foo",
     "rados_uid": 0,
-    "display_name": "John Doe",
-    "email": "john@example.com",
+    "display_name": "foo",
+    "email": "foo@example.com",
     "suspended": 0,
-    "subusers": [
-       { "id": "johndoe:swift",
-         "permissions": "full-control"}],
     "keys": [
-      { "user": "johndoe",
-        "access_key": "QFAMEDSJP5DEKJO0DDXY",
-        "secret_key": "iaSFLDVvDdQt6lkNzHyW4fPLZugBAI1g17LO0+87"}],
-    "swift_keys": [
-      { "user": "johndoe:swift",
-        "secret_key": "E9T2rUZNu2gxUjcwUBO8n\/Ev4KX6\/GprEuH4qhu1"}]}
+      { "user": "foo",
+        "access_key": "fooAccessKey",
+        "secret_key": "fooSecretKey"}],
+  }
 
+Note that you may create multiple S3 key pairs for a user.
 
+To attach a specified swift secret key for a subuser. ::
 
-Add / Remove Access Keys
-------------------------
+	radosgw-admin key create --subuser=foo:bar --key-type=swift --secret-key barSecret
+
+.. code-block:: javascript
 
-Users and subusers must have access keys to use the S3 and Swift
-interfaces. When you create a user or subuser and you do not specify 
-an access key and secret, the key and secret get generated automatically. 
-You may create a key and either specify or generate the access key and/or
-secret. You may also remove an access key and secret. Options include:
+  { "user_id": "foo",
+    "rados_uid": 0,
+    "display_name": "foo",
+    "email": "foo@example.com",
+    "suspended": 0,
+    "subusers": [
+       { "id": "foo:bar",
+         "permissions": "full-control"}],
+    "swift_keys": [
+      { "user": "foo:bar",
+        "secret_key": "asfghjghghmgm"}]}
 
+Note that a subuser can have only one swift secret key.
 
-- ``--secret=<key>`` specifies a secret key (e.g,. manually generated).
-- ``--gen-access-key`` generates random access key (for S3 user by default).
-- ``--gen-secret`` generates a random secret key.
-- ``--key-type=<type>`` specifies a key type. The options are: swift, s3
+Subusers can also be used with S3 APIs if the subuser is associated with a S3 key pair. ::	
 
+	radosgw-admin key create --subuser=foo:bar --key-type=s3 --access-key barAccessKey --secret-key barSecretKey
+	
+.. code-block:: javascript
 
-To add a key, specify the user. ::
+  { "user_id": "foo",
+    "rados_uid": 0,
+    "display_name": "foo",
+    "email": "foo@example.com",
+    "suspended": 0,
+    "subusers": [
+       { "id": "foo:bar",
+         "permissions": "full-control"}],
+    "keys": [
+      { "user": "foo:bar",
+        "access_key": "barAccessKey",
+        "secret_key": "barSecretKey"}],
+  }
 
-	radosgw-admin key create --uid=johndoe --key-type=s3 --gen-access-key --gen-secret
 
-You may also specify a key and a secret.
+To remove a S3 key pair, specify the access key. :: 
 
-To remove an access key, specify the user. :: 
+	radosgw-admin key rm --uid=foo --key-type=s3 --access-key=fooAccessKey 
 
-	radosgw-admin key rm --uid=johndoe
+To remove the swift secret key. ::
 
+	radosgw-admin key rm -subuser=foo:bar --key-type=swift
 
 
 Add / Remove Admin Capabilities
diff --git a/doc/radosgw/adminops.rst b/doc/radosgw/adminops.rst
index 6c69f439740..241c43d10d4 100644
--- a/doc/radosgw/adminops.rst
+++ b/doc/radosgw/adminops.rst
@@ -13,6 +13,10 @@ Get Usage
 
 Request bandwidth usage information.
 
+Note: this feature is disabled by default, can be enabled by setting ``rgw
+enable usage log = true`` in the appropriate section of ceph.conf. For changes
+in ceph.conf to take effect, radosgw process restart is needed.
+
 :caps: usage=read
 
 Syntax
@@ -163,6 +167,10 @@ Trim Usage
 Remove usage information. With no dates specified, removes all usage
 information.
 
+Note: this feature is disabled by default, can be enabled by setting ``rgw
+enable usage log = true`` in the appropriate section of ceph.conf. For changes
+in ceph.conf to take effect, radosgw process restart is needed.
+
 :caps: usage=write
 
 Syntax
@@ -215,8 +223,7 @@ TBD.
 Get User Info
 =============
 
-Get user information. If no user is specified returns the list of all users along with suspension
-information.
+Get user information.
 
 :caps: users=read
 
diff --git a/doc/radosgw/barbican.rst b/doc/radosgw/barbican.rst
index 557c956042f..3a7fe6e5c18 100644
--- a/doc/radosgw/barbican.rst
+++ b/doc/radosgw/barbican.rst
@@ -67,7 +67,8 @@ In the response, ``d1e7ef3b-f841-4b7c-90b2-b7d90ca2d723`` is the key id that
 can be used in any `SSE-KMS`_ request.
 
 This newly created key is not accessible by user ``rgwcrypt-user``. This
-privilege must be added with an ACL.
+privilege must be added with an ACL. See `How to Set/Replace ACL`_ for more
+details.
 
 Example request (assuming that the Keystone id of ``rgwcrypt-user`` is
 ``906aa90bd8a946c89cdff80d0869460f``)::
diff --git a/doc/radosgw/bucketpolicy.rst b/doc/radosgw/bucketpolicy.rst
new file mode 100644
index 00000000000..85e70556107
--- /dev/null
+++ b/doc/radosgw/bucketpolicy.rst
@@ -0,0 +1,133 @@
+===============
+Bucket Policies
+===============
+
+.. versionadded:: Luminous
+
+The Ceph Object Gateway supports a subset of the Amazon S3 policy
+language applied to buckets.
+
+
+Creation and Removal
+====================
+
+Bucket policies are managed through standard S3 operations rather than
+radosgw-admin.
+
+For example, one may use s3cmd to set or delete a policy thus::
+
+  $ cat > examplepol
+  {
+    "Version": "2012-10-17",
+    "Statement": [{
+      "Effect": "Allow",
+      "Principal": {"AWS": ["arn:aws:iam::usfolks:user/fred"]},
+      "Action": "s3PutObjectAcl",
+      "Resource": [
+        "arn:aws:s3:::happybucket/*"
+      ]
+    }]
+  }
+
+  $ s3cmd setpolicy examplepol s3://happybucket
+  $ s3cmd delpolicy s3://happybucket
+
+
+Limitations
+===========
+
+Currently, we support only the following actions:
+
+- s3:AbortMultipartUpload
+- s3:CreateBucket
+- s3:DeleteBucketPolicy
+- s3:DeleteBucket
+- s3:DeleteBucketWebsite
+- s3:DeleteObject
+- s3:DeleteObjectVersion
+- s3:DeleteReplicationConfiguration
+- s3:GetAccelerateConfiguration
+- s3:GetBucketAcl
+- s3:GetBucketCORS
+- s3:GetBucketLocation
+- s3:GetBucketLogging
+- s3:GetBucketNotification
+- s3:GetBucketPolicy
+- s3:GetBucketRequestPayment
+- s3:GetBucketTagging
+- s3:GetBucketVersioning
+- s3:GetBucketWebsite
+- s3:GetLifecycleConfiguration
+- s3:GetObjectAcl
+- s3:GetObject
+- s3:GetObjectTorrent
+- s3:GetObjectVersionAcl
+- s3:GetObjectVersion
+- s3:GetObjectVersionTorrent
+- s3:GetReplicationConfiguration
+- s3:ListAllMyBuckets
+- s3:ListBucketMultiPartUploads
+- s3:ListBucket
+- s3:ListBucketVersions
+- s3:ListMultipartUploadParts
+- s3:PutAccelerateConfiguration
+- s3:PutBucketAcl
+- s3:PutBucketCORS
+- s3:PutBucketLogging
+- s3:PutBucketNotification
+- s3:PutBucketPolicy
+- s3:PutBucketRequestPayment
+- s3:PutBucketTagging
+- s3:PutBucketVersioning
+- s3:PutBucketWebsite
+- s3:PutLifecycleConfiguration
+- s3:PutObjectAcl
+- s3:PutObject
+- s3:PutObjectVersionAcl
+- s3:PutReplicationConfiguration
+- s3:RestoreObject
+
+We do not yet support setting policies on users, groups, or roles.
+
+We use the RGW ‘tenant’ identifier in place of the Amazon twelve-digit
+account ID. In the future we may allow you to assign an account ID to
+a tenant, but for now if you want to use policies between AWS S3 and
+RGW S3 you will have to use the Amazon account ID as the tenant ID when
+creating users.
+
+Under AWS, all tenants share a single namespace. RGW gives every
+tenant its own namespace of buckets. There may be an option to enable
+an AWS-like 'flat' bucket namespace in future versions. At present, to
+access a bucket belonging to another tenant, address it as
+"tenant:bucket" in the S3 request.
+
+In AWS, a bucket policy can grant access to another account, and that
+account owner can then grant access to individual users with user
+permissions. Since we do not yet support user, role, and group
+permissions, account owners will currently need to grant access
+directly to individual users, and granting an entire account access to
+a bucket grants access to all users in that account.
+
+Bucket policies do not yet support string interpolation.
+
+Currently, the only condition keys we support are:
+- aws:CurrentTime
+- aws:EpochTime
+- aws:PrincipalType
+- aws:Referer
+- aws:SecureTransport
+- aws:SourceIp
+- aws:UserAgent
+- aws:username
+
+More may be supported soon as we integrate with the recently rewritten
+Authentication/Authorization subsystem.
+
+Swift
+=====
+
+There is no way to set bucket policies under Swift, but bucket
+policies that have been set govern Swift as well as S3 operations.
+
+Swift credentials are matched against Principals specified in a policy
+in a way specific to whatever backend is being used.
diff --git a/doc/radosgw/index.rst b/doc/radosgw/index.rst
index 7bf08cdbe97..ed26e11971a 100644
--- a/doc/radosgw/index.rst
+++ b/doc/radosgw/index.rst
@@ -51,6 +51,7 @@ you may write data with one API and retrieve it with the other.
    Multi-tenancy <multitenancy>
    Compression <compression>
    Server-Side Encryption <encryption>
+   Bucket Policy <bucketpolicy>
    Data Layout in RADOS <layout>
    Upgrade to Older Versions of Jewel <upgrade_to_jewel>
    troubleshooting
diff --git a/doc/radosgw/multisite.rst b/doc/radosgw/multisite.rst
index e81e9014286..c030ba0d8c8 100644
--- a/doc/radosgw/multisite.rst
+++ b/doc/radosgw/multisite.rst
@@ -63,7 +63,7 @@ gateway instances, one for each Ceph storage cluster.
 
 This guide assumes at least two Ceph storage clusters in geographically
 separate locations; however, the configuration can work on the same
-site. This guide also assumes four Ceph object gateway servers named
+site. This guide also assumes two Ceph object gateway servers named
 ``rgw1`` and ``rgw2``.
 
 A multi-site configuration requires a master zone group and a master
@@ -814,8 +814,8 @@ realm. Alternatively, to change which realm is the default, execute:
 
     # radosgw-admin realm default --rgw-realm=movies
 
-..note:: When the realm is default, the command line assumes
-         ``--rgw-realm=<realm-name>`` as an argument.
+.. note:: When the realm is default, the command line assumes
+   ``--rgw-realm=<realm-name>`` as an argument.
 
 Delete a Realm
 ~~~~~~~~~~~~~~
diff --git a/doc/radosgw/swift.rst b/doc/radosgw/swift.rst
index 39a911c3272..42f21ffcfce 100644
--- a/doc/radosgw/swift.rst
+++ b/doc/radosgw/swift.rst
@@ -49,7 +49,7 @@ The following table describes the support status for current Swift functional fe
 +---------------------------------+-----------------+----------------------------------------+
 | **List Objects**                | Supported       |                                        |
 +---------------------------------+-----------------+----------------------------------------+
-| **Static Website**              | Not Supported   |                                        |
+| **Static Website**              | Supported       |                                        |
 +---------------------------------+-----------------+----------------------------------------+
 | **Create Object**               | Supported       |                                        |
 +---------------------------------+-----------------+----------------------------------------+
diff --git a/doc/start/quick-ceph-deploy.rst b/doc/start/quick-ceph-deploy.rst
index 560d50b12c5..e2bf659753c 100644
--- a/doc/start/quick-ceph-deploy.rst
+++ b/doc/start/quick-ceph-deploy.rst
@@ -38,15 +38,12 @@ Create a Cluster
 ================
 
 If at any point you run into trouble and you want to start over, execute
-the following to purge the configuration::
+the following to purge the Ceph packages, and erase all its data and configuration::
 
+	ceph-deploy purge {ceph-node} [{ceph-node}]
 	ceph-deploy purgedata {ceph-node} [{ceph-node}]
 	ceph-deploy forgetkeys
 
-To purge the Ceph packages too, you may also execute::
-
-	ceph-deploy purge {ceph-node} [{ceph-node}]
-
 If you execute ``purge``, you must re-install Ceph.
 
 On your admin node from the directory you created for holding your
diff --git a/install-deps.sh b/install-deps.sh
index 90347622e29..42c34e68f43 100755
--- a/install-deps.sh
+++ b/install-deps.sh
@@ -22,6 +22,7 @@ export LC_ALL=C # the following is vulnerable to i18n
 if [ x`uname`x = xFreeBSDx ]; then
     $SUDO pkg install -yq \
         devel/git \
+        devel/gperf \
         devel/gmake \
         devel/cmake \
         devel/yasm \
diff --git a/mirroring/MIRRORS b/mirroring/MIRRORS
index 24b0b520f43..408a2bb3bf1 100644
--- a/mirroring/MIRRORS
+++ b/mirroring/MIRRORS
@@ -8,3 +8,4 @@ us-east.ceph.com: Tyler Bishop <tyler.bishop@beyondhosting.net>
 hk.ceph.com: Mart van Santen <mart@greenhost.nl>
 fr.ceph.com: Adrien Gillard <gillard.adrien@gmail.com>
 uk.ceph.com: Tim Bishop <T.D.Bishop@kent.ac.uk>
+cn.ceph.com: USTC LUG <lug@ustc.edu.cn>
diff --git a/qa/erasure-code/ec-rados-plugin=isa-k=2-m=1.yaml b/qa/erasure-code/ec-rados-plugin=isa-k=2-m=1.yaml
index 8d7c49785fa..f69963933c5 100644
--- a/qa/erasure-code/ec-rados-plugin=isa-k=2-m=1.yaml
+++ b/qa/erasure-code/ec-rados-plugin=isa-k=2-m=1.yaml
@@ -4,6 +4,7 @@ tasks:
     ops: 4000
     objects: 50
     ec_pool: true
+    min_size: 2
     write_append_excl: false
     erasure_code_profile:
       name: isaprofile
diff --git a/qa/releases/luminous-with-mgr.yaml b/qa/releases/luminous-with-mgr.yaml
new file mode 100644
index 00000000000..880df732b98
--- /dev/null
+++ b/qa/releases/luminous-with-mgr.yaml
@@ -0,0 +1,11 @@
+tasks:
+- exec:
+    osd.0:
+      - ceph osd set require_luminous_osds
+      - ceph osd set-require-min-compat-client luminous
+- ceph.healthy:
+overrides:
+  ceph:
+    conf:
+      mon:
+        mon warn on osd down out interval zero: false
diff --git a/qa/releases/luminous.yaml b/qa/releases/luminous.yaml
index b7ff0dc53b6..22bb78b2d67 100644
--- a/qa/releases/luminous.yaml
+++ b/qa/releases/luminous.yaml
@@ -10,6 +10,7 @@ tasks:
 - exec:
     osd.0:
       - ceph osd set require_luminous_osds
+      - ceph osd set-require-min-compat-client luminous
 - ceph.healthy:
 overrides:
   ceph:
diff --git a/qa/suites/fs/basic_functional/tasks/mds-full.yaml b/qa/suites/fs/basic_functional/tasks/mds-full.yaml
index 6865d83e807..53735001963 100644
--- a/qa/suites/fs/basic_functional/tasks/mds-full.yaml
+++ b/qa/suites/fs/basic_functional/tasks/mds-full.yaml
@@ -8,10 +8,15 @@ overrides:
       - failsafe disengaged, no longer dropping
       - is full \(reached quota
     conf:
+      mon:
+        mon osd nearfull ratio: 0.6
+        mon osd backfillfull ratio: 0.6
+        mon osd full ratio: 0.7
       osd:
         osd mon report interval max: 5
         osd objectstore: memstore
-        memstore device bytes: 100000000
+        osd failsafe full ratio: 1.0
+        memstore device bytes: 200000000
       client.0:
         debug client: 20
         debug objecter: 20
diff --git a/qa/suites/krbd/rbd-nomount/tasks/krbd_exclusive_option.yaml b/qa/suites/krbd/rbd-nomount/tasks/krbd_exclusive_option.yaml
new file mode 100644
index 00000000000..567deebfdb7
--- /dev/null
+++ b/qa/suites/krbd/rbd-nomount/tasks/krbd_exclusive_option.yaml
@@ -0,0 +1,5 @@
+tasks:
+- workunit:
+    clients:
+      all:
+        - rbd/krbd_exclusive_option.sh
diff --git a/qa/suites/mgr/basic/% b/qa/suites/krbd/wac/sysfs/%
index e69de29bb2d..e69de29bb2d 100644
--- a/qa/suites/mgr/basic/%
+++ b/qa/suites/krbd/wac/sysfs/%
diff --git a/qa/suites/rados/verify/1thrash/none.yaml b/qa/suites/krbd/wac/sysfs/ceph/ceph.yaml
index 2030acb9083..2030acb9083 100644
--- a/qa/suites/rados/verify/1thrash/none.yaml
+++ b/qa/suites/krbd/wac/sysfs/ceph/ceph.yaml
diff --git a/qa/suites/krbd/wac/sysfs/clusters/fixed-1.yaml b/qa/suites/krbd/wac/sysfs/clusters/fixed-1.yaml
new file mode 120000
index 00000000000..549e8809abe
--- /dev/null
+++ b/qa/suites/krbd/wac/sysfs/clusters/fixed-1.yaml
@@ -0,0 +1 @@
+../../../../../clusters/fixed-1.yaml
+\ No newline at end of file
diff --git a/qa/suites/krbd/wac/sysfs/conf.yaml b/qa/suites/krbd/wac/sysfs/conf.yaml
new file mode 100644
index 00000000000..8279674dfa7
--- /dev/null
+++ b/qa/suites/krbd/wac/sysfs/conf.yaml
@@ -0,0 +1,7 @@
+overrides:
+  ceph:
+    conf:
+      global:
+        ms die on skipped message: false
+      client:
+        rbd default features: 5
diff --git a/qa/suites/krbd/wac/sysfs/tasks/stable_pages_required.yaml b/qa/suites/krbd/wac/sysfs/tasks/stable_pages_required.yaml
new file mode 100644
index 00000000000..3d23227a022
--- /dev/null
+++ b/qa/suites/krbd/wac/sysfs/tasks/stable_pages_required.yaml
@@ -0,0 +1,5 @@
+tasks:
+- workunit:
+    clients:
+      all:
+        - rbd/krbd_stable_pages_required.sh
diff --git a/qa/suites/rados/thrash/z-require-luminous/at-mkfs.yaml b/qa/suites/krbd/wac/wac/%
index e69de29bb2d..e69de29bb2d 100644
--- a/qa/suites/rados/thrash/z-require-luminous/at-mkfs.yaml
+++ b/qa/suites/krbd/wac/wac/%
diff --git a/qa/suites/krbd/wac/wac/ceph/ceph.yaml b/qa/suites/krbd/wac/wac/ceph/ceph.yaml
new file mode 100644
index 00000000000..2030acb9083
--- /dev/null
+++ b/qa/suites/krbd/wac/wac/ceph/ceph.yaml
@@ -0,0 +1,3 @@
+tasks:
+- install:
+- ceph:
diff --git a/qa/suites/krbd/wac/wac/clusters/fixed-3.yaml b/qa/suites/krbd/wac/wac/clusters/fixed-3.yaml
new file mode 120000
index 00000000000..af987dab6a3
--- /dev/null
+++ b/qa/suites/krbd/wac/wac/clusters/fixed-3.yaml
@@ -0,0 +1 @@
+../../../../../clusters/fixed-3.yaml
+\ No newline at end of file
diff --git a/qa/suites/krbd/wac/wac/conf.yaml b/qa/suites/krbd/wac/wac/conf.yaml
new file mode 100644
index 00000000000..8279674dfa7
--- /dev/null
+++ b/qa/suites/krbd/wac/wac/conf.yaml
@@ -0,0 +1,7 @@
+overrides:
+  ceph:
+    conf:
+      global:
+        ms die on skipped message: false
+      client:
+        rbd default features: 5
diff --git a/qa/suites/krbd/wac/wac/tasks/wac.yaml b/qa/suites/krbd/wac/wac/tasks/wac.yaml
new file mode 100644
index 00000000000..52dabc38bfc
--- /dev/null
+++ b/qa/suites/krbd/wac/wac/tasks/wac.yaml
@@ -0,0 +1,11 @@
+tasks:
+- exec:
+    client.0:
+    - "dmesg -C"
+- rbd:
+    all:
+      fs_type: ext4
+- workunit:
+    clients:
+      all:
+        - suites/wac.sh
diff --git a/qa/suites/krbd/wac/wac/verify/many-resets.yaml b/qa/suites/krbd/wac/wac/verify/many-resets.yaml
new file mode 100644
index 00000000000..526897e9cda
--- /dev/null
+++ b/qa/suites/krbd/wac/wac/verify/many-resets.yaml
@@ -0,0 +1,10 @@
+overrides:
+  ceph:
+    conf:
+      global:
+        ms inject socket failures: 500
+tasks:
+- exec:
+    client.0:
+    - "dmesg | grep -q 'libceph: osd.* socket closed'"
+    - "dmesg | grep -q 'libceph: osd.* socket error on write'"
diff --git a/qa/suites/krbd/wac/wac/verify/no-resets.yaml b/qa/suites/krbd/wac/wac/verify/no-resets.yaml
new file mode 100644
index 00000000000..2728479da77
--- /dev/null
+++ b/qa/suites/krbd/wac/wac/verify/no-resets.yaml
@@ -0,0 +1,5 @@
+tasks:
+- exec:
+    client.0:
+    - "! dmesg | grep -q 'libceph: osd.* socket closed'"
+    - "! dmesg | grep -q 'libceph: osd.* socket error on write'"
diff --git a/qa/suites/mgr/basic/tasks/failover.yaml b/qa/suites/mgr/basic/tasks/failover.yaml
deleted file mode 100644
index b0c7b471ab0..00000000000
--- a/qa/suites/mgr/basic/tasks/failover.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-
-tasks:
-  - install:
-  - ceph:
-  - cephfs_test_runner:
-      modules:
-        - tasks.mgr.test_failover
diff --git a/qa/suites/multimds/basic/tasks/cephfs_test_exports.yaml b/qa/suites/multimds/basic/tasks/cephfs_test_exports.yaml
new file mode 100644
index 00000000000..b5842b3ef1b
--- /dev/null
+++ b/qa/suites/multimds/basic/tasks/cephfs_test_exports.yaml
@@ -0,0 +1,4 @@
+tasks:
+- cephfs_test_runner:
+    modules:
+      - tasks.cephfs.test_exports
diff --git a/qa/suites/rados/basic/ceph.yaml b/qa/suites/rados/basic/ceph.yaml
new file mode 100644
index 00000000000..2030acb9083
--- /dev/null
+++ b/qa/suites/rados/basic/ceph.yaml
@@ -0,0 +1,3 @@
+tasks:
+- install:
+- ceph:
diff --git a/qa/suites/rados/basic/d-require-luminous b/qa/suites/rados/basic/d-require-luminous
new file mode 120000
index 00000000000..737aee82415
--- /dev/null
+++ b/qa/suites/rados/basic/d-require-luminous
@@ -0,0 +1 @@
+../thrash/d-require-luminous/
+\ No newline at end of file
diff --git a/qa/suites/rados/basic/tasks/rados_api_tests.yaml b/qa/suites/rados/basic/tasks/rados_api_tests.yaml
index acfc597dec3..b66423988d7 100644
--- a/qa/suites/rados/basic/tasks/rados_api_tests.yaml
+++ b/qa/suites/rados/basic/tasks/rados_api_tests.yaml
@@ -4,8 +4,6 @@ overrides:
     - reached quota
     - wrongly marked me down
 tasks:
-- install:
-- ceph:
 - workunit:
     clients:
       client.0:
diff --git a/qa/suites/rados/basic/tasks/rados_cls_all.yaml b/qa/suites/rados/basic/tasks/rados_cls_all.yaml
index 34f7cbbb4a0..bbab083e9cd 100644
--- a/qa/suites/rados/basic/tasks/rados_cls_all.yaml
+++ b/qa/suites/rados/basic/tasks/rados_cls_all.yaml
@@ -1,6 +1,12 @@
+overrides:
+  ceph:
+    conf:
+      osd:
+        osd_class_load_list: "cephfs hello journal lock log numops rbd refcount 
+                              replica_log rgw sdk statelog timeindex user version"
+        osd_class_default_list: "cephfs hello journal lock log numops rbd refcount 
+                                 replica_log rgw sdk statelog timeindex user version"
 tasks:
-- install:
-- ceph:
 - workunit:
     clients:
       client.0:
diff --git a/qa/suites/rados/basic/tasks/rados_python.yaml b/qa/suites/rados/basic/tasks/rados_python.yaml
index 00320538ff7..d8b332b343d 100644
--- a/qa/suites/rados/basic/tasks/rados_python.yaml
+++ b/qa/suites/rados/basic/tasks/rados_python.yaml
@@ -1,8 +1,8 @@
-tasks:
-- install:
-- ceph:
+overrides:
+  ceph:
     log-whitelist:
     - wrongly marked me down
+tasks:
 - workunit:
     clients:
       client.0:
diff --git a/qa/suites/rados/basic/tasks/rados_stress_watch.yaml b/qa/suites/rados/basic/tasks/rados_stress_watch.yaml
index ae2e5fd0083..0e1ba010c5b 100644
--- a/qa/suites/rados/basic/tasks/rados_stress_watch.yaml
+++ b/qa/suites/rados/basic/tasks/rados_stress_watch.yaml
@@ -1,6 +1,4 @@
 tasks:
-- install:
-- ceph:
 - workunit:
     clients:
       client.0:
diff --git a/qa/suites/rados/basic/tasks/rados_striper.yaml b/qa/suites/rados/basic/tasks/rados_striper.yaml
index 8537fcbd4db..c19cc83a93b 100644
--- a/qa/suites/rados/basic/tasks/rados_striper.yaml
+++ b/qa/suites/rados/basic/tasks/rados_striper.yaml
@@ -1,6 +1,4 @@
 tasks:
-- install:
-- ceph:
 - exec:
    client.0:
    - ceph_test_rados_striper_api_io
diff --git a/qa/suites/rados/basic/tasks/rados_workunit_loadgen_big.yaml b/qa/suites/rados/basic/tasks/rados_workunit_loadgen_big.yaml
index 9432367e356..fa928d12063 100644
--- a/qa/suites/rados/basic/tasks/rados_workunit_loadgen_big.yaml
+++ b/qa/suites/rados/basic/tasks/rados_workunit_loadgen_big.yaml
@@ -3,8 +3,6 @@ overrides:
     log-whitelist:
     - wrongly marked me down
 tasks:
-- install:
-- ceph:
 - workunit:
     clients:
       all:
diff --git a/qa/suites/rados/basic/tasks/rados_workunit_loadgen_mix.yaml b/qa/suites/rados/basic/tasks/rados_workunit_loadgen_mix.yaml
index 7d882cac9c9..35d8fdaf11f 100644
--- a/qa/suites/rados/basic/tasks/rados_workunit_loadgen_mix.yaml
+++ b/qa/suites/rados/basic/tasks/rados_workunit_loadgen_mix.yaml
@@ -3,8 +3,6 @@ overrides:
     log-whitelist:
     - wrongly marked me down
 tasks:
-- install:
-- ceph:
 - workunit:
     clients:
       all:
diff --git a/qa/suites/rados/basic/tasks/rados_workunit_loadgen_mostlyread.yaml b/qa/suites/rados/basic/tasks/rados_workunit_loadgen_mostlyread.yaml
index 69c06b7b049..b3af2a2cac3 100644
--- a/qa/suites/rados/basic/tasks/rados_workunit_loadgen_mostlyread.yaml
+++ b/qa/suites/rados/basic/tasks/rados_workunit_loadgen_mostlyread.yaml
@@ -3,8 +3,6 @@ overrides:
     log-whitelist:
     - wrongly marked me down
 tasks:
-- install:
-- ceph:
 - workunit:
     clients:
       all:
diff --git a/qa/suites/rados/basic/tasks/readwrite.yaml b/qa/suites/rados/basic/tasks/readwrite.yaml
index 38343f316eb..f135107c767 100644
--- a/qa/suites/rados/basic/tasks/readwrite.yaml
+++ b/qa/suites/rados/basic/tasks/readwrite.yaml
@@ -2,11 +2,11 @@ overrides:
   ceph:
     crush_tunables: optimal
     conf:
+      mon:
+        mon osd initial require min compat client: luminous
       osd:
         osd_discard_disconnected_ops: false
 tasks:
-- install:
-- ceph:
 - rados:
     clients: [client.0]
     ops: 4000
diff --git a/qa/suites/rados/basic/tasks/repair_test.yaml b/qa/suites/rados/basic/tasks/repair_test.yaml
index 609f0db6211..bd98522d1e0 100644
--- a/qa/suites/rados/basic/tasks/repair_test.yaml
+++ b/qa/suites/rados/basic/tasks/repair_test.yaml
@@ -21,7 +21,5 @@ overrides:
         filestore debug inject read err: true
         bluestore debug inject read err: true
 tasks:
-- install:
-- ceph:
 - repair_test:
 
diff --git a/qa/suites/rados/basic/tasks/rgw_snaps.yaml b/qa/suites/rados/basic/tasks/rgw_snaps.yaml
index 3e7745aec2a..135db06b8ea 100644
--- a/qa/suites/rados/basic/tasks/rgw_snaps.yaml
+++ b/qa/suites/rados/basic/tasks/rgw_snaps.yaml
@@ -7,8 +7,6 @@ overrides:
       osd:
         osd_max_omap_entries_per_request: 10
 tasks:
-- install:
-- ceph:
 - rgw:
     default_idle_timeout: 3600
     client.0: null
diff --git a/qa/suites/rados/basic/tasks/scrub_test.yaml b/qa/suites/rados/basic/tasks/scrub_test.yaml
index 3b3b69dfad3..1847f80d46b 100644
--- a/qa/suites/rados/basic/tasks/scrub_test.yaml
+++ b/qa/suites/rados/basic/tasks/scrub_test.yaml
@@ -18,6 +18,4 @@ overrides:
       osd:
         osd deep scrub update digest min age: 0
 tasks:
-- install:
-- ceph:
 - scrub_test:
diff --git a/qa/suites/rados/basic/z-require-luminous b/qa/suites/rados/basic/z-require-luminous
deleted file mode 120000
index 483b23de56b..00000000000
--- a/qa/suites/rados/basic/z-require-luminous
+++ /dev/null
@@ -1 +0,0 @@
-../thrash/z-require-luminous
-\ No newline at end of file
diff --git a/qa/suites/rados/mgr/% b/qa/suites/rados/mgr/%
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/qa/suites/rados/mgr/%
diff --git a/qa/suites/mgr/basic/clusters/2-node-mgr.yaml b/qa/suites/rados/mgr/clusters/2-node-mgr.yaml
index fc9dfbdc244..bc950e5afff 100644
--- a/qa/suites/mgr/basic/clusters/2-node-mgr.yaml
+++ b/qa/suites/rados/mgr/clusters/2-node-mgr.yaml
@@ -1,5 +1,5 @@
 roles:
-- [mgr.x, mon.a, mds.a, mds.c, osd.0, client.0]
+- [mgr.x, mon.a, mon.c, mds.a, mds.c, osd.0, client.0]
 - [mgr.y, mon.b, mds.b, osd.1, osd.2, client.1]
 log-rotate:
   ceph-mds: 10G
diff --git a/qa/suites/mgr/basic/debug/mgr.yaml b/qa/suites/rados/mgr/debug/mgr.yaml
index 068021eb6c7..068021eb6c7 100644
--- a/qa/suites/mgr/basic/debug/mgr.yaml
+++ b/qa/suites/rados/mgr/debug/mgr.yaml
diff --git a/qa/suites/mgr/basic/objectstore b/qa/suites/rados/mgr/objectstore
index 4c8ebadfde8..4c8ebadfde8 120000
--- a/qa/suites/mgr/basic/objectstore
+++ b/qa/suites/rados/mgr/objectstore
diff --git a/qa/suites/rados/mgr/tasks/failover.yaml b/qa/suites/rados/mgr/tasks/failover.yaml
new file mode 100644
index 00000000000..e02b8bf2cb0
--- /dev/null
+++ b/qa/suites/rados/mgr/tasks/failover.yaml
@@ -0,0 +1,10 @@
+
+tasks:
+  - install:
+  - ceph:
+      # tests may leave mgrs broken, so don't try and call into them
+      # to invoke e.g. pg dump during teardown.
+      wait-for-scrub: false  
+  - cephfs_test_runner:
+      modules:
+        - tasks.mgr.test_failover
diff --git a/qa/suites/rados/monthrash/ceph/ceph.yaml b/qa/suites/rados/monthrash/ceph.yaml
index a2c0efc7779..a2c0efc7779 100644
--- a/qa/suites/rados/monthrash/ceph/ceph.yaml
+++ b/qa/suites/rados/monthrash/ceph.yaml
diff --git a/qa/suites/rados/monthrash/d-require-luminous b/qa/suites/rados/monthrash/d-require-luminous
new file mode 120000
index 00000000000..737aee82415
--- /dev/null
+++ b/qa/suites/rados/monthrash/d-require-luminous
@@ -0,0 +1 @@
+../thrash/d-require-luminous/
+\ No newline at end of file
diff --git a/qa/suites/rados/monthrash/z-require-luminous b/qa/suites/rados/monthrash/z-require-luminous
deleted file mode 120000
index 483b23de56b..00000000000
--- a/qa/suites/rados/monthrash/z-require-luminous
+++ /dev/null
@@ -1 +0,0 @@
-../thrash/z-require-luminous
-\ No newline at end of file
diff --git a/qa/suites/rados/multimon/z-require-luminous b/qa/suites/rados/multimon/z-require-luminous
deleted file mode 120000
index 483b23de56b..00000000000
--- a/qa/suites/rados/multimon/z-require-luminous
+++ /dev/null
@@ -1 +0,0 @@
-../thrash/z-require-luminous
-\ No newline at end of file
diff --git a/qa/suites/rados/singleton-bluestore/% b/qa/suites/rados/singleton-bluestore/%
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/qa/suites/rados/singleton-bluestore/%
diff --git a/qa/suites/rados/singleton/all/cephtool.yaml b/qa/suites/rados/singleton-bluestore/all/cephtool.yaml
index 880628f4fd2..880628f4fd2 100644
--- a/qa/suites/rados/singleton/all/cephtool.yaml
+++ b/qa/suites/rados/singleton-bluestore/all/cephtool.yaml
diff --git a/qa/suites/rados/singleton-bluestore/msgr b/qa/suites/rados/singleton-bluestore/msgr
new file mode 120000
index 00000000000..b29ecddaed7
--- /dev/null
+++ b/qa/suites/rados/singleton-bluestore/msgr
@@ -0,0 +1 @@
+../basic/msgr
+\ No newline at end of file
diff --git a/qa/suites/rados/singleton-bluestore/msgr-failures/few.yaml b/qa/suites/rados/singleton-bluestore/msgr-failures/few.yaml
new file mode 100644
index 00000000000..0de320d46b8
--- /dev/null
+++ b/qa/suites/rados/singleton-bluestore/msgr-failures/few.yaml
@@ -0,0 +1,5 @@
+overrides:
+  ceph:
+    conf:
+      global:
+        ms inject socket failures: 5000
diff --git a/qa/suites/rados/singleton-bluestore/msgr-failures/many.yaml b/qa/suites/rados/singleton-bluestore/msgr-failures/many.yaml
new file mode 100644
index 00000000000..86f8dde8a0e
--- /dev/null
+++ b/qa/suites/rados/singleton-bluestore/msgr-failures/many.yaml
@@ -0,0 +1,5 @@
+overrides:
+  ceph:
+    conf:
+      global:
+        ms inject socket failures: 500
diff --git a/qa/suites/rados/singleton-bluestore/objectstore/bluestore-comp.yaml b/qa/suites/rados/singleton-bluestore/objectstore/bluestore-comp.yaml
new file mode 120000
index 00000000000..b23b2a79273
--- /dev/null
+++ b/qa/suites/rados/singleton-bluestore/objectstore/bluestore-comp.yaml
@@ -0,0 +1 @@
+../../../../objectstore/bluestore-comp.yaml
+\ No newline at end of file
diff --git a/qa/suites/rados/singleton-bluestore/objectstore/bluestore.yaml b/qa/suites/rados/singleton-bluestore/objectstore/bluestore.yaml
new file mode 120000
index 00000000000..bd7d7e004a1
--- /dev/null
+++ b/qa/suites/rados/singleton-bluestore/objectstore/bluestore.yaml
@@ -0,0 +1 @@
+../../../../objectstore/bluestore.yaml
+\ No newline at end of file
diff --git a/qa/suites/rados/singleton-bluestore/rados.yaml b/qa/suites/rados/singleton-bluestore/rados.yaml
new file mode 120000
index 00000000000..b756e57bcf0
--- /dev/null
+++ b/qa/suites/rados/singleton-bluestore/rados.yaml
@@ -0,0 +1 @@
+../../../config/rados.yaml
+\ No newline at end of file
diff --git a/qa/suites/rados/singleton-nomsgr/all/health-warnings.yaml b/qa/suites/rados/singleton-nomsgr/all/health-warnings.yaml
new file mode 100644
index 00000000000..eb121161729
--- /dev/null
+++ b/qa/suites/rados/singleton-nomsgr/all/health-warnings.yaml
@@ -0,0 +1,9 @@
+roles:
+- [mon.a, mgr.x, osd.0, osd.1, osd.2, osd.3, osd.4, osd.5, osd.6, osd.7, osd.8, osd.9, client.0]
+tasks:
+- install:
+- ceph:
+- workunit:
+    clients:
+      all:
+        - rados/test_health_warnings.sh
diff --git a/qa/suites/rados/thrash-erasure-code-big/ceph.yaml b/qa/suites/rados/thrash-erasure-code-big/ceph.yaml
new file mode 120000
index 00000000000..a2fd139cbff
--- /dev/null
+++ b/qa/suites/rados/thrash-erasure-code-big/ceph.yaml
@@ -0,0 +1 @@
+../thrash/ceph.yaml
+\ No newline at end of file
diff --git a/qa/suites/rados/thrash-erasure-code-big/d-require-luminous b/qa/suites/rados/thrash-erasure-code-big/d-require-luminous
new file mode 120000
index 00000000000..737aee82415
--- /dev/null
+++ b/qa/suites/rados/thrash-erasure-code-big/d-require-luminous
@@ -0,0 +1 @@
+../thrash/d-require-luminous/
+\ No newline at end of file
diff --git a/qa/suites/rados/thrash-erasure-code-big/thrashers/default.yaml b/qa/suites/rados/thrash-erasure-code-big/thrashers/default.yaml
index 5acfcc3ddd9..057a7ecdb6d 100644
--- a/qa/suites/rados/thrash-erasure-code-big/thrashers/default.yaml
+++ b/qa/suites/rados/thrash-erasure-code-big/thrashers/default.yaml
@@ -1,6 +1,5 @@
-tasks:
-- install:
-- ceph:
+overrides:
+  ceph:
     log-whitelist:
     - wrongly marked me down
     - objects unfound and apparently lost
@@ -11,6 +10,7 @@ tasks:
         osd scrub min interval: 60
         osd scrub max interval: 120
         osd max backfills: 6
+tasks:
 - thrashosds:
     timeout: 1200
     chance_pgnum_grow: 1
diff --git a/qa/suites/rados/thrash-erasure-code-big/thrashers/fastread.yaml b/qa/suites/rados/thrash-erasure-code-big/thrashers/fastread.yaml
index 5ded6ba7148..ec889775e9e 100644
--- a/qa/suites/rados/thrash-erasure-code-big/thrashers/fastread.yaml
+++ b/qa/suites/rados/thrash-erasure-code-big/thrashers/fastread.yaml
@@ -1,6 +1,5 @@
-tasks:
-- install:
-- ceph:
+overrides:
+  ceph:
     log-whitelist:
     - wrongly marked me down
     - objects unfound and apparently lost
@@ -12,6 +11,7 @@ tasks:
         osd scrub min interval: 60
         osd scrub max interval: 120
         osd max backfills: 2
+tasks:
 - thrashosds:
     timeout: 1200
     chance_pgnum_grow: 1
diff --git a/qa/suites/rados/thrash-erasure-code-big/thrashers/mapgap.yaml b/qa/suites/rados/thrash-erasure-code-big/thrashers/mapgap.yaml
index 67720febbcb..5e93ac08fd0 100644
--- a/qa/suites/rados/thrash-erasure-code-big/thrashers/mapgap.yaml
+++ b/qa/suites/rados/thrash-erasure-code-big/thrashers/mapgap.yaml
@@ -1,5 +1,9 @@
 overrides:
   ceph:
+    log-whitelist:
+    - wrongly marked me down
+    - objects unfound and apparently lost
+    - osd_map_cache_size
     conf:
       mon:
         mon min osdmap epochs: 2
@@ -8,12 +12,6 @@ overrides:
         osd scrub min interval: 60
         osd scrub max interval: 120
 tasks:
-- install:
-- ceph:
-    log-whitelist:
-    - wrongly marked me down
-    - objects unfound and apparently lost
-    - osd_map_cache_size
 - thrashosds:
     timeout: 1800
     chance_pgnum_grow: 1
diff --git a/qa/suites/rados/thrash-erasure-code-big/thrashers/morepggrow.yaml b/qa/suites/rados/thrash-erasure-code-big/thrashers/morepggrow.yaml
index f09ec08cfa9..efda9161e43 100644
--- a/qa/suites/rados/thrash-erasure-code-big/thrashers/morepggrow.yaml
+++ b/qa/suites/rados/thrash-erasure-code-big/thrashers/morepggrow.yaml
@@ -1,6 +1,5 @@
-tasks:
-- install:
-- ceph:
+overrides:
+  ceph:
     conf:
       osd:
         osd scrub min interval: 60
@@ -9,6 +8,7 @@ tasks:
     log-whitelist:
     - wrongly marked me down
     - objects unfound and apparently lost
+tasks:
 - thrashosds:
     timeout: 1200
     chance_pgnum_grow: 3
diff --git a/qa/suites/rados/thrash-erasure-code-big/thrashers/pggrow.yaml b/qa/suites/rados/thrash-erasure-code-big/thrashers/pggrow.yaml
index 1117cddfc1d..772f2093c94 100644
--- a/qa/suites/rados/thrash-erasure-code-big/thrashers/pggrow.yaml
+++ b/qa/suites/rados/thrash-erasure-code-big/thrashers/pggrow.yaml
@@ -1,6 +1,5 @@
-tasks:
-- install:
-- ceph:
+overrides:
+  ceph:
     log-whitelist:
     - wrongly marked me down
     - objects unfound and apparently lost
@@ -8,6 +7,7 @@ tasks:
       osd:
         osd scrub min interval: 60
         osd scrub max interval: 120
+tasks:
 - thrashosds:
     timeout: 1200
     chance_pgnum_grow: 2
diff --git a/qa/suites/rados/thrash-erasure-code-isa/ceph.yaml b/qa/suites/rados/thrash-erasure-code-isa/ceph.yaml
new file mode 120000
index 00000000000..a2fd139cbff
--- /dev/null
+++ b/qa/suites/rados/thrash-erasure-code-isa/ceph.yaml
@@ -0,0 +1 @@
+../thrash/ceph.yaml
+\ No newline at end of file
diff --git a/qa/suites/rados/thrash-erasure-code-isa/d-require-luminous b/qa/suites/rados/thrash-erasure-code-isa/d-require-luminous
new file mode 120000
index 00000000000..737aee82415
--- /dev/null
+++ b/qa/suites/rados/thrash-erasure-code-isa/d-require-luminous
@@ -0,0 +1 @@
+../thrash/d-require-luminous/
+\ No newline at end of file
diff --git a/qa/suites/rados/thrash-erasure-code-overwrites/ceph.yaml b/qa/suites/rados/thrash-erasure-code-overwrites/ceph.yaml
new file mode 120000
index 00000000000..a2fd139cbff
--- /dev/null
+++ b/qa/suites/rados/thrash-erasure-code-overwrites/ceph.yaml
@@ -0,0 +1 @@
+../thrash/ceph.yaml
+\ No newline at end of file
diff --git a/qa/suites/rados/thrash-erasure-code-overwrites/d-require-luminous b/qa/suites/rados/thrash-erasure-code-overwrites/d-require-luminous
new file mode 120000
index 00000000000..737aee82415
--- /dev/null
+++ b/qa/suites/rados/thrash-erasure-code-overwrites/d-require-luminous
@@ -0,0 +1 @@
+../thrash/d-require-luminous/
+\ No newline at end of file
diff --git a/qa/suites/rados/thrash-erasure-code-shec/ceph.yaml b/qa/suites/rados/thrash-erasure-code-shec/ceph.yaml
new file mode 120000
index 00000000000..a2fd139cbff
--- /dev/null
+++ b/qa/suites/rados/thrash-erasure-code-shec/ceph.yaml
@@ -0,0 +1 @@
+../thrash/ceph.yaml
+\ No newline at end of file
diff --git a/qa/suites/rados/thrash-erasure-code-shec/d-require-luminous b/qa/suites/rados/thrash-erasure-code-shec/d-require-luminous
new file mode 120000
index 00000000000..737aee82415
--- /dev/null
+++ b/qa/suites/rados/thrash-erasure-code-shec/d-require-luminous
@@ -0,0 +1 @@
+../thrash/d-require-luminous/
+\ No newline at end of file
diff --git a/qa/suites/rados/thrash-erasure-code-shec/thrashers/default.yaml b/qa/suites/rados/thrash-erasure-code-shec/thrashers/default.yaml
index 2bece1f939b..5bff561a9fd 100644
--- a/qa/suites/rados/thrash-erasure-code-shec/thrashers/default.yaml
+++ b/qa/suites/rados/thrash-erasure-code-shec/thrashers/default.yaml
@@ -1,6 +1,5 @@
-tasks:
-- install:
-- ceph:
+overrides:
+  ceph:
     log-whitelist:
     - wrongly marked me down
     - objects unfound and apparently lost
@@ -11,6 +10,7 @@ tasks:
         osd scrub min interval: 60
         osd scrub max interval: 120
         osd max backfills: 3
+tasks:
 - thrashosds:
     timeout: 1200
     chance_pgnum_grow: 1
diff --git a/qa/suites/rados/thrash-erasure-code/ceph.yaml b/qa/suites/rados/thrash-erasure-code/ceph.yaml
new file mode 100644
index 00000000000..2030acb9083
--- /dev/null
+++ b/qa/suites/rados/thrash-erasure-code/ceph.yaml
@@ -0,0 +1,3 @@
+tasks:
+- install:
+- ceph:
diff --git a/qa/suites/rados/thrash-erasure-code/d-require-luminous b/qa/suites/rados/thrash-erasure-code/d-require-luminous
new file mode 120000
index 00000000000..737aee82415
--- /dev/null
+++ b/qa/suites/rados/thrash-erasure-code/d-require-luminous
@@ -0,0 +1 @@
+../thrash/d-require-luminous/
+\ No newline at end of file
diff --git a/qa/suites/rados/thrash-erasure-code/thrashers/default.yaml b/qa/suites/rados/thrash-erasure-code/thrashers/default.yaml
index 211d99a64fb..89adf041696 100644
--- a/qa/suites/rados/thrash-erasure-code/thrashers/default.yaml
+++ b/qa/suites/rados/thrash-erasure-code/thrashers/default.yaml
@@ -1,6 +1,5 @@
-tasks:
-- install:
-- ceph:
+overrides:
+  ceph:
     log-whitelist:
     - wrongly marked me down
     - objects unfound and apparently lost
@@ -10,6 +9,7 @@ tasks:
         osd scrub min interval: 60
         osd scrub max interval: 120
         osd max backfills: 2
+tasks:
 - thrashosds:
     timeout: 1200
     chance_pgnum_grow: 1
diff --git a/qa/suites/rados/thrash-erasure-code/thrashers/fastread.yaml b/qa/suites/rados/thrash-erasure-code/thrashers/fastread.yaml
index 8f7c455c693..d0ce8a319f8 100644
--- a/qa/suites/rados/thrash-erasure-code/thrashers/fastread.yaml
+++ b/qa/suites/rados/thrash-erasure-code/thrashers/fastread.yaml
@@ -1,6 +1,5 @@
-tasks:
-- install:
-- ceph:
+overrides:
+  ceph:
     log-whitelist:
     - wrongly marked me down
     - objects unfound and apparently lost
@@ -12,6 +11,7 @@ tasks:
         osd scrub min interval: 60
         osd scrub max interval: 120
         osd max backfills: 3
+tasks:
 - thrashosds:
     timeout: 1200
     chance_pgnum_grow: 1
diff --git a/qa/suites/rados/thrash-erasure-code/thrashers/mapgap.yaml b/qa/suites/rados/thrash-erasure-code/thrashers/mapgap.yaml
index bd448e27285..2822eeffc8a 100644
--- a/qa/suites/rados/thrash-erasure-code/thrashers/mapgap.yaml
+++ b/qa/suites/rados/thrash-erasure-code/thrashers/mapgap.yaml
@@ -8,13 +8,11 @@ overrides:
         osd scrub min interval: 60
         osd scrub max interval: 120
         osd max backfills: 5
-tasks:
-- install:
-- ceph:
     log-whitelist:
     - wrongly marked me down
     - objects unfound and apparently lost
     - osd_map_cache_size
+tasks:
 - thrashosds:
     timeout: 1800
     chance_pgnum_grow: 1
diff --git a/qa/suites/rados/thrash-erasure-code/thrashers/morepggrow.yaml b/qa/suites/rados/thrash-erasure-code/thrashers/morepggrow.yaml
index 3fe730673b6..6972cfedf69 100644
--- a/qa/suites/rados/thrash-erasure-code/thrashers/morepggrow.yaml
+++ b/qa/suites/rados/thrash-erasure-code/thrashers/morepggrow.yaml
@@ -1,6 +1,5 @@
-tasks:
-- install:
-- ceph:
+overrides:
+  ceph:
     conf:
       osd:
         osd scrub min interval: 60
@@ -9,6 +8,7 @@ tasks:
     log-whitelist:
     - wrongly marked me down
     - objects unfound and apparently lost
+tasks:
 - thrashosds:
     timeout: 1200
     chance_pgnum_grow: 3
diff --git a/qa/suites/rados/thrash-erasure-code/thrashers/pggrow.yaml b/qa/suites/rados/thrash-erasure-code/thrashers/pggrow.yaml
index ecb239a061e..e40d9729fc7 100644
--- a/qa/suites/rados/thrash-erasure-code/thrashers/pggrow.yaml
+++ b/qa/suites/rados/thrash-erasure-code/thrashers/pggrow.yaml
@@ -1,6 +1,5 @@
-tasks:
-- install:
-- ceph:
+overrides:
+  ceph:
     log-whitelist:
     - wrongly marked me down
     - objects unfound and apparently lost
@@ -9,6 +8,7 @@ tasks:
         osd scrub min interval: 60
         osd scrub max interval: 120
         osd max backfills: 4
+tasks:
 - thrashosds:
     timeout: 1200
     chance_pgnum_grow: 2
diff --git a/qa/suites/rados/thrash-erasure-code/z-require-luminous b/qa/suites/rados/thrash-erasure-code/z-require-luminous
deleted file mode 120000
index 483b23de56b..00000000000
--- a/qa/suites/rados/thrash-erasure-code/z-require-luminous
+++ /dev/null
@@ -1 +0,0 @@
-../thrash/z-require-luminous
-\ No newline at end of file
diff --git a/qa/suites/rados/thrash/ceph.yaml b/qa/suites/rados/thrash/ceph.yaml
new file mode 100644
index 00000000000..2030acb9083
--- /dev/null
+++ b/qa/suites/rados/thrash/ceph.yaml
@@ -0,0 +1,3 @@
+tasks:
+- install:
+- ceph:
diff --git a/qa/suites/rados/thrash/d-require-luminous/at-end.yaml b/qa/suites/rados/thrash/d-require-luminous/at-end.yaml
new file mode 100644
index 00000000000..3cf78ca5a79
--- /dev/null
+++ b/qa/suites/rados/thrash/d-require-luminous/at-end.yaml
@@ -0,0 +1,23 @@
+# do not require luminous osds at mkfs time; only set flag at
+# the end of the test run, then do a final scrub (to convert any
+# legacy snapsets), and verify we are healthy.
+tasks:
+- full_sequential_finally:
+  - exec:
+      mon.a:
+        - ceph osd set require_luminous_osds
+# make sure osds have latest map
+        - rados -p rbd bench 5 write -b 4096
+  - ceph.osd_scrub_pgs:
+      cluster: ceph
+  - exec:
+      mon.a:
+        - ceph pg dump -f json-pretty
+        - "ceph pg dump sum -f json-pretty | grep num_legacy_snapsets | head -1 | grep ': 0'"
+overrides:
+  ceph:
+    conf:
+      global:
+        mon debug no require luminous: true
+  thrashosds:
+    chance_thrash_cluster_full: 0
diff --git a/qa/suites/rados/thrash/d-require-luminous/at-mkfs.yaml b/qa/suites/rados/thrash/d-require-luminous/at-mkfs.yaml
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/qa/suites/rados/thrash/d-require-luminous/at-mkfs.yaml
diff --git a/qa/suites/rados/thrash/thrashers/default.yaml b/qa/suites/rados/thrash/thrashers/default.yaml
index 5df84bedabe..e2f3c64c146 100644
--- a/qa/suites/rados/thrash/thrashers/default.yaml
+++ b/qa/suites/rados/thrash/thrashers/default.yaml
@@ -1,6 +1,5 @@
-tasks:
-- install:
-- ceph:
+overrides:
+  ceph:
     log-whitelist:
     - wrongly marked me down
     - objects unfound and apparently lost
@@ -11,6 +10,7 @@ tasks:
         osd scrub max interval: 120
         osd max backfills: 3
         osd snap trim sleep: 2
+tasks:
 - thrashosds:
     timeout: 1200
     chance_pgnum_grow: 1
diff --git a/qa/suites/rados/thrash/thrashers/mapgap.yaml b/qa/suites/rados/thrash/thrashers/mapgap.yaml
index 0ac5cfc49c6..b039b619811 100644
--- a/qa/suites/rados/thrash/thrashers/mapgap.yaml
+++ b/qa/suites/rados/thrash/thrashers/mapgap.yaml
@@ -1,5 +1,9 @@
 overrides:
   ceph:
+    log-whitelist:
+    - wrongly marked me down
+    - objects unfound and apparently lost
+    - osd_map_cache_size
     conf:
       mon:
         mon min osdmap epochs: 2
@@ -10,12 +14,6 @@ overrides:
         osd scrub during recovery: false
         osd max backfills: 6
 tasks:
-- install:
-- ceph:
-    log-whitelist:
-    - wrongly marked me down
-    - objects unfound and apparently lost
-    - osd_map_cache_size
 - thrashosds:
     timeout: 1800
     chance_pgnum_grow: 0.25
diff --git a/qa/suites/rados/thrash/thrashers/morepggrow.yaml b/qa/suites/rados/thrash/thrashers/morepggrow.yaml
index a22f80c5845..5565d701e32 100644
--- a/qa/suites/rados/thrash/thrashers/morepggrow.yaml
+++ b/qa/suites/rados/thrash/thrashers/morepggrow.yaml
@@ -1,6 +1,5 @@
-tasks:
-- install:
-- ceph:
+overrides:
+  ceph:
     conf:
       osd:
         osd scrub min interval: 60
@@ -13,6 +12,7 @@ tasks:
     log-whitelist:
     - wrongly marked me down
     - objects unfound and apparently lost
+tasks:
 - thrashosds:
     timeout: 1200
     chance_pgnum_grow: 3
diff --git a/qa/suites/rados/thrash/thrashers/none.yaml b/qa/suites/rados/thrash/thrashers/none.yaml
index 2030acb9083..e69de29bb2d 100644
--- a/qa/suites/rados/thrash/thrashers/none.yaml
+++ b/qa/suites/rados/thrash/thrashers/none.yaml
@@ -1,3 +0,0 @@
-tasks:
-- install:
-- ceph:
diff --git a/qa/suites/rados/thrash/thrashers/pggrow.yaml b/qa/suites/rados/thrash/thrashers/pggrow.yaml
index 30d8957c77b..817bbd48578 100644
--- a/qa/suites/rados/thrash/thrashers/pggrow.yaml
+++ b/qa/suites/rados/thrash/thrashers/pggrow.yaml
@@ -1,6 +1,5 @@
-tasks:
-- install:
-- ceph:
+overrides:
+  ceph:
     log-whitelist:
     - wrongly marked me down
     - objects unfound and apparently lost
@@ -11,6 +10,7 @@ tasks:
         filestore odsync write: true
         osd max backfills: 2
         osd snap trim sleep: .5
+tasks:
 - thrashosds:
     timeout: 1200
     chance_pgnum_grow: 2
diff --git a/qa/suites/rados/thrash/workloads/small-objects.yaml b/qa/suites/rados/thrash/workloads/small-objects.yaml
index ee20bc3c563..f5a18ae6e95 100644
--- a/qa/suites/rados/thrash/workloads/small-objects.yaml
+++ b/qa/suites/rados/thrash/workloads/small-objects.yaml
@@ -1,6 +1,9 @@
 overrides:
   ceph:
     crush_tunables: jewel
+    conf:
+      mon:
+        mon osd initial require min compat client: jewel
 tasks:
 - rados:
     clients: [client.0]
diff --git a/qa/suites/rados/thrash/z-require-luminous/at-end.yaml b/qa/suites/rados/thrash/z-require-luminous/at-end.yaml
deleted file mode 100644
index e3650821395..00000000000
--- a/qa/suites/rados/thrash/z-require-luminous/at-end.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-# do not require luminous osds at mkfs time; only set flag at
-# the end of the test run, then do a final scrub (to convert any
-# legacy snapsets), and verify we are healthy.
-tasks:
-- exec_on_cleanup:
-    mon.a:
-      - ceph osd set require_luminous_osds
-overrides:
-  ceph:
-    conf:
-      global:
-        mon debug no require luminous: true
-  thrashosds:
-    chance_thrash_cluster_full: 0
diff --git a/qa/suites/rados/verify/ceph.yaml b/qa/suites/rados/verify/ceph.yaml
new file mode 100644
index 00000000000..2030acb9083
--- /dev/null
+++ b/qa/suites/rados/verify/ceph.yaml
@@ -0,0 +1,3 @@
+tasks:
+- install:
+- ceph:
diff --git a/qa/suites/rados/verify/d-require-luminous b/qa/suites/rados/verify/d-require-luminous
new file mode 120000
index 00000000000..737aee82415
--- /dev/null
+++ b/qa/suites/rados/verify/d-require-luminous
@@ -0,0 +1 @@
+../thrash/d-require-luminous/
+\ No newline at end of file
diff --git a/qa/suites/rados/verify/1thrash/default.yaml b/qa/suites/rados/verify/d-thrash/default.yaml
index 9435b146af6..d67ff20a693 100644
--- a/qa/suites/rados/verify/1thrash/default.yaml
+++ b/qa/suites/rados/verify/d-thrash/default.yaml
@@ -1,9 +1,9 @@
-tasks:
-- install:
-- ceph:
+overrides:
+  ceph:
     log-whitelist:
     - wrongly marked me down
     - objects unfound and apparently lost
+tasks:
 - thrashosds:
     timeout: 1200
     chance_pgnum_grow: 1
diff --git a/qa/suites/rados/verify/d-thrash/none.yaml b/qa/suites/rados/verify/d-thrash/none.yaml
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/qa/suites/rados/verify/d-thrash/none.yaml
diff --git a/qa/suites/rados/verify/tasks/rados_cls_all.yaml b/qa/suites/rados/verify/tasks/rados_cls_all.yaml
index 853da39ad99..bbab083e9cd 100644
--- a/qa/suites/rados/verify/tasks/rados_cls_all.yaml
+++ b/qa/suites/rados/verify/tasks/rados_cls_all.yaml
@@ -1,3 +1,11 @@
+overrides:
+  ceph:
+    conf:
+      osd:
+        osd_class_load_list: "cephfs hello journal lock log numops rbd refcount 
+                              replica_log rgw sdk statelog timeindex user version"
+        osd_class_default_list: "cephfs hello journal lock log numops rbd refcount 
+                                 replica_log rgw sdk statelog timeindex user version"
 tasks:
 - workunit:
     clients:
diff --git a/qa/suites/rados/verify/z-require-luminous b/qa/suites/rados/verify/z-require-luminous
deleted file mode 120000
index 483b23de56b..00000000000
--- a/qa/suites/rados/verify/z-require-luminous
+++ /dev/null
@@ -1 +0,0 @@
-../thrash/z-require-luminous
-\ No newline at end of file
diff --git a/qa/suites/smoke/1node/distros/ubuntu_latest.yaml b/qa/suites/smoke/1node/distros/ubuntu_latest.yaml
new file mode 120000
index 00000000000..21601efb361
--- /dev/null
+++ b/qa/suites/smoke/1node/distros/ubuntu_latest.yaml
@@ -0,0 +1 @@
+../../../../distros/supported/ubuntu_latest.yaml
+\ No newline at end of file
diff --git a/qa/suites/smoke/1node/filestore-xfs.yaml b/qa/suites/smoke/1node/filestore-xfs.yaml
deleted file mode 120000
index 59ef7e4847e..00000000000
--- a/qa/suites/smoke/1node/filestore-xfs.yaml
+++ /dev/null
@@ -1 +0,0 @@
-../../../objectstore/filestore-xfs.yaml
-\ No newline at end of file
diff --git a/qa/suites/smoke/1node/objectstore/filestore-xfs.yaml b/qa/suites/smoke/1node/objectstore/filestore-xfs.yaml
new file mode 120000
index 00000000000..1af1dfd77be
--- /dev/null
+++ b/qa/suites/smoke/1node/objectstore/filestore-xfs.yaml
@@ -0,0 +1 @@
+../../../../objectstore/filestore-xfs.yaml
+\ No newline at end of file
diff --git a/qa/suites/smoke/basic/bluestore.yaml b/qa/suites/smoke/basic/bluestore.yaml
deleted file mode 120000
index e9c941b65f6..00000000000
--- a/qa/suites/smoke/basic/bluestore.yaml
+++ /dev/null
@@ -1 +0,0 @@
-../../../objectstore/bluestore.yaml
-\ No newline at end of file
diff --git a/qa/suites/smoke/basic/objectstore b/qa/suites/smoke/basic/objectstore
deleted file mode 120000
index 4c8ebadfde8..00000000000
--- a/qa/suites/smoke/basic/objectstore
+++ /dev/null
@@ -1 +0,0 @@
-../../../objectstore
-\ No newline at end of file
diff --git a/qa/suites/smoke/basic/objectstore/bluestore.yaml b/qa/suites/smoke/basic/objectstore/bluestore.yaml
new file mode 120000
index 00000000000..bd7d7e004a1
--- /dev/null
+++ b/qa/suites/smoke/basic/objectstore/bluestore.yaml
@@ -0,0 +1 @@
+../../../../objectstore/bluestore.yaml
+\ No newline at end of file
diff --git a/qa/suites/smoke/systemd/objectstore/filestore-xfs.yaml b/qa/suites/smoke/systemd/objectstore/filestore-xfs.yaml
new file mode 120000
index 00000000000..1af1dfd77be
--- /dev/null
+++ b/qa/suites/smoke/systemd/objectstore/filestore-xfs.yaml
@@ -0,0 +1 @@
+../../../../objectstore/filestore-xfs.yaml
+\ No newline at end of file
diff --git a/qa/suites/upgrade/hammer-jewel-x/stress-split/4-mon b/qa/suites/upgrade/hammer-jewel-x/stress-split/4-mon
deleted file mode 120000
index da3b2aac702..00000000000
--- a/qa/suites/upgrade/hammer-jewel-x/stress-split/4-mon
+++ /dev/null
@@ -1 +0,0 @@
-../../jewel-x/stress-split/4-mon/
-\ No newline at end of file
diff --git a/qa/suites/upgrade/hammer-jewel-x/stress-split/4-workload b/qa/suites/upgrade/hammer-jewel-x/stress-split/4-workload
new file mode 120000
index 00000000000..6135fb0ade6
--- /dev/null
+++ b/qa/suites/upgrade/hammer-jewel-x/stress-split/4-workload
@@ -0,0 +1 @@
+../../jewel-x/stress-split/4-workload
+\ No newline at end of file
diff --git a/qa/suites/upgrade/hammer-jewel-x/stress-split/5-finish-upgrade.yaml b/qa/suites/upgrade/hammer-jewel-x/stress-split/5-finish-upgrade.yaml
new file mode 120000
index 00000000000..7d39ac68eaf
--- /dev/null
+++ b/qa/suites/upgrade/hammer-jewel-x/stress-split/5-finish-upgrade.yaml
@@ -0,0 +1 @@
+../../jewel-x/stress-split/5-finish-upgrade.yaml
+\ No newline at end of file
diff --git a/qa/suites/upgrade/hammer-jewel-x/stress-split/5-workload b/qa/suites/upgrade/hammer-jewel-x/stress-split/5-workload
deleted file mode 120000
index 6ebbbb48ae9..00000000000
--- a/qa/suites/upgrade/hammer-jewel-x/stress-split/5-workload
+++ /dev/null
@@ -1 +0,0 @@
-../../jewel-x/stress-split/5-workload/
-\ No newline at end of file
diff --git a/qa/suites/upgrade/kraken-x/stress-split/6-luminous.yaml b/qa/suites/upgrade/hammer-jewel-x/stress-split/6-luminous.yaml
index 5283ac73e1b..5283ac73e1b 120000
--- a/qa/suites/upgrade/kraken-x/stress-split/6-luminous.yaml
+++ b/qa/suites/upgrade/hammer-jewel-x/stress-split/6-luminous.yaml
diff --git a/qa/suites/upgrade/hammer-jewel-x/stress-split/6-next-mon b/qa/suites/upgrade/hammer-jewel-x/stress-split/6-next-mon
deleted file mode 120000
index 1a19a3ac716..00000000000
--- a/qa/suites/upgrade/hammer-jewel-x/stress-split/6-next-mon
+++ /dev/null
@@ -1 +0,0 @@
-../../jewel-x/stress-split/6-next-mon/
-\ No newline at end of file
diff --git a/qa/suites/upgrade/hammer-jewel-x/stress-split/7-final-workload b/qa/suites/upgrade/hammer-jewel-x/stress-split/7-final-workload
new file mode 120000
index 00000000000..97adf26bce5
--- /dev/null
+++ b/qa/suites/upgrade/hammer-jewel-x/stress-split/7-final-workload
@@ -0,0 +1 @@
+../../jewel-x/stress-split/7-final-workload/
+\ No newline at end of file
diff --git a/qa/suites/upgrade/hammer-jewel-x/stress-split/7-workload b/qa/suites/upgrade/hammer-jewel-x/stress-split/7-workload
deleted file mode 120000
index 06ebb5fa68d..00000000000
--- a/qa/suites/upgrade/hammer-jewel-x/stress-split/7-workload
+++ /dev/null
@@ -1 +0,0 @@
-../../jewel-x/stress-split/7-workload/
-\ No newline at end of file
diff --git a/qa/suites/upgrade/hammer-jewel-x/stress-split/8-next-mon b/qa/suites/upgrade/hammer-jewel-x/stress-split/8-next-mon
deleted file mode 120000
index fa5113db15c..00000000000
--- a/qa/suites/upgrade/hammer-jewel-x/stress-split/8-next-mon
+++ /dev/null
@@ -1 +0,0 @@
-../../jewel-x/stress-split/8-next-mon/
-\ No newline at end of file
diff --git a/qa/suites/upgrade/hammer-jewel-x/stress-split/9-workload b/qa/suites/upgrade/hammer-jewel-x/stress-split/9-workload
deleted file mode 120000
index eb67cf79823..00000000000
--- a/qa/suites/upgrade/hammer-jewel-x/stress-split/9-workload
+++ /dev/null
@@ -1 +0,0 @@
-../../jewel-x/stress-split/9-workload/
-\ No newline at end of file
diff --git a/qa/suites/upgrade/jewel-x/parallel/1.5-final-scrub.yaml b/qa/suites/upgrade/jewel-x/parallel/1.5-final-scrub.yaml
new file mode 100644
index 00000000000..83457c0166d
--- /dev/null
+++ b/qa/suites/upgrade/jewel-x/parallel/1.5-final-scrub.yaml
@@ -0,0 +1,11 @@
+# do not require luminous osds at mkfs time; only set flag at
+# the end of the test run, then do a final scrub (to convert any
+# legacy snapsets), and verify we are healthy.
+tasks:
+- full_sequential_finally:
+  - ceph.osd_scrub_pgs:
+      cluster: ceph
+  - exec:
+      mon.a:
+        - ceph pg dump -f json-pretty
+        - "ceph pg dump sum -f json-pretty | grep num_legacy_snapsets | head -1 | grep ': 0'"
diff --git a/qa/suites/upgrade/jewel-x/parallel/2-workload/blogbench.yaml b/qa/suites/upgrade/jewel-x/parallel/2-workload/blogbench.yaml
index 021fcc68190..a8e28c52ce0 100644
--- a/qa/suites/upgrade/jewel-x/parallel/2-workload/blogbench.yaml
+++ b/qa/suites/upgrade/jewel-x/parallel/2-workload/blogbench.yaml
@@ -4,11 +4,11 @@ meta:
    mount ceph-fuse on client.2 before running workunit
 workload:
   full_sequential:
-  - sequential:
-    - ceph-fuse:
-    - print: "**** done ceph-fuse 2-workload"
-    - workunit:
-        clients:
-           client.2:
-            - suites/blogbench.sh
-    - print: "**** done suites/blogbench.sh 2-workload"
+    - sequential:
+      - ceph-fuse:
+      - print: "**** done ceph-fuse 2-workload"
+      - workunit:
+          clients:
+             client.2:
+              - suites/blogbench.sh
+      - print: "**** done suites/blogbench.sh 2-workload"
diff --git a/qa/suites/upgrade/jewel-x/parallel/2-workload/cache-pool-snaps.yaml b/qa/suites/upgrade/jewel-x/parallel/2-workload/cache-pool-snaps.yaml
new file mode 100644
index 00000000000..8e633e69033
--- /dev/null
+++ b/qa/suites/upgrade/jewel-x/parallel/2-workload/cache-pool-snaps.yaml
@@ -0,0 +1,41 @@
+overrides:
+  ceph:
+    log-whitelist:
+      - must scrub before tier agent can activate
+tasks:
+workload:
+  full_sequential:
+  - sequential:
+    - exec:
+        client.0:
+          - sudo ceph osd pool create base 4
+          - sudo ceph osd pool create cache 4
+          - sudo ceph osd tier add base cache
+          - sudo ceph osd tier cache-mode cache writeback
+          - sudo ceph osd tier set-overlay base cache
+          - sudo ceph osd pool set cache hit_set_type bloom
+          - sudo ceph osd pool set cache hit_set_count 8
+          - sudo ceph osd pool set cache hit_set_period 3600
+          - sudo ceph osd pool set cache target_max_objects 250
+          - sudo ceph osd pool set cache min_read_recency_for_promote 0
+          - sudo ceph osd pool set cache min_write_recency_for_promote 0
+    - rados:
+        clients: [client.0]
+        pools: [base]
+        ops: 4000
+        objects: 500
+        pool_snaps: true
+        op_weights:
+          read: 100
+          write: 100
+          delete: 50
+          copy_from: 50
+          flush: 50
+          try_flush: 50
+          evict: 50
+          snap_create: 50
+          snap_remove: 50
+          rollback: 50
+openstack:
+  - machine:
+      ram: 15000 # MB
diff --git a/qa/suites/upgrade/jewel-x/parallel/2-workload/ec-rados-default.yaml b/qa/suites/upgrade/jewel-x/parallel/2-workload/ec-rados-default.yaml
index 5c5a958804e..fb9d30f12f5 100644
--- a/qa/suites/upgrade/jewel-x/parallel/2-workload/ec-rados-default.yaml
+++ b/qa/suites/upgrade/jewel-x/parallel/2-workload/ec-rados-default.yaml
@@ -4,21 +4,21 @@ meta:
    on an erasure-coded pool
 workload:
   full_sequential:
-  - rados:
-      clients: [client.0]
-      ops: 4000
-      objects: 50
-      ec_pool: true
-      write_append_excl: false
-      op_weights:
-        read: 100
-        write: 0
-        append: 100
-        delete: 50
-        snap_create: 50
-        snap_remove: 50
-        rollback: 50
-        copy_from: 50
-        setattr: 25
-        rmattr: 25
-  - print: "**** done rados ec task"
+    - rados:
+        clients: [client.0]
+        ops: 4000
+        objects: 50
+        ec_pool: true
+        write_append_excl: false
+        op_weights:
+          read: 100
+          write: 0
+          append: 100
+          delete: 50
+          snap_create: 50
+          snap_remove: 50
+          rollback: 50
+          copy_from: 50
+          setattr: 25
+          rmattr: 25
+    - print: "**** done rados ec task"
diff --git a/qa/suites/upgrade/jewel-x/parallel/kraken.yaml b/qa/suites/upgrade/jewel-x/parallel/kraken.yaml
deleted file mode 120000
index a890722e139..00000000000
--- a/qa/suites/upgrade/jewel-x/parallel/kraken.yaml
+++ /dev/null
@@ -1 +0,0 @@
-../../../../releases/kraken.yaml
-\ No newline at end of file
diff --git a/qa/suites/upgrade/jewel-x/point-to-point-x/point-to-point-upgrade.yaml b/qa/suites/upgrade/jewel-x/point-to-point-x/point-to-point-upgrade.yaml
index 23261585076..736ba19e623 100644
--- a/qa/suites/upgrade/jewel-x/point-to-point-x/point-to-point-upgrade.yaml
+++ b/qa/suites/upgrade/jewel-x/point-to-point-x/point-to-point-upgrade.yaml
@@ -172,6 +172,7 @@ workload_x:
        client.1:
          force-branch: ceph-jewel
          rgw_server: client.1
+         scan_for_encryption_keys: false
    - print: "**** done s3tests workload_x"
 upgrade-sequence_x:
    sequential:
diff --git a/qa/suites/upgrade/jewel-x/stress-split-erasure-code/1.5-final-scrub.yaml b/qa/suites/upgrade/jewel-x/stress-split-erasure-code/1.5-final-scrub.yaml
new file mode 120000
index 00000000000..522db1b80a6
--- /dev/null
+++ b/qa/suites/upgrade/jewel-x/stress-split-erasure-code/1.5-final-scrub.yaml
@@ -0,0 +1 @@
+../parallel/1.5-final-scrub.yaml
+\ No newline at end of file
diff --git a/qa/suites/upgrade/jewel-x/stress-split/1.5-final-scrub.yaml b/qa/suites/upgrade/jewel-x/stress-split/1.5-final-scrub.yaml
new file mode 120000
index 00000000000..522db1b80a6
--- /dev/null
+++ b/qa/suites/upgrade/jewel-x/stress-split/1.5-final-scrub.yaml
@@ -0,0 +1 @@
+../parallel/1.5-final-scrub.yaml
+\ No newline at end of file
diff --git a/qa/suites/upgrade/kraken-x/parallel/0-cluster/start.yaml b/qa/suites/upgrade/kraken-x/parallel/0-cluster/start.yaml
index 3b39878fba4..eb5d185e5cc 100644
--- a/qa/suites/upgrade/kraken-x/parallel/0-cluster/start.yaml
+++ b/qa/suites/upgrade/kraken-x/parallel/0-cluster/start.yaml
@@ -6,6 +6,7 @@ meta:
    CephFS tests running on client 2,3
 roles:
 - - mon.a
+  - mgr.x
   - mds.a
   - osd.0
   - osd.1
diff --git a/qa/suites/upgrade/kraken-x/parallel/3-upgrade-sequence/upgrade-all.yaml b/qa/suites/upgrade/kraken-x/parallel/3-upgrade-sequence/upgrade-all.yaml
index 7612ec531fd..cff3a68366d 100644
--- a/qa/suites/upgrade/kraken-x/parallel/3-upgrade-sequence/upgrade-all.yaml
+++ b/qa/suites/upgrade/kraken-x/parallel/3-upgrade-sequence/upgrade-all.yaml
@@ -4,7 +4,7 @@ meta:
 upgrade-sequence:
    sequential:
    - ceph.restart:
-       daemons: [mon.a, mon.b, mon.c]
+       daemons: [mon.a, mon.b, mon.c, mgr.x]
    - ceph.restart:
        daemons: [osd.0, osd.1, osd.2, osd.3]
        wait-for-healthy: false
diff --git a/qa/suites/upgrade/kraken-x/parallel/3-upgrade-sequence/upgrade-mon-osd-mds.yaml b/qa/suites/upgrade/kraken-x/parallel/3-upgrade-sequence/upgrade-mon-osd-mds.yaml
index 6db61d26f74..f197de67945 100644
--- a/qa/suites/upgrade/kraken-x/parallel/3-upgrade-sequence/upgrade-mon-osd-mds.yaml
+++ b/qa/suites/upgrade/kraken-x/parallel/3-upgrade-sequence/upgrade-mon-osd-mds.yaml
@@ -13,7 +13,7 @@ upgrade-sequence:
    - sleep:
        duration: 60
    - ceph.restart:
-       daemons: [mon.b, mon.c]
+       daemons: [mon.b, mon.c, mgr.x]
        wait-for-healthy: true
    - sleep:
        duration: 60
diff --git a/qa/suites/upgrade/kraken-x/parallel/4-luminous-with-mgr.yaml b/qa/suites/upgrade/kraken-x/parallel/4-luminous-with-mgr.yaml
new file mode 120000
index 00000000000..5c72153e1f4
--- /dev/null
+++ b/qa/suites/upgrade/kraken-x/parallel/4-luminous-with-mgr.yaml
@@ -0,0 +1 @@
+../../../../releases/luminous-with-mgr.yaml
+\ No newline at end of file
diff --git a/qa/suites/upgrade/kraken-x/parallel/4-luminous.yaml b/qa/suites/upgrade/kraken-x/parallel/4-luminous.yaml
deleted file mode 120000
index 5283ac73e1b..00000000000
--- a/qa/suites/upgrade/kraken-x/parallel/4-luminous.yaml
+++ /dev/null
@@ -1 +0,0 @@
-../../../../releases/luminous.yaml
-\ No newline at end of file
diff --git a/qa/suites/upgrade/kraken-x/parallel/objectstore b/qa/suites/upgrade/kraken-x/parallel/objectstore
index 58b9a18c28e..016cbf9677d 120000
--- a/qa/suites/upgrade/kraken-x/parallel/objectstore
+++ b/qa/suites/upgrade/kraken-x/parallel/objectstore
@@ -1 +1 @@
-../../../../objectstore
-\ No newline at end of file
+../stress-split/objectstore/
+\ No newline at end of file
diff --git a/qa/suites/upgrade/kraken-x/stress-split-erasure-code/6-luminous-with-mgr.yaml b/qa/suites/upgrade/kraken-x/stress-split-erasure-code/6-luminous-with-mgr.yaml
new file mode 120000
index 00000000000..01d44cc0488
--- /dev/null
+++ b/qa/suites/upgrade/kraken-x/stress-split-erasure-code/6-luminous-with-mgr.yaml
@@ -0,0 +1 @@
+../stress-split/6-luminous-with-mgr.yaml
+\ No newline at end of file
diff --git a/qa/suites/upgrade/kraken-x/stress-split-erasure-code/6-luminous.yaml b/qa/suites/upgrade/kraken-x/stress-split-erasure-code/6-luminous.yaml
deleted file mode 120000
index 2b99d5c36b9..00000000000
--- a/qa/suites/upgrade/kraken-x/stress-split-erasure-code/6-luminous.yaml
+++ /dev/null
@@ -1 +0,0 @@
-../stress-split/6-luminous.yaml
-\ No newline at end of file
diff --git a/qa/suites/upgrade/kraken-x/stress-split-erasure-code/objectstore b/qa/suites/upgrade/kraken-x/stress-split-erasure-code/objectstore
index 58b9a18c28e..016cbf9677d 120000
--- a/qa/suites/upgrade/kraken-x/stress-split-erasure-code/objectstore
+++ b/qa/suites/upgrade/kraken-x/stress-split-erasure-code/objectstore
@@ -1 +1 @@
-../../../../objectstore
-\ No newline at end of file
+../stress-split/objectstore/
+\ No newline at end of file
diff --git a/qa/suites/upgrade/kraken-x/stress-split/0-cluster/start.yaml b/qa/suites/upgrade/kraken-x/stress-split/0-cluster/start.yaml
index 9aef0e718a6..4f40219b526 100644
--- a/qa/suites/upgrade/kraken-x/stress-split/0-cluster/start.yaml
+++ b/qa/suites/upgrade/kraken-x/stress-split/0-cluster/start.yaml
@@ -10,6 +10,7 @@ roles:
 - - mon.a
   - mon.b
   - mon.c
+  - mgr.x
   - osd.0
   - osd.1
   - osd.2
diff --git a/qa/suites/upgrade/kraken-x/stress-split/2-partial-upgrade/firsthalf.yaml b/qa/suites/upgrade/kraken-x/stress-split/2-partial-upgrade/firsthalf.yaml
index 442dcf105a7..87fa1d5fa2c 100644
--- a/qa/suites/upgrade/kraken-x/stress-split/2-partial-upgrade/firsthalf.yaml
+++ b/qa/suites/upgrade/kraken-x/stress-split/2-partial-upgrade/firsthalf.yaml
@@ -8,5 +8,5 @@ tasks:
     osd.0:
 - print: "**** done install.upgrade osd.0"
 - ceph.restart:
-    daemons: [mon.a,mon.b,mon.c,osd.0, osd.1, osd.2]
+    daemons: [mon.a,mon.b,mon.c,mgr.x,osd.0,osd.1,osd.2]
 - print: "**** done ceph.restart 1st half"
diff --git a/qa/suites/upgrade/kraken-x/stress-split/3-thrash/default.yaml b/qa/suites/upgrade/kraken-x/stress-split/3-thrash/default.yaml
index 955a855b00c..73449101449 100644
--- a/qa/suites/upgrade/kraken-x/stress-split/3-thrash/default.yaml
+++ b/qa/suites/upgrade/kraken-x/stress-split/3-thrash/default.yaml
@@ -19,4 +19,5 @@ stress-tasks:
     chance_thrash_cluster_full: 0
     chance_thrash_pg_upmap: 0
     chance_thrash_pg_upmap_items: 0
+    disable_objectstore_tool_tests: true
 - print: "**** done thrashosds 3-thrash"
diff --git a/qa/suites/upgrade/kraken-x/stress-split/5-finish-upgrade.yaml b/qa/suites/upgrade/kraken-x/stress-split/5-finish-upgrade.yaml
index 9693b680d03..1d528cd5de7 100644
--- a/qa/suites/upgrade/kraken-x/stress-split/5-finish-upgrade.yaml
+++ b/qa/suites/upgrade/kraken-x/stress-split/5-finish-upgrade.yaml
@@ -1,6 +1,7 @@
 tasks:
 - install.upgrade:
     osd.3:
+    client.0:
 - ceph.restart:
     daemons: [osd.3, osd.4, osd.5]
     wait-for-healthy: false
diff --git a/qa/suites/upgrade/kraken-x/stress-split/6-luminous-with-mgr.yaml b/qa/suites/upgrade/kraken-x/stress-split/6-luminous-with-mgr.yaml
new file mode 120000
index 00000000000..5c72153e1f4
--- /dev/null
+++ b/qa/suites/upgrade/kraken-x/stress-split/6-luminous-with-mgr.yaml
@@ -0,0 +1 @@
+../../../../releases/luminous-with-mgr.yaml
+\ No newline at end of file
diff --git a/qa/suites/upgrade/kraken-x/stress-split/objectstore b/qa/suites/upgrade/kraken-x/stress-split/objectstore
deleted file mode 120000
index 58b9a18c28e..00000000000
--- a/qa/suites/upgrade/kraken-x/stress-split/objectstore
+++ /dev/null
@@ -1 +0,0 @@
-../../../../objectstore
-\ No newline at end of file
diff --git a/qa/suites/upgrade/kraken-x/stress-split/objectstore/bluestore.yaml b/qa/suites/upgrade/kraken-x/stress-split/objectstore/bluestore.yaml
new file mode 120000
index 00000000000..d6445987d07
--- /dev/null
+++ b/qa/suites/upgrade/kraken-x/stress-split/objectstore/bluestore.yaml
@@ -0,0 +1 @@
+../../../../../objectstore/bluestore.yaml
+\ No newline at end of file
diff --git a/qa/suites/upgrade/kraken-x/stress-split/objectstore/filestore-xfs.yaml b/qa/suites/upgrade/kraken-x/stress-split/objectstore/filestore-xfs.yaml
new file mode 120000
index 00000000000..03750e5adeb
--- /dev/null
+++ b/qa/suites/upgrade/kraken-x/stress-split/objectstore/filestore-xfs.yaml
@@ -0,0 +1 @@
+../../../../../objectstore/filestore-xfs.yaml
+\ No newline at end of file
diff --git a/qa/tasks/ceph_deploy.py b/qa/tasks/ceph_deploy.py
index 44fb6c19415..3e5d2aba573 100644
--- a/qa/tasks/ceph_deploy.py
+++ b/qa/tasks/ceph_deploy.py
@@ -567,14 +567,20 @@ def cli_test(ctx, config):
                                                 sudo=True)
     new_mon_install = 'install {branch} --mon '.format(
         branch=test_branch) + nodename
+    new_mgr_install = 'install {branch} --mgr '.format(
+        branch=test_branch) + nodename
     new_osd_install = 'install {branch} --osd '.format(
         branch=test_branch) + nodename
     new_admin = 'install {branch} --cli '.format(branch=test_branch) + nodename
     create_initial = 'mon create-initial '
+    # either use create-keys or push command
+    push_keys = 'admin ' + nodename
     execute_cdeploy(admin, new_mon_install, path)
+    execute_cdeploy(admin, new_mgr_install, path)
     execute_cdeploy(admin, new_osd_install, path)
     execute_cdeploy(admin, new_admin, path)
     execute_cdeploy(admin, create_initial, path)
+    execute_cdeploy(admin, push_keys, path)
 
     for i in range(3):
         zap_disk = 'disk zap ' + "{n}:{d}".format(n=nodename, d=devs[i])
diff --git a/qa/tasks/ceph_manager.py b/qa/tasks/ceph_manager.py
index b7762b866c9..a40bd6c7bca 100644
--- a/qa/tasks/ceph_manager.py
+++ b/qa/tasks/ceph_manager.py
@@ -649,8 +649,11 @@ class Thrasher:
         """
         if pool is None:
             pool = self.ceph_manager.get_pool()
+            force = False
+        else:
+            force = True
         self.log("fixing pg num pool %s" % (pool,))
-        if self.ceph_manager.set_pool_pgpnum(pool):
+        if self.ceph_manager.set_pool_pgpnum(pool, force):
             self.pools_to_fix_pgp_num.discard(pool)
 
     def test_pool_min_size(self):
@@ -1671,14 +1674,14 @@ class CephManager:
             self.pools[pool_name] = new_pg_num
             return True
 
-    def set_pool_pgpnum(self, pool_name):
+    def set_pool_pgpnum(self, pool_name, force):
         """
         Set pgpnum property of pool_name pool.
         """
         with self.lock:
             assert isinstance(pool_name, basestring)
             assert pool_name in self.pools
-            if self.get_num_creating() > 0:
+            if not force and self.get_num_creating() > 0:
                 return False
             self.set_pool_property(pool_name, 'pgp_num', self.pools[pool_name])
             return True
diff --git a/qa/tasks/cephfs/filesystem.py b/qa/tasks/cephfs/filesystem.py
index 273bb39ebe4..01e1ca588c1 100644
--- a/qa/tasks/cephfs/filesystem.py
+++ b/qa/tasks/cephfs/filesystem.py
@@ -456,6 +456,15 @@ class Filesystem(MDSCluster):
                                          data_pool_name, pgs_per_fs_pool.__str__())
         self.mon_manager.raw_cluster_cmd('fs', 'new',
                                          self.name, self.metadata_pool_name, data_pool_name)
+        # Turn off spurious standby count warnings from modifying max_mds in tests.
+        try:
+            self.mon_manager.raw_cluster_cmd('fs', 'set', self.name, 'standby_count_wanted', '0')
+        except CommandFailedError as e:
+            if e.exitstatus == 22:
+                # standby_count_wanted not available prior to luminous (upgrade tests would fail otherwise)
+                pass
+            else:
+                raise
 
         self.getinfo(refresh = True)
 
@@ -788,7 +797,7 @@ class Filesystem(MDSCluster):
 
         return result
 
-    def wait_for_state(self, goal_state, reject=None, timeout=None, mds_id=None):
+    def wait_for_state(self, goal_state, reject=None, timeout=None, mds_id=None, rank=None):
         """
         Block until the MDS reaches a particular state, or a failure condition
         is met.
@@ -805,7 +814,11 @@ class Filesystem(MDSCluster):
         started_at = time.time()
         while True:
             status = self.status()
-            if mds_id is not None:
+            if rank is not None:
+                mds_info = status.get_rank(self.id, rank)
+                current_state = mds_info['state'] if mds_info else None
+                log.info("Looked up MDS state for mds.{0}: {1}".format(rank, current_state))
+            elif mds_id is not None:
                 # mds_info is None if no daemon with this ID exists in the map
                 mds_info = status.get_mds(mds_id)
                 current_state = mds_info['state'] if mds_info else None
diff --git a/qa/tasks/cephfs/mount.py b/qa/tasks/cephfs/mount.py
index 5bef25ca662..a86640ebef1 100644
--- a/qa/tasks/cephfs/mount.py
+++ b/qa/tasks/cephfs/mount.py
@@ -436,36 +436,6 @@ class CephFSMount(object):
         self._kill_background(p)
         self.background_procs.remove(p)
 
-    def spam_dir_background(self, path):
-        """
-        Create directory `path` and do lots of metadata operations
-        in it until further notice.
-        """
-        assert(self.is_mounted())
-        abs_path = os.path.join(self.mountpoint, path)
-
-        pyscript = dedent("""
-            import sys
-            import time
-            import os
-
-            abs_path = "{abs_path}"
-
-            if not os.path.exists(abs_path):
-                os.makedirs(abs_path)
-
-            n = 0
-            while True:
-                file_path = os.path.join(abs_path, "tmp%d" % n)
-                f = open(file_path, 'w')
-                f.close()
-                n = n + 1
-            """).format(abs_path=abs_path)
-
-        rproc = self._run_python(pyscript)
-        self.background_procs.append(rproc)
-        return rproc
-
     def get_global_id(self):
         raise NotImplementedError()
 
diff --git a/qa/tasks/cephfs/test_data_scan.py b/qa/tasks/cephfs/test_data_scan.py
index 9b92216b984..c30f5fa53ef 100644
--- a/qa/tasks/cephfs/test_data_scan.py
+++ b/qa/tasks/cephfs/test_data_scan.py
@@ -425,20 +425,11 @@ class TestDataScan(CephFSTestCase):
         self.fs.wait_for_daemons()
         if other_pool:
             for mds_id in self.fs.mds_ids:
-                self.wait_until_equal(
-                        lambda: get_state(mds_id),
-                        "up:active",
-                        timeout=60)
-            self.fs.mon_manager.raw_cluster_cmd('tell', 'mds.a',
-                                                'injectargs', '--debug-mds=20')
-            self.fs.mon_manager.raw_cluster_cmd('tell', 'mds.b',
-                                                'injectargs', '--debug-mds=20')
-            self.fs.mon_manager.raw_cluster_cmd('daemon', 'mds.a',
-                                                'scrub_path', '/',
-                                                'recursive', 'repair')
-            self.fs.mon_manager.raw_cluster_cmd('daemon', 'mds.b',
-                                                'scrub_path', '/',
-                                                'recursive', 'repair')
+                self.fs.mon_manager.raw_cluster_cmd('tell', "mds." + mds_id,
+                                                    'injectargs', '--debug-mds=20')
+                self.fs.mon_manager.raw_cluster_cmd('daemon', "mds." + mds_id,
+                                                    'scrub_path', '/',
+                                                    'recursive', 'repair')
         log.info(str(self.mds_cluster.status()))
 
         # Mount a client
diff --git a/qa/tasks/cephfs/test_exports.py b/qa/tasks/cephfs/test_exports.py
new file mode 100644
index 00000000000..7cbf464a953
--- /dev/null
+++ b/qa/tasks/cephfs/test_exports.py
@@ -0,0 +1,103 @@
+import logging
+import time
+from tasks.cephfs.fuse_mount import FuseMount
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+log = logging.getLogger(__name__)
+
+class TestExports(CephFSTestCase):
+    def _wait_subtrees(self, status, rank, test):
+        timeout = 30
+        pause = 2
+        test = sorted(test)
+        for i in range(timeout/pause):
+            subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, rank)['name'])
+            subtrees = filter(lambda s: s['dir']['path'].startswith('/'), subtrees)
+            log.info(subtrees)
+            filtered = sorted([(s['dir']['path'], s['auth_first']) for s in subtrees])
+            log.info(filtered)
+            if filtered == test:
+                return subtrees
+            time.sleep(pause)
+        raise RuntimeError("rank {0} failed to reach desired subtree state", rank)
+
+    def test_export_pin(self):
+        self.fs.set_allow_multimds(True)
+        self.fs.set_max_mds(2)
+
+        status = self.fs.status()
+
+        self.mount_a.run_shell(["mkdir", "-p", "1/2/3"])
+        self._wait_subtrees(status, 0, [])
+
+        # NOP
+        self.mount_a.run_shell(["setfattr", "-n", "ceph.dir.pin", "-v", "-1", "1"])
+        self._wait_subtrees(status, 0, [])
+
+        # NOP (rank < -1)
+        self.mount_a.run_shell(["setfattr", "-n", "ceph.dir.pin", "-v", "-2341", "1"])
+        self._wait_subtrees(status, 0, [])
+
+        # pin /1 to rank 1
+        self.mount_a.run_shell(["setfattr", "-n", "ceph.dir.pin", "-v", "1", "1"])
+        self._wait_subtrees(status, 1, [('/1', 1)])
+
+        # Check export_targets is set properly
+        status = self.fs.status()
+        log.info(status)
+        r0 = status.get_rank(self.fs.id, 0)
+        self.assertTrue(sorted(r0['export_targets']) == [1])
+
+        # redundant pin /1/2 to rank 1
+        self.mount_a.run_shell(["setfattr", "-n", "ceph.dir.pin", "-v", "1", "1/2"])
+        self._wait_subtrees(status, 1, [('/1', 1), ('/1/2', 1)])
+
+        # change pin /1/2 to rank 0
+        self.mount_a.run_shell(["setfattr", "-n", "ceph.dir.pin", "-v", "0", "1/2"])
+        self._wait_subtrees(status, 1, [('/1', 1), ('/1/2', 0)])
+        self._wait_subtrees(status, 0, [('/1', 1), ('/1/2', 0)])
+
+        # change pin /1/2/3 to (presently) non-existent rank 2
+        self.mount_a.run_shell(["setfattr", "-n", "ceph.dir.pin", "-v", "2", "1/2/3"])
+        self._wait_subtrees(status, 0, [('/1', 1), ('/1/2', 0)])
+        self._wait_subtrees(status, 1, [('/1', 1), ('/1/2', 0)])
+
+        # change pin /1/2 back to rank 1
+        self.mount_a.run_shell(["setfattr", "-n", "ceph.dir.pin", "-v", "1", "1/2"])
+        self._wait_subtrees(status, 1, [('/1', 1), ('/1/2', 1)])
+
+        # add another directory pinned to 1
+        self.mount_a.run_shell(["mkdir", "-p", "1/4/5"])
+        self.mount_a.run_shell(["setfattr", "-n", "ceph.dir.pin", "-v", "1", "1/4/5"])
+        self._wait_subtrees(status, 1, [('/1', 1), ('/1/2', 1), ('/1/4/5', 1)])
+
+        # change pin /1 to 0
+        self.mount_a.run_shell(["setfattr", "-n", "ceph.dir.pin", "-v", "0", "1"])
+        self._wait_subtrees(status, 0, [('/1', 0), ('/1/2', 1), ('/1/4/5', 1)])
+
+        # change pin /1/2 to default (-1); does the subtree root properly respect it's parent pin?
+        self.mount_a.run_shell(["setfattr", "-n", "ceph.dir.pin", "-v", "-1", "1/2"])
+        self._wait_subtrees(status, 0, [('/1', 0), ('/1/4/5', 1)])
+
+        if len(list(status.get_standbys())):
+            self.fs.set_max_mds(3)
+            self.fs.wait_for_state('up:active', rank=2)
+            self._wait_subtrees(status, 0, [('/1', 0), ('/1/4/5', 1), ('/1/2/3', 2)])
+
+            # Check export_targets is set properly
+            status = self.fs.status()
+            log.info(status)
+            r0 = status.get_rank(self.fs.id, 0)
+            self.assertTrue(sorted(r0['export_targets']) == [1,2])
+            r1 = status.get_rank(self.fs.id, 1)
+            self.assertTrue(sorted(r1['export_targets']) == [0])
+            r2 = status.get_rank(self.fs.id, 2)
+            self.assertTrue(sorted(r2['export_targets']) == [])
+
+        # Test rename
+        self.mount_a.run_shell(["mkdir", "-p", "a/b", "aa/bb"])
+        self.mount_a.run_shell(["setfattr", "-n", "ceph.dir.pin", "-v", "1", "a"])
+        self.mount_a.run_shell(["setfattr", "-n", "ceph.dir.pin", "-v", "0", "aa/bb"])
+        self._wait_subtrees(status, 0, [('/1', 0), ('/1/4/5', 1), ('/1/2/3', 2), ('/a', 1), ('/aa/bb', 0)])
+        self.mount_a.run_shell(["mv", "aa", "a/b/"])
+        self._wait_subtrees(status, 0, [('/1', 0), ('/1/4/5', 1), ('/1/2/3', 2), ('/a', 1), ('/a/b/aa/bb', 0)])
diff --git a/qa/tasks/cephfs/test_failover.py b/qa/tasks/cephfs/test_failover.py
index 85ef2b509fb..faefec458d6 100644
--- a/qa/tasks/cephfs/test_failover.py
+++ b/qa/tasks/cephfs/test_failover.py
@@ -94,6 +94,47 @@ class TestFailover(CephFSTestCase):
         with self.assertRaises(CommandFailedError):
             self.mounts[0].mount()
 
+    def test_standby_count_wanted(self):
+        """
+        That cluster health warnings are generated by insufficient standbys available.
+        """
+
+        # Need all my standbys up as well as the active daemons
+        self.wait_for_daemon_start()
+
+        grace = int(self.fs.get_config("mds_beacon_grace", service_type="mon"))
+
+        standbys = self.mds_cluster.get_standby_daemons()
+        self.assertGreaterEqual(len(standbys), 1)
+        self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', str(len(standbys)))
+
+        # Kill a standby and check for warning
+        victim = standbys.pop()
+        self.fs.mds_stop(victim)
+        log.info("waiting for insufficient standby daemon warning")
+        self.wait_for_health("insufficient standby daemons available", grace*2)
+
+        # restart the standby, see that he becomes a standby, check health clears
+        self.fs.mds_restart(victim)
+        self.wait_until_true(
+            lambda: victim in self.mds_cluster.get_standby_daemons(),
+            timeout=60  # Approximately long enough for MDS to start and mon to notice
+        )
+        self.wait_for_health_clear(timeout=30)
+
+        # Set it one greater than standbys ever seen
+        standbys = self.mds_cluster.get_standby_daemons()
+        self.assertGreaterEqual(len(standbys), 1)
+        self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', str(len(standbys)+1))
+        log.info("waiting for insufficient standby daemon warning")
+        self.wait_for_health("insufficient standby daemons available", grace*2)
+
+        # Set it to 0
+        self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', '0')
+        self.wait_for_health_clear(timeout=30)
+
+
+
 
 class TestStandbyReplay(CephFSTestCase):
     MDSS_REQUIRED = 4
diff --git a/qa/tasks/cephfs/test_full.py b/qa/tasks/cephfs/test_full.py
index 3e599566744..e69ccb373b9 100644
--- a/qa/tasks/cephfs/test_full.py
+++ b/qa/tasks/cephfs/test_full.py
@@ -405,8 +405,7 @@ class TestClusterFull(FullnessTestCase):
             # `max_avail` attribute of pools that sometimes occurs in between
             # tests (reason as yet unclear, but this dodges the issue)
             TestClusterFull.pool_capacity = self.fs.get_pool_df(self._data_pool_name())['max_avail']
-            mon_osd_full_ratio = float(self.fs.get_config("mon_osd_full_ratio"))
-            TestClusterFull.fill_mb = int(1.05 * mon_osd_full_ratio * (self.pool_capacity / (1024.0 * 1024.0)))
+            TestClusterFull.fill_mb = int(1.05 * (self.pool_capacity / (1024.0 * 1024.0)))
 
     def is_full(self):
         return self.fs.is_full()
diff --git a/qa/tasks/cephfs/test_journal_repair.py b/qa/tasks/cephfs/test_journal_repair.py
index 8496b144e1e..1b03afc0fc4 100644
--- a/qa/tasks/cephfs/test_journal_repair.py
+++ b/qa/tasks/cephfs/test_journal_repair.py
@@ -173,26 +173,33 @@ class TestJournalRepair(CephFSTestCase):
             self.mds_cluster.mds_stop(unneeded_mds)
             self.mds_cluster.mds_fail(unneeded_mds)
 
-        # Do a bunch of I/O such that at least some will hit the second MDS: create
-        # lots of directories so that the balancer should find it easy to make a decision
-        # to allocate some of them to the second mds.
-        spammers = []
-        for n in range(0, 16):
-            dir_name = "spam_{0}".format(n)
-            spammers.append(self.mount_a.spam_dir_background(dir_name))
+        # Create a dir on each rank
+        self.mount_a.run_shell(["mkdir", "alpha"])
+        self.mount_a.run_shell(["mkdir", "bravo"])
+        self.mount_a.setfattr("alpha/", "ceph.dir.pin", "0")
+        self.mount_a.setfattr("bravo/", "ceph.dir.pin", "1")
 
         def subtrees_assigned():
             got_subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=active_mds_names[0])
-            rank_1_count = len([s for s in got_subtrees if s['auth_first'] == 1])
 
-            # Greater than 1, because there is typically 1 for ~mds1, and once it
-            # has been assigned something in addition to that it means it has been
-            # assigned a "real" subtree.
-            return rank_1_count > 1
+            for s in got_subtrees:
+                if s['dir']['path'] == '/bravo':
+                    if s['auth_first'] == 1:
+                        return True
+                    else:
+                        # Should not happen
+                        raise RuntimeError("/bravo is subtree but not rank 1!")
 
-        # We are waiting for the MDS to respond to hot directories, which
-        # is not guaranteed to happen at a particular time, so a lengthy timeout here.
-        self.wait_until_true(subtrees_assigned, 600)
+            return False
+
+        # Ensure the pinning has taken effect and the /bravo dir is now
+        # migrated to rank 1.
+        self.wait_until_true(subtrees_assigned, 30)
+
+        # Do some IO (this should be split across ranks according to
+        # the rank-pinned dirs)
+        self.mount_a.create_n_files("alpha/file", 1000)
+        self.mount_a.create_n_files("bravo/file", 1000)
 
         # Flush the journals so that we have some backing store data
         # belonging to one MDS, and some to the other MDS.
@@ -229,16 +236,6 @@ class TestJournalRepair(CephFSTestCase):
             # killing the mount also means killing the node.
             pass
 
-        log.info("Terminating spammer processes...")
-        for spammer_proc in spammers:
-            spammer_proc.stdin.close()
-            try:
-                spammer_proc.wait()
-            except (CommandFailedError, ConnectionLostError):
-                # The ConnectionLostError case is for kernel client, where
-                # killing the mount also means killing the node.
-                pass
-
         # See that the second MDS will crash when it starts and tries to
         # acquire rank 1
         damaged_id = active_mds_names[1]
diff --git a/qa/tasks/rados.py b/qa/tasks/rados.py
index 2ef542fd278..eceff90381c 100644
--- a/qa/tasks/rados.py
+++ b/qa/tasks/rados.py
@@ -29,6 +29,8 @@ def task(ctx, config):
           runs: <number of times to run> - the pool is remade between runs
           ec_pool: use an ec pool
           erasure_code_profile: profile to use with the erasure coded pool
+          fast_read: enable ec_pool's fast_read
+          min_size: set the min_size of created pool
           pool_snaps: use pool snapshots instead of selfmanaged snapshots
 	  write_fadvise_dontneed: write behavior like with LIBRADOS_OP_FLAG_FADVISE_DONTNEED.
 	                          This mean data don't access in the near future.
@@ -233,6 +235,10 @@ def task(ctx, config):
                     if config.get('fast_read', False):
                         manager.raw_cluster_cmd(
                             'osd', 'pool', 'set', pool, 'fast_read', 'true')
+                    min_size = config.get('min_size', None);
+                    if min_size is not None:
+                        manager.raw_cluster_cmd(
+                            'osd', 'pool', 'set', pool, 'min_size', str(min_size))
 
                 (remote,) = ctx.cluster.only(role).remotes.iterkeys()
                 proc = remote.run(
diff --git a/qa/tasks/radosgw_admin.py b/qa/tasks/radosgw_admin.py
index a08a5ddd44d..033d4d29820 100644
--- a/qa/tasks/radosgw_admin.py
+++ b/qa/tasks/radosgw_admin.py
@@ -69,13 +69,13 @@ def task(ctx, config):
     global log
 
     # regions and config found from rgw task
-    assert ctx.rgw.regions
-        "tasks radosgw_admin needs region(s) declared from the rgw task"
+    assert ctx.rgw.regions is not None, \
+        "radosgw_admin task needs region(s) declared from the rgw task"
     regions = ctx.rgw.regions
     log.debug('regions are: %r', regions)
 
-    assert ctx.rgw.config
-        "tasks radosgw_admin needs a config passed from the rgw task"
+    assert ctx.rgw.config, \
+        "radosgw_admin task needs a config passed from the rgw task"
     config = ctx.rgw.config
     log.debug('config is: %r', config)
 
diff --git a/qa/tasks/s3tests.py b/qa/tasks/s3tests.py
index 4fbea2f64c8..762eff26758 100644
--- a/qa/tasks/s3tests.py
+++ b/qa/tasks/s3tests.py
@@ -381,13 +381,15 @@ def scan_for_leaked_encryption_keys(ctx, config):
         for client, client_config in config.iteritems():
             if not client_config.get('scan_for_encryption_keys', True):
                 continue
+            cluster_name, daemon_type, client_id = teuthology.split_role(client)
+            client_with_cluster = '.'.join((cluster_name, daemon_type, client_id))
             (remote,) = ctx.cluster.only(client).remotes.keys()
             proc = remote.run(
                 args=[
                     'grep',
                     '--binary-files=text',
                     s3test_customer_key,
-                    '/var/log/ceph/rgw.{client}.log'.format(client=client),
+                    '/var/log/ceph/rgw.{client}.log'.format(client=client_with_cluster),
                 ],
                 wait=False,
                 check_status=False,
diff --git a/qa/workunits/cephtool/test.sh b/qa/workunits/cephtool/test.sh
index 7f539a54498..92a8db1f3c5 100755
--- a/qa/workunits/cephtool/test.sh
+++ b/qa/workunits/cephtool/test.sh
@@ -394,6 +394,7 @@ function test_tiering()
 
   # make sure we can't create an ec pool tier
   ceph osd pool create eccache 2 2 erasure
+  expect_false ceph osd set-require-min-compat-client bobtail
   ceph osd pool create repbase 2
   expect_false ceph osd tier add repbase eccache
   ceph osd pool delete repbase repbase --yes-i-really-really-mean-it
@@ -1133,6 +1134,12 @@ function test_mon_osd()
   ceph osd crush get-tunable straw_calc_version | grep 1
 
   #
+  # require-min-compat-client
+  expect_false ceph osd set-require-min-compat-client dumpling  # firefly tunables
+  ceph osd set-require-min-compat-client luminous
+  ceph osd dump | grep 'require_min_compat_client luminous'
+
+  #
   # osd scrub
   #
   # how do I tell when these are done?
diff --git a/qa/workunits/cls/test_cls_sdk.sh b/qa/workunits/cls/test_cls_sdk.sh
new file mode 100755
index 00000000000..f1ccdc3b479
--- /dev/null
+++ b/qa/workunits/cls/test_cls_sdk.sh
@@ -0,0 +1,5 @@
+#!/bin/sh -e
+
+ceph_test_cls_sdk
+
+exit 0
diff --git a/qa/workunits/mon/test_mon_config_key.py b/qa/workunits/mon/test_mon_config_key.py
index 9e7537a743c..168f6db168c 100755
--- a/qa/workunits/mon/test_mon_config_key.py
+++ b/qa/workunits/mon/test_mon_config_key.py
@@ -12,6 +12,7 @@
 import argparse
 import base64
 import errno
+import json
 import logging
 import os
 import random
@@ -47,11 +48,20 @@ SIZES = [
     (8192, -errno.EFBIG)
 ]
 
+# tests will be randomly selected from the keys here, and the test
+# suboperation will be randomly selected from the list in the values
+# here.  i.e. 'exists/existing' would test that a key the test put into
+# the store earlier actually does still exist in the config store,
+# and that's a separate test case from 'exists/enoent', which tests
+# nonexistence of a key known to not be present.
+
 OPS = {
     'put': ['existing', 'new'],
     'del': ['existing', 'enoent'],
     'exists': ['existing', 'enoent'],
-    'get': ['existing', 'enoent']
+    'get': ['existing', 'enoent'],
+    'list': ['existing', 'enoent'],
+    'dump': ['existing', 'enoent'],
 }
 
 CONFIG_PUT = []  # list: keys
@@ -386,6 +396,67 @@ def main():
                         k=key, sz=cnt, es=CONFIG_EXISTING[key])
                 destroy_tmp_file(file_path)
             continue
+
+        elif op == 'list' or op == 'dump':
+            expected = 0
+            cmd = [op]
+            key = None
+
+            if sop == 'existing':
+                if len(CONFIG_EXISTING) == 0:
+                    op_log.debug('no existing keys; continue')
+                    continue
+                key = rnd.choice(CONFIG_PUT)
+                assert key in CONFIG_EXISTING, \
+                    "key '{k_}' not in CONFIG_EXISTING".format(k_=key)
+
+            if sop == 'enoent':
+                for x in range(0, 10):
+                    key = base64.b64encode(os.urandom(20)).decode()
+                    if key not in CONFIG_EXISTING:
+                        break
+                    key = None
+                if key is None:
+                    op_log.error('unable to generate an unique key -- try again later.')
+                    continue
+                assert key not in CONFIG_PUT and key not in CONFIG_EXISTING, \
+                    'key {k} was not supposed to exist!'.format(k=key)
+
+            assert key is not None, \
+                'key must be != None'
+
+            file_path = gen_tmp_file_path(rnd)
+            cmd += ['-o', file_path]
+            op_log.debug('key: {k}'.format(k=key))
+            run_cmd(cmd, expects=expected)
+            try:
+                temp_file = open(file_path, 'r+')
+            except IOError as err:
+                if err.errno == errno.ENOENT:
+                    assert CONFIG_EXISTING[key] == 0, \
+                        "error opening '{fp}': {e}".format(fp=file_path, e=err)
+                    continue
+                else:
+                    assert False, \
+                        'some error occurred: {e}'.format(e=err)
+            cnt = 0
+            try:
+                read_data = json.load(temp_file)
+            except ValueError:
+                temp_file.seek(0)
+                assert False, "{op} output was not valid JSON:\n{filedata}".format(op, temp_file.readlines())
+
+            if sop == 'existing':
+                assert key in read_data, "key '{k}' not found in list/dump output".format(k=key)
+                if op == 'dump':
+                    cnt = len(read_data[key])
+                    assert cnt == CONFIG_EXISTING[key], \
+                        "wrong size from list for key '{k}': {sz}, expected {es}".format(
+                        k=key, sz=cnt, es=CONFIG_EXISTING[key])
+            elif sop == 'enoent':
+                assert key not in read_data, "key '{k}' found in list/dump output".format(k=key)
+            destroy_tmp_file(file_path)
+            continue
         else:
             assert False, 'unknown op {o}'.format(o=op)
 
diff --git a/qa/workunits/rados/test_health_warnings.sh b/qa/workunits/rados/test_health_warnings.sh
new file mode 100755
index 00000000000..d9ddbbe7f8c
--- /dev/null
+++ b/qa/workunits/rados/test_health_warnings.sh
@@ -0,0 +1,36 @@
+#!/bin/bash -ex
+
+set -u
+
+# number of osds = 10
+crushtool -o crushmap --build --num_osds 10 host straw 2 rack straw 2 row straw 2 root straw 0
+ceph osd setcrushmap -i crushmap
+ceph osd tree
+
+test_mark_two_osds_same_host_down() {
+  ceph osd down osd.0 osd.1
+  ceph health detail
+  ceph health | grep "host"
+  ceph health detail | grep "osd.0"
+  ceph health detail | grep "osd.1"
+}
+
+test_mark_two_osds_same_rack_down() {
+  ceph osd down osd.8 osd.9
+  ceph health detail
+  ceph health | grep "rack"
+  ceph health detail | grep "osd.8"
+  ceph health detail | grep "osd.9"
+}
+
+test_mark_all_osds_down() {
+  ceph osd down `ceph osd ls`
+  ceph health detail
+  ceph health | grep "row"
+}
+
+test_mark_two_osds_same_host_down
+test_mark_two_osds_same_rack_down
+test_mark_all_osds_down
+
+exit 0
diff --git a/qa/workunits/rbd/cli_generic.sh b/qa/workunits/rbd/cli_generic.sh
index e85ae619bec..24f2439a590 100755
--- a/qa/workunits/rbd/cli_generic.sh
+++ b/qa/workunits/rbd/cli_generic.sh
@@ -290,7 +290,7 @@ test_pool_image_args() {
 
     ceph osd pool delete test test --yes-i-really-really-mean-it || true
     ceph osd pool create test 100
-    truncate -s 1 /tmp/empty
+    truncate -s 1 /tmp/empty /tmp/empty@snap
 
     rbd ls | wc -l | grep 0
     rbd create -s 1 test1
@@ -302,6 +302,12 @@ test_pool_image_args() {
     rbd import /tmp/empty foo
     rbd ls | grep -q foo
 
+    # should fail due to "destination snapname specified"
+    rbd import --dest test/empty@snap /tmp/empty && exit 1 || true
+    rbd import /tmp/empty test/empty@snap && exit 1 || true
+    rbd import --image test/empty@snap /tmp/empty && exit 1 || true
+    rbd import /tmp/empty@snap && exit 1 || true
+
     rbd ls test | wc -l | grep 0
     rbd import /tmp/empty test/test1
     rbd ls test | grep -q test1
@@ -336,7 +342,7 @@ test_pool_image_args() {
     rbd ls | grep test12
     rbd ls test | grep -qv test12
 
-    rm -f /tmp/empty
+    rm -f /tmp/empty /tmp/empty@snap
     ceph osd pool delete test test --yes-i-really-really-mean-it
 
     for f in foo test1 test10 test12 test2 test3 ; do
@@ -408,7 +414,7 @@ test_trash() {
     rbd ls | wc -l | grep 1
     rbd ls -l | grep 'test2.*2.*'
 
-    rbd trash mv test2 --delay 10
+    rbd trash mv test2 --delay 3600
     rbd trash ls | grep test2
     rbd trash ls | wc -l | grep 1
     rbd trash ls -l | grep 'test2.*USER.*protected until'
diff --git a/qa/workunits/rbd/krbd_exclusive_option.sh b/qa/workunits/rbd/krbd_exclusive_option.sh
new file mode 100755
index 00000000000..958aecf7325
--- /dev/null
+++ b/qa/workunits/rbd/krbd_exclusive_option.sh
@@ -0,0 +1,165 @@
+#!/bin/bash
+
+set -ex
+
+function expect_false() {
+    if "$@"; then return 1; else return 0; fi
+}
+
+function assert_locked() {
+    local dev_id="${1#/dev/rbd}"
+
+    local client_addr
+    client_addr="$(< $SYSFS_DIR/$dev_id/client_addr)"
+
+    local client_id
+    client_id="$(< $SYSFS_DIR/$dev_id/client_id)"
+    # client4324 -> client.4324
+    client_id="client.${client_id#client}"
+
+    local watch_cookie
+    watch_cookie="$(rados -p rbd listwatchers rbd_header.$IMAGE_ID |
+        grep $client_id | cut -d ' ' -f 3 | cut -d '=' -f 2)"
+    [[ $(echo -n "$watch_cookie" | grep -c '^') -eq 1 ]]
+
+    local actual
+    actual="$(rados -p rbd --format=json lock info rbd_header.$IMAGE_ID rbd_lock |
+        python -m json.tool)"
+
+    local expected
+    expected="$(cat <<EOF | python -m json.tool
+{
+    "lockers": [
+        {
+            "addr": "$client_addr",
+            "cookie": "auto $watch_cookie",
+            "description": "",
+            "expiration": "0.000000",
+            "name": "$client_id"
+        }
+    ],
+    "name": "rbd_lock",
+    "tag": "internal",
+    "type": "exclusive"
+}
+EOF
+    )"
+
+    [ "$actual" = "$expected" ]
+}
+
+function assert_unlocked() {
+    rados -p rbd --format=json lock info rbd_header.$IMAGE_ID rbd_lock |
+        grep '"lockers":\[\]'
+}
+
+SYSFS_DIR="/sys/bus/rbd/devices"
+IMAGE_NAME="exclusive-option-test"
+
+rbd create --size 1 --image-feature '' $IMAGE_NAME
+
+IMAGE_ID="$(rbd info --format=json $IMAGE_NAME |
+    python -c "import sys, json; print json.load(sys.stdin)['block_name_prefix'].split('.')[1]")"
+
+DEV=$(sudo rbd map $IMAGE_NAME)
+assert_unlocked
+sudo rbd unmap $DEV
+assert_unlocked
+
+expect_false sudo rbd map -o exclusive $IMAGE_NAME
+assert_unlocked
+
+rbd feature enable $IMAGE_NAME exclusive-lock
+rbd snap create $IMAGE_NAME@snap
+
+DEV=$(sudo rbd map $IMAGE_NAME)
+assert_unlocked
+sudo rbd unmap $DEV
+assert_unlocked
+
+DEV=$(sudo rbd map -o exclusive $IMAGE_NAME)
+assert_locked $DEV
+[[ $(blockdev --getro $DEV) -eq 0 ]]
+sudo rbd unmap $DEV
+assert_unlocked
+
+DEV=$(sudo rbd map -o exclusive $IMAGE_NAME@snap)
+assert_locked $DEV
+[[ $(blockdev --getro $DEV) -eq 1 ]]
+sudo rbd unmap $DEV
+assert_unlocked
+
+DEV=$(sudo rbd map -o exclusive,ro $IMAGE_NAME)
+assert_locked $DEV
+[[ $(blockdev --getro $DEV) -eq 1 ]]
+sudo rbd unmap $DEV
+assert_unlocked
+
+# alternate syntax
+DEV=$(sudo rbd map --exclusive --read-only $IMAGE_NAME)
+assert_locked $DEV
+[[ $(blockdev --getro $DEV) -eq 1 ]]
+sudo rbd unmap $DEV
+assert_unlocked
+
+DEV=$(sudo rbd map $IMAGE_NAME)
+assert_unlocked
+dd if=/dev/urandom of=$DEV bs=4k count=10 oflag=direct
+assert_locked $DEV
+OTHER_DEV=$(sudo rbd map -o noshare,exclusive $IMAGE_NAME)
+assert_locked $OTHER_DEV
+sudo rbd unmap $DEV
+sudo rbd unmap $OTHER_DEV
+assert_unlocked
+
+DEV=$(sudo rbd map -o exclusive $IMAGE_NAME)
+assert_locked $DEV
+expect_false sudo rbd map -o noshare,exclusive $IMAGE_NAME
+assert_locked $DEV
+sudo rbd unmap $DEV
+assert_unlocked
+
+DEV=$(sudo rbd map -o exclusive $IMAGE_NAME)
+assert_locked $DEV
+OTHER_DEV=$(sudo rbd map -o noshare $IMAGE_NAME)
+dd if=/dev/urandom of=$OTHER_DEV bs=4k count=10 oflag=direct &
+PID=$!
+sleep 20
+assert_locked $DEV
+[ "$(ps -o stat= $PID)" = "D" ]
+sudo rbd unmap $DEV
+wait $PID
+assert_locked $OTHER_DEV
+sudo rbd unmap $OTHER_DEV
+assert_unlocked
+
+DEV=$(sudo rbd map -o exclusive $IMAGE_NAME)
+assert_locked $DEV
+sudo rbd map -o noshare,lock_on_read $IMAGE_NAME &
+SUDO_PID=$!
+sleep 20
+assert_locked $DEV
+PID="$(ps -o pid= --ppid $SUDO_PID)"
+[ "$(ps -o stat= $PID)" = "Dl" ]
+sudo rbd unmap $DEV
+wait $SUDO_PID
+assert_locked $OTHER_DEV
+sudo rbd unmap $OTHER_DEV
+assert_unlocked
+
+# induce a watch error after 30 seconds
+DEV=$(sudo rbd map -o exclusive,osdkeepalive=60 $IMAGE_NAME)
+assert_locked $DEV
+OLD_WATCHER="$(rados -p rbd listwatchers rbd_header.$IMAGE_ID)"
+sleep 40
+assert_locked $DEV
+NEW_WATCHER="$(rados -p rbd listwatchers rbd_header.$IMAGE_ID)"
+# same client_id, old cookie < new cookie
+[ "$(echo "$OLD_WATCHER" | cut -d ' ' -f 2)" = \
+    "$(echo "$NEW_WATCHER" | cut -d ' ' -f 2)" ]
+[[ $(echo "$OLD_WATCHER" | cut -d ' ' -f 3 | cut -d '=' -f 2) -lt \
+    $(echo "$NEW_WATCHER" | cut -d ' ' -f 3 | cut -d '=' -f 2) ]]
+sudo rbd unmap $DEV
+assert_unlocked
+
+echo OK
diff --git a/qa/workunits/rbd/krbd_stable_pages_required.sh b/qa/workunits/rbd/krbd_stable_pages_required.sh
new file mode 100755
index 00000000000..a7c44c8f42f
--- /dev/null
+++ b/qa/workunits/rbd/krbd_stable_pages_required.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+set -ex
+
+IMAGE_NAME="stable-pages-required-test"
+
+rbd create --size 1 $IMAGE_NAME
+DEV=$(sudo rbd map $IMAGE_NAME)
+[[ $(blockdev --getsize64 $DEV) -eq 1048576 ]]
+grep -q 1 /sys/block/${DEV#/dev/}/bdi/stable_pages_required
+
+rbd resize --size 2 $IMAGE_NAME
+[[ $(blockdev --getsize64 $DEV) -eq 2097152 ]]
+grep -q 1 /sys/block/${DEV#/dev/}/bdi/stable_pages_required
+sudo rbd unmap $DEV
+
+echo OK
diff --git a/qa/workunits/rbd/rbd_mirror.sh b/qa/workunits/rbd/rbd_mirror.sh
index b1e4df33c80..f4935941205 100755
--- a/qa/workunits/rbd/rbd_mirror.sh
+++ b/qa/workunits/rbd/rbd_mirror.sh
@@ -127,6 +127,25 @@ wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+stopped'
 wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+replaying' 'master_position'
 compare_images ${POOL} ${image}
 
+# failover (unmodified)
+demote_image ${CLUSTER2} ${POOL} ${image}
+wait_for_image_replay_stopped ${CLUSTER1} ${POOL} ${image}
+wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+stopped'
+wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+stopped'
+promote_image ${CLUSTER1} ${POOL} ${image}
+wait_for_image_replay_started ${CLUSTER2} ${POOL} ${image}
+
+# failback (unmodified)
+demote_image ${CLUSTER1} ${POOL} ${image}
+wait_for_image_replay_stopped ${CLUSTER2} ${POOL} ${image}
+wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+stopped'
+promote_image ${CLUSTER2} ${POOL} ${image}
+wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image}
+wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image}
+wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+replaying' 'master_position'
+wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+stopped'
+compare_images ${POOL} ${image}
+
 # failover
 demote_image ${CLUSTER2} ${POOL} ${image}
 wait_for_image_replay_stopped ${CLUSTER1} ${POOL} ${image}
diff --git a/qa/workunits/rbd/run_devstack_tempest.sh b/qa/workunits/rbd/run_devstack_tempest.sh
index f87f95c713c..0e2c9687478 100755
--- a/qa/workunits/rbd/run_devstack_tempest.sh
+++ b/qa/workunits/rbd/run_devstack_tempest.sh
@@ -1,5 +1,7 @@
 #!/bin/bash -ex
 
+STACK_BRANCH=stable/ocata
+
 STACK_USER=${STACK_USER:-stack}
 STACK_GROUP=${STACK_GROUP:-stack}
 TEMPEST_USER=${TEMPEST_USER:-tempest}
@@ -42,7 +44,7 @@ SERVICE_PASSWORD=secretservice
 SERVICE_TOKEN=111222333444
 SWIFT_HASH=1234123412341234
 ROOTSLEEP=0
-ENABLED_SERVICES=c-api,c-bak,c-sch,c-vol,ceilometer-acentral,ceilometer-acompute,ceilometer-alarm-evaluator,ceilometer-alarm-notifier,ceilometer-anotification,ceilometer-api,ceilometer-collector,cinder,dstat,g-api,g-reg,horizon,key,mysql,n-api,n-cond,n-cpu,n-crt,n-obj,n-sch,q-agt,q-dhcp,q-l3,q-meta,q-metering,q-svc,quantum,rabbit,s-account,s-container,s-object,s-proxy,tempest
+ENABLED_SERVICES=c-api,c-bak,c-sch,c-vol,ceilometer-acentral,ceilometer-acompute,ceilometer-alarm-evaluator,ceilometer-alarm-notifier,ceilometer-anotification,ceilometer-api,ceilometer-collector,cinder,dstat,g-api,g-reg,horizon,key,mysql,n-api,n-cauth,n-cond,n-cpu,n-novnc,n-obj,n-sch,peakmem_tracker,placement-api,q-agt,q-dhcp,q-l3,q-meta,q-metering,q-svc,rabbit,s-account,s-container,s-object,s-proxy,tempest
 SKIP_EXERCISES=boot_from_volume,bundle,client-env,euca
 SYSLOG=False
 SCREEN_LOGDIR=${STACK_LOG_PATH}/screen-logs
@@ -81,10 +83,13 @@ EOF
 cat<<EOF > ${STACK_HOME_PATH}/start.sh
 #!/bin/bash -ex
 cd ${STACK_OPT_PATH}
-git clone https://git.openstack.org/openstack-dev/devstack
-cd devstack
-git checkout stable/newton
+git clone https://git.openstack.org/openstack-dev/devstack -b ${STACK_BRANCH}
+
+# TODO workaround for https://github.com/pypa/setuptools/issues/951
+git clone https://git.openstack.org/openstack/requirements.git -b ${STACK_BRANCH}
+sed -i 's/appdirs===1.4.0/appdirs===1.4.3/' requirements/upper-constraints.txt
 
+cd devstack
 cp ${STACK_HOME_PATH}/local.conf .
 
 export PYTHONUNBUFFERED=true
@@ -93,7 +98,6 @@ export PROJECTS="openstack/devstack-plugin-ceph"
 ./stack.sh
 EOF
 
-
 # execute devstack
 chmod 0755 ${STACK_HOME_PATH}/start.sh
 sudo -H -u ${STACK_USER} ${STACK_HOME_PATH}/start.sh
diff --git a/qa/workunits/rgw/run-s3tests.sh b/qa/workunits/rgw/run-s3tests.sh
new file mode 100755
index 00000000000..a18a5f05178
--- /dev/null
+++ b/qa/workunits/rgw/run-s3tests.sh
@@ -0,0 +1,74 @@
+#!/bin/bash -ex
+
+# run s3-tests from current directory. assume working
+# ceph environment (radosgw-admin in path) and rgw on localhost:8000
+# (the vstart default).
+
+branch=$1
+[ -z "$1" ] && branch=master
+port=$2
+[ -z "$2" ] && port=8000   # this is vstart's default
+
+##
+
+dir=tmp.s3-tests.$$
+
+# clone and bootstrap
+mkdir $dir
+cd $dir
+git clone https://github.com/ceph/s3-tests
+cd s3-tests
+git checkout ceph-$branch
+./bootstrap
+cd ../..
+
+# users
+akey1=access1
+skey1=secret1
+radosgw-admin user create --uid=s3test1 --display-name='tester1' \
+	      --access-key=$akey1 --secret=$skey1 --email=tester1@ceph.com
+
+akey2=access2
+skey2=secret2
+radosgw-admin user create --uid=s3test2 --display-name='tester2' \
+	      --access-key=$akey2 --secret=$skey2 --email=teester2@ceph.com
+
+cat <<EOF > s3.conf
+[DEFAULT]
+## replace with e.g. "localhost" to run against local software
+host = 127.0.0.1
+## uncomment the port to use something other than 80
+port = $port
+## say "no" to disable TLS
+is_secure = no
+[fixtures]
+## all the buckets created will start with this prefix;
+## {random} will be filled with random characters to pad
+## the prefix to 30 characters long, and avoid collisions
+bucket prefix = s3testbucket-{random}-
+[s3 main]
+## the tests assume two accounts are defined, "main" and "alt".
+## user_id is a 64-character hexstring
+user_id = s3test1
+## display name typically looks more like a unix login, "jdoe" etc
+display_name = tester1
+## replace these with your access keys
+access_key = $akey1
+secret_key = $skey1
+email = tester1@ceph.com
+[s3 alt]
+## another user account, used for ACL-related tests
+user_id = s3test2
+display_name = tester2
+## the "alt" user needs to have email set, too
+email = tester2@ceph.com
+access_key = $akey2
+secret_key = $skey2
+EOF
+
+S3TEST_CONF=`pwd`/s3.conf $dir/s3-tests/virtualenv/bin/nosetests -a '!fails_on_rgw' -v 
+
+rm -rf $dir
+
+echo OK.
+
diff --git a/qa/workunits/suites/wac.sh b/qa/workunits/suites/wac.sh
new file mode 100755
index 00000000000..49b4f1464e1
--- /dev/null
+++ b/qa/workunits/suites/wac.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+set -ex
+
+wget http://download.ceph.com/qa/wac.c
+gcc -o wac wac.c
+set +e
+timeout 5m ./wac -l 65536 -n 64 -r wac-test
+RET=$?
+set -e
+[[ $RET -eq 124 ]]
+echo OK
diff --git a/src/Beast b/src/Beast
-Subproject 999e2fa0318b5982736d3ea01a418770ea80267
+Subproject d8db5f1a0d607aa78e6e857daa0410b0d532697
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 927a2044b3b..6c5375bfdbd 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -90,7 +90,7 @@ if(HAVE_INTEL)
 endif()
 
 execute_process(
-  COMMAND yasm -f elf64 ${CMAKE_SOURCE_DIR}/src/common/crc32c_intel_fast_asm.S -o /dev/null
+  COMMAND yasm -f elf64 ${CMAKE_SOURCE_DIR}/src/common/crc32c_intel_fast_asm.s -o /dev/null
   RESULT_VARIABLE no_yasm
   OUTPUT_QUIET)
 if(no_yasm)
@@ -286,6 +286,10 @@ endif()
 set(heap_profiler_files ${TCMALLOC_srcs})
 add_library(heap_profiler_objs OBJECT ${heap_profiler_files})
 
+if (WITH_BLKIN)
+  add_subdirectory(blkin/blkin-lib)
+endif(WITH_BLKIN)
+
 # Common infrastructure
 configure_file(
   ${CMAKE_SOURCE_DIR}/src/ceph_ver.h.in.cmake
@@ -331,6 +335,18 @@ add_library(crush_objs OBJECT ${crush_srcs})
 add_subdirectory(json_spirit)
 
 include_directories("${CMAKE_SOURCE_DIR}/src/xxHash")
+include_directories(SYSTEM "${CMAKE_SOURCE_DIR}/src/rapidjson/include")
+
+set(GMOCK_INCLUDE_DIRS
+  "${CMAKE_SOURCE_DIR}/src/googletest/googletest/include/gmock")
+set(GTEST_INCLUDE_DIRS
+  "${CMAKE_SOURCE_DIR}/src/googletest/googletest/include/gtest")
+
+include_directories("${CMAKE_SOURCE_DIR}/src/dmclock/src")
+include_directories("${CMAKE_SOURCE_DIR}/src/dmclock/support/src")
+
+# needed for source files that friend unit tests (e.g., using FRIEND_TEST)
+include_directories("${CMAKE_SOURCE_DIR}/src/googletest/googletest/include")
 
 set(xio_common_srcs)
 if(HAVE_XIO)
@@ -424,6 +440,7 @@ set(libcommon_files
   common/TrackedOp.cc
   common/SloppyCRCMap.cc
   common/types.cc
+  common/iso_8601.cc
   log/Log.cc
   log/SubsystemMap.cc
   mon/MonCap.cc
@@ -519,13 +536,13 @@ if(HAVE_INTEL)
     common/crc32c_intel_fast.c)
   if(HAVE_GOOD_YASM_ELF64)
     list(APPEND libcommon_files
-      common/crc32c_intel_fast_asm.S
-      common/crc32c_intel_fast_zero_asm.S)
+      common/crc32c_intel_fast_asm.s
+      common/crc32c_intel_fast_zero_asm.s)
   endif(HAVE_GOOD_YASM_ELF64)
 elseif(HAVE_POWER8)
   list(APPEND libcommon_files
     common/crc32c_ppc.c
-    common/crc32c_ppc_asm.S)
+    common/crc32c_ppc_asm.s)
 endif(HAVE_INTEL)
 
 if(LINUX)
@@ -581,8 +598,12 @@ set(ceph_common_deps
   ${Boost_PROGRAM_OPTIONS_LIBRARY}
   ${Boost_DATE_TIME_LIBRARY}
   ${Boost_IOSTREAMS_LIBRARY}
-  ${BLKID_LIBRARIES} ${Backtrace_LIBRARIES}
-  ${CRYPTO_LIBS} ${CMAKE_THREAD_LIBS_INIT} ${CMAKE_DL_LIBS})
+  ${BLKID_LIBRARIES}
+  ${Backtrace_LIBRARIES}
+  ${BLKIN_LIBRARIES}
+  ${CRYPTO_LIBS}
+  ${CMAKE_THREAD_LIBS_INIT}
+  ${CMAKE_DL_LIBS})
 if(HAVE_RDMA)
   list(APPEND ceph_common_deps ${RDMA_LIBRARY})
 endif()
@@ -614,6 +635,7 @@ install(TARGETS ceph-common DESTINATION ${CMAKE_INSTALL_PKGLIBDIR})
 
 add_library(common_utf8 STATIC common/utf8.c)
 
+target_link_libraries(common json_spirit common_utf8 erasure_code rt uuid resolv ${CRYPTO_LIBS} ${Boost_LIBRARIES} ${BLKID_LIBRARIES} ${EXECINFO_LIBRARIES} ${BLKIN_LIBRARIES})
 if(${WITH_LTTNG})
   add_subdirectory(tracing)
   add_dependencies(common-objs oprequest-tp)
@@ -651,7 +673,7 @@ if (WITH_MGR)
   add_executable(ceph-mgr ${mgr_srcs}
                  $<TARGET_OBJECTS:heap_profiler_objs>)
   target_include_directories(ceph-mgr PRIVATE "${PYTHON_INCLUDE_DIRS}")
-  target_link_libraries(ceph-mgr mds osdc global-static common
+  target_link_libraries(ceph-mgr osdc client global-static common
       ${Boost_PYTHON_LIBRARY} ${PYTHON_LIBRARIES} ${BLKID_LIBRARIES} ${CMAKE_DL_LIBS} ${ALLOC_LIBS})
   install(TARGETS ceph-mgr DESTINATION bin)
 endif (WITH_MGR)
@@ -768,6 +790,8 @@ if (NOT WITH_SYSTEM_ROCKSDB)
   # with SSE 4.2. For details refer to rocksdb/util/crc32c.cc.
   if (HAVE_INTEL_SSE4_2)
     list(APPEND ROCKSDB_CMAKE_ARGS -DCMAKE_CXX_FLAGS=${SIMD_COMPILE_FLAGS})
+  else()
+    list(APPEND ROCKSDB_CMAKE_ARGS -DWITH_SSE42=OFF)
   endif()
   list(APPEND ROCKSDB_CMAKE_ARGS -DCMAKE_AR=${CMAKE_AR})
   list(APPEND ROCKSDB_CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE})
@@ -802,6 +826,12 @@ if (NOT WITH_SYSTEM_ROCKSDB)
 
 endif(NOT WITH_SYSTEM_ROCKSDB)
 
+include(TestBigEndian)
+test_big_endian(CEPH_BIG_ENDIAN)
+if(NOT CEPH_BIG_ENDIAN)
+  set(CEPH_LITTLE_ENDIAN 1)
+endif()
+
 add_subdirectory(kv)
 add_subdirectory(os)
 
@@ -809,6 +839,7 @@ set(cls_references_files objclass/class_api.cc)
 add_library(cls_references_objs OBJECT ${cls_references_files})
 
 add_subdirectory(osd)
+
 set(ceph_osd_srcs
   ceph_osd.cc)
 add_executable(ceph-osd ${ceph_osd_srcs}
@@ -842,6 +873,18 @@ add_subdirectory(compressor)
 
 add_subdirectory(tools)
 
+# dmClock
+
+add_subdirectory(dmclock) # after gmock
+add_dependencies(tests dmclock-tests dmclock-data-struct-tests)
+
+if(WITH_TESTS)
+  install(PROGRAMS
+    ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/dmclock-tests
+    ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/dmclock-data-struct-tests
+    DESTINATION bin)
+endif(WITH_TESTS)
+
 if(HAVE_INTEL)
   add_subdirectory(crypto/isa-l)
 endif(HAVE_INTEL)
diff --git a/src/arch/intel.c b/src/arch/intel.c
index a45bc1debbe..5c483dccbdd 100644
--- a/src/arch/intel.c
+++ b/src/arch/intel.c
@@ -25,24 +25,7 @@ int ceph_arch_intel_sse2 = 0;
 int ceph_arch_intel_aesni = 0;
 
 #ifdef __x86_64__
-
-/* Note: valgrind redefines cpuid : it is different from the native processor. */
-/* intel cpu? */
-static void do_cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx,
-                     unsigned int *edx)
-{
-        int id = *eax;
-
-        asm("movl %4, %%eax;"
-            "cpuid;"
-            "movl %%eax, %0;"
-            "movl %%ebx, %1;"
-            "movl %%ecx, %2;"
-            "movl %%edx, %3;"
-                : "=r" (*eax), "=r" (*ebx), "=r" (*ecx), "=r" (*edx)
-                : "r" (id)
-                : "eax", "ebx", "ecx", "edx");
-}
+#include <cpuid.h>
 
 /* http://en.wikipedia.org/wiki/CPUID#EAX.3D1:_Processor_Info_and_Feature_Bits */
 
@@ -57,8 +40,10 @@ static void do_cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx,
 int ceph_arch_intel_probe(void)
 {
 	/* i know how to check this on x86_64... */
-	unsigned int eax = 1, ebx, ecx, edx;
-	do_cpuid(&eax, &ebx, &ecx, &edx);
+	unsigned int eax, ebx, ecx = 0, edx = 0;
+	if (!__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {
+	  return 1;
+	}
 	if ((ecx & CPUID_PCLMUL) != 0) {
 		ceph_arch_intel_pclmul = 1;
 	}
diff --git a/src/auth/AuthServiceHandler.h b/src/auth/AuthServiceHandler.h
index 6bd8bd97634..4d8a649374f 100644
--- a/src/auth/AuthServiceHandler.h
+++ b/src/auth/AuthServiceHandler.h
@@ -15,11 +15,14 @@
 #ifndef CEPH_AUTHSERVICEHANDLER_H
 #define CEPH_AUTHSERVICEHANDLER_H
 
-#include "include/types.h"
-#include "Auth.h"
+#include <stddef.h>              // for NULL
+#include <stdint.h>              // for uint64_t
+#include "common/entity_name.h"  // for EntityName
+#include "include/buffer_fwd.h"  // for bufferlist
 
 class CephContext;
 class KeyServer;
+struct AuthCapsInfo;
 
 struct AuthServiceHandler {
 protected:
diff --git a/src/auth/cephx/CephxProtocol.cc b/src/auth/cephx/CephxProtocol.cc
index ae61d09dde9..5836a33bd53 100644
--- a/src/auth/cephx/CephxProtocol.cc
+++ b/src/auth/cephx/CephxProtocol.cc
@@ -38,7 +38,7 @@ void cephx_calc_client_server_challenge(CephContext *cct, CryptoKey& secret, uin
   uint64_t k = 0;
   const uint64_t *p = (const uint64_t *)enc.c_str();
   for (int pos = 0; pos + sizeof(k) <= enc.length(); pos+=sizeof(k), p++)
-    k ^= mswab64(*p);
+    k ^= mswab(*p);
   *key = k;
 }
 
diff --git a/src/auth/cephx/CephxProtocol.h b/src/auth/cephx/CephxProtocol.h
index ebb25dc9be7..c8220698990 100644
--- a/src/auth/cephx/CephxProtocol.h
+++ b/src/auth/cephx/CephxProtocol.h
@@ -421,7 +421,7 @@ extern bool cephx_verify_authorizer(CephContext *cct, KeyStore *keys,
 /*
  * encode+encrypt macros
  */
-#define AUTH_ENC_MAGIC 0xff009cad8826aa55ull
+static constexpr uint64_t AUTH_ENC_MAGIC = 0xff009cad8826aa55ull;
 
 template <typename T>
 void decode_decrypt_enc_bl(CephContext *cct, T& t, CryptoKey key, bufferlist& bl_enc, 
diff --git a/src/auth/cephx/CephxSessionHandler.cc b/src/auth/cephx/CephxSessionHandler.cc
index 943b80afbd3..087d6c54aa2 100644
--- a/src/auth/cephx/CephxSessionHandler.cc
+++ b/src/auth/cephx/CephxSessionHandler.cc
@@ -41,9 +41,9 @@ int CephxSessionHandler::_calc_signature(Message *m, uint64_t *psig)
     __le32 middle_crc;
     __le32 data_crc;
   } __attribute__ ((packed)) sigblock = {
-    1, mswab64(AUTH_ENC_MAGIC), mswab32(4*4),
-    mswab32(header.crc), mswab32(footer.front_crc),
-    mswab32(footer.middle_crc), mswab32(footer.data_crc)
+    1, mswab(AUTH_ENC_MAGIC), mswab<uint32_t>(4*4),
+    mswab<uint32_t>(header.crc), mswab<uint32_t>(footer.front_crc),
+    mswab<uint32_t>(footer.middle_crc), mswab<uint32_t>(footer.data_crc)
   };
   bufferlist bl_plaintext;
   bl_plaintext.append(buffer::create_static(sizeof(sigblock), (char*)&sigblock));
diff --git a/src/blkin b/src/blkin
new file mode 160000
+Subproject f24ceec055ea236a093988237a9821d145f5f7c
diff --git a/src/ceph-create-keys b/src/ceph-create-keys
index c7846f85d50..dda58e62e11 100755
--- a/src/ceph-create-keys
+++ b/src/ceph-create-keys
@@ -132,6 +132,20 @@ def get_key(cluster, mon_id):
                             'mon', 'allow *',
                             'osd', 'allow *',
                             'mds', 'allow *',
+                            'mgr', 'allow *',
+                            ],
+                        stdout=f,
+                        )
+                else:
+                    returncode = subprocess.call(
+                        args=args_prefix + [
+                            'auth',
+                            'caps',
+                            'client.admin',
+                            'mon', 'allow *',
+                            'osd', 'allow *',
+                            'mds', 'allow *',
+                            'mgr', 'allow *',
                             ],
                         stdout=f,
                         )
diff --git a/src/ceph-detect-init/ceph_detect_init/__init__.py b/src/ceph-detect-init/ceph_detect_init/__init__.py
index 78374bef9fd..fe0a2a787f4 100644
--- a/src/ceph-detect-init/ceph_detect_init/__init__.py
+++ b/src/ceph-detect-init/ceph_detect_init/__init__.py
@@ -141,23 +141,7 @@ def platform_information():
     distro_lower = distro.lower()
     # this could be an empty string in Debian
     if not codename and 'debian' in distro_lower:
-        debian_codenames = {
-            '8': 'jessie',
-            '7': 'wheezy',
-            '6': 'squeeze',
-        }
-        major_version = release.split('.')[0]
-        codename = debian_codenames.get(major_version, '')
-
-        # In order to support newer jessie/sid or wheezy/sid strings
-        # we test this if sid is buried in the minor, we should use
-        # sid anyway.
-        if not codename and '/' in release:
-            major, minor = release.split('/')
-            if minor == 'sid':
-                codename = minor
-            else:
-                codename = major
+        pass
     # this is an empty string in Oracle
     elif distro_lower.startswith('oracle linux'):
         codename = 'OL' + release
diff --git a/src/ceph-detect-init/ceph_detect_init/debian/__init__.py b/src/ceph-detect-init/ceph_detect_init/debian/__init__.py
index 73a7851a3fc..94217cdfa26 100644
--- a/src/ceph-detect-init/ceph_detect_init/debian/__init__.py
+++ b/src/ceph-detect-init/ceph_detect_init/debian/__init__.py
@@ -1,3 +1,6 @@
+import os
+import subprocess
+
 distro = None
 release = None
 codename = None
@@ -8,14 +11,11 @@ def choose_init():
 
     Returns the name of a init system (upstart, sysvinit ...).
     """
-    assert(distro and codename)
-    if distro.lower() in ('ubuntu', 'linuxmint'):
-        if codename >= 'vivid':
-            return 'systemd'
-        else:
-            return 'upstart'
-    if distro.lower() == 'debian':
-        if codename in ('squeeze', 'wheezy'):
-            return 'sysvinit'
-        else:
-            return 'systemd'
+    # yes, this is heuristics
+    if os.path.isdir('/run/systemd/system'):
+        return 'systemd'
+    if not subprocess.call('. /lib/lsb/init-functions ; init_is_upstart',
+                           shell=True):
+        return 'upstart'
+    if os.path.isfile('/sbin/init') and not os.path.islink('/sbin/init'):
+        return 'sysvinit'
diff --git a/src/ceph-detect-init/tests/test_all.py b/src/ceph-detect-init/tests/test_all.py
index 263cd9a289b..18451bf4636 100644
--- a/src/ceph-detect-init/tests/test_all.py
+++ b/src/ceph-detect-init/tests/test_all.py
@@ -65,30 +65,35 @@ class TestCephDetectInit(testtools.TestCase):
         self.assertEqual('sysvinit', centos.choose_init())
 
     def test_debian(self):
-        with mock.patch.multiple('ceph_detect_init.debian',
-                                 distro='debian',
-                                 codename='wheezy'):
-            self.assertEqual('sysvinit', debian.choose_init())
-        with mock.patch.multiple('ceph_detect_init.debian',
-                                 distro='debian',
-                                 codename='squeeze'):
-            self.assertEqual('sysvinit', debian.choose_init())
-        with mock.patch.multiple('ceph_detect_init.debian',
-                                 distro='debian',
-                                 codename='jessie'):
+        with mock.patch.multiple('os.path',
+                                 isdir=lambda x: x == '/run/systemd/system'):
             self.assertEqual('systemd', debian.choose_init())
-        with mock.patch.multiple('ceph_detect_init.debian',
-                                 distro='ubuntu',
-                                 codename='trusty'):
-            self.assertEqual('upstart', debian.choose_init())
-        with mock.patch.multiple('ceph_detect_init.debian',
-                                 distro='ubuntu',
-                                 codename='vivid'):
-            self.assertEqual('systemd', debian.choose_init())
-        with mock.patch.multiple('ceph_detect_init.debian',
-                                 distro='not-debian',
-                                 codename='andy'):
-            self.assertIs(None, debian.choose_init())
+
+        def mock_call_with_upstart(*args, **kwargs):
+            if args[0] == '. /lib/lsb/init-functions ; init_is_upstart' and \
+               kwargs['shell']:
+                return 0
+            else:
+                return 1
+        with mock.patch.multiple('os.path',
+                                 isdir=lambda x: False,
+                                 isfile=lambda x: False):
+            with mock.patch.multiple('subprocess',
+                                     call=mock_call_with_upstart):
+                self.assertEqual('upstart', debian.choose_init())
+        with mock.patch.multiple('os.path',
+                                 isdir=lambda x: False,
+                                 isfile=lambda x: x == '/sbin/init',
+                                 islink=lambda x: x != '/sbin/init'):
+            with mock.patch.multiple('subprocess',
+                                     call=lambda *args, **kwargs: 1):
+                self.assertEqual('sysvinit', debian.choose_init())
+        with mock.patch.multiple('os.path',
+                                 isdir=lambda x: False,
+                                 isfile=lambda x: False):
+            with mock.patch.multiple('subprocess',
+                                     call=lambda *args, **kwargs: 1):
+                self.assertIs(None, debian.choose_init())
 
     def test_fedora(self):
         with mock.patch('ceph_detect_init.fedora.release',
@@ -183,8 +188,6 @@ class TestCephDetectInit(testtools.TestCase):
             self.assertEqual('debian', distro.distro)
             self.assertEqual(False, distro.is_el)
             self.assertEqual('6.0', distro.release)
-            self.assertEqual('squeeze', distro.codename)
-            self.assertEqual('sysvinit', distro.init)
 
         with mock.patch.multiple('platform',
                                  system=lambda: 'FreeBSD',
@@ -252,29 +255,9 @@ class TestCephDetectInit(testtools.TestCase):
     @mock.patch('platform.system', lambda: 'Linux')
     def test_platform_information_linux(self):
         with mock.patch('platform.linux_distribution',
-                        lambda **kwargs: (('debian', '6.0', ''))):
-            self.assertEqual(('debian', '6.0', 'squeeze'),
-                             ceph_detect_init.platform_information())
-
-        with mock.patch('platform.linux_distribution',
-                        lambda **kwargs: (('debian', '7.0', ''))):
-            self.assertEqual(('debian', '7.0', 'wheezy'),
-                             ceph_detect_init.platform_information())
-
-        with mock.patch('platform.linux_distribution',
                         lambda **kwargs: (('debian', '8.0', ''))):
-            self.assertEqual(('debian', '8.0', 'jessie'),
-                             ceph_detect_init.platform_information())
-
-        with mock.patch('platform.linux_distribution',
-                        lambda **kwargs: (('debian', 'jessie/sid', ''))):
-            self.assertEqual(('debian', 'jessie/sid', 'sid'),
-                             ceph_detect_init.platform_information())
-
-        with mock.patch('platform.linux_distribution',
-                        lambda **kwargs: (('debian', 'sid/jessie', ''))):
-            self.assertEqual(('debian', 'sid/jessie', 'sid'),
-                             ceph_detect_init.platform_information())
+            self.assertEqual(('debian', '8.0'),
+                             ceph_detect_init.platform_information()[:-1])
 
         with mock.patch('platform.linux_distribution',
                         lambda **kwargs: (('Oracle Linux Server', '7.3', ''))):
diff --git a/src/ceph-disk/ceph_disk/main.py b/src/ceph-disk/ceph_disk/main.py
index a9d21351506..0869b3c07c9 100755
--- a/src/ceph-disk/ceph_disk/main.py
+++ b/src/ceph-disk/ceph_disk/main.py
@@ -206,15 +206,19 @@ class Ptype(object):
         return False
 
 
-DEFAULT_FS_TYPE = 'xfs'
 SYSFS = '/sys'
 
 if platform.system() == 'FreeBSD':
     FREEBSD = True
+    DEFAULT_FS_TYPE = 'zfs'
     PROCDIR = '/compat/linux/proc'
+    # FreeBSD does not have blockdevices any more
+    BLOCKDIR = '/dev'
 else:
     FREEBSD = False
+    DEFAULT_FS_TYPE = 'xfs'
     PROCDIR = '/proc'
+    BLOCKDIR = '/sys/block'
 
 """
 OSD STATUS Definition
@@ -232,7 +236,6 @@ MOUNT_OPTIONS = dict(
     # that user_xattr helped
     ext4='noatime,user_xattr',
     xfs='noatime,inode64',
-    zfs='atime=off',
 )
 
 MKFS_ARGS = dict(
@@ -250,6 +253,9 @@ MKFS_ARGS = dict(
         '-f',
         '-i', 'size=2048',
     ],
+    zfs=[
+        '-o', 'atime=off'
+    ],
 )
 
 INIT_SYSTEMS = [
@@ -456,7 +462,7 @@ def command(arguments, **kwargs):
     executables *will* be found and will error nicely otherwise.
 
     This returns the output of the command and the return code of the
-    process in a tuple: (output, returncode).
+    process in a tuple: (stdout, stderr, returncode).
     """
 
     arguments = list(map(_bytes2str, _get_command_executable(arguments)))
@@ -540,53 +546,6 @@ def command_check_call(arguments, exit=False):
         raise
 
 
-def platform_distro():
-    """
-    Returns a normalized, lower case string without any leading nor trailing
-    whitespace that represents the distribution name of the current machine.
-    """
-    distro = platform_information()[0] or ''
-    return distro.strip().lower()
-
-
-def platform_information():
-    if FREEBSD:
-        distro = platform.system()
-        release = platform.version().split()[1]
-        codename = platform.version().split()[3]
-        version = platform.version().split('-')[0]
-        major_version = version.split('.')[0]
-        major, minor = release.split('.')
-    else:
-        distro, release, codename = platform.linux_distribution()
-        # this could be an empty string in Debian
-        if not codename and 'debian' in distro.lower():
-            debian_codenames = {
-                '8': 'jessie',
-                '7': 'wheezy',
-                '6': 'squeeze',
-            }
-            major_version = release.split('.')[0]
-            codename = debian_codenames.get(major_version, '')
-
-            # In order to support newer jessie/sid,  wheezy/sid strings we test
-            # this if sid is buried in the minor, we should use sid anyway.
-            if not codename and '/' in release:
-                major, minor = release.split('/')
-                if minor == 'sid':
-                    codename = minor
-                else:
-                    codename = major
-        # this could be an empty string in Virtuozzo linux
-        if not codename and 'virtuozzo linux' in distro.lower():
-            codename = 'virtuozzo'
-
-    return (
-        str(distro).strip(),
-        str(release).strip(),
-        str(codename).strip()
-    )
-
 #
 # An alternative block_path implementation would be
 #
@@ -640,7 +599,7 @@ def is_mpath(dev):
     True if the path is managed by multipath
     """
     if FREEBSD:
-        return True
+        return False
     uuid = get_dm_uuid(dev)
     return (uuid and
             (re.match('part\d+-mpath-', uuid) or
@@ -651,7 +610,7 @@ def get_dev_name(path):
     """
     get device name from path.  e.g.::
 
-        /dev/sda -> sdas, /dev/cciss/c0d1 -> cciss!c0d1
+        /dev/sda -> sda, /dev/cciss/c0d1 -> cciss!c0d1
 
     a device "name" is something like::
 
@@ -734,7 +693,7 @@ def get_partition_dev(dev, pnum):
             partname = get_partition_mpath(dev, pnum)
         else:
             name = get_dev_name(os.path.realpath(dev))
-            sys_entry = os.path.join('/sys/block', name)
+            sys_entry = os.path.join(BLOCKDIR, name)
             error_msg = " in %s" % sys_entry
             for f in os.listdir(sys_entry):
                 if f.startswith(name) and f.endswith(str(pnum)):
@@ -763,7 +722,7 @@ def list_all_partitions():
     Return a list of devices and partitions
     """
     if not FREEBSD:
-        names = os.listdir('/sys/block')
+        names = os.listdir(BLOCKDIR)
         dev_part_list = {}
         for name in names:
             # /dev/fd0 may hang http://tracker.ceph.com/issues/6827
@@ -864,7 +823,7 @@ def is_partition(dev):
         raise Error('not a block device', dev)
 
     name = get_dev_name(dev)
-    if os.path.exists(os.path.join('/sys/block', name)):
+    if os.path.exists(os.path.join(BLOCKDIR, name)):
         return False
 
     # make sure it is a partition of something else
@@ -1514,6 +1473,7 @@ def check_journal_reqs(args):
     _, _, allows_journal = command([
         'ceph-osd', '--check-allows-journal',
         '-i', '0',
+        '--log-file', '$run_dir/$cluster-osd-check.log',
         '--cluster', args.cluster,
         '--setuser', get_ceph_user(),
         '--setgroup', get_ceph_group(),
@@ -1521,6 +1481,7 @@ def check_journal_reqs(args):
     _, _, wants_journal = command([
         'ceph-osd', '--check-wants-journal',
         '-i', '0',
+        '--log-file', '$run_dir/$cluster-osd-check.log',
         '--cluster', args.cluster,
         '--setuser', get_ceph_user(),
         '--setgroup', get_ceph_group(),
@@ -1528,6 +1489,7 @@ def check_journal_reqs(args):
     _, _, needs_journal = command([
         'ceph-osd', '--check-needs-journal',
         '-i', '0',
+        '--log-file', '$run_dir/$cluster-osd-check.log',
         '--cluster', args.cluster,
         '--setuser', get_ceph_user(),
         '--setgroup', get_ceph_group(),
@@ -3374,20 +3336,27 @@ def stop_daemon(
         raise Error('ceph osd stop failed', e)
 
 
-def detect_fstype(
-    dev,
-):
-    fstype = _check_output(
-        args=[
-            '/sbin/blkid',
-            # we don't want stale cached results
-            '-p',
-            '-s', 'TYPE',
-            '-o', 'value',
-            '--',
-            dev,
-        ],
-    )
+def detect_fstype(dev):
+    if FREEBSD:
+        fstype = _check_output(
+            args=[
+                'fstyp',
+                '-u',
+                dev,
+            ],
+        )
+    else:
+        fstype = _check_output(
+            args=[
+                '/sbin/blkid',
+                # we don't want stale cached results
+                '-p',
+                '-s', 'TYPE',
+                '-o', 'value',
+                '--',
+                dev,
+            ],
+        )
     fstype = must_be_one_line(fstype)
     return fstype
 
@@ -4228,19 +4197,29 @@ def get_oneliner(base, name):
 
 
 def get_dev_fs(dev):
-    fscheck, _, _ = command(
-        [
-            'blkid',
-            '-s',
-            'TYPE',
-            dev,
-        ],
-    )
-    if 'TYPE' in fscheck:
-        fstype = fscheck.split()[1].split('"')[1]
-        return fstype
+    if FREEBSD:
+        fstype, _, ret = command(
+            [
+                'fstyp',
+                '-u',
+                dev,
+            ],
+        )
+        if ret == 0:
+            return fstype
     else:
-        return None
+        fscheck, _, _ = command(
+            [
+                'blkid',
+                '-s',
+                'TYPE',
+                dev,
+            ],
+        )
+        if 'TYPE' in fscheck:
+            fstype = fscheck.split()[1].split('"')[1]
+            return fstype
+    return None
 
 
 def split_dev_base_partnum(dev):
@@ -4586,11 +4565,11 @@ def list_zfs():
                  'fails.\n (Error: %s)' % e)
         raise
     lines = out.splitlines()
-    for line in lines[2:]:
+    for line in lines[1:]:
         vdevline = line.split()
         if os.path.exists(os.path.join(vdevline[1], 'active')):
             elems = os.path.split(vdevline[1])
-            print(vdevline[0], "ceph data, active, cluster ceph,", elems[5],
+            print(vdevline[0], "ceph data, active, cluster ceph,", elems[1],
                   "mounted on:", vdevline[1])
         else:
             print(vdevline[0] + " other, zfs, mounted on: " + vdevline[1])
diff --git a/src/ceph-disk/tests/test_main.py b/src/ceph-disk/tests/test_main.py
index 69a7dfd9493..45fe48a1690 100644
--- a/src/ceph-disk/tests/test_main.py
+++ b/src/ceph-disk/tests/test_main.py
@@ -465,6 +465,9 @@ class TestCephDisk(object):
                   main.PTYPE['regular']['journal']['ready'])
 
     def test_list_bluestore(self):
+        if platform.system() == "FreeBSD":
+            return
+
         self.list(main.PTYPE['plain']['osd']['ready'],
                   main.PTYPE['plain']['block']['ready'])
         self.list(main.PTYPE['luks']['osd']['ready'],
diff --git a/src/ceph.in b/src/ceph.in
index 3ab1c222a3c..89b52cecc5b 100755
--- a/src/ceph.in
+++ b/src/ceph.in
@@ -38,6 +38,15 @@ FLAG_NOFORWARD  = (1 << 0)
 FLAG_OBSOLETE   = (1 << 1)
 FLAG_DEPRECATED = (1 << 2)
 
+# priorities from src/common/perf_counters.h
+PRIO_CRITICAL = 10
+PRIO_INTERESTING = 8
+PRIO_USEFUL = 5
+PRIO_UNINTERESTING = 2
+PRIO_DEBUGONLY = 0
+
+PRIO_DEFAULT = PRIO_USEFUL
+
 # Make life easier on developers:
 # If in src/, and .libs and pybind exist here, assume we're running
 # from a Ceph source dir and tweak PYTHONPATH and LD_LIBRARY_PATH
@@ -266,10 +275,15 @@ ping <mon.id>           Send simple presence/life test to a mon
                         <mon.id> may be 'mon.*' for all mons
 daemon {type.id|path} <cmd>
                         Same as --admin-daemon, but auto-find admin socket
-daemonperf {type.id | path} [<interval>] [<count>]
+daemonperf {type.id | path} [stat-pats] [priority] [<interval>] [<count>]
+daemonperf {type.id | path} list|ls [stat-pats] [priority]
                         Get selected perf stats from daemon/admin socket
+                        Optional shell-glob comma-delim match string stat-pats
+                        Optional selection priority (can abbreviate name):
+                         critical, interesting, useful, noninteresting, debug
+                        List shows a table of all available stats
                         Run <count> times (default forever),
-                        once per <interval> seconds (default 1)
+                         once per <interval> seconds (default 1)
     """, file=sys.stdout)
 
 
@@ -565,6 +579,134 @@ def ping_monitor(cluster_handle, name, timeout):
             print(s)
     return 0
 
+
+def maybe_daemon_command(parsed_args, childargs):
+    """
+    Check if --admin-socket, daemon, or daemonperf command
+    if it is, returns (boolean handled, return code if handled == True)
+    """
+
+    daemon_perf = False
+    sockpath = None
+    if parsed_args.admin_socket:
+        sockpath = parsed_args.admin_socket
+    elif len(childargs) > 0 and childargs[0] in ["daemon", "daemonperf"]:
+        daemon_perf = (childargs[0] == "daemonperf")
+        # Treat "daemon <path>" or "daemon <name>" like --admin_daemon <path>
+        # Handle "daemonperf <path>" the same but requires no trailing args
+        require_args = 2 if daemon_perf else 3
+        if len(childargs) >= require_args:
+            if childargs[1].find('/') >= 0:
+                sockpath = childargs[1]
+            else:
+                # try resolve daemon name
+                try:
+                    sockpath = ceph_conf(parsed_args, 'admin_socket',
+                                         childargs[1])
+                except Exception as e:
+                    print('Can\'t get admin socket path: ' + str(e), file=sys.stderr)
+                    return True, errno.EINVAL
+            # for both:
+            childargs = childargs[2:]
+        else:
+            print('{0} requires at least {1} arguments'.format(childargs[0], require_args), 
+                file=sys.stderr) 
+            return True, errno.EINVAL
+
+    if sockpath and daemon_perf:
+        return True, daemonperf(childargs, sockpath)
+    elif sockpath:
+        try:
+            raw_write(admin_socket(sockpath, childargs, parsed_args.output_format))
+        except Exception as e:
+            print('admin_socket: {0}'.format(e), file=sys.stderr)
+            return True, errno.EINVAL
+        return True, 0
+
+    return False, 0
+
+
+def isnum(s):
+    try:
+        float(s)
+        return True
+    except ValueError:
+        return False
+
+def daemonperf(childargs, sockpath):
+    """
+    Handle daemonperf command; returns errno or 0
+
+    daemonperf <daemon> [priority string] [statpats] [interval] [count]
+    daemonperf <daemon> list|ls [statpats]
+    """
+
+    interval = 1
+    count = None
+    statpats = None
+    priority = None
+    do_list = False
+
+    def prio_from_name(arg):
+
+        PRIOMAP = {
+            'critical': PRIO_CRITICAL,
+            'interesting': PRIO_INTERESTING,
+            'useful': PRIO_USEFUL,
+            'uninteresting': PRIO_UNINTERESTING,
+            'debugonly': PRIO_DEBUGONLY,
+        }
+
+        if arg in PRIOMAP:
+            return PRIOMAP[arg]
+        # allow abbreviation
+        for name, val in PRIOMAP.items():
+            if name.startswith(arg):
+                return val
+        return None
+
+    # consume and analyze non-numeric args
+    while len(childargs) and not isnum(childargs[0]):
+        arg = childargs.pop(0)
+        # 'list'?
+        if arg in ['list', 'ls']:
+            do_list = True;
+            continue
+        # prio?
+        prio = prio_from_name(arg)
+        if prio is not None:
+            priority = prio
+            continue
+        # statpats
+        statpats = arg.split(',')
+
+    if priority is None:
+        priority = PRIO_DEFAULT
+
+    if len(childargs) > 0:
+        try:
+            interval = float(childargs.pop(0))
+            if interval < 0:
+                raise ValueError
+        except ValueError:
+            print('daemonperf: interval should be a positive number', file=sys.stderr)
+            return errno.EINVAL
+
+    if len(childargs) > 0:
+        arg = childargs.pop(0)
+        if (not isnum(arg)) or (int(arg) < 0):
+            print('daemonperf: count should be a positive integer', file=sys.stderr)
+            return errno.EINVAL
+        count = int(arg)
+
+    watcher = DaemonWatcher(sockpath, statpats, priority)
+    if do_list:
+        watcher.list()
+    else:
+        watcher.run(interval, count)
+
+    return 0
+
 ###
 # main
 ###
@@ -610,58 +752,9 @@ def main():
 
     format = parsed_args.output_format
 
-    daemon_perf = False
-    sockpath = None
-    if parsed_args.admin_socket:
-        sockpath = parsed_args.admin_socket
-    elif len(childargs) > 0 and childargs[0] in ["daemon", "daemonperf"]:
-        daemon_perf = (childargs[0] == "daemonperf")
-        # Treat "daemon <path>" or "daemon <name>" like --admin_daemon <path>
-        # Handle "daemonperf <path>" the same but requires no trailing args
-        require_args = 2 if daemon_perf else 3
-        if len(childargs) >= require_args:
-            if childargs[1].find('/') >= 0:
-                sockpath = childargs[1]
-            else:
-                # try resolve daemon name
-                try:
-                    sockpath = ceph_conf(parsed_args, 'admin_socket',
-                                         childargs[1])
-                except Exception as e:
-                    print('Can\'t get admin socket path: ' + str(e), file=sys.stderr)
-                    return errno.EINVAL
-            # for both:
-            childargs = childargs[2:]
-        else:
-            print('{0} requires at least {1} arguments'.format(childargs[0], require_args), 
-                file=sys.stderr) 
-            return errno.EINVAL
-
-    if sockpath and daemon_perf:
-        interval = 1
-        count = None
-        if len(childargs) > 0:
-            try:
-                interval = float(childargs[0])
-                if interval < 0:
-                    raise ValueError
-            except ValueError:
-                print('daemonperf: interval should be a positive number', file=sys.stderr)
-                return errno.EINVAL
-        if len(childargs) > 1:
-            if not childargs[1].isdigit():
-                print('daemonperf: count should be a positive integer', file=sys.stderr)
-                return errno.EINVAL
-            count = int(childargs[1])
-        DaemonWatcher(sockpath).run(interval, count)
-        return 0
-    elif sockpath:
-        try:
-            raw_write(admin_socket(sockpath, childargs, format))
-        except Exception as e:
-            print('admin_socket: {0}'.format(e), file=sys.stderr)
-            return errno.EINVAL
-        return 0
+    done, ret = maybe_daemon_command(parsed_args, childargs)
+    if done:
+        return ret
 
     timeout = None
     if parsed_args.cluster_timeout:
diff --git a/src/ceph_fuse.cc b/src/ceph_fuse.cc
index c8018256246..faa949f9609 100644
--- a/src/ceph_fuse.cc
+++ b/src/ceph_fuse.cc
@@ -194,7 +194,7 @@ int main(int argc, const char **argv, const char *envp[]) {
 
     // get monmap
     Messenger *messenger = NULL;
-    Client *client;
+    StandaloneClient *client;
     CephFuse *cfuse;
     UserPerm perms;
     int tester_r = 0;
@@ -213,7 +213,7 @@ int main(int argc, const char **argv, const char *envp[]) {
     messenger->set_policy(entity_name_t::TYPE_MDS,
 			  Messenger::Policy::lossless_client(0));
 
-    client = new Client(messenger, mc);
+    client = new StandaloneClient(messenger, mc);
     if (filer_flags) {
       client->set_filer_flags(filer_flags);
     }
diff --git a/src/ceph_syn.cc b/src/ceph_syn.cc
index d3ed16a1295..a8cbba4de2e 100644
--- a/src/ceph_syn.cc
+++ b/src/ceph_syn.cc
@@ -67,7 +67,7 @@ int main(int argc, const char **argv, char *envp[])
     messengers[i]->bind(g_conf->public_addr);
     mclients[i] = new MonClient(g_ceph_context);
     mclients[i]->build_initial_monmap();
-    Client *client = new Client(messengers[i], mclients[i]);
+    auto client = new StandaloneClient(messengers[i], mclients[i]);
     client->set_filer_flags(syn_filer_flags);
     SyntheticClient *syn = new SyntheticClient(client);
     clients.push_back(client);
diff --git a/src/ceph_ver.c b/src/ceph_ver.c
index b983b552701..efa2268185c 100644
--- a/src/ceph_ver.c
+++ b/src/ceph_ver.c
@@ -1,5 +1,4 @@
 
-#include "acconfig.h"
 #include "ceph_ver.h"
 
 #define CONCAT_VER_SYMBOL(x) ceph_ver__##x
diff --git a/src/client/Client.cc b/src/client/Client.cc
index 0cd9d66d284..98c987b00e9 100644
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
@@ -226,9 +226,8 @@ vinodeno_t Client::map_faked_ino(ino_t ino)
 
 // cons/des
 
-Client::Client(Messenger *m, MonClient *mc)
+Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
   : Dispatcher(m->cct),
-    logger(NULL),
     m_command_hook(this),
     timer(m->cct, client_lock),
     callback_handle(NULL),
@@ -246,17 +245,16 @@ Client::Client(Messenger *m, MonClient *mc)
     remount_finisher(m->cct),
     objecter_finisher(m->cct),
     tick_event(NULL),
-    monclient(mc), messenger(m), whoami(mc->get_global_id()),
-    cap_epoch_barrier(0),
+    messenger(m), monclient(mc),
+    objecter(objecter_),
+    whoami(mc->get_global_id()), cap_epoch_barrier(0),
     last_tid(0), oldest_tid(0), last_flush_tid(1),
-    initialized(false), authenticated(false),
+    initialized(false),
     mounted(false), unmounting(false),
     local_osd(-1), local_osd_epoch(0),
     unsafe_sync_write(0),
     client_lock("Client::client_lock")
 {
-  monclient->set_messenger(m);
-
   _reset_faked_inos();
   //
   root = 0;
@@ -279,14 +277,12 @@ Client::Client(Messenger *m, MonClient *mc)
   // file handles
   free_fd_set.insert(10, 1<<30);
 
-  // osd interfaces
   mdsmap.reset(new MDSMap);
-  objecter = new Objecter(cct, messenger, monclient, NULL,
-			  0, 0);
-  objecter->set_client_incarnation(0);  // client always 0, for now.
-  writeback_handler = new ObjecterWriteback(objecter, &objecter_finisher,
-					    &client_lock);
-  objectcacher = new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
+
+  // osd interfaces
+  writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
+					    &client_lock));
+  objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
 				  client_flush_set_callback,    // all commit callback
 				  (void*)this,
 				  cct->_conf->client_oc_size,
@@ -294,9 +290,9 @@ Client::Client(Messenger *m, MonClient *mc)
 				  cct->_conf->client_oc_max_dirty,
 				  cct->_conf->client_oc_target_dirty,
 				  cct->_conf->client_oc_max_dirty_age,
-				  true);
+				  true));
   objecter_finisher.start();
-  filer = new Filer(objecter, &objecter_finisher);
+  filer.reset(new Filer(objecter, &objecter_finisher));
 }
 
 
@@ -305,14 +301,6 @@ Client::~Client()
   assert(!client_lock.is_locked());
 
   tear_down_cache();
-
-  delete objectcacher;
-  delete writeback_handler;
-
-  delete filer;
-  delete objecter;
-
-  delete logger;
 }
 
 void Client::tear_down_cache()
@@ -467,35 +455,27 @@ int Client::init()
 {
   timer.init();
   objectcacher->start();
-  objecter->init();
 
   client_lock.Lock();
   assert(!initialized);
 
-  // ok!
-  messenger->add_dispatcher_tail(objecter);
   messenger->add_dispatcher_tail(this);
+  client_lock.Unlock();
 
-  monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
-  int r = monclient->init();
-  if (r < 0) {
-    // need to do cleanup because we're in an intermediate init state
-    timer.shutdown();
-    client_lock.Unlock();
-    objecter->shutdown();
-    objectcacher->stop();
-    monclient->shutdown();
-    return r;
-  }
-  objecter->start();
+  _finish_init();
+  return 0;
+}
 
+void Client::_finish_init()
+{
+  client_lock.Lock();
   // logger
   PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
   plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
   plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
   plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
-  logger = plb.create_perf_counters();
-  cct->get_perfcounters_collection()->add(logger);
+  logger.reset(plb.create_perf_counters());
+  cct->get_perfcounters_collection()->add(logger.get());
 
   client_lock.Unlock();
 
@@ -546,7 +526,6 @@ int Client::init()
   client_lock.Lock();
   initialized = true;
   client_lock.Unlock();
-  return r;
 }
 
 void Client::shutdown() 
@@ -600,22 +579,16 @@ void Client::shutdown()
   timer.shutdown();
   client_lock.Unlock();
 
-  objecter->shutdown();
   objecter_finisher.wait_for_empty();
   objecter_finisher.stop();
 
-  monclient->shutdown();
-
   if (logger) {
-    cct->get_perfcounters_collection()->remove(logger);
-    delete logger;
-    logger = NULL;
+    cct->get_perfcounters_collection()->remove(logger.get());
+    logger.reset();
   }
 }
 
 
-
-
 // ===================
 // metadata cache stuff
 
@@ -5436,7 +5409,7 @@ int Client::authenticate()
 {
   assert(client_lock.is_locked_by_me());
 
-  if (authenticated) {
+  if (monclient->is_authenticated()) {
     return 0;
   }
 
@@ -5449,7 +5422,6 @@ int Client::authenticate()
 
   whoami = monclient->get_global_id();
   messenger->set_myname(entity_name_t::CLIENT(whoami.v));
-  authenticated = true;
 
   return 0;
 }
@@ -7908,11 +7880,11 @@ int Client::open(const char *relpath, int flags, const UserPerm& perms,
 		 mode_t mode, int stripe_unit, int stripe_count,
 		 int object_size, const char *data_pool)
 {
-  ldout(cct, 3) << "open enter(" << relpath << ", " << flags << "," << mode << ")" << dendl;
+  ldout(cct, 3) << "open enter(" << relpath << ", " << ceph_flags_sys2wire(flags) << "," << mode << ")" << dendl;
   Mutex::Locker lock(client_lock);
   tout(cct) << "open" << std::endl;
   tout(cct) << relpath << std::endl;
-  tout(cct) << flags << std::endl;
+  tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
 
   Fh *fh = NULL;
 
@@ -7982,7 +7954,7 @@ int Client::open(const char *relpath, int flags, const UserPerm& perms,
   
  out:
   tout(cct) << r << std::endl;
-  ldout(cct, 3) << "open exit(" << path << ", " << flags << ") = " << r << dendl;
+  ldout(cct, 3) << "open exit(" << path << ", " << ceph_flags_sys2wire(flags) << ") = " << r << dendl;
   return r;
 }
 
@@ -8198,7 +8170,8 @@ int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
     return -EROFS;
   }
 
-  int cmode = ceph_flags_to_mode(flags);
+  // use normalized flags to generate cmode
+  int cmode = ceph_flags_to_mode(ceph_flags_sys2wire(flags));
   if (cmode < 0)
     return -EINVAL;
   int want = ceph_caps_for_mode(cmode);
@@ -8214,8 +8187,8 @@ int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
     MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
     filepath path;
     in->make_nosnap_relative_path(path);
-    req->set_filepath(path); 
-    req->head.args.open.flags = flags & ~O_CREAT;
+    req->set_filepath(path);
+    req->head.args.open.flags = ceph_flags_sys2wire(flags & ~O_CREAT);
     req->head.args.open.mode = mode;
     req->head.args.open.pool = -1;
     if (cct->_conf->client_debug_getattr_caps)
@@ -9363,9 +9336,9 @@ int Client::statfs(const char *path, struct statvfs *stbuf,
   assert(root != nullptr);
   Inode *quota_root = root->quota.is_enable() ? root : get_quota_root(root, perms);
 
-  // get_quota_root should always give us something if client quotas are
-  // enabled
-  assert(cct->_conf->client_quota == false || quota_root != nullptr);
+  // get_quota_root should always give us something
+  // because client quotas are always enabled
+  assert(quota_root != nullptr);
 
   if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
 
@@ -11170,7 +11143,8 @@ int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
     return -EDQUOT;
   }
 
-  int cmode = ceph_flags_to_mode(flags);
+  // use normalized flags to generate cmode
+  int cmode = ceph_flags_to_mode(ceph_flags_sys2wire(flags));
   if (cmode < 0)
     return -EINVAL;
 
@@ -11191,7 +11165,7 @@ int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
   path.push_dentry(name);
   req->set_filepath(path);
   req->set_inode(dir);
-  req->head.args.open.flags = flags | O_CREAT;
+  req->head.args.open.flags = ceph_flags_sys2wire(flags | O_CREAT);
 
   req->head.args.open.stripe_unit = stripe_unit;
   req->head.args.open.stripe_count = stripe_count;
@@ -11633,8 +11607,7 @@ int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const ch
     else
       return -EROFS;
   }
-  if (cct->_conf->client_quota &&
-      fromdir != todir) {
+  if (fromdir != todir) {
     Inode *fromdir_root =
       fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
     Inode *todir_root =
@@ -11963,10 +11936,10 @@ int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
 
   vinodeno_t vino = _get_vino(in);
 
-  ldout(cct, 3) << "ll_open " << vino << " " << flags << dendl;
+  ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
   tout(cct) << "ll_open" << std::endl;
   tout(cct) << vino.ino.val << std::endl;
-  tout(cct) << flags << std::endl;
+  tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
 
   int r;
   if (!cct->_conf->fuse_default_permissions) {
@@ -11983,8 +11956,8 @@ int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
     ll_unclosed_fh_set.insert(fhptr);
   }
   tout(cct) << (unsigned long)fhptr << std::endl;
-  ldout(cct, 3) << "ll_open " << vino << " " << flags << " = " << r << " (" <<
-    fhptr << ")" << dendl;
+  ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
+      " = " << r << " (" << fhptr << ")" << dendl;
   return r;
 }
 
@@ -11997,13 +11970,13 @@ int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
   vinodeno_t vparent = _get_vino(parent);
 
   ldout(cct, 3) << "_ll_create " << vparent << " " << name << " 0" << oct <<
-    mode << dec << " " << flags << ", uid " << perms.uid()
+    mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
 		<< ", gid " << perms.gid() << dendl;
   tout(cct) << "ll_create" << std::endl;
   tout(cct) << vparent.ino.val << std::endl;
   tout(cct) << name << std::endl;
   tout(cct) << mode << std::endl;
-  tout(cct) << flags << std::endl;
+  tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
 
   bool created = false;
   int r = _lookup(parent, name, caps, in, perms);
@@ -12064,8 +12037,8 @@ out:
   tout(cct) << (unsigned long)*fhp << std::endl;
   tout(cct) << ino << std::endl;
   ldout(cct, 3) << "_ll_create " << parent << " " << name << " 0" << oct <<
-    mode << dec << " " << flags << " = " << r << " (" << *fhp << " " <<
-    hex << ino << dec << ")" << dendl;
+    mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
+    *fhp << " " << hex << ino << dec << ")" << dendl;
 
   return r;
 }
@@ -12104,6 +12077,7 @@ int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
   Mutex::Locker lock(client_lock);
   InodeRef in;
 
+
   int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
   if (r >= 0) {
     assert(in);
@@ -12841,9 +12815,6 @@ bool Client::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool
 
 Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
 {
-  if (!cct->_conf->client_quota)
-    return NULL;
-
   Inode *cur = in;
   utime_t now = ceph_clock_now();
 
@@ -12915,9 +12886,6 @@ Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
 bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
 				   std::function<bool (const Inode &in)> test)
 {
-  if (!cct->_conf->client_quota)
-    return false;
-
   while (true) {
     assert(in != NULL);
     if (test(*in)) {
@@ -13257,3 +13225,55 @@ mds_rank_t Client::_get_random_up_mds() const
   return *p;
 }
 
+
+StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc)
+    : Client(m, mc, new Objecter(m->cct, m, mc, NULL, 0, 0))
+{
+  monclient->set_messenger(m);
+  objecter->set_client_incarnation(0);
+}
+
+StandaloneClient::~StandaloneClient()
+{
+  delete objecter;
+  objecter = nullptr;
+}
+
+int StandaloneClient::init()
+{
+  timer.init();
+  objectcacher->start();
+  objecter->init();
+
+  client_lock.Lock();
+  assert(!initialized);
+
+  messenger->add_dispatcher_tail(objecter);
+  messenger->add_dispatcher_tail(this);
+
+  monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
+  int r = monclient->init();
+  if (r < 0) {
+    // need to do cleanup because we're in an intermediate init state
+    timer.shutdown();
+    client_lock.Unlock();
+    objecter->shutdown();
+    objectcacher->stop();
+    monclient->shutdown();
+    return r;
+  }
+  objecter->start();
+
+  client_lock.Unlock();
+  _finish_init();
+
+  return 0;
+}
+
+void StandaloneClient::shutdown()
+{
+  Client::shutdown();
+  objecter->shutdown();
+  monclient->shutdown();
+}
+
diff --git a/src/client/Client.h b/src/client/Client.h
index 236ca90d995..7d2b73eb35d 100644
--- a/src/client/Client.h
+++ b/src/client/Client.h
@@ -252,7 +252,7 @@ class Client : public Dispatcher, public md_config_obs_t {
  public:
   using Dispatcher::cct;
 
-  PerfCounters *logger;
+  std::unique_ptr<PerfCounters> logger;
 
   class CommandHook : public AdminSocketHook {
     Client *m_client;
@@ -304,8 +304,10 @@ public:
     return UserPerm(uid, gid);
   }
 protected:
-  MonClient *monclient;
   Messenger *messenger;  
+  MonClient *monclient;
+  Objecter  *objecter;
+
   client_t whoami;
 
   int user_id, group_id;
@@ -384,7 +386,6 @@ protected:
   bool is_dir_operation(MetaRequest *request);
 
   bool   initialized;
-  bool   authenticated;
   bool   mounted;
   bool   unmounting;
 
@@ -404,10 +405,9 @@ public:
   void _sync_write_commit(Inode *in);
 
 protected:
-  Filer                 *filer;     
-  ObjectCacher          *objectcacher;
-  Objecter              *objecter;     // (non-blocking) osd interface
-  WritebackHandler      *writeback_handler;
+  std::unique_ptr<Filer>             filer;
+  std::unique_ptr<ObjectCacher>      objectcacher;
+  std::unique_ptr<WritebackHandler>  writeback_handler;
 
   // cache
   ceph::unordered_map<vinodeno_t, Inode*> inode_map;
@@ -586,11 +586,18 @@ protected:
 
   void _close_sessions();
 
+  /**
+   * The basic housekeeping parts of init (perf counters, admin socket)
+   * that is independent of how objecters/monclient/messengers are
+   * being set up.
+   */
+  void _finish_init();
+
  public:
   void set_filer_flags(int flags);
   void clear_filer_flags(int flags);
 
-  Client(Messenger *m, MonClient *mc);
+  Client(Messenger *m, MonClient *mc, Objecter *objecter_);
   ~Client() override;
   void tear_down_cache();
 
@@ -601,8 +608,8 @@ protected:
   inodeno_t get_root_ino();
   Inode *get_root();
 
-  int init()  WARN_UNUSED_RESULT;
-  void shutdown();
+  virtual int init();
+  virtual void shutdown();
 
   // messaging
   void handle_mds_map(class MMDSMap *m);
@@ -1225,4 +1232,19 @@ public:
 	                          const std::set <std::string> &changed) override;
 };
 
+/**
+ * Specialization of Client that manages its own Objecter instance
+ * and handles init/shutdown of messenger/monclient
+ */
+class StandaloneClient : public Client
+{
+  public:
+  StandaloneClient(Messenger *m, MonClient *mc);
+
+  ~StandaloneClient() override;
+
+  int init() override;
+  void shutdown() override;
+};
+
 #endif
diff --git a/src/client/ObjecterWriteback.h b/src/client/ObjecterWriteback.h
index 97cb228d5b8..8928437646c 100644
--- a/src/client/ObjecterWriteback.h
+++ b/src/client/ObjecterWriteback.h
@@ -17,7 +17,9 @@ class ObjecterWriteback : public WritebackHandler {
   void read(const object_t& oid, uint64_t object_no,
 		    const object_locator_t& oloc, uint64_t off, uint64_t len,
 		    snapid_t snapid, bufferlist *pbl, uint64_t trunc_size,
-		    __u32 trunc_seq, int op_flags, Context *onfinish) override {
+		    __u32 trunc_seq, int op_flags,
+                    const ZTracer::Trace &parent_trace,
+                    Context *onfinish) override {
     m_objecter->read_trunc(oid, oloc, off, len, snapid, pbl, 0,
 			   trunc_size, trunc_seq,
 			   new C_OnFinisher(new C_Lock(m_lock, onfinish),
@@ -34,6 +36,7 @@ class ObjecterWriteback : public WritebackHandler {
 			   const SnapContext& snapc, const bufferlist &bl,
 			   ceph::real_time mtime, uint64_t trunc_size,
 			   __u32 trunc_seq, ceph_tid_t journal_tid,
+                           const ZTracer::Trace &parent_trace,
 			   Context *oncommit) override {
     return m_objecter->write_trunc(oid, oloc, off, len, snapc, bl, mtime, 0,
 				   trunc_size, trunc_seq,
diff --git a/src/client/SyntheticClient.cc b/src/client/SyntheticClient.cc
index 1c1b33aa3a2..54c3ddbff44 100644
--- a/src/client/SyntheticClient.cc
+++ b/src/client/SyntheticClient.cc
@@ -3354,7 +3354,6 @@ int SyntheticClient::chunk_file(string &filename)
   uint64_t size = st.st_size;
   dout(0) << "file " << filename << " size is " << size << dendl;
 
-  Filer *filer = client->filer;
 
   inode_t inode;
   memset(&inode, 0, sizeof(inode));
@@ -3374,7 +3373,8 @@ int SyntheticClient::chunk_file(string &filename)
     
     flock.Lock();
     Context *onfinish = new C_SafeCond(&flock, &cond, &done);
-    filer->read(inode.ino, &inode.layout, CEPH_NOSNAP, pos, get, &bl, 0, onfinish);
+    client->filer->read(inode.ino, &inode.layout, CEPH_NOSNAP, pos, get, &bl, 0,
+			onfinish);
     while (!done)
       cond.Wait(flock);
     flock.Unlock();
diff --git a/src/client/UserPerm.h b/src/client/UserPerm.h
index 1895c15c2d4..a2d6ccbe888 100644
--- a/src/client/UserPerm.h
+++ b/src/client/UserPerm.h
@@ -68,10 +68,10 @@ public:
 
   uid_t uid() const { return m_uid != (uid_t)-1 ? m_uid : ::geteuid(); }
   gid_t gid() const { return m_gid != (gid_t)-1 ? m_gid : ::getegid(); }
-  bool gid_in_groups(gid_t gid) const {
-    if (gid == m_gid) return true;
+  bool gid_in_groups(gid_t id) const {
+    if (id == gid()) return true;
     for (int i = 0; i < gid_count; ++i) {
-      if (gid == gids[i]) return true;
+      if (id == gids[i]) return true;
     }
     return false;
   }
diff --git a/src/cls/CMakeLists.txt b/src/cls/CMakeLists.txt
index 0b916907619..1c36c1fd0b5 100644
--- a/src/cls/CMakeLists.txt
+++ b/src/cls/CMakeLists.txt
@@ -3,6 +3,11 @@
 set(cls_dir ${CMAKE_INSTALL_LIBDIR}/rados-classes)
 set(cls_embedded_srcs)
 
+# cls_sdk
+add_library(cls_sdk SHARED sdk/cls_sdk.cc)
+set_target_properties(cls_sdk PROPERTIES VERSION "1.0.0" SOVERSION "1")
+install(TARGETS cls_sdk DESTINATION ${cls_dir})
+
 # cls_hello
 set(cls_hello_srcs hello/cls_hello.cc)
 add_library(cls_hello SHARED ${cls_hello_srcs})
diff --git a/src/cls/journal/cls_journal_types.h b/src/cls/journal/cls_journal_types.h
index 4e1f2d7fd6e..f2cb841973f 100644
--- a/src/cls/journal/cls_journal_types.h
+++ b/src/cls/journal/cls_journal_types.h
@@ -9,7 +9,6 @@
 #include "include/encoding.h"
 #include <iosfwd>
 #include <list>
-#include <set>
 #include <string>
 
 namespace ceph {
diff --git a/src/cls/log/cls_log.cc b/src/cls/log/cls_log.cc
index a53149ce3b3..70b9eb85165 100644
--- a/src/cls/log/cls_log.cc
+++ b/src/cls/log/cls_log.cc
@@ -126,7 +126,7 @@ static int cls_log_add(cls_method_context_t hctx, bufferlist *in, bufferlist *ou
       index = entry.id;
     }
 
-    CLS_LOG(0, "storing entry at %s", index.c_str());
+    CLS_LOG(20, "storing entry at %s", index.c_str());
 
 
     if (index > header.max_marker)
diff --git a/src/cls/lua/cls_lua_client.cc b/src/cls/lua/cls_lua_client.cc
index 95dd2bb6043..44348270c80 100644
--- a/src/cls/lua/cls_lua_client.cc
+++ b/src/cls/lua/cls_lua_client.cc
@@ -1,10 +1,7 @@
-#include <errno.h>
 #include <string>
 #include <vector>
 #include "include/encoding.h"
-#include "include/rados.h"
-#include "include/rados/librados.h"
-#include "include/types.h"
+#include "include/rados/librados.hpp"  // for IoCtx
 #include "cls_lua_client.h"
 #include "cls_lua_ops.h"
 
diff --git a/src/cls/lua/cls_lua_client.h b/src/cls/lua/cls_lua_client.h
index fd049247add..e37906993c9 100644
--- a/src/cls/lua/cls_lua_client.h
+++ b/src/cls/lua/cls_lua_client.h
@@ -1,8 +1,12 @@
 #ifndef CLS_LUA_CLIENT_HPP
 #define CLS_LUA_CLIENT_HPP
 #include <string>
-#include <vector>
-#include "include/rados/librados.hpp"
+
+#include "include/buffer_fwd.h"  // for bufferlist
+
+namespace librados {
+  class IoCtx;
+}
 
 namespace cls_lua_client {
   int exec(librados::IoCtx& ioctx, const std::string& oid,
diff --git a/src/cls/lua/cls_lua_ops.h b/src/cls/lua/cls_lua_ops.h
index ed297922044..2bf95051d34 100644
--- a/src/cls/lua/cls_lua_ops.h
+++ b/src/cls/lua/cls_lua_ops.h
@@ -2,8 +2,8 @@
 #define CEPH_CLS_LUA_OPS_H
 
 struct cls_lua_eval_op {
-  string script;
-  string handler;
+  std::string script;
+  std::string handler;
   bufferlist input;
 
   void encode(bufferlist &bl) const {
diff --git a/src/cls/rbd/cls_rbd.cc b/src/cls/rbd/cls_rbd.cc
index 5dc53a08126..c5dc0cf45eb 100644
--- a/src/cls/rbd/cls_rbd.cc
+++ b/src/cls/rbd/cls_rbd.cc
@@ -1554,6 +1554,12 @@ int snapshot_add(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
     return -EINVAL;
   }
 
+  if (boost::get<cls::rbd::UnknownSnapshotNamespace>(
+        &snap_meta.snapshot_namespace.snapshot_namespace) != nullptr) {
+    CLS_ERR("Unknown snapshot namespace provided");
+    return -EINVAL;
+  }
+
   CLS_LOG(20, "snapshot_add name=%s id=%llu", snap_meta.name.c_str(),
 	 (unsigned long long)snap_meta.id.val);
 
@@ -1610,7 +1616,7 @@ int snapshot_add(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
 
     total_read += vals.size();
     if (total_read >= snap_limit) {
-      CLS_ERR("Attempt to create snapshot over limit of %lu", snap_limit);
+      CLS_ERR("Attempt to create snapshot over limit of %" PRIu64, snap_limit);
       return -EDQUOT;
     }
 
@@ -2752,7 +2758,7 @@ int snapshot_get_limit(cls_method_context_t hctx, bufferlist *in,
     return r;
   }
 
-  CLS_LOG(20, "read snapshot limit %lu", snap_limit);
+  CLS_LOG(20, "read snapshot limit %" PRIu64, snap_limit);
   ::encode(snap_limit, *out);
 
   return 0;
@@ -2776,7 +2782,7 @@ int snapshot_set_limit(cls_method_context_t hctx, bufferlist *in,
     CLS_LOG(20, "remove snapshot limit\n");
     rc = cls_cxx_map_remove_key(hctx, "snap_limit");
   } else {
-    CLS_LOG(20, "set snapshot limit to %lu\n", new_limit);
+    CLS_LOG(20, "set snapshot limit to %" PRIu64 "\n", new_limit);
     ::encode(new_limit, bl);
     rc = cls_cxx_map_set_val(hctx, "snap_limit", &bl);
   }
diff --git a/src/cls/rbd/cls_rbd.h b/src/cls/rbd/cls_rbd.h
index c1f17b845e5..f08fc1b1f0a 100644
--- a/src/cls/rbd/cls_rbd.h
+++ b/src/cls/rbd/cls_rbd.h
@@ -66,7 +66,8 @@ struct cls_rbd_snap {
   cls_rbd_parent parent;
   uint64_t flags;
   utime_t timestamp;
-  cls::rbd::SnapshotNamespaceOnDisk snapshot_namespace;
+  cls::rbd::SnapshotNamespaceOnDisk snapshot_namespace = {
+    cls::rbd::UserSnapshotNamespace{}};
 
   /// true if we have a parent
   bool has_parent() const {
@@ -107,8 +108,6 @@ struct cls_rbd_snap {
     }
     if (struct_v >= 5) {
       ::decode(snapshot_namespace, p);
-    } else {
-      snapshot_namespace = cls::rbd::SnapshotNamespaceOnDisk(cls::rbd::UserSnapshotNamespace());
     }
     if (struct_v >= 6) {
       ::decode(timestamp, p);
diff --git a/src/cls/rgw/cls_rgw.cc b/src/cls/rgw/cls_rgw.cc
index e57d9de136a..aad9b4f8ac6 100644
--- a/src/cls/rgw/cls_rgw.cc
+++ b/src/cls/rgw/cls_rgw.cc
@@ -95,11 +95,15 @@ static void get_index_ver_key(cls_method_context_t hctx, uint64_t index_ver, str
   *key = buf;
 }
 
-static void bi_log_index_key(cls_method_context_t hctx, string& key, string& id, uint64_t index_ver)
+static void bi_log_prefix(string& key)
 {
   key = BI_PREFIX_CHAR;
   key.append(bucket_index_prefixes[BI_BUCKET_LOG_INDEX]);
+}
 
+static void bi_log_index_key(cls_method_context_t hctx, string& key, string& id, uint64_t index_ver)
+{
+  bi_log_prefix(key);
   get_index_ver_key(hctx, index_ver, &id);
   key.append(id);
 }
@@ -2257,9 +2261,8 @@ static int list_plain_entries(cls_method_context_t hctx, const string& name, con
   string filter = name;
   string start_key = marker;
 
-  string first_instance_idx;
-  encode_obj_versioned_data_key(string(), &first_instance_idx);
-  string end_key = first_instance_idx;
+  string end_key; // stop listing at bi_log_prefix
+  bi_log_prefix(end_key);
 
   int count = 0;
   map<string, bufferlist> keys;
@@ -2507,7 +2510,7 @@ static int rgw_bi_list_op(cls_method_context_t hctx, bufferlist *in, bufferlist
 
   ret = list_olh_entries(hctx, op.name, op.marker, max - count, &op_ret.entries);
   if (ret < 0) {
-    CLS_LOG(0, "ERROR: %s(): list_instance_entries retured ret=%d", __func__, ret);
+    CLS_LOG(0, "ERROR: %s(): list_olh_entries retured ret=%d", __func__, ret);
     return ret;
   }
 
@@ -3370,7 +3373,7 @@ static int rgw_cls_lc_get_next_entry(cls_method_context_t hctx, bufferlist *in,
   try {
     ::decode(op, in_iter);
   } catch (buffer::error& err) {
-    CLS_LOG(1, "ERROR: rgw_cls_lc_rm_entry(): failed to decode entry\n");
+    CLS_LOG(1, "ERROR: rgw_cls_lc_get_next_entry: failed to decode op\n");
     return -EINVAL;
   }
 
@@ -3403,10 +3406,11 @@ static int rgw_cls_lc_list_entries(cls_method_context_t hctx, bufferlist *in, bu
   try {
     ::decode(op, in_iter);
   } catch (buffer::error& err) {
-    CLS_LOG(1, "ERROR: rgw_cls_lc_rm_entry(): failed to decode entry\n");
+    CLS_LOG(1, "ERROR: rgw_cls_lc_list_entries(): failed to decode op\n");
     return -EINVAL;
   }
-    cls_rgw_lc_list_entries_ret op_ret;
+
+  cls_rgw_lc_list_entries_ret op_ret;
   bufferlist::iterator iter;
   map<string, bufferlist> vals;
   string filter_prefix;
@@ -3437,7 +3441,7 @@ static int rgw_cls_lc_put_head(cls_method_context_t hctx, bufferlist *in, buffer
   try {
     ::decode(op, in_iter);
   } catch (buffer::error& err) {
-    CLS_LOG(1, "ERROR: rgw_cls_lc_set_entry(): failed to decode entry\n");
+    CLS_LOG(1, "ERROR: rgw_cls_lc_put_head(): failed to decode entry\n");
     return -EINVAL;
   }
 
diff --git a/src/cls/rgw/cls_rgw_types.cc b/src/cls/rgw/cls_rgw_types.cc
index 1b8edbb667d..7d9c83bfad8 100644
--- a/src/cls/rgw/cls_rgw_types.cc
+++ b/src/cls/rgw/cls_rgw_types.cc
@@ -61,6 +61,7 @@ void rgw_bucket_dir_entry_meta::dump(Formatter *f) const
   encode_json("owner_display_name", owner_display_name, f);
   encode_json("content_type", content_type, f);
   encode_json("accounted_size", accounted_size, f);
+  encode_json("user_data", user_data, f);
 }
 
 void rgw_bucket_dir_entry_meta::decode_json(JSONObj *obj) {
@@ -75,6 +76,7 @@ void rgw_bucket_dir_entry_meta::decode_json(JSONObj *obj) {
   JSONDecoder::decode_json("owner_display_name", owner_display_name, obj);
   JSONDecoder::decode_json("content_type", content_type, obj);
   JSONDecoder::decode_json("accounted_size", accounted_size, obj);
+  JSONDecoder::decode_json("user_data", user_data, obj);
 }
 
 void rgw_bucket_dir_entry::generate_test_instances(list<rgw_bucket_dir_entry*>& o)
diff --git a/src/cls/rgw/cls_rgw_types.h b/src/cls/rgw/cls_rgw_types.h
index 8e48e60affb..2b8dae327f1 100644
--- a/src/cls/rgw/cls_rgw_types.h
+++ b/src/cls/rgw/cls_rgw_types.h
@@ -95,12 +95,13 @@ struct rgw_bucket_dir_entry_meta {
   string owner_display_name;
   string content_type;
   uint64_t accounted_size;
+  string user_data;
 
   rgw_bucket_dir_entry_meta() :
   category(0), size(0), accounted_size(0) { }
 
   void encode(bufferlist &bl) const {
-    ENCODE_START(4, 3, bl);
+    ENCODE_START(5, 3, bl);
     ::encode(category, bl);
     ::encode(size, bl);
     ::encode(mtime, bl);
@@ -109,10 +110,11 @@ struct rgw_bucket_dir_entry_meta {
     ::encode(owner_display_name, bl);
     ::encode(content_type, bl);
     ::encode(accounted_size, bl);
+    ::encode(user_data, bl);
     ENCODE_FINISH(bl);
   }
   void decode(bufferlist::iterator &bl) {
-    DECODE_START_LEGACY_COMPAT_LEN(4, 3, 3, bl);
+    DECODE_START_LEGACY_COMPAT_LEN(5, 3, 3, bl);
     ::decode(category, bl);
     ::decode(size, bl);
     ::decode(mtime, bl);
@@ -125,6 +127,8 @@ struct rgw_bucket_dir_entry_meta {
       ::decode(accounted_size, bl);
     else
       accounted_size = size;
+    if (struct_v >= 5)
+      ::decode(user_data, bl);
     DECODE_FINISH(bl);
   }
   void dump(Formatter *f) const;
diff --git a/src/cls/sdk/cls_sdk.cc b/src/cls/sdk/cls_sdk.cc
new file mode 100644
index 00000000000..1f907b49802
--- /dev/null
+++ b/src/cls/sdk/cls_sdk.cc
@@ -0,0 +1,131 @@
+/*
+ * This is an example RADOS object class built using only the Ceph SDK interface.
+ */
+#include "include/rados/objclass.h"
+
+CLS_VER(1,0)
+CLS_NAME(sdk)
+
+cls_handle_t h_class;
+cls_method_handle_t h_test_coverage_write;
+cls_method_handle_t h_test_coverage_replay;
+
+/**
+ * test_coverage_write - a "write" method that creates an object
+ *
+ * This method modifies the object by making multiple write calls (write,
+ * setxattr and set_val).
+ */
+static int test_coverage_write(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
+{
+  // create the object
+  int ret = cls_cxx_create(hctx, false);
+  if (ret < 0) {
+    CLS_LOG(0, "ERROR: %s(): cls_cxx_create returned %d", __func__, ret);	  
+    return ret;
+  }
+
+  uint64_t size;
+  // get the size of the object
+  ret = cls_cxx_stat(hctx, &size, NULL);
+  if (ret < 0)
+    return ret;
+  
+  std::string c = "test";
+  bufferlist bl;
+  bl.append(c);
+
+  // write to the object
+  ret = cls_cxx_write(hctx, 0, bl.length(), &bl);
+  if (ret < 0)
+    return ret;
+  
+  uint64_t new_size;
+  // get the new size of the object
+  ret = cls_cxx_stat(hctx, &new_size, NULL);
+  if (ret < 0)
+    return ret;
+
+  // make some change to the xattr
+  ret = cls_cxx_setxattr(hctx, "foo", &bl);
+  if (ret < 0)
+    return ret;
+
+  // make some change to the omap
+  ret = cls_cxx_map_set_val(hctx, "foo", &bl);
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+/**
+ * test_coverage_replay - a "read" method to retrieve previously written data
+ *
+ * This method reads the object by making multiple read calls (read, getxattr 
+ * and get_val). It also removes the object after reading.
+ */
+
+static int test_coverage_replay(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
+{
+  CLS_LOG(0, "reading already written object");
+  uint64_t size;
+  // get the size of the object
+  int ret = cls_cxx_stat(hctx, &size, NULL);
+  if (ret < 0)
+    return ret;
+  
+  bufferlist bl;
+  // read the object entry
+  ret = cls_cxx_read(hctx, 0, size, &bl);
+  if (ret < 0)
+    return ret;
+
+  // if the size is incorrect 
+  if (bl.length() != size)
+    return -EIO;
+  
+  bl.clear();
+
+  // read xattr entry
+  ret = cls_cxx_getxattr(hctx, "foo", &bl);
+  if (ret < 0)
+    return ret;
+
+  // if the size is incorrect
+  if (bl.length() != size)
+    return -EIO;
+  
+  bl.clear();
+
+  // read omap entry
+  ret = cls_cxx_map_get_val(hctx, "foo", &bl);
+  if (ret < 0)
+    return ret;
+
+  // if the size is incorrect
+  if (bl.length() != size)
+    return -EIO;
+
+  // remove the object
+  ret = cls_cxx_remove(hctx);
+  if (ret < 0)
+    return ret;
+  
+  return 0;
+}
+
+void __cls_init()
+{
+  CLS_LOG(0, "loading cls_sdk");
+
+  cls_register("sdk", &h_class);
+
+  cls_register_cxx_method(h_class, "test_coverage_write",
+      CLS_METHOD_RD|CLS_METHOD_WR,
+      test_coverage_write, &h_test_coverage_write);
+  
+  cls_register_cxx_method(h_class, "test_coverage_replay",
+      CLS_METHOD_RD|CLS_METHOD_WR,
+      test_coverage_replay, &h_test_coverage_replay);
+}
diff --git a/src/cls/statelog/cls_statelog.cc b/src/cls/statelog/cls_statelog.cc
index 5a43da1a6a6..74739c28972 100644
--- a/src/cls/statelog/cls_statelog.cc
+++ b/src/cls/statelog/cls_statelog.cc
@@ -273,19 +273,10 @@ static int cls_statelog_check_state(cls_method_context_t hctx, bufferlist *in, b
     return -EINVAL;
   }
 
-  string obj_index;
-  get_index_by_object(op.object, op.op_id, obj_index);
-
-  bufferlist bl;
-  int rc = cls_cxx_map_get_val(hctx, obj_index, &bl);
-  if (rc < 0) {
-    CLS_LOG(0, "could not find entry %s", obj_index.c_str());
-    return rc;
-  }
 
   cls_statelog_entry entry;
 
-  rc = get_existing_entry(hctx, op.client_id, op.op_id, op.object, entry);
+  int rc = get_existing_entry(hctx, op.client_id, op.op_id, op.object, entry);
   if (rc < 0)
     return rc;
 
diff --git a/src/cls/statelog/cls_statelog_client.cc b/src/cls/statelog/cls_statelog_client.cc
index 1b68c6f3fb3..3ec31dac3ed 100644
--- a/src/cls/statelog/cls_statelog_client.cc
+++ b/src/cls/statelog/cls_statelog_client.cc
@@ -114,7 +114,6 @@ void cls_statelog_list(librados::ObjectReadOperation& op,
 void cls_statelog_check_state(librados::ObjectOperation& op, const string& client_id, const string& op_id, const string& object, uint32_t state)
 {
   bufferlist inbl;
-  bufferlist outbl;
   cls_statelog_check_state_op call;
   call.client_id = client_id;
   call.op_id = op_id;
diff --git a/src/common/BackTrace.cc b/src/common/BackTrace.cc
index 5fc6311d528..d68f43bd21d 100644
--- a/src/common/BackTrace.cc
+++ b/src/common/BackTrace.cc
@@ -7,7 +7,6 @@
 #include "BackTrace.h"
 
 #include "common/version.h"
-#include "acconfig.h"
 
 #define _STR(x) #x
 #define STRINGIFY(x) _STR(x)
diff --git a/src/common/ConfUtils.cc b/src/common/ConfUtils.cc
index b8e8edbce02..135e7da17ab 100644
--- a/src/common/ConfUtils.cc
+++ b/src/common/ConfUtils.cc
@@ -14,7 +14,6 @@
 
 #include <algorithm>
 #include <errno.h>
-#include <list>
 #include <map>
 #include <sstream>
 #include <stdio.h>
@@ -22,8 +21,6 @@
 #include <string.h>
 #include <string>
 #include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
 #include <iostream>
 
 #include "include/buffer.h"
diff --git a/src/common/DecayCounter.h b/src/common/DecayCounter.h
index 918e338aa6c..6a3ac4bcad4 100644
--- a/src/common/DecayCounter.h
+++ b/src/common/DecayCounter.h
@@ -35,6 +35,8 @@ class DecayRate {
 
 public:
   DecayRate() : k(0) {}
+  DecayRate(const DecayRate &dr) : k(dr.k) {}
+
   // cppcheck-suppress noExplicitConstructor
   DecayRate(double hl) { set_halflife(hl); }
   void set_halflife(double hl) {
@@ -43,14 +45,12 @@ public:
 };
 
 class DecayCounter {
- protected:
 public:
   double val;           // value
   double delta;         // delta since last decay
   double vel;           // recent velocity
   utime_t last_decay;   // time of last decay
-
- public:
+  DecayRate rate;
 
   void encode(bufferlist& bl) const;
   void decode(const utime_t &t, bufferlist::iterator& p);
@@ -62,6 +62,11 @@ public:
   {
   }
 
+  explicit DecayCounter(const utime_t &now, const DecayRate &rate)
+    : val(0), delta(0), vel(0), last_decay(now), rate(rate)
+  {
+  }
+
   // these two functions are for the use of our dencoder testing infrastructure
   DecayCounter() : val(0), delta(0), vel(0), last_decay() {}
 
@@ -76,7 +81,11 @@ public:
 
   double get(utime_t now, const DecayRate& rate) {
     decay(now, rate);
-    return val;
+    return val+delta;
+  }
+  double get(utime_t now) {
+    decay(now, rate);
+    return val+delta;
   }
 
   double get_last() {
@@ -100,6 +109,11 @@ public:
     delta += v;
     return val+delta;
   }
+  double hit(utime_t now, double v = 1.0) {
+    decay(now, rate);
+    delta += v;
+    return val+delta;
+  }
 
   void adjust(double a) {
     val += a;
diff --git a/src/common/Finisher.cc b/src/common/Finisher.cc
index 16e283e7deb..d7220d10cd0 100644
--- a/src/common/Finisher.cc
+++ b/src/common/Finisher.cc
@@ -33,9 +33,11 @@ void Finisher::wait_for_empty()
   finisher_lock.Lock();
   while (!finisher_queue.empty() || finisher_running) {
     ldout(cct, 10) << "wait_for_empty waiting" << dendl;
+    finisher_empty_wait = true;
     finisher_empty_cond.Wait(finisher_lock);
   }
   ldout(cct, 10) << "wait_for_empty empty" << dendl;
+  finisher_empty_wait = false;
   finisher_lock.Unlock();
 }
 
@@ -44,7 +46,8 @@ void *Finisher::finisher_thread_entry()
   finisher_lock.Lock();
   ldout(cct, 10) << "finisher_thread start" << dendl;
 
-  utime_t start, end;
+  utime_t start;
+  uint64_t count = 0;
   while (!finisher_stop) {
     /// Every time we are woken up, we process the queue until it is empty.
     while (!finisher_queue.empty()) {
@@ -58,8 +61,10 @@ void *Finisher::finisher_thread_entry()
       finisher_lock.Unlock();
       ldout(cct, 10) << "finisher_thread doing " << ls << dendl;
 
-      if (logger)
+      if (logger) {
 	start = ceph_clock_now();
+	count = ls.size();
+      }
 
       // Now actually process the contexts.
       for (vector<Context*>::iterator p = ls.begin();
@@ -77,21 +82,20 @@ void *Finisher::finisher_thread_entry()
 	  c->complete(ls_rval.front().second);
 	  ls_rval.pop_front();
 	}
-	if (logger) {
-	  logger->dec(l_finisher_queue_len);
-	  end = ceph_clock_now();
-	  logger->tinc(l_finisher_complete_lat, end - start);
-	  start = end;
-        }
       }
       ldout(cct, 10) << "finisher_thread done with " << ls << dendl;
       ls.clear();
+      if (logger) {
+	logger->dec(l_finisher_queue_len, count);
+	logger->tinc(l_finisher_complete_lat, ceph_clock_now() - start);
+      }
 
       finisher_lock.Lock();
       finisher_running = false;
     }
     ldout(cct, 10) << "finisher_thread empty" << dendl;
-    finisher_empty_cond.Signal();
+    if (unlikely(finisher_empty_wait))
+      finisher_empty_cond.Signal();
     if (finisher_stop)
       break;
     
diff --git a/src/common/Finisher.h b/src/common/Finisher.h
index 5947fa77df1..1629d915d63 100644
--- a/src/common/Finisher.h
+++ b/src/common/Finisher.h
@@ -43,6 +43,7 @@ class Finisher {
   Cond         finisher_empty_cond; ///< Signaled when the finisher has nothing more to process.
   bool         finisher_stop; ///< Set when the finisher should stop.
   bool         finisher_running; ///< True when the finisher is currently executing contexts.
+  bool	       finisher_empty_wait; ///< True mean someone wait finisher empty.
   /// Queue for contexts for which complete(0) will be called.
   /// NULLs in this queue indicate that an item from finisher_queue_rval
   /// should be completed in that place instead.
@@ -136,14 +137,14 @@ class Finisher {
   /// Anonymous finishers do not log their queue length.
   explicit Finisher(CephContext *cct_) :
     cct(cct_), finisher_lock("Finisher::finisher_lock"),
-    finisher_stop(false), finisher_running(false),
+    finisher_stop(false), finisher_running(false), finisher_empty_wait(false),
     thread_name("fn_anonymous"), logger(0),
     finisher_thread(this) {}
 
   /// Construct a named Finisher that logs its queue length.
   Finisher(CephContext *cct_, string name, string tn) :
-    cct(cct_), finisher_lock("Finisher::finisher_lock"),
-    finisher_stop(false), finisher_running(false),
+    cct(cct_), finisher_lock("Finisher::" + name),
+    finisher_stop(false), finisher_running(false), finisher_empty_wait(false),
     thread_name(tn), logger(0),
     finisher_thread(this) {
     PerfCountersBuilder b(cct, string("finisher-") + name,
diff --git a/src/common/Formatter.cc b/src/common/Formatter.cc
index 0bbbdd361ec..00a9b3849ca 100644
--- a/src/common/Formatter.cc
+++ b/src/common/Formatter.cc
@@ -14,8 +14,6 @@
 
 #define LARGE_SIZE 1024
 
-#include "include/int_types.h"
-
 #include "assert.h"
 #include "Formatter.h"
 #include "HTMLFormatter.h"
@@ -23,11 +21,9 @@
 #include "include/buffer.h"
 
 #include <algorithm>
-#include <iostream>
 #include <sstream>
 #include <stdarg.h>
 #include <stdio.h>
-#include <stdlib.h>
 #include <vector>
 #include <string>
 #include <set>
diff --git a/src/common/Formatter.h b/src/common/Formatter.h
index a4cf3e13cd5..a23266ac4ad 100644
--- a/src/common/Formatter.h
+++ b/src/common/Formatter.h
@@ -9,8 +9,10 @@
 #include <iosfwd>
 #include <list>
 #include <vector>
-#include <sstream>
 #include <stdarg.h>
+#include <stddef.h>
+#include <utility>
+#include <sstream>
 #include <string>
 #include <map>
 
diff --git a/src/common/Graylog.cc b/src/common/Graylog.cc
index ff2935151a3..a8fd8a99d29 100644
--- a/src/common/Graylog.cc
+++ b/src/common/Graylog.cc
@@ -4,9 +4,6 @@
 #include "Graylog.h"
 
 #include <iostream>
-#include <sstream>
-
-#include <arpa/inet.h>
 
 #include "common/Formatter.h"
 #include "common/LogEntry.h"
diff --git a/src/common/HTMLFormatter.cc b/src/common/HTMLFormatter.cc
index 63474adbc1a..11004cc4879 100644
--- a/src/common/HTMLFormatter.cc
+++ b/src/common/HTMLFormatter.cc
@@ -14,22 +14,15 @@
 
 #define LARGE_SIZE 1024
 
-#include "include/int_types.h"
-
-#include "assert.h"
 #include "Formatter.h"
 #include "HTMLFormatter.h"
-#include "common/escape.h"
 
-#include <iostream>
 #include <sstream>
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <vector>
 #include <string>
-#include <set>
-#include <boost/format.hpp>
+#include <string.h>     // for strdup
 
 // -----------------------
 namespace ceph {
diff --git a/src/common/HTMLFormatter.h b/src/common/HTMLFormatter.h
index 23fc8765599..19d056f76af 100644
--- a/src/common/HTMLFormatter.h
+++ b/src/common/HTMLFormatter.h
@@ -3,18 +3,10 @@
 #ifndef CEPH_HTML_FORMATTER_H
 #define CEPH_HTML_FORMATTER_H
 
-#include "include/int_types.h"
-
-#include <deque>
 #include <iosfwd>
-#include <list>
-#include <vector>
-#include <sstream>
 #include <stdarg.h>
 #include <string>
-#include <map>
 
-#include "include/buffer.h"
 #include "Formatter.h"
 
 namespace ceph {
diff --git a/src/common/LogClient.cc b/src/common/LogClient.cc
index c0e4a7bf5ae..b99a8a2462c 100644
--- a/src/common/LogClient.cc
+++ b/src/common/LogClient.cc
@@ -268,8 +268,10 @@ Message *LogClient::get_mon_log_message(bool flush)
 {
   Mutex::Locker l(log_lock);
   if (flush) {
+    if (log_queue.empty())
+      return nullptr;
     // reset session
-    last_log_sent = last_log - log_queue.size();
+    last_log_sent = log_queue.front().seq;
   }
   return _get_mon_log_message();
 }
diff --git a/src/common/LogEntry.cc b/src/common/LogEntry.cc
index ece56951cf5..bc887e1b3ad 100644
--- a/src/common/LogEntry.cc
+++ b/src/common/LogEntry.cc
@@ -1,7 +1,7 @@
 
 #include <syslog.h>
 
-#include <boost/algorithm/string.hpp>
+#include <boost/algorithm/string/predicate.hpp>
 
 #include "LogEntry.h"
 #include "Formatter.h"
diff --git a/src/common/PrebufferedStreambuf.h b/src/common/PrebufferedStreambuf.h
index 55012d12ea4..d56c9909fe8 100644
--- a/src/common/PrebufferedStreambuf.h
+++ b/src/common/PrebufferedStreambuf.h
@@ -1,7 +1,6 @@
 #ifndef CEPH_COMMON_PREBUFFEREDSTREAMBUF_H
 #define CEPH_COMMON_PREBUFFEREDSTREAMBUF_H
 
-#include <iosfwd>
 #include <string>
 #include <streambuf>
 
diff --git a/src/common/RefCountedObj.h b/src/common/RefCountedObj.h
index 2da83d1a760..6099bfff717 100644
--- a/src/common/RefCountedObj.h
+++ b/src/common/RefCountedObj.h
@@ -21,6 +21,11 @@
 #include "common/ceph_context.h"
 #include "common/valgrind.h"
 
+#include <atomic>
+
+// re-include our assert to clobber the system one; fix dout:
+#include "include/assert.h"
+
 struct RefCountedObject {
 private:
   mutable atomic_t nref;
diff --git a/src/common/SloppyCRCMap.cc b/src/common/SloppyCRCMap.cc
index 7924ae6e8a7..c637f7a257a 100644
--- a/src/common/SloppyCRCMap.cc
+++ b/src/common/SloppyCRCMap.cc
@@ -2,7 +2,8 @@
 // vim: ts=8 sw=2 smarttab
 
 #include "common/SloppyCRCMap.h"
-#include "common/Formatter.h"
+
+using namespace std;
 
 void SloppyCRCMap::write(uint64_t offset, uint64_t len, const bufferlist& bl,
 			 std::ostream *out)
diff --git a/src/common/SloppyCRCMap.h b/src/common/SloppyCRCMap.h
index 34642a32539..78c470c1887 100644
--- a/src/common/SloppyCRCMap.h
+++ b/src/common/SloppyCRCMap.h
@@ -4,9 +4,10 @@
 #ifndef CEPH_COMMON_SLOPPYCRCMAP_H
 #define CEPH_COMMON_SLOPPYCRCMAP_H
 
-#include "include/types.h"
 #include "include/encoding.h"
+#include "common/Formatter.h"
 
+#include <list>
 #include <map>
 #include <ostream>
 
@@ -69,7 +70,7 @@ public:
   void encode(bufferlist& bl) const;
   void decode(bufferlist::iterator& bl);
   void dump(Formatter *f) const;
-  static void generate_test_instances(list<SloppyCRCMap*>& ls);
+  static void generate_test_instances(std::list<SloppyCRCMap*>& ls);
 };
 WRITE_CLASS_ENCODER(SloppyCRCMap)
 
diff --git a/src/common/SubProcess.h b/src/common/SubProcess.h
index 2295aade48a..c35ae1fd31c 100644
--- a/src/common/SubProcess.h
+++ b/src/common/SubProcess.h
@@ -17,21 +17,11 @@
 #ifndef SUB_PROCESS_H
 #define SUB_PROCESS_H
 
-#include <sys/types.h>
 #include <sys/wait.h>
-
-#include <signal.h>
-#include <errno.h>
 #include <stdarg.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <signal.h>
-
 #include <sstream>
 #include <vector>
 #include <iostream>
-
 #include <include/assert.h>
 #include <common/errno.h>
 
diff --git a/src/common/TextTable.h b/src/common/TextTable.h
index 60c982a7e5f..12e8ca1f5a4 100644
--- a/src/common/TextTable.h
+++ b/src/common/TextTable.h
@@ -17,8 +17,6 @@
 
 #include <vector>
 #include <sstream>
-#include <iomanip>
-#include <string>
 #include "include/assert.h"
 
 /**
diff --git a/src/common/Thread.cc b/src/common/Thread.cc
index 3e8d3434624..a58c7ba252c 100644
--- a/src/common/Thread.cc
+++ b/src/common/Thread.cc
@@ -19,16 +19,6 @@
 #include "common/signal.h"
 #include "common/io_priority.h"
 
-#include <dirent.h>
-#include <errno.h>
-#include <iostream>
-#include <pthread.h>
-
-#include <signal.h>
-#include <sstream>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/types.h>
 #ifdef HAVE_SCHED
 #include <sched.h>
 #endif
diff --git a/src/common/Throttle.cc b/src/common/Throttle.cc
index 56579ca8dd1..ca5701d8f73 100644
--- a/src/common/Throttle.cc
+++ b/src/common/Throttle.cc
@@ -1,12 +1,7 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 // vim: ts=8 sw=2 smarttab
 
-#include <errno.h>
-#include <thread>
-
 #include "common/Throttle.h"
-#include "common/dout.h"
-#include "common/ceph_context.h"
 #include "common/perf_counters.h"
 
 #define dout_subsys ceph_subsys_throttle
diff --git a/src/common/Throttle.h b/src/common/Throttle.h
index e4119a3b97c..f6bcd078b45 100644
--- a/src/common/Throttle.h
+++ b/src/common/Throttle.h
@@ -4,15 +4,8 @@
 #ifndef CEPH_THROTTLE_H
 #define CEPH_THROTTLE_H
 
-#include "Mutex.h"
 #include "Cond.h"
-#include <list>
-#include <map>
-#include <iostream>
 #include <condition_variable>
-#include <chrono>
-#include "include/atomic.h"
-#include "include/Context.h"
 
 class CephContext;
 class PerfCounters;
diff --git a/src/common/Timer.cc b/src/common/Timer.cc
index fb7a1bfbc03..ef76a9e41b7 100644
--- a/src/common/Timer.cc
+++ b/src/common/Timer.cc
@@ -13,21 +13,13 @@
  */
 
 #include "Cond.h"
-#include "Mutex.h"
-#include "Thread.h"
 #include "Timer.h"
 
-#include "common/config.h"
-#include "include/Context.h"
 
 #define dout_subsys ceph_subsys_timer
 #undef dout_prefix
 #define dout_prefix *_dout << "timer(" << this << ")."
 
-#include <sstream>
-#include <signal.h>
-#include <sys/time.h>
-#include <math.h>
 
 
 class SafeTimerThread : public Thread {
diff --git a/src/common/Timer.h b/src/common/Timer.h
index 6ac916d9e2e..078a172b25b 100644
--- a/src/common/Timer.h
+++ b/src/common/Timer.h
@@ -17,9 +17,6 @@
 
 #include "Cond.h"
 #include "Mutex.h"
-#include "RWLock.h"
-
-#include <map>
 
 class CephContext;
 class Context;
diff --git a/src/common/TracepointProvider.h b/src/common/TracepointProvider.h
index 9efc482c6ef..97d3a0f7637 100644
--- a/src/common/TracepointProvider.h
+++ b/src/common/TracepointProvider.h
@@ -4,14 +4,9 @@
 #ifndef CEPH_TRACEPOINT_PROVIDER_H
 #define CEPH_TRACEPOINT_PROVIDER_H
 
-#include "include/int_types.h"
-#include "common/ceph_context.h"
 #include "common/config_obs.h"
 #include "common/Mutex.h"
 #include <dlfcn.h>
-#include <set>
-#include <string>
-#include <boost/noncopyable.hpp>
 
 struct md_config_t;
 
diff --git a/src/common/TrackedOp.cc b/src/common/TrackedOp.cc
index 58df253262d..3181fe1fc22 100644
--- a/src/common/TrackedOp.cc
+++ b/src/common/TrackedOp.cc
@@ -11,13 +11,6 @@
  */
 
 #include "TrackedOp.h"
-#include "common/Formatter.h"
-#include <iostream>
-#include <vector>
-#include "common/debug.h"
-#include "common/config.h"
-#include "msg/Message.h"
-#include "include/assert.h"
 
 #define dout_context cct
 #define dout_subsys ceph_subsys_optracker
diff --git a/src/common/TrackedOp.h b/src/common/TrackedOp.h
index c5f6827ec83..621831da917 100644
--- a/src/common/TrackedOp.h
+++ b/src/common/TrackedOp.h
@@ -13,16 +13,9 @@
 
 #ifndef TRACKEDREQUEST_H_
 #define TRACKEDREQUEST_H_
-#include <sstream>
-#include <stdint.h>
-#include <boost/intrusive/list.hpp>
-#include <atomic>
 
-#include "include/utime.h"
-#include "common/Mutex.h"
 #include "common/histogram.h"
 #include "msg/Message.h"
-#include "include/memory.h"
 #include "common/RWLock.h"
 
 #define OPTRACKER_PREALLOC_EVENTS 20
@@ -222,6 +215,11 @@ protected:
   virtual void _unregistered() {};
 
 public:
+  ZTracer::Trace osd_trace;
+  ZTracer::Trace pg_trace;
+  ZTracer::Trace store_trace;
+  ZTracer::Trace journal_trace;
+
   virtual ~TrackedOp() {}
 
   void get() {
diff --git a/src/common/WorkQueue.cc b/src/common/WorkQueue.cc
index ec5f8a1476d..b077b813cc3 100644
--- a/src/common/WorkQueue.cc
+++ b/src/common/WorkQueue.cc
@@ -12,17 +12,9 @@
  * 
  */
 
+#include "WorkQueue.h"
 #include "include/compat.h"
-
-#include <sstream>
-
-#include "include/types.h"
-#include "include/utime.h"
 #include "common/errno.h"
-#include "WorkQueue.h"
-
-#include "common/config.h"
-#include "common/HeartbeatMap.h"
 
 #define dout_subsys ceph_subsys_tp
 #undef dout_prefix
diff --git a/src/common/WorkQueue.h b/src/common/WorkQueue.h
index 6e29da6fd38..33c6c780d35 100644
--- a/src/common/WorkQueue.h
+++ b/src/common/WorkQueue.h
@@ -15,11 +15,8 @@
 #ifndef CEPH_WORKQUEUE_H
 #define CEPH_WORKQUEUE_H
 
-#include "Mutex.h"
 #include "Cond.h"
-#include "Thread.h"
 #include "include/unordered_map.h"
-#include "common/config_obs.h"
 #include "common/HeartbeatMap.h"
 
 class CephContext;
diff --git a/src/common/addr_parsing.c b/src/common/addr_parsing.c
index ae755576500..50694402980 100644
--- a/src/common/addr_parsing.c
+++ b/src/common/addr_parsing.c
@@ -12,7 +12,6 @@
  *
  */
 
-#include <errno.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
diff --git a/src/common/backport14.h b/src/common/backport14.h
new file mode 100644
index 00000000000..a574cd06f93
--- /dev/null
+++ b/src/common/backport14.h
@@ -0,0 +1,64 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <memory>
+#include <type_traits>
+
+#ifndef CEPH_COMMON_BACKPORT14_H
+#define CEPH_COMMON_BACKPORT14_H
+
+// Library code from C++14 that can be implemented in C++11.
+
+namespace ceph {
+template<typename T>
+using remove_extent_t = typename std::remove_extent<T>::type;
+template<typename T>
+using remove_reference_t = typename std::remove_reference<T>::type;
+template<typename T>
+using result_of_t = typename std::result_of<T>::type;
+
+namespace _backport14 {
+template<typename T>
+struct uniquity {
+  using datum = std::unique_ptr<T>;
+};
+
+template<typename T>
+struct uniquity<T[]> {
+  using array = std::unique_ptr<T[]>;
+};
+
+template<typename T, std::size_t N>
+struct uniquity<T[N]> {
+  using verboten = void;
+};
+
+template<typename T, typename... Args>
+inline typename uniquity<T>::datum make_unique(Args&&... args) {
+  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+
+template<typename T>
+inline typename uniquity<T>::array make_unique(std::size_t n) {
+  return std::unique_ptr<T>(new remove_extent_t<T>[n]());
+}
+
+template<typename T, class... Args>
+typename uniquity<T>::verboten
+make_unique(Args&&...) = delete;
+} // namespace _backport14
+using _backport14::make_unique;
+} // namespace ceph
+
+#endif // CEPH_COMMON_BACKPORT14_H
diff --git a/src/common/bit_vector.hpp b/src/common/bit_vector.hpp
index 62d924d984f..c15b0927e37 100644
--- a/src/common/bit_vector.hpp
+++ b/src/common/bit_vector.hpp
@@ -231,8 +231,11 @@ void BitVector<_b>::decode_data(bufferlist::iterator& it, uint64_t byte_offset)
   while (byte_offset < end_offset) {
     uint64_t len = MIN(BLOCK_SIZE, end_offset - byte_offset);
 
+    bufferptr ptr;
+    it.copy_deep(len, ptr);
+
     bufferlist bit;
-    it.copy(len, bit);
+    bit.append(ptr);
     if (m_crc_enabled &&
 	m_data_crcs[byte_offset / BLOCK_SIZE] != bit.crc32c(0)) {
       throw buffer::malformed_input("invalid data block CRC");
diff --git a/src/common/blkdev.cc b/src/common/blkdev.cc
index 361a7738bd4..11bd954103b 100644
--- a/src/common/blkdev.cc
+++ b/src/common/blkdev.cc
@@ -18,7 +18,6 @@
 #include <ctype.h>
 #include <dirent.h>
 #include <stdlib.h>
-#include "include/int_types.h"
 #include "include/uuid.h"
 
 #ifdef __linux__
diff --git a/src/common/bloom_filter.cc b/src/common/bloom_filter.cc
index 5a7a3ac18c3..6bc54bc7762 100644
--- a/src/common/bloom_filter.cc
+++ b/src/common/bloom_filter.cc
@@ -1,8 +1,9 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 // vim: ts=8 sw=2 smarttab
 
-#include "include/types.h"
 #include "common/bloom_filter.hpp"
+#include "common/Formatter.h"
+#include "include/buffer.h"
 
 MEMPOOL_DEFINE_FACTORY(unsigned char, byte, bloom_filter);
 
@@ -66,7 +67,7 @@ void bloom_filter::dump(Formatter *f) const
   f->close_section();
 }
 
-void bloom_filter::generate_test_instances(list<bloom_filter*>& ls)
+void bloom_filter::generate_test_instances(std::list<bloom_filter*>& ls)
 {
   ls.push_back(new bloom_filter(10, .5, 1));
   ls.push_back(new bloom_filter(10, .5, 1));
@@ -88,7 +89,7 @@ void compressible_bloom_filter::encode(bufferlist& bl) const
 
   uint32_t s = size_list.size();
   ::encode(s, bl);
-  for (vector<size_t>::const_iterator p = size_list.begin();
+  for (std::vector<size_t>::const_iterator p = size_list.begin();
        p != size_list.end(); ++p)
     ::encode((uint64_t)*p, bl);
 
@@ -117,13 +118,13 @@ void compressible_bloom_filter::dump(Formatter *f) const
   bloom_filter::dump(f);
 
   f->open_array_section("table_sizes");
-  for (vector<size_t>::const_iterator p = size_list.begin();
+  for (std::vector<size_t>::const_iterator p = size_list.begin();
        p != size_list.end(); ++p)
     f->dump_unsigned("size", (uint64_t)*p);
   f->close_section();
 }
 
-void compressible_bloom_filter::generate_test_instances(list<compressible_bloom_filter*>& ls)
+void compressible_bloom_filter::generate_test_instances(std::list<compressible_bloom_filter*>& ls)
 {
   ls.push_back(new compressible_bloom_filter(10, .5, 1));
   ls.push_back(new compressible_bloom_filter(10, .5, 1));
diff --git a/src/common/bloom_filter.hpp b/src/common/bloom_filter.hpp
index 286a96589c6..9007f111cd7 100644
--- a/src/common/bloom_filter.hpp
+++ b/src/common/bloom_filter.hpp
@@ -32,7 +32,6 @@
 
 #include "include/mempool.h"
 #include "include/encoding.h"
-#include "common/Formatter.h"
 
 static const std::size_t bits_per_char = 0x08;    // 8 bits in 1 char(unsigned)
 static const unsigned char bit_mask[bits_per_char] = {
diff --git a/src/common/buffer.cc b/src/common/buffer.cc
index ae70a21c94f..5d8dcf5c27d 100644
--- a/src/common/buffer.cc
+++ b/src/common/buffer.cc
@@ -26,7 +26,6 @@
 #include "include/atomic.h"
 #include "common/RWLock.h"
 #include "include/types.h"
-#include "include/compat.h"
 #include "include/inline_memory.h"
 #include "include/scope_guard.h"
 #if defined(HAVE_XIO)
@@ -34,13 +33,10 @@
 #endif
 
 #include <errno.h>
-#include <fstream>
-#include <sstream>
 #include <sys/uio.h>
 #include <limits.h>
 
 #include <atomic>
-#include <ostream>
 
 #define CEPH_BUFFER_ALLOC_UNIT  (MIN(CEPH_PAGE_SIZE, 4096))
 #define CEPH_BUFFER_APPEND_SIZE (CEPH_BUFFER_ALLOC_UNIT - sizeof(raw_combined))
@@ -91,6 +87,7 @@ static std::atomic_flag buffer_debug_lock = ATOMIC_FLAG_INIT;
 
   static atomic_t buffer_cached_crc;
   static atomic_t buffer_cached_crc_adjusted;
+  static atomic_t buffer_missed_crc;
   static bool buffer_track_crc = get_env_bool("CEPH_BUFFER_TRACK");
 
   void buffer::track_cached_crc(bool b) {
@@ -103,6 +100,10 @@ static std::atomic_flag buffer_debug_lock = ATOMIC_FLAG_INIT;
     return buffer_cached_crc_adjusted.read();
   }
 
+  int buffer::get_missed_crc() {
+    return buffer_missed_crc.read();
+  }
+
   static atomic_t buffer_c_str_accesses;
   static bool buffer_track_c_str = get_env_bool("CEPH_BUFFER_TRACK");
 
@@ -2338,19 +2339,6 @@ int buffer::list::write_fd(int fd, uint64_t offset) const
   return 0;
 }
 
-void buffer::list::prepare_iov(std::vector<iovec> *piov) const
-{
-  assert(_buffers.size() <= IOV_MAX);
-  piov->resize(_buffers.size());
-  unsigned n = 0;
-  for (std::list<buffer::ptr>::const_iterator p = _buffers.begin();
-       p != _buffers.end();
-       ++p, ++n) {
-    (*piov)[n].iov_base = (void *)p->c_str();
-    (*piov)[n].iov_len = p->length();
-  }
-}
-
 int buffer::list::write_fd_zero_copy(int fd) const
 {
   if (!can_zero_copy())
@@ -2404,6 +2392,8 @@ __u32 buffer::list::crc32c(__u32 crc) const
 	    buffer_cached_crc_adjusted.inc();
 	}
       } else {
+	if (buffer_track_crc)
+	  buffer_missed_crc.inc();
 	uint32_t base = crc;
 	crc = ceph_crc32c(crc, (unsigned char*)it->c_str(), it->length());
 	r->set_crc(ofs, make_pair(base, crc));
@@ -2508,6 +2498,25 @@ void buffer::list::hexdump(std::ostream &out, bool trailing_newline) const
   out.flags(original_flags);
 }
 
+
+buffer::list buffer::list::static_from_mem(char* c, size_t l) {
+  list bl;
+  bl.push_back(ptr(create_static(l, c)));
+  return bl;
+}
+
+buffer::list buffer::list::static_from_cstring(char* c) {
+  return static_from_mem(c, std::strlen(c));
+}
+
+buffer::list buffer::list::static_from_string(string& s) {
+  // C++14 just has string::data return a char* from a non-const
+  // string.
+  return static_from_mem(const_cast<char*>(s.data()), s.length());
+  // But the way buffer::list mostly doesn't work in a sane way with
+  // const makes me generally sad.
+}
+
 std::ostream& buffer::operator<<(std::ostream& out, const buffer::raw &r) {
   return out << "buffer::raw(" << (void*)r.data << " len " << r.len << " nref " << r.nref.read() << ")";
 }
diff --git a/src/common/ceph_context.cc b/src/common/ceph_context.cc
index 9334a94b1fa..e259b28feed 100644
--- a/src/common/ceph_context.cc
+++ b/src/common/ceph_context.cc
@@ -21,6 +21,7 @@
 #include "common/admin_socket.h"
 #include "common/perf_counters.h"
 #include "common/Thread.h"
+#include "common/code_environment.h"
 #include "common/ceph_context.h"
 #include "common/ceph_crypto.h"
 #include "common/config.h"
@@ -495,6 +496,35 @@ void CephContext::do_command(std::string command, cmdmap_t& cmdmap,
         f->dump_string("option", *p);
       }
       f->close_section(); // unknown
+    } else if (command == "config diff get") {
+      std::string setting;
+      if (!cmd_getval(this, cmdmap, "var", setting)) {
+        f->dump_string("error", "syntax error: 'config diff get <var>'");
+      } else {
+        md_config_t def_conf;
+        def_conf.set_val("cluster", _conf->cluster);
+        def_conf.name = _conf->name;
+        def_conf.set_val("host", _conf->host);
+        def_conf.apply_changes(NULL);
+
+        map<string, pair<string, string>> diff;
+        set<string> unknown;
+        def_conf.diff(_conf, &diff, &unknown, setting);
+        f->open_object_section("diff");
+        f->open_object_section("current");
+
+        for (const auto& p : diff) {
+          f->dump_string(p.first.c_str(), p.second.second);
+        } 
+        f->close_section();   //-- current
+
+        f->open_object_section("defaults");
+        for (const auto& p : diff) {
+          f->dump_string(p.first.c_str(), p.second.first);
+        } 
+        f->close_section();   //-- defaults
+        f->close_section();   //-- diff
+      } 
     } else if (command == "log flush") {
       _log->flush();
     }
@@ -581,6 +611,9 @@ CephContext::CephContext(uint32_t module_type_, int init_flags_)
   _admin_socket->register_command("config diff",
       "config diff", _admin_hook,
       "dump diff of current config and default config");
+  _admin_socket->register_command("config diff get",
+      "config diff get name=var,type=CephString", _admin_hook,
+      "dump diff get <field>: dump diff of current and default config setting <field>");
   _admin_socket->register_command("log flush", "log flush", _admin_hook, "flush log entries to log file");
   _admin_socket->register_command("log dump", "log dump", _admin_hook, "dump recent log entries to log file");
   _admin_socket->register_command("log reopen", "log reopen", _admin_hook, "reopen log file");
@@ -619,6 +652,7 @@ CephContext::~CephContext()
   _admin_socket->unregister_command("config set");
   _admin_socket->unregister_command("config get");
   _admin_socket->unregister_command("config diff");
+  _admin_socket->unregister_command("config diff get");
   _admin_socket->unregister_command("log flush");
   _admin_socket->unregister_command("log dump");
   _admin_socket->unregister_command("log reopen");
@@ -659,7 +693,7 @@ CephContext::~CephContext()
   delete _crypto_none;
   delete _crypto_aes;
   if (_crypto_inited)
-    ceph::crypto::shutdown();
+    ceph::crypto::shutdown(g_code_env == CODE_ENVIRONMENT_LIBRARY);
 }
 
 void CephContext::put() {
diff --git a/src/common/ceph_crypto.cc b/src/common/ceph_crypto.cc
index 6da3232b1dc..67db503149b 100644
--- a/src/common/ceph_crypto.cc
+++ b/src/common/ceph_crypto.cc
@@ -27,7 +27,7 @@ void ceph::crypto::init(CephContext *cct)
 {
 }
 
-void ceph::crypto::shutdown()
+void ceph::crypto::shutdown(bool)
 {
 }
 
@@ -44,6 +44,7 @@ ceph::crypto::HMACSHA256::~HMACSHA256()
 
 // for SECMOD_RestartModules()
 #include <secmod.h>
+#include <nspr.h>
 
 static pthread_mutex_t crypto_init_mutex = PTHREAD_MUTEX_INITIALIZER;
 static uint32_t crypto_refs = 0;
@@ -77,12 +78,15 @@ void ceph::crypto::init(CephContext *cct)
   assert(crypto_context != NULL);
 }
 
-void ceph::crypto::shutdown()
+void ceph::crypto::shutdown(bool shared)
 {
   pthread_mutex_lock(&crypto_init_mutex);
   assert(crypto_refs > 0);
   if (--crypto_refs == 0) {
     NSS_ShutdownContext(crypto_context);
+    if (!shared) {
+      PR_Cleanup();
+    }
     crypto_context = NULL;
     crypto_init_pid = 0;
   }
diff --git a/src/common/ceph_crypto.h b/src/common/ceph_crypto.h
index f88542adb35..9c302392923 100644
--- a/src/common/ceph_crypto.h
+++ b/src/common/ceph_crypto.h
@@ -23,7 +23,11 @@ namespace ceph {
   namespace crypto {
     void assert_init();
     void init(CephContext *cct);
-    void shutdown();
+    // @param shared true if the the underlying crypto library could be shared
+    //               with the application linked against the Ceph library.
+    // @note we do extra global cleanup specific to the underlying crypto
+    //       library, if @c shared is @c false.
+    void shutdown(bool shared=true);
 
     using CryptoPP::Weak::MD5;
     using CryptoPP::SHA1;
@@ -67,7 +71,7 @@ namespace ceph {
   namespace crypto {
     void assert_init();
     void init(CephContext *cct);
-    void shutdown();
+    void shutdown(bool shared=true);
     class Digest {
     private:
       PK11Context *ctx;
diff --git a/src/common/ceph_fs.cc b/src/common/ceph_fs.cc
index a4f71be943a..ce9873121c2 100644
--- a/src/common/ceph_fs.cc
+++ b/src/common/ceph_fs.cc
@@ -4,8 +4,6 @@
  *
  */
 
-#include <errno.h>
-
 /*
  * Some non-inline ceph helpers
  */
@@ -17,18 +15,18 @@ int ceph_flags_to_mode(int flags)
 	int mode = -1;
 
 #ifdef O_DIRECTORY  /* fixme */
-	if ((flags & O_DIRECTORY) == O_DIRECTORY)
+	if ((flags & CEPH_O_DIRECTORY) == CEPH_O_DIRECTORY)
 		return CEPH_FILE_MODE_PIN;
 #endif
 
 	switch (flags & O_ACCMODE) {
-	case O_WRONLY:
+	case CEPH_O_WRONLY:
 		mode = CEPH_FILE_MODE_WR;
 		break;
-	case O_RDONLY:
+	case CEPH_O_RDONLY:
 		mode = CEPH_FILE_MODE_RD;
 		break;
-	case O_RDWR:
+	case CEPH_O_RDWR:
 	case O_ACCMODE: /* this is what the VFS does */
 		mode = CEPH_FILE_MODE_RDWR;
 		break;
@@ -54,3 +52,33 @@ int ceph_caps_for_mode(int mode)
 
 	return caps;
 }
+
+int ceph_flags_sys2wire(int flags)
+{
+       int wire_flags = 0;
+
+       switch (flags & O_ACCMODE) {
+       case O_RDONLY:
+               wire_flags |= CEPH_O_RDONLY;
+               break;
+       case O_WRONLY:
+               wire_flags |= CEPH_O_WRONLY;
+               break;
+       case O_RDWR:
+               wire_flags |= CEPH_O_RDWR;
+               break;
+       }
+       flags &= ~O_ACCMODE;
+
+#define ceph_sys2wire(a) if (flags & a) { wire_flags |= CEPH_##a; flags &= ~a; }
+
+       ceph_sys2wire(O_CREAT);
+       ceph_sys2wire(O_EXCL);
+       ceph_sys2wire(O_TRUNC);
+       ceph_sys2wire(O_DIRECTORY);
+       ceph_sys2wire(O_NOFOLLOW);
+
+#undef ceph_sys2wire
+
+       return wire_flags;
+}
diff --git a/src/common/cmdparse.cc b/src/common/cmdparse.cc
index 3fdf94c7367..bf1a327b031 100644
--- a/src/common/cmdparse.cc
+++ b/src/common/cmdparse.cc
@@ -319,7 +319,7 @@ cmd_vartype_stringify(const cmd_vartype &v)
 
 
 void
-handle_bad_get(CephContext *cct, string k, const char *tname)
+handle_bad_get(CephContext *cct, const string& k, const char *tname)
 {
   ostringstream errstr;
   int status;
diff --git a/src/common/cmdparse.h b/src/common/cmdparse.h
index 104848dc263..788b7097d46 100644
--- a/src/common/cmdparse.h
+++ b/src/common/cmdparse.h
@@ -42,13 +42,13 @@ void dump_cmddesc_to_json(ceph::Formatter *jf,
 bool cmdmap_from_json(std::vector<std::string> cmd, cmdmap_t *mapp,
 		      std::stringstream &ss);
 void cmdmap_dump(const cmdmap_t &cmdmap, ceph::Formatter *f);
-void handle_bad_get(CephContext *cct, std::string k, const char *name);
+void handle_bad_get(CephContext *cct, const std::string& k, const char *name);
 
 std::string cmd_vartype_stringify(const cmd_vartype& v);
 
 template <typename T>
 bool
-cmd_getval(CephContext *cct, const cmdmap_t& cmdmap, std::string k, T& val)
+cmd_getval(CephContext *cct, const cmdmap_t& cmdmap, const std::string& k, T& val)
 {
   if (cmdmap.count(k)) {
     try {
@@ -65,7 +65,7 @@ cmd_getval(CephContext *cct, const cmdmap_t& cmdmap, std::string k, T& val)
 
 template <typename T>
 void
-cmd_getval(CephContext *cct, cmdmap_t& cmdmap, std::string k, T& val, T defval)
+cmd_getval(CephContext *cct, const cmdmap_t& cmdmap, const std::string& k, T& val, const T& defval)
 {
   if (!cmd_getval(cct, cmdmap, k, val))
     val = defval;
@@ -73,7 +73,7 @@ cmd_getval(CephContext *cct, cmdmap_t& cmdmap, std::string k, T& val, T defval)
 
 template <typename T>
 void
-cmd_putval(CephContext *cct, cmdmap_t& cmdmap, std::string k, T val)
+cmd_putval(CephContext *cct, cmdmap_t& cmdmap, const std::string& k, const T& val)
 {
   cmdmap[k] = val;
 }
diff --git a/src/common/common_init.cc b/src/common/common_init.cc
index ffe4baec797..d796e933c4b 100644
--- a/src/common/common_init.cc
+++ b/src/common/common_init.cc
@@ -25,6 +25,7 @@
 #include "common/valgrind.h"
 #include "common/version.h"
 #include "common/strtol.h"
+#include "common/zipkin_trace.h"
 #include "include/color.h"
 
 #include <errno.h>
@@ -124,6 +125,7 @@ void complain_about_parse_errors(CephContext *cct,
 void common_init_finish(CephContext *cct)
 {
   cct->init_crypto();
+  ZTracer::ztrace_init();
 
   int flags = cct->get_init_flags();
   if (!(flags & CINIT_FLAG_NO_DAEMON_ACTIONS))
diff --git a/src/common/config.cc b/src/common/config.cc
index 9fef25d209c..32a817deb8c 100644
--- a/src/common/config.cc
+++ b/src/common/config.cc
@@ -63,7 +63,11 @@ using std::pair;
 using std::set;
 using std::string;
 
-const char *CEPH_CONF_FILE_DEFAULT = "$data_dir/config, /etc/ceph/$cluster.conf, ~/.ceph/$cluster.conf, $cluster.conf";
+const char *CEPH_CONF_FILE_DEFAULT = "$data_dir/config, /etc/ceph/$cluster.conf, ~/.ceph/$cluster.conf, $cluster.conf"
+#if defined(__FreeBSD__)
+    ", /usr/local/etc/ceph/$cluster.conf"
+#endif
+    ;
 
 #define _STR(x) #x
 #define STRINGIFY(x) _STR(x)
@@ -1338,15 +1342,35 @@ bool md_config_t::expand_meta(std::string &origval,
 }
 
 void md_config_t::diff(
+  const md_config_t *other,
+  map<string, pair<string, string> > *diff,
+  set<string> *unknown) 
+{
+  diff_helper(other, diff, unknown);
+}
+void md_config_t::diff(
+  const md_config_t *other,
+  map<string, pair<string, string> > *diff,
+  set<string> *unknown, const string& setting) 
+{
+  diff_helper(other, diff, unknown, setting);
+}
+
+void md_config_t::diff_helper(
     const md_config_t *other,
     map<string,pair<string,string> > *diff,
-    set<string> *unknown)
+    set<string> *unknown, const string& setting)
 {
   Mutex::Locker l(lock);
 
   char local_buf[4096];
   char other_buf[4096];
-  for (auto& opt: *config_options) {
+  for (auto& opt : *config_options) {
+    if (!setting.empty()) {
+      if (setting != opt.name) {
+        continue;
+      }
+    }
     memset(local_buf, 0, sizeof(local_buf));
     memset(other_buf, 0, sizeof(other_buf));
 
@@ -1366,6 +1390,10 @@ void md_config_t::diff(
 
     if (strcmp(local_val, other_val))
       diff->insert(make_pair(opt.name, make_pair(local_val, other_val)));
+    else if (!setting.empty()) {
+        diff->insert(make_pair(opt.name, make_pair(local_val, other_val)));
+        break;
+    }
   }
 }
 
diff --git a/src/common/config.h b/src/common/config.h
index bbabc14a887..887b2f7e939 100644
--- a/src/common/config.h
+++ b/src/common/config.h
@@ -239,6 +239,12 @@ public:
   void diff(const md_config_t *other,
             map<string,pair<string,string> > *diff, set<string> *unknown);
 
+  /// obtain a diff between config values and another md_config_t 
+  /// values for a specific setting. 
+  void diff(const md_config_t *other,
+            map<string,pair<string,string>> *diff, set<string> *unknown, 
+            const string& setting);
+
   /// print/log warnings/errors from parsing the config
   void complain_about_parse_errors(CephContext *cct);
 
@@ -270,6 +276,11 @@ private:
 
   bool expand_meta(std::string &val,
 		   std::ostream *oss) const;
+
+  void diff_helper(const md_config_t* other,
+                   map<string, pair<string, string>>* diff,
+                   set<string>* unknown, const string& setting = string{});
+
 public:  // for global_init
   bool early_expand_meta(std::string &val,
 			 std::ostream *oss) const {
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index cb7ed27908d..13e1a558441 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -172,7 +172,14 @@ SUBSYS(eventtrace, 1, 5)
 
 OPTION(key, OPT_STR, "")
 OPTION(keyfile, OPT_STR, "")
-OPTION(keyring, OPT_STR, "/etc/ceph/$cluster.$name.keyring,/etc/ceph/$cluster.keyring,/etc/ceph/keyring,/etc/ceph/keyring.bin") // default changed by common_preinit() for mds and osd
+OPTION(keyring, OPT_STR, 
+    // default changed by common_preinit() for mds and osd
+    "/etc/ceph/$cluster.$name.keyring,/etc/ceph/$cluster.keyring,/etc/ceph/keyring,/etc/ceph/keyring.bin," 
+#if defined(__FreeBSD)
+    "/usr/local/etc/ceph/$cluster.$name.keyring,/usr/local/etc/ceph/$cluster.keyring,"
+    "/usr/local/etc/ceph/keyring,/usr/local/etc/ceph/keyring.bin," 
+#endif
+    )
 OPTION(heartbeat_interval, OPT_INT, 5)
 OPTION(heartbeat_file, OPT_STR, "")
 OPTION(heartbeat_inject_failure, OPT_INT, 0)    // force an unhealthy heartbeat for N seconds
@@ -285,7 +292,6 @@ OPTION(mon_osd_max_op_age, OPT_DOUBLE, 32)     // max op age before we get conce
 OPTION(mon_osd_max_split_count, OPT_INT, 32) // largest number of PGs per "involved" OSD to let split create
 OPTION(mon_osd_allow_primary_temp, OPT_BOOL, false)  // allow primary_temp to be set in the osdmap
 OPTION(mon_osd_allow_primary_affinity, OPT_BOOL, false)  // allow primary_affinity to be set in the osdmap
-OPTION(mon_osd_allow_pg_upmap, OPT_BOOL, false) // allow pg upmap to be set in the osdmap
 OPTION(mon_osd_prime_pg_temp, OPT_BOOL, true)  // prime osdmap with pg mapping changes
 OPTION(mon_osd_prime_pg_temp_max_time, OPT_FLOAT, .5)  // max time to spend priming
 OPTION(mon_osd_prime_pg_temp_max_estimate, OPT_FLOAT, .25) // max estimate of pg total before we do all pgs in parallel
@@ -313,7 +319,9 @@ OPTION(mon_cache_target_full_warn_ratio, OPT_FLOAT, .66) // position between poo
 OPTION(mon_osd_full_ratio, OPT_FLOAT, .95) // what % full makes an OSD "full"
 OPTION(mon_osd_backfillfull_ratio, OPT_FLOAT, .90) // what % full makes an OSD backfill full (backfill halted)
 OPTION(mon_osd_nearfull_ratio, OPT_FLOAT, .85) // what % full makes an OSD near full
+OPTION(mon_osd_initial_require_min_compat_client, OPT_STR, "hammer")
 OPTION(mon_allow_pool_delete, OPT_BOOL, false) // allow pool deletion
+OPTION(mon_fake_pool_delete, OPT_BOOL, false)  // fake pool deletion (add _DELETED suffix)
 OPTION(mon_globalid_prealloc, OPT_U32, 10000)   // how many globalids to prealloc
 OPTION(mon_osd_report_timeout, OPT_INT, 900)    // grace period before declaring unresponsive OSDs dead
 OPTION(mon_force_standby_active, OPT_BOOL, true) // should mons force standby-replay mds to be active
@@ -329,8 +337,6 @@ OPTION(mon_max_log_epochs, OPT_INT, 500)
 OPTION(mon_max_mdsmap_epochs, OPT_INT, 500)
 OPTION(mon_max_osd, OPT_INT, 10000)
 OPTION(mon_probe_timeout, OPT_DOUBLE, 2.0)
-OPTION(mon_slurp_timeout, OPT_DOUBLE, 10.0)
-OPTION(mon_slurp_bytes, OPT_INT, 256*1024)    // limit size of slurp messages
 OPTION(mon_client_bytes, OPT_U64, 100ul << 20)  // client msg data allowed in memory (in bytes)
 OPTION(mon_mgr_proxy_client_bytes_ratio, OPT_FLOAT, .3) // ratio of mon_client_bytes that can be consumed by proxied mgr commands before we error out to client
 OPTION(mon_daemon_bytes, OPT_U64, 400ul << 20)  // mds, osd message memory cap (in bytes)
@@ -357,9 +363,6 @@ OPTION(mon_config_key_max_entry_size, OPT_INT, 4096) // max num bytes per config
 OPTION(mon_sync_timeout, OPT_DOUBLE, 60.0)
 OPTION(mon_sync_max_payload_size, OPT_U32, 1048576) // max size for a sync chunk payload (say, 1MB)
 OPTION(mon_sync_debug, OPT_BOOL, false) // enable sync-specific debug
-OPTION(mon_sync_debug_leader, OPT_INT, -1) // monitor to be used as the sync leader
-OPTION(mon_sync_debug_provider, OPT_INT, -1) // monitor to be used as the sync provider
-OPTION(mon_sync_debug_provider_fallback, OPT_INT, -1) // monitor to be used as fallback if sync provider fails
 OPTION(mon_inject_sync_get_chunk_delay, OPT_DOUBLE, 0)  // inject N second delay on each get_chunk request
 OPTION(mon_osd_min_down_reporters, OPT_INT, 2)   // number of OSDs from different subtrees who need to report a down OSD for it to count
 OPTION(mon_osd_reporter_subtree_level , OPT_STR, "host")   // in which level of parent bucket the reporters are counted
@@ -375,6 +378,7 @@ OPTION(mon_debug_dump_transactions, OPT_BOOL, false)
 OPTION(mon_debug_dump_json, OPT_BOOL, false)
 OPTION(mon_debug_dump_location, OPT_STR, "/var/log/ceph/$cluster-$name.tdump")
 OPTION(mon_debug_no_require_luminous, OPT_BOOL, false)
+OPTION(mon_debug_no_require_bluestore_for_ec_overwrites, OPT_BOOL, false)
 OPTION(mon_inject_transaction_delay_max, OPT_DOUBLE, 10.0)      // seconds
 OPTION(mon_inject_transaction_delay_probability, OPT_DOUBLE, 0) // range [0, 1]
 
@@ -435,7 +439,6 @@ OPTION(client_mount_gid, OPT_INT, -1)
 OPTION(client_notify_timeout, OPT_INT, 10) // in seconds
 OPTION(osd_client_watch_timeout, OPT_INT, 30) // in seconds
 OPTION(client_caps_release_delay, OPT_INT, 5) // in seconds
-OPTION(client_quota, OPT_BOOL, true)
 OPTION(client_quota_df, OPT_BOOL, true) // use quota for df on subdir mounts
 OPTION(client_oc, OPT_BOOL, true)
 OPTION(client_oc_size, OPT_INT, 1024*1024* 200)    // MB * n
@@ -458,7 +461,7 @@ OPTION(client_dirsize_rbytes, OPT_BOOL, true)
 OPTION(fuse_use_invalidate_cb, OPT_BOOL, true) // use fuse 2.8+ invalidate callback to keep page cache consistent
 OPTION(fuse_disable_pagecache, OPT_BOOL, false)
 OPTION(fuse_allow_other, OPT_BOOL, true)
-OPTION(fuse_default_permissions, OPT_BOOL, true)
+OPTION(fuse_default_permissions, OPT_BOOL, false)
 OPTION(fuse_big_writes, OPT_BOOL, true)
 OPTION(fuse_atomic_o_trunc, OPT_BOOL, true)
 OPTION(fuse_debug, OPT_BOOL, false)
@@ -488,6 +491,8 @@ OPTION(objecter_debug_inject_relock_delay, OPT_BOOL, false)
 
 // Max number of deletes at once in a single Filer::purge call
 OPTION(filer_max_purge_ops, OPT_U32, 10)
+// Max number of truncate at once in a single Filer::truncate call
+OPTION(filer_max_truncate_ops, OPT_U32, 128)
 
 OPTION(journaler_write_head_interval, OPT_INT, 15)
 OPTION(journaler_prefetch_periods, OPT_INT, 10)   // * journal object size
@@ -529,6 +534,7 @@ OPTION(mds_log_events_per_segment, OPT_INT, 1024)
 OPTION(mds_log_segment_size, OPT_INT, 0)  // segment size for mds log, default to default file_layout_t
 OPTION(mds_log_max_segments, OPT_U32, 30)
 OPTION(mds_log_max_expiring, OPT_INT, 20)
+OPTION(mds_bal_export_pin, OPT_BOOL, true)  // allow clients to pin directory trees to ranks
 OPTION(mds_bal_sample_interval, OPT_DOUBLE, 3.0)  // every 3 seconds
 OPTION(mds_bal_replicate_threshold, OPT_FLOAT, 8000)
 OPTION(mds_bal_unreplicate_threshold, OPT_FLOAT, 0)
@@ -552,8 +558,7 @@ OPTION(mds_bal_need_min, OPT_FLOAT, .8)       // take within this range of what
 OPTION(mds_bal_need_max, OPT_FLOAT, 1.2)
 OPTION(mds_bal_midchunk, OPT_FLOAT, .3)       // any sub bigger than this taken in full
 OPTION(mds_bal_minchunk, OPT_FLOAT, .001)     // never take anything smaller than this
-OPTION(mds_bal_target_removal_min, OPT_INT, 5) // min balance iterations before old target is removed
-OPTION(mds_bal_target_removal_max, OPT_INT, 10) // max balance iterations before old target is removed
+OPTION(mds_bal_target_decay, OPT_DOUBLE, 10.0) // target decay half-life in MDSMap (2x larger is approx. 2x slower)
 OPTION(mds_replay_interval, OPT_FLOAT, 1.0) // time to wait before starting replay again
 OPTION(mds_shutdown_check, OPT_INT, 0)
 OPTION(mds_thrash_exports, OPT_INT, 0)
@@ -850,6 +855,7 @@ OPTION(osd_pg_epoch_persisted_max_stale, OPT_U32, 150) // make this < map_cache_
 
 OPTION(osd_min_pg_log_entries, OPT_U32, 3000)  // number of entries to keep in the pg log when trimming it
 OPTION(osd_max_pg_log_entries, OPT_U32, 10000) // max entries, say when degraded, before we trim
+OPTION(osd_force_recovery_pg_log_entries_factor, OPT_FLOAT, 1.3) // max entries factor before force recovery
 OPTION(osd_pg_log_trim_min, OPT_U32, 100)
 OPTION(osd_op_complaint_time, OPT_FLOAT, 30) // how many seconds old makes an op complaint-worthy
 OPTION(osd_command_max_records, OPT_INT, 256)
@@ -980,9 +986,6 @@ OPTION(osd_max_omap_bytes_per_request, OPT_U64, 1<<30)
 
 OPTION(osd_objectstore, OPT_STR, "filestore")  // ObjectStore backend type
 OPTION(osd_objectstore_tracing, OPT_BOOL, false) // true if LTTng-UST tracepoints should be enabled
-// Override maintaining compatibility with older OSDs
-// Set to true for testing.  Users should NOT set this.
-OPTION(osd_debug_override_acting_compat, OPT_BOOL, false)
 OPTION(osd_objectstore_fuse, OPT_BOOL, false)
 
 OPTION(osd_bench_small_size_max_iops, OPT_U32, 100) // 100 IOPS
@@ -990,6 +993,9 @@ OPTION(osd_bench_large_size_max_throughput, OPT_U64, 100 << 20) // 100 MB/s
 OPTION(osd_bench_max_block_size, OPT_U64, 64 << 20) // cap the block size at 64MB
 OPTION(osd_bench_duration, OPT_U32, 30) // duration of 'osd bench', capped at 30s to avoid triggering timeouts
 
+OPTION(osd_blkin_trace_all, OPT_BOOL, false) // create a blkin trace for all osd requests
+OPTION(osdc_blkin_trace_all, OPT_BOOL, false) // create a blkin trace for all objecter requests
+
 OPTION(osd_discard_disconnected_ops, OPT_BOOL, true)
 
 OPTION(memstore_device_bytes, OPT_U64, 1024*1024*1024)
@@ -1023,6 +1029,7 @@ OPTION(bluefs_log_compact_min_size, OPT_U64, 16*1048576)  // before we consider
 OPTION(bluefs_min_flush_size, OPT_U64, 524288)  // ignore flush until its this big
 OPTION(bluefs_compact_log_sync, OPT_BOOL, false)  // sync or async log compaction?
 OPTION(bluefs_buffered_io, OPT_BOOL, false)
+OPTION(bluefs_sync_write, OPT_BOOL, false)
 OPTION(bluefs_allocator, OPT_STR, "bitmap")     // stupid | bitmap
 OPTION(bluefs_preextend_wal_files, OPT_BOOL, false)  // this *requires* that rocksdb has recycling enabled
 
@@ -1116,7 +1123,7 @@ OPTION(bluestore_allocator, OPT_STR, "bitmap")     // stupid | bitmap
 OPTION(bluestore_freelist_blocks_per_key, OPT_INT, 128)
 OPTION(bluestore_bitmapallocator_blocks_per_zone, OPT_INT, 1024) // must be power of 2 aligned, e.g., 512, 1024, 2048...
 OPTION(bluestore_bitmapallocator_span_size, OPT_INT, 1024) // must be power of 2 aligned, e.g., 512, 1024, 2048...
-OPTION(bluestore_rocksdb_options, OPT_STR, "compression=kNoCompression,max_write_buffer_number=4,min_write_buffer_number_to_merge=1,recycle_log_file_num=4,write_buffer_size=268435456,writable_file_max_buffer_size=0")
+OPTION(bluestore_rocksdb_options, OPT_STR, "compression=kNoCompression,max_write_buffer_number=4,min_write_buffer_number_to_merge=1,recycle_log_file_num=4,write_buffer_size=268435456,writable_file_max_buffer_size=0,compaction_readahead_size=2097152")
 OPTION(bluestore_fsck_on_mount, OPT_BOOL, false)
 OPTION(bluestore_fsck_on_mount_deep, OPT_BOOL, true)
 OPTION(bluestore_fsck_on_umount, OPT_BOOL, false)
@@ -1124,13 +1131,11 @@ OPTION(bluestore_fsck_on_umount_deep, OPT_BOOL, true)
 OPTION(bluestore_fsck_on_mkfs, OPT_BOOL, true)
 OPTION(bluestore_fsck_on_mkfs_deep, OPT_BOOL, false)
 OPTION(bluestore_sync_submit_transaction, OPT_BOOL, false) // submit kv txn in queueing thread (not kv_sync_thread)
-OPTION(bluestore_max_ops, OPT_U64, 512)
-OPTION(bluestore_max_bytes, OPT_U64, 64*1024*1024)
-OPTION(bluestore_throttle_cost_per_io_hdd, OPT_U64, 200000)
+OPTION(bluestore_throttle_bytes, OPT_U64, 64*1024*1024)
+OPTION(bluestore_throttle_deferred_bytes, OPT_U64, 128*1024*1024)
+OPTION(bluestore_throttle_cost_per_io_hdd, OPT_U64, 1500000)
 OPTION(bluestore_throttle_cost_per_io_ssd, OPT_U64, 4000)
 OPTION(bluestore_throttle_cost_per_io, OPT_U64, 0)
-OPTION(bluestore_deferred_max_ops, OPT_U64, 512)
-OPTION(bluestore_deferred_max_bytes, OPT_U64, 128*1024*1024)
 OPTION(bluestore_deferred_batch_ops, OPT_U64, 0)
 OPTION(bluestore_deferred_batch_ops_hdd, OPT_U64, 64)
 OPTION(bluestore_deferred_batch_ops_ssd, OPT_U64, 16)
@@ -1154,6 +1159,7 @@ OPTION(kstore_max_ops, OPT_U64, 512)
 OPTION(kstore_max_bytes, OPT_U64, 64*1024*1024)
 OPTION(kstore_backend, OPT_STR, "rocksdb")
 OPTION(kstore_rocksdb_options, OPT_STR, "compression=kNoCompression")
+OPTION(kstore_rocksdb_bloom_bits_per_key, OPT_INT, 0)
 OPTION(kstore_fsck_on_mount, OPT_BOOL, false)
 OPTION(kstore_fsck_on_mount_deep, OPT_BOOL, true)
 OPTION(kstore_nid_prealloc, OPT_U64, 1024)
@@ -1163,7 +1169,7 @@ OPTION(kstore_onode_map_size, OPT_U64, 1024)
 OPTION(kstore_cache_tails, OPT_BOOL, true)
 OPTION(kstore_default_stripe_size, OPT_INT, 65536)
 
-OPTION(filestore_omap_backend, OPT_STR, "leveldb")
+OPTION(filestore_omap_backend, OPT_STR, "rocksdb")
 OPTION(filestore_omap_backend_path, OPT_STR, "")
 
 /// filestore wb throttle limits
@@ -1347,6 +1353,7 @@ OPTION(rbd_request_timed_out_seconds, OPT_INT, 30) // number of seconds before m
 OPTION(rbd_skip_partial_discard, OPT_BOOL, false) // when trying to discard a range inside an object, set to true to skip zeroing the range.
 OPTION(rbd_enable_alloc_hint, OPT_BOOL, true) // when writing a object, it will issue a hint to osd backend to indicate the expected size object need
 OPTION(rbd_tracing, OPT_BOOL, false) // true if LTTng-UST tracepoints should be enabled
+OPTION(rbd_blkin_trace_all, OPT_BOOL, false) // create a blkin trace for all RBD requests
 OPTION(rbd_validate_pool, OPT_BOOL, true) // true if empty pools should be validated for RBD compatibility
 OPTION(rbd_validate_names, OPT_BOOL, true) // true if image specs should be validated
 OPTION(rbd_auto_exclusive_lock_until_manual_request, OPT_BOOL, true) // whether to automatically acquire/release exclusive lock until it is explicitly requested, i.e. before we know the user of librbd is properly using the lock API
@@ -1656,7 +1663,6 @@ OPTION(rgw_sync_data_inject_err_probability, OPT_DOUBLE, 0) // range [0, 1]
 OPTION(rgw_sync_meta_inject_err_probability, OPT_DOUBLE, 0) // range [0, 1]
 
 
-OPTION(rgw_realm_reconfigure_delay, OPT_DOUBLE, 2) // seconds to wait before reloading realm configuration
 OPTION(rgw_period_push_interval, OPT_DOUBLE, 2) // seconds to wait before retrying "period push"
 OPTION(rgw_period_push_interval_max, OPT_DOUBLE, 30) // maximum interval after exponential backoff
 
@@ -1713,3 +1719,5 @@ OPTION(event_tracing, OPT_BOOL, false) // true if LTTng-UST tracepoints should b
 OPTION(internal_safe_to_start_threads, OPT_BOOL, false)
 
 OPTION(debug_deliberately_leak_memory, OPT_BOOL, false)
+
+OPTION(rgw_swift_custom_header, OPT_STR, "") // option to enable swift custom headers
diff --git a/src/common/crc32c_intel_baseline.c b/src/common/crc32c_intel_baseline.c
index 390898171df..2862f627246 100644
--- a/src/common/crc32c_intel_baseline.c
+++ b/src/common/crc32c_intel_baseline.c
@@ -36,8 +36,6 @@
 
 #include "include/int_types.h"
 
-#include <stdlib.h>
-
 #define MAX_ITER	8
 
 unsigned long crc32_table_iscsi_base[256] = {
diff --git a/src/common/crc32c_intel_fast_asm.S b/src/common/crc32c_intel_fast_asm.s
index 2189684b4c7..2189684b4c7 100644
--- a/src/common/crc32c_intel_fast_asm.S
+++ b/src/common/crc32c_intel_fast_asm.s
diff --git a/src/common/crc32c_intel_fast_zero_asm.S b/src/common/crc32c_intel_fast_zero_asm.s
index 34b7f489016..34b7f489016 100644
--- a/src/common/crc32c_intel_fast_zero_asm.S
+++ b/src/common/crc32c_intel_fast_zero_asm.s
diff --git a/src/common/crc32c_ppc_asm.S b/src/common/crc32c_ppc_asm.s
index 1dc6dd1cf31..1dc6dd1cf31 100644
--- a/src/common/crc32c_ppc_asm.S
+++ b/src/common/crc32c_ppc_asm.s
diff --git a/src/common/dout.h b/src/common/dout.h
index 9f715bf501b..d2340c9364c 100644
--- a/src/common/dout.h
+++ b/src/common/dout.h
@@ -77,9 +77,11 @@ public:
 
 // NOTE: depend on magic value in _ASSERT_H so that we detect when
 // /usr/include/assert.h clobbers our fancier version.
-#define dendl std::flush;				\
+#define dendl_impl std::flush;				\
   _ASSERT_H->_log->submit_entry(_dout_e);		\
     }						\
   } while (0)
 
+#define dendl dendl_impl
+
 #endif
diff --git a/src/common/entity_name.h b/src/common/entity_name.h
index d1c6cbcdd7b..66576cc1af2 100644
--- a/src/common/entity_name.h
+++ b/src/common/entity_name.h
@@ -15,6 +15,8 @@
 #ifndef CEPH_COMMON_ENTITY_NAME_H
 #define CEPH_COMMON_ENTITY_NAME_H
 
+#include <ifaddrs.h>
+#include <netinet/in.h>
 #include <iosfwd>
 #include <stdint.h>
 #include <string>
diff --git a/src/common/hobject.cc b/src/common/hobject.cc
index a965c87d622..dffd471b7f9 100644
--- a/src/common/hobject.cc
+++ b/src/common/hobject.cc
@@ -75,7 +75,7 @@ string hobject_t::to_str() const
 
   char snap_with_hash[1000];
   char *t = snap_with_hash;
-  char *end = t + sizeof(snap_with_hash);
+  const char *end = t + sizeof(snap_with_hash);
 
   uint64_t poolid(pool);
   t += snprintf(t, end - t, "%.*llX", 16, (long long unsigned)poolid);
@@ -90,7 +90,7 @@ string hobject_t::to_str() const
   else
     t += snprintf(t, end - t, ".%llx", (long long unsigned)snap);
 
-  out += string(snap_with_hash);
+  out.append(snap_with_hash, t);
 
   out.push_back('.');
   append_escaped(oid.name, &out);
diff --git a/src/common/io_priority.cc b/src/common/io_priority.cc
index 579307f4071..687faa3bd7f 100644
--- a/src/common/io_priority.cc
+++ b/src/common/io_priority.cc
@@ -12,7 +12,6 @@
  *
  */
 
-#include <sys/types.h>
 #include <unistd.h>
 #ifdef __linux__
 #include <sys/syscall.h>   /* For SYS_xxx definitions */
@@ -20,7 +19,6 @@
 #include <algorithm>
 #include <errno.h>
 
-#include "common/errno.h"
 #include "io_priority.h"
 
 pid_t ceph_gettid(void)
diff --git a/src/common/ipaddr.cc b/src/common/ipaddr.cc
index a7dedf25709..55ac722d666 100644
--- a/src/common/ipaddr.cc
+++ b/src/common/ipaddr.cc
@@ -1,10 +1,13 @@
 #include "include/ipaddr.h"
 
-#include <sys/socket.h>
+#include <alloca.h>
 #include <arpa/inet.h>
+#include <ifaddrs.h>
+#include <netinet/in.h>
+#include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
-
+#include <sys/socket.h>
 
 static void netmask_ipv4(const struct in_addr *addr,
 			 unsigned int prefix_len,
diff --git a/src/common/iso_8601.cc b/src/common/iso_8601.cc
new file mode 100644
index 00000000000..88828dac0ca
--- /dev/null
+++ b/src/common/iso_8601.cc
@@ -0,0 +1,209 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <cctype>
+#include <chrono>
+#include <ctime>
+#include <cstdint>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include <sstream>
+#include <utility>
+
+#include "include/timegm.h"
+#include "iso_8601.h"
+
+
+namespace ceph {
+using std::chrono::duration_cast;
+using std::chrono::nanoseconds;
+using std::chrono::seconds;
+using std::setfill;
+using std::setw;
+using std::size_t;
+using std::stringstream;
+using std::string;
+using std::uint16_t;
+
+using boost::none;
+using boost::optional;
+using boost::string_ref;
+
+using ceph::real_clock;
+using ceph::real_time;
+
+using sriter = string_ref::const_iterator;
+
+namespace {
+// This assumes a contiguous block of numbers in the correct order.
+uint16_t digit(char c) {
+  if (!(c >= '0' && c <= '9')) {
+    throw std::invalid_argument("Not a digit.");
+  }
+  return static_cast<uint16_t>(c - '0');
+}
+
+optional<real_time> calculate(const tm& t, uint32_t n = 0) {
+  ceph_assert(n < 1000000000);
+  time_t tt = internal_timegm(&t);
+  if (tt == static_cast<time_t>(-1)) {
+    return none;
+  }
+
+  return real_clock::from_time_t(tt) + nanoseconds(n);
+}
+}
+
+optional<real_time> from_iso_8601(const string_ref s,
+				  const bool ws_terminates) noexcept {
+  auto end = s.cend();
+  auto read_digit = [end](sriter& c) mutable {
+    if (c == end) {
+      throw std::invalid_argument("End of input.");
+    }
+    auto f = digit(*c);
+    ++c;
+    return f;
+  };
+
+  auto read_digits = [end, &read_digit](sriter& c, std::size_t n) {
+    auto v = 0ULL;
+    for (auto i = 0U; i < n; ++i) {
+      auto d = read_digit(c);
+      v = (10ULL * v) + d;
+    }
+    return v;
+  };
+  auto partial_date = [end, ws_terminates](sriter& c) {
+    return (c == end || (ws_terminates && std::isspace(*c)));
+  };
+  auto time_end = [end, ws_terminates](sriter& c) {
+    return (c != end && *c == 'Z' &&
+	    ((c + 1) == end ||
+	     (ws_terminates && std::isspace(*(c + 1)))));
+  };
+  auto consume_delimiter = [end](sriter& c, char q) {
+    if (c == end || *c != q) {
+      throw std::invalid_argument("Expected delimiter not found.");
+    } else {
+      ++c;
+    }
+  };
+
+  tm t = { 0, // tm_sec
+	   0, // tm_min
+	   0, // tm_hour
+	   1, // tm_mday
+	   0, // tm_mon
+	   70, // tm_year
+	   0, // tm_wday
+	   0, // tm_yday
+	   0, // tm_isdst
+  };
+  try {
+    auto c = s.cbegin();
+    {
+      auto y = read_digits(c, 4);
+      if (y < 1970) {
+	return none;
+      }
+      t.tm_year = y - 1900;
+    }
+    if (partial_date(c)) {
+      return calculate(t, 0);
+    }
+
+    consume_delimiter(c, '-');
+    t.tm_mon = (read_digits(c, 2) - 1);
+    if (partial_date(c)) {
+      return calculate(t);
+    }
+    consume_delimiter(c, '-');
+    t.tm_mday = read_digits(c, 2);
+    if (partial_date(c)) {
+      return calculate(t);
+    }
+    consume_delimiter(c, 'T');
+    t.tm_hour = read_digits(c, 2);
+    if (time_end(c)) {
+      return calculate(t);
+    }
+    consume_delimiter(c, ':');
+    t.tm_min = read_digits(c, 2);
+    if (time_end(c)) {
+      return calculate(t);
+    }
+    consume_delimiter(c, ':');
+    t.tm_sec = read_digits(c, 2);
+    if (time_end(c)) {
+      return calculate(t);
+    }
+    consume_delimiter(c, '.');
+
+    auto n = 0UL;
+    auto multiplier = 100000000UL;
+    for (auto i = 0U; i < 9U; ++i) {
+      auto d = read_digit(c);
+      n += d * multiplier;
+      multiplier /= 10;
+      if (time_end(c)) {
+	return calculate(t, n);
+      }
+    }
+  } catch (std::invalid_argument& e) {
+    // fallthrough
+  }
+  return none;
+}
+
+string to_iso_8601(const real_time t,
+		   const iso_8601_format f) noexcept {
+  ceph_assert(f >= iso_8601_format::Y &&
+	      f <= iso_8601_format::YMDhmsn);
+  stringstream out(std::ios_base::out);
+
+  auto sec = real_clock::to_time_t(t);
+  auto nsec = duration_cast<nanoseconds>(t.time_since_epoch() %
+					 seconds(1)).count();
+
+  struct tm bt;
+  gmtime_r(&sec, &bt);
+  out.fill('0');
+
+  out << 1900 + bt.tm_year;
+  if (f == iso_8601_format::Y) {
+    return out.str();
+  }
+
+  out << '-' << setw(2) << bt.tm_mon + 1;
+  if (f == iso_8601_format::YM) {
+    return out.str();
+  }
+
+  out << '-' << setw(2) << bt.tm_mday;
+  if (f == iso_8601_format::YMD) {
+    return out.str();
+  }
+
+  out << 'T' << setw(2) << bt.tm_hour;
+  if (f == iso_8601_format::YMDh) {
+    out << 'Z';
+    return out.str();
+  }
+
+  out << ':' << setw(2) << bt.tm_min;
+  if (f == iso_8601_format::YMDhm) {
+    out << 'Z';
+    return out.str();
+  }
+
+  out << ':' << setw(2) << bt.tm_sec;
+  if (f == iso_8601_format::YMDhms) {
+    out << 'Z';
+    return out.str();
+  }
+  out << '.' << setw(9) << nsec << 'Z';
+  return out.str();
+}
+}
diff --git a/src/common/iso_8601.h b/src/common/iso_8601.h
new file mode 100644
index 00000000000..5aa63983386
--- /dev/null
+++ b/src/common/iso_8601.h
@@ -0,0 +1,44 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_COMMON_ISO_8601_H
+#define CEPH_COMMON_ISO_8601_H
+
+#include <boost/optional.hpp>
+#include <boost/utility/string_ref.hpp>
+
+#include "common/ceph_time.h"
+
+namespace ceph {
+
+// Here, we support the W3C profile of ISO 8601 with the following
+// restrictions:
+// -   Subsecond resolution is supported to nanosecond
+//     granularity. Any number of digits between 1 and 9 may be
+//     specified after the decimal point.
+// -   All times must be UTC.
+// -   All times must be representable as a sixty-four bit count of
+//     nanoseconds since the epoch.
+// -   Partial times are handled thus:
+//     *    If there are no subseconds, they are assumed to be zero.
+//     *    If there are no seconds, they are assumed to be zero.
+//     *    If there are no minutes, they are assumed to be zero.
+//     *    If there is no time, it is assumed to midnight.
+//     *    If there is no day, it is assumed to be the first.
+//     *    If there is no month, it is assumed to be January.
+//
+// If a date is invalid, boost::none is returned.
+
+boost::optional<ceph::real_time> from_iso_8601(
+  boost::string_ref s, const bool ws_terminates = true) noexcept;
+
+enum class iso_8601_format {
+  Y, YM, YMD, YMDh, YMDhm, YMDhms, YMDhmsn
+};
+
+std::string to_iso_8601(const ceph::real_time t,
+			const iso_8601_format f = iso_8601_format::YMDhmsn)
+  noexcept;
+}
+
+#endif
diff --git a/src/common/mime.c b/src/common/mime.c
index 90c19a7684e..fe45123ccc9 100644
--- a/src/common/mime.c
+++ b/src/common/mime.c
@@ -15,7 +15,6 @@
 
 #include <errno.h>
 #include <stdio.h>
-#include <string.h>
 
 int mime_encode_as_qp(const char *input, char *output, int outlen)
 {
diff --git a/src/common/module.c b/src/common/module.c
index b251274a5f1..06b32ed7d3e 100644
--- a/src/common/module.c
+++ b/src/common/module.c
@@ -15,8 +15,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <sys/wait.h>
-#include <unistd.h>
 
 /*
  * TODO: Switch to libkmod when we abandon older platforms.  The APIs
diff --git a/src/common/obj_bencher.cc b/src/common/obj_bencher.cc
index 701a4b5ad0e..32b0abe9235 100644
--- a/src/common/obj_bencher.cc
+++ b/src/common/obj_bencher.cc
@@ -1176,7 +1176,9 @@ int ObjBencher::clean_up(int num_objects, int prevPid, int concurrentios) {
   lock.Unlock();
 
   // don't start more completions than files
-  if (num_objects < concurrentios) {
+  if (num_objects == 0) {
+    return 0;
+  } else if (num_objects < concurrentios) {
     concurrentios = num_objects;
   }
 
diff --git a/src/common/perf_counters.cc b/src/common/perf_counters.cc
index c869880a1e2..2cdf7de25c2 100644
--- a/src/common/perf_counters.cc
+++ b/src/common/perf_counters.cc
@@ -241,7 +241,7 @@ uint64_t PerfCounters::get(int idx) const
   return data.u64.read();
 }
 
-void PerfCounters::tinc(int idx, utime_t amt)
+void PerfCounters::tinc(int idx, utime_t amt, uint32_t avgcount)
 {
   if (!m_cct->_conf->perf)
     return;
@@ -252,15 +252,15 @@ void PerfCounters::tinc(int idx, utime_t amt)
   if (!(data.type & PERFCOUNTER_TIME))
     return;
   if (data.type & PERFCOUNTER_LONGRUNAVG) {
-    data.avgcount.inc();
+    data.avgcount.add(avgcount);
     data.u64.add(amt.to_nsec());
-    data.avgcount2.inc();
+    data.avgcount2.add(avgcount);
   } else {
     data.u64.add(amt.to_nsec());
   }
 }
 
-void PerfCounters::tinc(int idx, ceph::timespan amt)
+void PerfCounters::tinc(int idx, ceph::timespan amt, uint32_t avgcount)
 {
   if (!m_cct->_conf->perf)
     return;
@@ -271,9 +271,9 @@ void PerfCounters::tinc(int idx, ceph::timespan amt)
   if (!(data.type & PERFCOUNTER_TIME))
     return;
   if (data.type & PERFCOUNTER_LONGRUNAVG) {
-    data.avgcount.inc();
+    data.avgcount.add(avgcount);
     data.u64.add(amt.count());
-    data.avgcount2.inc();
+    data.avgcount2.add(avgcount);
   } else {
     data.u64.add(amt.count());
   }
@@ -317,7 +317,7 @@ void PerfCounters::hinc(int idx, int64_t x, int64_t y)
   assert(idx < m_upper_bound);
 
   perf_counter_data_any_d& data(m_data[idx - m_lower_bound - 1]);
-  assert(data.type == (PERFCOUNTER_HISTOGRAM | PERFCOUNTER_U64));
+  assert(data.type == (PERFCOUNTER_HISTOGRAM | PERFCOUNTER_COUNTER | PERFCOUNTER_U64));
   assert(data.histogram);
 
   data.histogram->inc(x, y);
@@ -370,19 +370,49 @@ void PerfCounters::dump_formatted_generic(Formatter *f, bool schema,
 
     if (schema) {
       f->open_object_section(d->name);
+      // we probably should not have exposed this raw field (with bit
+      // values), but existing plugins rely on it so we're stuck with
+      // it.
       f->dump_int("type", d->type);
 
-      if (d->description) {
-        f->dump_string("description", d->description);
+      if (d->type & PERFCOUNTER_COUNTER) {
+	f->dump_string("metric_type", "counter");
+      } else {
+	f->dump_string("metric_type", "gauge");
+      }
+
+      if (d->type & PERFCOUNTER_LONGRUNAVG) {
+	if (d->type & PERFCOUNTER_TIME) {
+	  f->dump_string("value_type", "real-integer-pair");
+	} else {
+	  f->dump_string("value_type", "integer-integer-pair");
+	}
+      } else if (d->type & PERFCOUNTER_HISTOGRAM) {
+	if (d->type & PERFCOUNTER_TIME) {
+	  f->dump_string("value_type", "real-2d-histogram");
+	} else {
+	  f->dump_string("value_type", "integer-2d-histogram");
+	}
       } else {
-        f->dump_string("description", "");
+	if (d->type & PERFCOUNTER_TIME) {
+	  f->dump_string("value_type", "real");
+	} else {
+	  f->dump_string("value_type", "integer");
+	}
       }
 
-      if (d->nick != NULL && !suppress_nicks) {
+      f->dump_string("description", d->description ? d->description : "");
+      if (d->nick != NULL) {
         f->dump_string("nick", d->nick);
       } else {
         f->dump_string("nick", "");
       }
+      if (d->prio) {
+	int p = std::max(std::min(d->prio + prio_adjust,
+				  (int)PerfCountersBuilder::PRIO_CRITICAL),
+			 0);
+	f->dump_int("priority", p);
+      }
       f->close_section();
     } else {
       if (d->type & PERFCOUNTER_LONGRUNAVG) {
@@ -401,7 +431,7 @@ void PerfCounters::dump_formatted_generic(Formatter *f, bool schema,
 	}
 	f->close_section();
       } else if (d->type & PERFCOUNTER_HISTOGRAM) {
-        assert(d->type == (PERFCOUNTER_HISTOGRAM | PERFCOUNTER_U64));
+        assert(d->type == (PERFCOUNTER_HISTOGRAM | PERFCOUNTER_COUNTER | PERFCOUNTER_U64));
         assert(d->histogram);
         f->open_object_section(d->name);
         d->histogram->dump_formatted(f);
@@ -453,48 +483,59 @@ PerfCountersBuilder::~PerfCountersBuilder()
   m_perf_counters = NULL;
 }
 
-void PerfCountersBuilder::add_u64_counter(int idx, const char *name,
-    const char *description, const char *nick)
+void PerfCountersBuilder::add_u64_counter(
+  int idx, const char *name,
+  const char *description, const char *nick, int prio)
 {
-  add_impl(idx, name, description, nick, PERFCOUNTER_U64 | PERFCOUNTER_COUNTER);
+  add_impl(idx, name, description, nick, prio,
+	   PERFCOUNTER_U64 | PERFCOUNTER_COUNTER);
 }
 
-void PerfCountersBuilder::add_u64(int idx, const char *name,
-    const char *description, const char *nick)
+void PerfCountersBuilder::add_u64(
+  int idx, const char *name,
+  const char *description, const char *nick, int prio)
 {
-  add_impl(idx, name, description, nick, PERFCOUNTER_U64);
+  add_impl(idx, name, description, nick, prio, PERFCOUNTER_U64);
 }
 
-void PerfCountersBuilder::add_u64_avg(int idx, const char *name,
-    const char *description, const char *nick)
+void PerfCountersBuilder::add_u64_avg(
+  int idx, const char *name,
+  const char *description, const char *nick, int prio)
 {
-  add_impl(idx, name, description, nick, PERFCOUNTER_U64 | PERFCOUNTER_LONGRUNAVG);
+  add_impl(idx, name, description, nick, prio,
+	   PERFCOUNTER_U64 | PERFCOUNTER_LONGRUNAVG);
 }
 
-void PerfCountersBuilder::add_time(int idx, const char *name,
-    const char *description, const char *nick)
+void PerfCountersBuilder::add_time(
+  int idx, const char *name,
+  const char *description, const char *nick, int prio)
 {
-  add_impl(idx, name, description, nick, PERFCOUNTER_TIME);
+  add_impl(idx, name, description, nick, prio, PERFCOUNTER_TIME);
 }
 
-void PerfCountersBuilder::add_time_avg(int idx, const char *name,
-    const char *description, const char *nick)
+void PerfCountersBuilder::add_time_avg(
+  int idx, const char *name,
+  const char *description, const char *nick, int prio)
 {
-  add_impl(idx, name, description, nick, PERFCOUNTER_TIME | PERFCOUNTER_LONGRUNAVG);
+  add_impl(idx, name, description, nick, prio,
+	   PERFCOUNTER_TIME | PERFCOUNTER_LONGRUNAVG);
 }
 
-void PerfCountersBuilder::add_histogram(int idx, const char *name,
-    PerfHistogramCommon::axis_config_d x_axis_config,
-    PerfHistogramCommon::axis_config_d y_axis_config,
-    const char *description, const char *nick)
+void PerfCountersBuilder::add_u64_counter_histogram(
+  int idx, const char *name,
+  PerfHistogramCommon::axis_config_d x_axis_config,
+  PerfHistogramCommon::axis_config_d y_axis_config,
+  const char *description, const char *nick, int prio)
 {
-  add_impl(idx, name, description, nick, PERFCOUNTER_U64 | PERFCOUNTER_HISTOGRAM,
+  add_impl(idx, name, description, nick, prio,
+	   PERFCOUNTER_U64 | PERFCOUNTER_HISTOGRAM | PERFCOUNTER_COUNTER,
            unique_ptr<PerfHistogram<>>{new PerfHistogram<>{x_axis_config, y_axis_config}});
 }
 
-void PerfCountersBuilder::add_impl(int idx, const char *name,
-    const char *description, const char *nick, int ty,
-    unique_ptr<PerfHistogram<>> histogram)
+void PerfCountersBuilder::add_impl(
+  int idx, const char *name,
+  const char *description, const char *nick, int prio, int ty,
+  unique_ptr<PerfHistogram<>> histogram)
 {
   assert(idx > m_perf_counters->m_lower_bound);
   assert(idx < m_perf_counters->m_upper_bound);
@@ -504,7 +545,12 @@ void PerfCountersBuilder::add_impl(int idx, const char *name,
   assert(data.type == PERFCOUNTER_NONE);
   data.name = name;
   data.description = description;
+  // nick must be <= 4 chars
+  if (nick) {
+    assert(strlen(nick) <= 4);
+  }
   data.nick = nick;
+  data.prio = prio;
   data.type = (enum perfcounter_type_d)ty;
   data.histogram = std::move(histogram);
 }
@@ -513,8 +559,10 @@ PerfCounters *PerfCountersBuilder::create_perf_counters()
 {
   PerfCounters::perf_counter_data_vec_t::const_iterator d = m_perf_counters->m_data.begin();
   PerfCounters::perf_counter_data_vec_t::const_iterator d_end = m_perf_counters->m_data.end();
-  for (; d != d_end; ++d) 
+  for (; d != d_end; ++d) {
     assert(d->type != PERFCOUNTER_NONE);
+    assert(d->type & (PERFCOUNTER_U64 | PERFCOUNTER_TIME));
+  }
 
   PerfCounters *ret = m_perf_counters;
   m_perf_counters = NULL;
diff --git a/src/common/perf_counters.h b/src/common/perf_counters.h
index 11b616c353d..95426af5142 100644
--- a/src/common/perf_counters.h
+++ b/src/common/perf_counters.h
@@ -37,11 +37,11 @@ class PerfCountersBuilder;
 enum perfcounter_type_d : uint8_t
 {
   PERFCOUNTER_NONE = 0,
-  PERFCOUNTER_TIME = 0x1,
-  PERFCOUNTER_U64 = 0x2,
-  PERFCOUNTER_LONGRUNAVG = 0x4,
-  PERFCOUNTER_COUNTER = 0x8,
-  PERFCOUNTER_HISTOGRAM = 0x10,
+  PERFCOUNTER_TIME = 0x1,       // float (measuring seconds)
+  PERFCOUNTER_U64 = 0x2,        // integer (note: either TIME or U64 *must* be set)
+  PERFCOUNTER_LONGRUNAVG = 0x4, // paired counter + sum (time)
+  PERFCOUNTER_COUNTER = 0x8,    // counter (vs guage)
+  PERFCOUNTER_HISTOGRAM = 0x10, // histogram (vector) of values
 };
 
 
@@ -102,6 +102,7 @@ public:
     const char *name;
     const char *description;
     const char *nick;
+    int prio = 0;
     enum perfcounter_type_d type;
     atomic64_t u64;
     atomic64_t avgcount;
@@ -159,8 +160,8 @@ public:
   uint64_t get(int idx) const;
 
   void tset(int idx, utime_t v);
-  void tinc(int idx, utime_t v);
-  void tinc(int idx, ceph::timespan v);
+  void tinc(int idx, utime_t v, uint32_t avgcount = 1);
+  void tinc(int idx, ceph::timespan v, uint32_t avgcount = 1);
   utime_t tget(int idx) const;
 
   void hinc(int idx, int64_t x, int64_t y);
@@ -181,8 +182,9 @@ public:
     m_name = s;
   }
 
-  void set_suppress_nicks(bool b) {
-    suppress_nicks = b;
+  /// adjust priority values by some value
+  void set_prio_adjust(int p) {
+    prio_adjust = p;
   }
 
 private:
@@ -201,7 +203,7 @@ private:
   std::string m_name;
   const std::string m_lock_name;
 
-  bool suppress_nicks = false;
+  int prio_adjust = 0;
 
   /** Protects m_data */
   mutable Mutex m_lock;
@@ -282,26 +284,49 @@ public:
   PerfCountersBuilder(CephContext *cct, const std::string &name,
 		    int first, int last);
   ~PerfCountersBuilder();
+
+  // prio values: higher is better, and higher values get included in
+  // 'ceph daemonperf' (and similar) results.
+  enum {
+    PRIO_CRITICAL = 10,
+    PRIO_INTERESTING = 8,
+    PRIO_USEFUL = 5,
+    PRIO_UNINTERESTING = 2,
+    PRIO_DEBUGONLY = 0,
+  };
   void add_u64(int key, const char *name,
-      const char *description=NULL, const char *nick = NULL);
+	       const char *description=NULL, const char *nick = NULL,
+	       int prio=0);
   void add_u64_counter(int key, const char *name,
-      const char *description=NULL, const char *nick = NULL);
+		       const char *description=NULL,
+		       const char *nick = NULL,
+		       int prio=0);
   void add_u64_avg(int key, const char *name,
-      const char *description=NULL, const char *nick = NULL);
+		   const char *description=NULL,
+		   const char *nick = NULL,
+		   int prio=0);
   void add_time(int key, const char *name,
-      const char *description=NULL, const char *nick = NULL);
+		const char *description=NULL,
+		const char *nick = NULL,
+		int prio=0);
   void add_time_avg(int key, const char *name,
-      const char *description=NULL, const char *nick = NULL);
-  void add_histogram(int key, const char* name,
-      PerfHistogramCommon::axis_config_d x_axis_config,
-      PerfHistogramCommon::axis_config_d y_axis_config,
-      const char *description=NULL, const char* nick = NULL);
+		    const char *description=NULL,
+		    const char *nick = NULL,
+		    int prio=0);
+  void add_u64_counter_histogram(
+    int key, const char* name,
+    PerfHistogramCommon::axis_config_d x_axis_config,
+    PerfHistogramCommon::axis_config_d y_axis_config,
+    const char *description=NULL,
+    const char* nick = NULL,
+    int prio=0);
+
   PerfCounters* create_perf_counters();
 private:
   PerfCountersBuilder(const PerfCountersBuilder &rhs);
   PerfCountersBuilder& operator=(const PerfCountersBuilder &rhs);
   void add_impl(int idx, const char *name,
-                const char *description, const char *nick, int ty,
+                const char *description, const char *nick, int prio, int ty,
                 unique_ptr<PerfHistogram<>> histogram = nullptr);
 
   PerfCounters *m_perf_counters;
diff --git a/src/common/perf_histogram.h b/src/common/perf_histogram.h
index bcd51a48ed4..ee726d394e4 100644
--- a/src/common/perf_histogram.h
+++ b/src/common/perf_histogram.h
@@ -17,7 +17,6 @@
 
 #include "common/Formatter.h"
 #include "include/atomic.h"
-#include "include/int_types.h"
 
 #include <array>
 #include <memory>
diff --git a/src/common/pick_address.cc b/src/common/pick_address.cc
index 5f910d541e6..4291cce8cbc 100644
--- a/src/common/pick_address.cc
+++ b/src/common/pick_address.cc
@@ -16,6 +16,7 @@
 
 #include <netdb.h>
 #include <errno.h>
+#include <ifaddrs.h>
 
 #include "include/ipaddr.h"
 #include "include/str_list.h"
diff --git a/src/common/pipe.c b/src/common/pipe.c
index 4d22f2458dc..913aa4fe89a 100644
--- a/src/common/pipe.c
+++ b/src/common/pipe.c
@@ -18,6 +18,8 @@
 
 #include <errno.h>
 #include <fcntl.h>
+#include <limits.h>
+#include <stdint.h>
 #include <unistd.h>
 
 int pipe_cloexec(int pipefd[2])
diff --git a/src/common/run_cmd.cc b/src/common/run_cmd.cc
index 81e41712b9b..396676d7b45 100644
--- a/src/common/run_cmd.cc
+++ b/src/common/run_cmd.cc
@@ -18,7 +18,6 @@
 #include <sstream>
 #include <stdarg.h>
 #include <stdlib.h>
-#include <sys/types.h>
 #include <sys/wait.h>
 #include <unistd.h>
 #include <vector>
diff --git a/src/common/sctp_crc32.c b/src/common/sctp_crc32.c
index 2fa26afecc5..a8deb07fe5f 100644
--- a/src/common/sctp_crc32.c
+++ b/src/common/sctp_crc32.c
@@ -43,7 +43,7 @@ __FBSDID("$FreeBSD: src/sys/netinet/sctp_crc32.c,v 1.8 2007/05/08 17:01:10 rrs E
 
 #include <stdint.h>
 
-#include "include/byteorder.h"
+#include "acconfig.h"
 
 #ifndef SCTP_USE_ADLER32
 
diff --git a/src/common/secret.c b/src/common/secret.c
index 82150930eab..b6f26ec3ece 100644
--- a/src/common/secret.c
+++ b/src/common/secret.c
@@ -14,12 +14,10 @@
 
 #include <string.h>
 #include <stdio.h>
-#include <stdlib.h>
 #include <unistd.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <keyutils.h>
-#include <sys/types.h>
 
 #include "common/armor.h"
 #include "common/safe_io.h"
diff --git a/src/common/str_map.cc b/src/common/str_map.cc
index 4605302e53a..c3f6a2b6519 100644
--- a/src/common/str_map.cc
+++ b/src/common/str_map.cc
@@ -14,8 +14,6 @@
  * 
  */
 
-#include <errno.h>
-
 #include "include/str_map.h"
 #include "include/str_list.h"
 
diff --git a/src/common/strtol.cc b/src/common/strtol.cc
index 863ef08cf46..4997a1242af 100644
--- a/src/common/strtol.cc
+++ b/src/common/strtol.cc
@@ -14,9 +14,8 @@
 
 #include "strtol.h"
 
-#include <cerrno>
 #include <climits>
-#include <cstdlib>
+#include <limits>
 #include <sstream>
 
 using std::ostringstream;
diff --git a/src/common/strtol.h b/src/common/strtol.h
index 2f4c4ce2756..810273ebd23 100644
--- a/src/common/strtol.h
+++ b/src/common/strtol.h
@@ -16,7 +16,6 @@
 #define CEPH_COMMON_STRTOL_H
 
 #include <string>
-#include <limits>
 extern "C" {
 #include <stdint.h>
 }
diff --git a/src/common/tracked_int_ptr.hpp b/src/common/tracked_int_ptr.hpp
index e0e423835e7..dfc74934724 100644
--- a/src/common/tracked_int_ptr.hpp
+++ b/src/common/tracked_int_ptr.hpp
@@ -15,12 +15,6 @@
 #ifndef CEPH_TRACKEDINTPTR_H
 #define CEPH_TRACKEDINTPTR_H
 
-#include <map>
-#include <list>
-#include <memory>
-#include <utility>
-#include "common/Mutex.h"
-#include "common/Cond.h"
 
 template <class T>
 class TrackedIntPtr {
diff --git a/src/common/utf8.c b/src/common/utf8.c
index 3bc77c32cde..5a8592f1cc5 100644
--- a/src/common/utf8.c
+++ b/src/common/utf8.c
@@ -13,7 +13,6 @@
  */
 #include "common/utf8.h"
 
-#include <stdio.h>
 #include <string.h>
 
 static int high_bits_set(int c)
diff --git a/src/common/util.cc b/src/common/util.cc
index 3cc1c49e78b..762fe374bb5 100644
--- a/src/common/util.cc
+++ b/src/common/util.cc
@@ -12,7 +12,6 @@
  * 
  */
 
-#include <errno.h>
 #include <sys/utsname.h>
 #include <boost/lexical_cast.hpp>
 
@@ -20,7 +19,6 @@
 #include "include/util.h"
 #include "common/debug.h"
 #include "common/errno.h"
-#include "common/strtol.h"
 #include "common/version.h"
 
 #ifdef HAVE_SYS_VFS_H
diff --git a/src/common/version.cc b/src/common/version.cc
index 0ca569e9c56..d5ce748ddd9 100644
--- a/src/common/version.cc
+++ b/src/common/version.cc
@@ -12,12 +12,10 @@
  *
  */
 
-#include "acconfig.h"
 #include "ceph_ver.h"
 #include "common/version.h"
 
 #include <sstream>
-#include <string>
 
 #define _STR(x) #x
 #define STRINGIFY(x) _STR(x)
diff --git a/src/common/xattr.c b/src/common/xattr.c
index caa31d52704..080bebcaafd 100644
--- a/src/common/xattr.c
+++ b/src/common/xattr.c
@@ -9,7 +9,6 @@
  * Foundation.  See file COPYING.
  */
 
-#include "acconfig.h"
 #if defined(__FreeBSD__)
 #include <errno.h>
 #include <stdint.h>
diff --git a/src/common/zipkin_trace.h b/src/common/zipkin_trace.h
new file mode 100644
index 00000000000..86624715c29
--- /dev/null
+++ b/src/common/zipkin_trace.h
@@ -0,0 +1,91 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef COMMON_ZIPKIN_TRACE_H
+#define COMMON_ZIPKIN_TRACE_H
+
+#include "acconfig.h"
+#include "include/encoding.h"
+
+#ifdef WITH_BLKIN
+
+#include <ztracer.hpp>
+
+#else // !WITH_BLKIN
+
+// add stubs for noop Trace and Endpoint
+
+// match the "real" struct
+struct blkin_trace_info {
+    int64_t trace_id;
+    int64_t span_id;
+    int64_t parent_span_id;
+};
+
+namespace ZTracer
+{
+static inline int ztrace_init() { return 0; }
+
+class Endpoint {
+ public:
+  Endpoint(const char *name) {}
+  Endpoint(const char *ip, int port, const char *name) {}
+
+  void copy_ip(const std::string &newip) {}
+  void copy_name(const std::string &newname) {}
+  void copy_address_from(const Endpoint *endpoint) {}
+  void share_address_from(const Endpoint *endpoint) {}
+  void set_port(int p) {}
+};
+
+class Trace {
+ public:
+  Trace() {}
+  Trace(const char *name, const Endpoint *ep, const Trace *parent = NULL) {}
+  Trace(const char *name, const Endpoint *ep,
+        const blkin_trace_info *i, bool child=false) {}
+
+  bool valid() const { return false; }
+  operator bool() const { return false; }
+
+  int init(const char *name, const Endpoint *ep, const Trace *parent = NULL) {
+    return 0;
+  }
+  int init(const char *name, const Endpoint *ep,
+           const blkin_trace_info *i, bool child=false) {
+    return 0;
+  }
+
+  void copy_name(const std::string &newname) {}
+
+  const blkin_trace_info* get_info() const { return NULL; }
+  void set_info(const blkin_trace_info *i) {}
+
+  void keyval(const char *key, const char *val) const {}
+  void keyval(const char *key, int64_t val) const {}
+  void keyval(const char *key, const char *val, const Endpoint *ep) const {}
+  void keyval(const char *key, int64_t val, const Endpoint *ep) const {}
+
+  void event(const char *event) const {}
+  void event(const char *event, const Endpoint *ep) const {}
+};
+} // namespace ZTrace
+
+#endif // !WITH_BLKIN
+
+static inline void encode(const blkin_trace_info& b, bufferlist& bl)
+{
+  ::encode(b.trace_id, bl);
+  ::encode(b.span_id, bl);
+  ::encode(b.parent_span_id, bl);
+}
+
+static inline void decode(blkin_trace_info& b, bufferlist::iterator& p)
+{
+  ::decode(b.trace_id, p);
+  ::decode(b.span_id, p);
+  ::decode(b.parent_span_id, p);
+}
+
+
+
+#endif // COMMON_ZIPKIN_TRACE_H
diff --git a/src/compressor/zlib/CompressionPluginZlib.h b/src/compressor/zlib/CompressionPluginZlib.h
index 5716e417d22..0aa641f8bb6 100644
--- a/src/compressor/zlib/CompressionPluginZlib.h
+++ b/src/compressor/zlib/CompressionPluginZlib.h
@@ -35,19 +35,18 @@ public:
   int factory(CompressorRef *cs,
                       std::ostream *ss) override
   {
+    bool isal = false;
 #if defined(__i386__) || defined(__x86_64__)
-    bool isal;
+    // other arches or lack of support result in isal = false
     if (cct->_conf->compressor_zlib_isal) {
       ceph_arch_probe();
       isal = (ceph_arch_intel_pclmul && ceph_arch_intel_sse41);
-    } else {
-      isal = false;
     }
+#endif
     if (compressor == 0 || has_isal != isal) {
       compressor = std::make_shared<ZlibCompressor>(isal);
       has_isal = isal;
     }
-#endif
     *cs = compressor;
     return 0;
   }
diff --git a/src/compressor/zstd/CMakeLists.txt b/src/compressor/zstd/CMakeLists.txt
index 0332834e80c..d9d2b6e560d 100644
--- a/src/compressor/zstd/CMakeLists.txt
+++ b/src/compressor/zstd/CMakeLists.txt
@@ -6,8 +6,10 @@ set(ZSTD_C_FLAGS -fPIC -Wno-unused-variable -O3)
 include(ExternalProject)
 ExternalProject_Add(zstd_ext
   SOURCE_DIR ${CMAKE_SOURCE_DIR}/src/zstd/build/cmake
-  CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+  CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+             -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
              -DCMAKE_C_FLAGS=${ZSTD_C_FLAGS}
+             -DCMAKE_AR=${CMAKE_AR}
   BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/libzstd
   BUILD_COMMAND $(MAKE) libzstd_static
   INSTALL_COMMAND "true")
diff --git a/src/crush/CrushCompiler.cc b/src/crush/CrushCompiler.cc
index 4e0df956b65..ac5599250ca 100644
--- a/src/crush/CrushCompiler.cc
+++ b/src/crush/CrushCompiler.cc
@@ -287,7 +287,7 @@ int CrushCompiler::decompile_choose_arg_map(crush_choose_arg_map arg_map,
   return 0;
 }
 
-int CrushCompiler::decompile_choose_args(std::pair<const long unsigned int, crush_choose_arg_map> &i,
+int CrushCompiler::decompile_choose_args(const std::pair<const long unsigned int, crush_choose_arg_map> &i,
                                          ostream &out)
 {
   out << "choose_args " << i.first << " {\n";
diff --git a/src/crush/CrushCompiler.h b/src/crush/CrushCompiler.h
index 3a93085597c..5146480683b 100644
--- a/src/crush/CrushCompiler.h
+++ b/src/crush/CrushCompiler.h
@@ -36,7 +36,7 @@ class CrushCompiler {
 		    ostream &out);
   int decompile_choose_arg_map(crush_choose_arg_map arg_map,
 			       ostream &out);
-  int decompile_choose_args(std::pair<const long unsigned int, crush_choose_arg_map> &i,
+  int decompile_choose_args(const std::pair<const long unsigned int, crush_choose_arg_map> &i,
 			    ostream &out);
   int decompile_bucket_impl(int i, ostream &out);
   int decompile_bucket(int cur,
diff --git a/src/crush/CrushWrapper.cc b/src/crush/CrushWrapper.cc
index 7974b9a7501..b2c143c91b1 100644
--- a/src/crush/CrushWrapper.cc
+++ b/src/crush/CrushWrapper.cc
@@ -105,6 +105,29 @@ bool CrushWrapper::is_v5_rule(unsigned ruleid) const
   return false;
 }
 
+bool CrushWrapper::has_choose_args() const
+{
+  return !choose_args.empty();
+}
+
+bool CrushWrapper::has_incompat_choose_args() const
+{
+  if (choose_args.size() != 1)
+    return true;
+  crush_choose_arg_map arg_map = choose_args.begin()->second;
+  for (__u32 i = 0; i < arg_map.size; i++) {
+    crush_choose_arg *arg = &arg_map.args[i];
+    if (arg->weight_set_size == 0 &&
+	arg->ids_size == 0)
+	continue;
+    if (arg->weight_set_size != 1)
+      return true;
+    if (arg->ids_size != 0)
+      return true;
+  }
+  return false;
+}
+
 int CrushWrapper::split_id_class(int i, int *idout, int *classout) const
 {
   if (!item_exists(i))
@@ -587,6 +610,20 @@ int CrushWrapper::get_full_location_ordered(int id, vector<pair<string, string>
   return 0;
 }
 
+string CrushWrapper::get_full_location_ordered_string(int id)
+{
+  vector<pair<string, string> > full_location_ordered;
+  string full_location;
+  get_full_location_ordered(id, full_location_ordered);
+  reverse(begin(full_location_ordered), end(full_location_ordered));
+  for(auto i = full_location_ordered.begin(); i != full_location_ordered.end(); i++) {
+    full_location = full_location + i->first + "=" + i->second;
+    if (i != full_location_ordered.end() - 1) {
+      full_location = full_location + ",";
+    }
+  }
+  return full_location;
+}
 
 map<int, string> CrushWrapper::get_parent_hierarchy(int id)
 {
@@ -777,6 +814,52 @@ int CrushWrapper::move_bucket(CephContext *cct, int id, const map<string,string>
   return insert_item(cct, id, bucket_weight / (float)0x10000, id_name, loc);
 }
 
+int CrushWrapper::swap_bucket(CephContext *cct, int src, int dst)
+{
+  if (src >= 0 || dst >= 0)
+    return -EINVAL;
+  if (!item_exists(src) || !item_exists(dst))
+    return -EINVAL;
+  crush_bucket *a = get_bucket(src);
+  crush_bucket *b = get_bucket(dst);
+  unsigned aw = a->weight;
+  unsigned bw = b->weight;
+
+  // swap weights
+  adjust_item_weight(cct, a->id, bw);
+  adjust_item_weight(cct, b->id, aw);
+
+  // swap items
+  map<int,unsigned> tmp;
+  unsigned as = a->size;
+  unsigned bs = b->size;
+  for (unsigned i = 0; i < as; ++i) {
+    int item = a->items[0];
+    int itemw = crush_get_bucket_item_weight(a, 0);
+    tmp[item] = itemw;
+    crush_bucket_remove_item(crush, a, item);
+  }
+  assert(a->size == 0);
+  assert(b->size == bs);
+  for (unsigned i = 0; i < bs; ++i) {
+    int item = b->items[0];
+    int itemw = crush_get_bucket_item_weight(b, 0);
+    crush_bucket_remove_item(crush, b, item);
+    crush_bucket_add_item(crush, a, item, itemw);
+  }
+  assert(a->size == bs);
+  assert(b->size == 0);
+  for (auto t : tmp) {
+    crush_bucket_add_item(crush, b, t.first, t.second);
+  }
+  assert(a->size == bs);
+  assert(b->size == as);
+
+  // swap names
+  swap_names(src, dst);
+  return 0;
+}
+
 int CrushWrapper::link_bucket(CephContext *cct, int id, const map<string,string>& loc)
 {
   if (choose_args.size() > 0) {
@@ -1044,6 +1127,17 @@ int CrushWrapper::get_immediate_parent_id(int id, int *parent) const
   return -ENOENT;
 }
 
+int CrushWrapper::get_parent_of_type(int item, int type) const
+{
+  do {
+    int r = get_immediate_parent_id(item, &item);
+    if (r < 0) {
+      return 0;
+    }
+  } while (get_bucket_type(item) != type);
+  return item;
+}
+
 bool CrushWrapper::class_is_in_use(int class_id)
 {
   for (auto &i : class_bucket)
@@ -1218,6 +1312,11 @@ int CrushWrapper::get_rule_weight_osd_map(unsigned ruleno, map<int,float> *pmap)
   crush_rule *rule = crush->rules[ruleno];
 
   // build a weight map for each TAKE in the rule, and then merge them
+
+  // FIXME: if there are multiple takes that place a different number of
+  // objects we do not take that into account.  (Also, note that doing this
+  // right is also a function of the pool, since the crush rule
+  // might choose 2 + choose 2 but pool size may only be 3.)
   for (unsigned i=0; i<rule->len; ++i) {
     map<int,float> m;
     float sum = 0;
@@ -1381,6 +1480,16 @@ void CrushWrapper::encode(bufferlist& bl, uint64_t features) const
   ::encode(crush->max_rules, bl);
   ::encode(crush->max_devices, bl);
 
+  bool encode_compat_choose_args = false;
+  crush_choose_arg_map arg_map;
+  memset(&arg_map, '\0', sizeof(arg_map));
+  if (has_choose_args() &&
+      !HAVE_FEATURE(features, CRUSH_CHOOSE_ARGS)) {
+    assert(!has_incompat_choose_args());
+    encode_compat_choose_args = true;
+    arg_map = choose_args.begin()->second;
+  }
+
   // buckets
   for (int i=0; i<crush->max_buckets; i++) {
     __u32 alg = 0;
@@ -1424,8 +1533,17 @@ void CrushWrapper::encode(bufferlist& bl, uint64_t features) const
       break;
 
     case CRUSH_BUCKET_STRAW2:
-      for (unsigned j=0; j<crush->buckets[i]->size; j++) {
-	::encode((reinterpret_cast<crush_bucket_straw2*>(crush->buckets[i]))->item_weights[j], bl);
+      {
+	__u32 *weights;
+	if (encode_compat_choose_args &&
+	    arg_map.args[i].weight_set_size > 0) {
+	  weights = arg_map.args[i].weight_set[0].weights;
+	} else {
+	  weights = (reinterpret_cast<crush_bucket_straw2*>(crush->buckets[i]))->item_weights;
+	}
+	for (unsigned j=0; j<crush->buckets[i]->size; j++) {
+	  ::encode(weights[j], bl);
+	}
       }
       break;
 
@@ -2208,6 +2326,29 @@ int CrushWrapper::_choose_type_stack(
   ldout(cct, 10) << __func__ << " cumulative_fanout " << cumulative_fanout
 		 << dendl;
 
+  // identify underful targets for each intermediate level.
+  // this serves two purposes:
+  //   1. we can tell when we are selecting a bucket that does not have any underfull
+  //      devices beneath it.  that means that if the current input includes an overfull
+  //      device, we won't be able to find an underfull device with this parent to
+  //      swap for it.
+  //   2. when we decide we should reject a bucket due to the above, this list gives us
+  //      a list of peers to consider that *do* have underfull devices available..  (we
+  //      are careful to pick one that has the same parent.)
+  vector<set<int>> underfull_buckets; // level -> set of buckets with >0 underfull item(s)
+  underfull_buckets.resize(stack.size() - 1);
+  for (auto osd : underfull) {
+    int item = osd;
+    for (int j = (int)stack.size() - 2; j >= 0; --j) {
+      int type = stack[j].first;
+      item = get_parent_of_type(item, type);
+      ldout(cct, 10) << __func__ << " underfull " << osd << " type " << type
+		     << " is " << item << dendl;
+      underfull_buckets[j].insert(item);
+    }
+  }
+  ldout(cct, 20) << __func__ << " underfull_buckets " << underfull_buckets << dendl;
+
   for (unsigned j = 0; j < stack.size(); ++j) {
     int type = stack[j].first;
     int fanout = stack[j].second;
@@ -2219,25 +2360,22 @@ int CrushWrapper::_choose_type_stack(
     auto tmpi = i;
     for (auto from : w) {
       ldout(cct, 10) << " from " << from << dendl;
-
+      // identify leaves under each choice.  we use this to check whether any of these
+      // leaves are overfull.  (if so, we need to make sure there are underfull candidates
+      // to swap for them.)
+      vector<set<int>> leaves;
+      leaves.resize(fanout);
       for (int pos = 0; pos < fanout; ++pos) {
 	if (type > 0) {
 	  // non-leaf
-	  int item = *tmpi;
-	  do {
-	    int r = get_immediate_parent_id(item, &item);
-	    if (r < 0) {
-	      ldout(cct, 10) << __func__ << " parent of " << item << " got "
-			     << cpp_strerror(r) << dendl;
-	      return -EINVAL;
-	    }
-	  } while (get_bucket_type(item) != type);
+	  int item = get_parent_of_type(*tmpi, type);
 	  o.push_back(item);
-	  ldout(cct, 10) << __func__ << "   from " << *tmpi << " got " << item
-			 << " of type " << type << dendl;
 	  int n = cum_fanout;
-	  while (n-- && tmpi != orig.end())
-	    ++tmpi;
+	  while (n-- && tmpi != orig.end()) {
+	    leaves[pos].insert(*tmpi++);
+	  }
+	  ldout(cct, 10) << __func__ << "   from " << *tmpi << " got " << item
+			 << " of type " << type << " over leaves " << leaves[pos] << dendl;
 	} else {
 	  // leaf
 	  bool replaced = false;
@@ -2279,6 +2417,50 @@ int CrushWrapper::_choose_type_stack(
 	  }
 	}
       }
+      if (j + 1 < stack.size()) {
+	// check if any buckets have overfull leaves but no underfull candidates
+	for (int pos = 0; pos < fanout; ++pos) {
+	  if (underfull_buckets[j].count(o[pos]) == 0) {
+	    // are any leaves overfull?
+	    bool any_overfull = false;
+	    for (auto osd : leaves[pos]) {
+	      if (overfull.count(osd)) {
+		any_overfull = true;
+	      }
+	    }
+	    if (any_overfull) {
+	      ldout(cct, 10) << " bucket " << o[pos] << " has no underfull targets and "
+			     << ">0 leaves " << leaves[pos] << " is overfull; alts "
+			     << underfull_buckets[j]
+			     << dendl;
+	      for (auto alt : underfull_buckets[j]) {
+		if (std::find(o.begin(), o.end(), alt) == o.end()) {
+		  // see if alt has the same parent
+		  if (j == 0 ||
+		      get_parent_of_type(o[pos], stack[j-1].first) ==
+		      get_parent_of_type(alt, stack[j-1].first)) {
+		    if (j)
+		      ldout(cct, 10) << "  replacing " << o[pos]
+				     << " (which has no underfull leaves) with " << alt
+				     << " (same parent "
+				     << get_parent_of_type(alt, stack[j-1].first) << " type "
+				     << type << ")" << dendl;
+		    else
+		      ldout(cct, 10) << "  replacing " << o[pos]
+				     << " (which has no underfull leaves) with " << alt
+				     << " (first level)" << dendl;
+		    o[pos] = alt;
+		    break;
+		  } else {
+		    ldout(cct, 30) << "  alt " << alt << " for " << o[pos]
+				   << " has different parent, skipping" << dendl;
+		  }
+		}
+	      }
+	    }
+	  }
+	}
+      }
       if (i == orig.end()) {
 	ldout(cct, 10) << __func__ << " end of orig, break 2" << dendl;
 	break;
diff --git a/src/crush/CrushWrapper.h b/src/crush/CrushWrapper.h
index 3d829954d77..57aa001cfb5 100644
--- a/src/crush/CrushWrapper.h
+++ b/src/crush/CrushWrapper.h
@@ -169,7 +169,7 @@ public:
     crush->straw_calc_version = 1;
   }
   void set_tunables_default() {
-    set_tunables_firefly();
+    set_tunables_hammer();
     crush->straw_calc_version = 1;
   }
 
@@ -316,6 +316,8 @@ public:
   bool has_v3_rules() const;
   bool has_v4_buckets() const;
   bool has_v5_rules() const;
+  bool has_choose_args() const;          // any choose_args
+  bool has_incompat_choose_args() const; // choose_args that can't be made compat
 
   bool is_v2_rule(unsigned ruleid) const;
   bool is_v3_rule(unsigned ruleid) const;
@@ -400,6 +402,16 @@ public:
       name_rmap[name] = i;
     return 0;
   }
+  void swap_names(int a, int b) {
+    string an = name_map[a];
+    string bn = name_map[b];
+    name_map[a] = bn;
+    name_map[b] = an;
+    if (have_rmaps) {
+      name_rmap[an] = b;
+      name_rmap[bn] = a;
+    }
+  }
   bool id_has_class(int i) {
     int idout;
     int classout;
@@ -572,6 +584,12 @@ public:
   int get_immediate_parent_id(int id, int *parent) const;
 
   /**
+   * return ancestor of the given type, or 0 if none
+   * (parent is always a bucket and thus <0)
+   */
+  int get_parent_of_type(int id, int type) const;
+
+  /**
    * get the fully qualified location of a device by successively finding
    * parents beginning at ID and ending at highest type number specified in
    * the CRUSH map which assumes that if device foo is under device bar, the
@@ -590,6 +608,15 @@ public:
    */
   int get_full_location_ordered(int id, vector<pair<string, string> >& path);
 
+  /*
+   * identical to get_full_location_ordered(int id, vector<pair<string, string> >& path),
+   * although it returns a concatenated string with the type/name pairs in descending
+   * hierarchical order with format key1=val1,key2=val2.
+   *
+   * returns the location in descending hierarchy as a string.
+   */
+  string get_full_location_ordered_string(int id);
+
   /**
    * returns (type_id, type) of all parent buckets between id and
    * default, can be used to check for anomolous CRUSH maps
@@ -650,6 +677,16 @@ public:
   int move_bucket(CephContext *cct, int id, const map<string,string>& loc);
 
   /**
+   * swap bucket contents of two buckets without touching bucket ids
+   *
+   * @param cct cct
+   * @param src bucket a
+   * @param dst bucket b
+   * @return 0 for success, negative on error
+   */
+  int swap_bucket(CephContext *cct, int src, int dst);
+
+  /**
    * add a link to an existing bucket in the hierarchy to the new location
    *
    * This has the same location and ancestor creation behavior as
@@ -1208,8 +1245,9 @@ public:
     choose_args.clear();
   }
 
+  template<typename WeightVector>
   void do_rule(int rule, int x, vector<int>& out, int maxout,
-	       const vector<__u32>& weight,
+	       const WeightVector& weight,
 	       uint64_t choose_args_index) const {
     int rawout[maxout];
     char work[crush_work_size(crush, maxout)];
diff --git a/src/crush/builder.c b/src/crush/builder.c
index 00f4b6c2ee9..dc342d28a01 100644
--- a/src/crush/builder.c
+++ b/src/crush/builder.c
@@ -1,15 +1,12 @@
 #include <string.h>
-#include <limits.h>
 #include <math.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <assert.h>
 #include <errno.h>
 
-#include "include/int_types.h"
-
+#include "crush/crush.h"
 #include "builder.h"
-#include "hash.h"
 
 #define dprintk(args...) /* printf(args) */
 
@@ -1497,6 +1494,7 @@ void set_legacy_crush_map(struct crush_map *map) {
   map->choose_total_tries = 19;
   map->chooseleaf_descend_once = 0;
   map->chooseleaf_vary_r = 0;
+  map->chooseleaf_stable = 0;
   map->straw_calc_version = 0;
 
   // by default, use legacy types, and also exclude tree,
diff --git a/src/crush/builder.h b/src/crush/builder.h
index 651729ff0e0..ffb5eaf22db 100644
--- a/src/crush/builder.h
+++ b/src/crush/builder.h
@@ -1,7 +1,12 @@
 #ifndef CEPH_CRUSH_BUILDER_H
 #define CEPH_CRUSH_BUILDER_H
 
-#include "crush.h"
+#include "include/int_types.h"
+
+struct crush_bucket;
+struct crush_choose_arg;
+struct crush_map;
+struct crush_rule;
 
 /** @ingroup API
  *
@@ -9,22 +14,10 @@
  * caller is responsible for deallocating the crush_map with
  * crush_destroy().
  *
- * The content of the allocated crush_map is undefined and
- * it is the responsibility of the caller to set meaningful values.
- *
- * The recommended values are:
- *
- *       struct crush_map *m = crush_create();
- *       m->choose_local_tries = 0;
- *       m->choose_local_fallback_tries = 0;
- *       m->choose_total_tries = 50;
- *       m->chooseleaf_descend_once = 1;
- *       m->chooseleaf_vary_r = 1;
- *       m->chooseleaf_stable = 1;
- *       m->allowed_bucket_algs =
- *               (1 << CRUSH_BUCKET_UNIFORM) |
- *               (1 << CRUSH_BUCKET_LIST) |
- *               (1 << CRUSH_BUCKET_STRAW2);
+ * The content of the allocated crush_map is set with
+ * set_optimal_crush_map(). The caller is responsible for setting each
+ * tunable in the __crush_map__ for backward compatibility or mapping
+ * stability.
  *
  * @returns a pointer to the newly created crush_map or NULL
  */
@@ -87,10 +80,10 @@ extern struct crush_rule *crush_make_rule(int len, int ruleset, int type, int mi
  *     __arg1__ leaves within all the buckets of type __arg2__ and
  *     select them.
  *
- * In all __CHOOSE__ steps, if __arg1__ is zero, the number of items
- * to select is determined by the __max_result__ argument of
- * crush_do_rule(), i.e. __arg1__ is __max_result__ minus the number of
- * items already in the result.
+ * In all __CHOOSE__ steps, if __arg1__ is less than or equal to zero,
+ * the number of items to select is equal to the __max_result__ argument
+ * of crush_do_rule() minus __arg1__. It is common to set __arg1__ to zero
+ * to select as many items as requested by __max_result__.
  *
  * - __CRUSH_RULE_SET_CHOOSE_TRIES__ and __CRUSH_RULE_SET_CHOOSELEAF_TRIES__
  *
@@ -307,7 +300,45 @@ crush_make_straw_bucket(struct crush_map *map,
 extern int crush_addition_is_unsafe(__u32 a, __u32 b);
 extern int crush_multiplication_is_unsafe(__u32  a, __u32 b);
 
+/** @ingroup API
+ *
+ * Set the __map__ tunables to implement the most ancient behavior,
+ * for backward compatibility purposes only.
+ *
+ * - choose_local_tries == 2
+ * - choose_local_fallback_tries == 5
+ * - choose_total_tries == 19
+ * - chooseleaf_descend_once == 0
+ * - chooseleaf_vary_r == 0
+ * - straw_calc_version == 0
+ * - chooseleaf_stable = 0
+ *
+ * See the __crush_map__ documentation for more information about
+ * each tunable.
+ *
+ * @param map a crush_map
+ */
 extern void set_legacy_crush_map(struct crush_map *map);
+/** @ingroup API
+ *
+ * Set the __map__ tunables to implement the optimal behavior. These
+ * are the values set by crush_create(). It does not guarantee a
+ * stable mapping after an upgrade.
+ *
+ * For instance when a bug is fixed it may significantly change the
+ * mapping. In that case a new tunable (say tunable_new) is added so
+ * the caller can control when the bug fix is activated. The
+ * set_optimal_crush_map() function will always set all tunables,
+ * including tunable_new, to fix all bugs even if it means changing
+ * the mapping. If the caller needs fine grained control on the
+ * tunables to upgrade to a new version without changing the mapping,
+ * it needs to set the __crush_map__ tunables individually.
+ *
+ * See the __crush_map__ documentation for more information about
+ * each tunable.
+ *
+ * @param map a crush_map
+ */
 extern void set_optimal_crush_map(struct crush_map *map);
 
 #endif
diff --git a/src/crush/crush.h b/src/crush/crush.h
index 83963ef9cc0..31fb94deff4 100644
--- a/src/crush/crush.h
+++ b/src/crush/crush.h
@@ -506,6 +506,24 @@ static inline int crush_calc_tree_node(int i)
 	return ((i+1) << 1)-1;
 }
 
+static inline const char *crush_alg_name(int alg)
+{
+	switch (alg) {
+	case CRUSH_BUCKET_UNIFORM:
+		return "uniform";
+	case CRUSH_BUCKET_LIST:
+		return "list";
+	case CRUSH_BUCKET_TREE:
+		return "tree";
+	case CRUSH_BUCKET_STRAW:
+		return "straw";
+	case CRUSH_BUCKET_STRAW2:
+		return "straw2";
+	default:
+		return "unknown";
+	}
+}
+
 /* ---------------------------------------------------------------------
 			       Private
    --------------------------------------------------------------------- */
diff --git a/src/crush/mapper.c b/src/crush/mapper.c
index 9fc06e89a31..321e7a7d549 100644
--- a/src/crush/mapper.c
+++ b/src/crush/mapper.c
@@ -849,7 +849,7 @@ void crush_init_workspace(const struct crush_map *m, void *v) {
 	struct crush_work *w = (struct crush_work *)v;
 	char *point = (char *)v;
 	__s32 b;
-	point += sizeof(struct crush_work *);
+	point += sizeof(struct crush_work);
 	w->work = (struct crush_work_bucket **)point;
 	point += m->max_buckets * sizeof(struct crush_work_bucket *);
 	for (b = 0; b < m->max_buckets; ++b) {
@@ -994,11 +994,6 @@ int crush_do_rule(const struct crush_map *map,
 
 			for (i = 0; i < wsize; i++) {
 				int bno;
-				/*
-				 * see CRUSH_N, CRUSH_N_MINUS macros.
-				 * basically, numrep <= 0 means relative to
-				 * the provided result_max
-				 */
 				numrep = curstep->arg1;
 				if (numrep <= 0) {
 					numrep += result_max;
diff --git a/src/dmclock/.gitignore b/src/dmclock/.gitignore
new file mode 100644
index 00000000000..c6ddef2752b
--- /dev/null
+++ b/src/dmclock/.gitignore
@@ -0,0 +1,4 @@
+*~
+*.dSYM
+*.o
+build*
diff --git a/src/dmclock/CMakeLists.txt b/src/dmclock/CMakeLists.txt
new file mode 100644
index 00000000000..428863dc496
--- /dev/null
+++ b/src/dmclock/CMakeLists.txt
@@ -0,0 +1,32 @@
+cmake_minimum_required(VERSION 2.8.11)
+
+set(CMAKE_CXX_FLAGS "-std=c++11 -Wno-write-strings ${CMAKE_CXX_FLAGS}")
+
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/modules/")
+
+if(DO_NOT_DELAY_TAG_CALC)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDO_NOT_DELAY_TAG_CALC")
+endif()
+
+if(K_WAY_HEAP)
+  if(K_WAY_HEAP LESS 2)
+    message(FATAL_ERROR "K_WAY_HEAP value should be at least 2")
+  else()
+    set(CMAKE_CXX_SIM_FLAGS "-DK_WAY_HEAP=${K_WAY_HEAP}")
+  endif()
+endif()
+
+if (NOT(TARGET gtest AND TARGET gtest_main))
+  find_package(gtest REQUIRED)
+  include_directories(${GTEST_INCLUDE_DIRS})
+endif()
+
+find_package(Boost REQUIRED)
+include_directories(${Boost_INCLUDE_DIRS})
+
+add_subdirectory(src)
+add_subdirectory(sim)
+add_subdirectory(support)
+
+enable_testing()
+add_subdirectory(test)
diff --git a/src/dmclock/README.md b/src/dmclock/README.md
new file mode 100644
index 00000000000..ab67295b153
--- /dev/null
+++ b/src/dmclock/README.md
@@ -0,0 +1,45 @@
+# dmclock
+
+This repository contains C++ 11 code that implements the dmclock
+distributed quality of service algorithm. See __mClock: Handling
+Throughput Variability for Hypervisor IO Scheduling__ by Gulati,
+Merchant, and Varman for a description of the algorithm.
+
+## Running cmake
+
+When running cmake, set the build type with either:
+
+    -DCMAKE_BUILD_TYPE=Debug
+    -DCMAKE_BUILD_TYPE=Release
+
+To turn on profiling, run cmake with an additional:
+
+    -DPROFILE=yes
+
+An optimization/fix to the published algorithm has been added and is
+on by default. To disable this optimization/fix run cmake with:
+
+    -DDO_NOT_DELAY_TAG_CALC=yes
+
+## Running make
+
+### Building the dmclock library
+
+The `make` command builds a library libdmclock.a. That plus the header
+files in the src directory allow one to use the implementation in
+their code.
+
+### Building unit tests
+
+The `make dmclock-tests` command builds unit tests.
+
+### Building simulations
+
+The `make dmclock-sims` command builds two simulations -- *dmc_sim*
+and *ssched_sim* -- which incorporate, respectively, the dmclock
+priority queue or a very simple scheduler for comparison. Other
+priority queue implementations could be added in the future.
+
+## dmclock API
+
+To be written....
diff --git a/src/dmclock/benchmark/README.md b/src/dmclock/benchmark/README.md
new file mode 100644
index 00000000000..d945e986fc1
--- /dev/null
+++ b/src/dmclock/benchmark/README.md
@@ -0,0 +1,42 @@
+# dmclock benchmarking
+
+**IMPORTANT**: now that K_WAY_HEAP is no longer allowed to have the
+value 1, the shell and Python scripts that generate the PDFs no longer
+work exactly correctly. Some effort to debug is necessary.
+
+This directory contains scripts to evaluate effects of different
+branching-factors (k=1 to k=11) in the IndirectIntrusiveHeap
+data-structure. IndirectIntrusiveHeap is now a k-way heap, so finding
+an ideal value for k (i.e., k=2 or k=3) for a particular work-load is
+important. Also, it is well-documented that the right choice of
+k-value improves the caching behaviour [Syed -- citation needed
+here]. As a result, the overall performance of an application using
+k-way heap increases significantly [Syed -- citation needed here].
+
+A rule of thumb is the following:
+	if number of elements are <= 6, use k=1
+	otherwise, use k=3.
+
+## Prerequisites
+
+requires python 2.7, gnuplot, and awk.
+  
+## Running benchmark
+
+./run.sh [name_of_the_output] [k_way] [repeat] # [Syed -- last two command line args do not work]
+
+The "run.sh" script looks for config files in the "configs" directory,
+and the final output is generated as
+"name_of_the_output.pdf". Internally, "run.sh" calls other scripts
+such as data_gen.sh, data_parser.py, and plot_gen.sh.
+
+## Modifying parameters
+
+To modify k-value and/or the amount of times each simulation is
+repeated, modify the following two variables in "run.sh" file:
+
+    k_way=[your_value]
+    repeat=[your_value]
+
+For example, k_way=3 means, the benchmark will compare simulations
+using 1-way, 2-way, and 3-way heaps.
diff --git a/src/dmclock/benchmark/configs/dmc_sim_100_100.conf b/src/dmclock/benchmark/configs/dmc_sim_100_100.conf
new file mode 100644
index 00000000000..c93d4c71f6d
--- /dev/null
+++ b/src/dmclock/benchmark/configs/dmc_sim_100_100.conf
@@ -0,0 +1,31 @@
+[global]
+server_groups = 1
+client_groups = 2
+server_random_selection = true
+server_soft_limit = true
+
+[server.0]
+server_count = 100
+server_iops  = 160
+
+[client.0]
+client_count = 99
+client_wait = 0
+client_total_ops = 10000
+client_server_select_range = 100
+client_iops_goal = 200
+client_outstanding_ops = 32
+client_reservation = 100.0
+client_limit = 0.0
+client_weight = 1.0
+
+[client.1]
+client_count = 1
+client_wait = 10
+client_total_ops = 10000
+client_server_select_range = 100
+client_iops_goal = 200
+client_outstanding_ops = 32
+client_reservation = 100.0
+client_limit = 0.0
+client_weight = 1.0
diff --git a/src/dmclock/benchmark/configs/dmc_sim_8_6.conf b/src/dmclock/benchmark/configs/dmc_sim_8_6.conf
new file mode 100644
index 00000000000..28aeb401d44
--- /dev/null
+++ b/src/dmclock/benchmark/configs/dmc_sim_8_6.conf
@@ -0,0 +1,43 @@
+[global]
+server_groups = 1
+client_groups = 3
+server_random_selection = true
+server_soft_limit = true
+
+[client.0]
+client_count = 2
+client_wait = 0
+client_total_ops = 1000
+client_server_select_range = 8
+client_iops_goal = 200
+client_outstanding_ops = 32
+client_reservation = 0.0
+client_limit = 0.0
+client_weight = 1.0
+
+[client.1]
+client_count = 2
+client_wait = 5
+client_total_ops = 1000
+client_server_select_range = 8
+client_iops_goal = 200
+client_outstanding_ops = 32
+client_reservation = 20.0
+client_limit = 40.0
+client_weight = 1.0
+
+[client.2]
+client_count = 2
+client_wait = 10
+client_total_ops = 1000
+client_server_select_range = 8
+client_iops_goal = 200
+client_outstanding_ops = 32
+client_reservation = 0.0
+client_limit = 50.0
+client_weight = 2.0
+
+
+[server.0]
+server_count = 8
+server_iops  = 160
diff --git a/src/dmclock/benchmark/data_gen.sh b/src/dmclock/benchmark/data_gen.sh
new file mode 100755
index 00000000000..80a77bd9a1a
--- /dev/null
+++ b/src/dmclock/benchmark/data_gen.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+config_dir="configs"
+repeat=2 #5
+
+# parameter check -- output_file name
+if [ "$1" != "" ]; then
+  output_file="$1"
+else
+  echo "Please provide the name of the output file"
+  exit
+fi
+
+# parameter check -- k-value
+if [ "$2" != "" ]; then
+  k_way="$2"
+else
+  echo "Please provide the maximum K_WAY value"
+  exit
+fi
+
+# parameter check --repeat
+if [ "$3" != "" ]; then
+  repeat="$3"
+fi
+
+echo "k-way:$k_way, num_repeat:$repeat"
+
+# create simulators in different directories 
+k=2
+while [ $k -le $k_way ]
+do
+  mkdir "build_$k"
+  cd "build_$k"
+  rm -rf *
+  cmake -DCMAKE_BUILD_TYPE=Release -DK_WAY_HEAP=$k ../../.
+  make dmclock-sims
+  cd ..
+  
+  k=$(( $k + 1 ))
+done
+
+# run simulators 
+echo '' > $output_file
+for config in "$config_dir"/*.conf
+do
+  k=2
+  while [ $k -le $k_way ]
+  do
+    cd "build_$k"
+    
+    # repeat same experiment
+    i=0
+    while [ $i -lt $repeat ]
+    do  
+      i=$(( $i + 1 ))
+      
+      # clear cache first
+      sync
+      #sudo sh -c 'echo 1 >/proc/sys/vm/drop_caches'
+      #sudo sh -c 'echo 2 >/proc/sys/vm/drop_caches'
+      #sudo sh -c 'echo 3 >/proc/sys/vm/drop_caches'
+
+      # run with heap
+      msg="file_name:$k:$config"
+      echo $msg >> ../$output_file
+      echo "running $msg ..."
+      ./sim/dmc_sim -c ../$config | awk '(/average/)' >> ../$output_file
+    done # end repeat
+    cd ..
+    k=$(( $k + 1 ))
+  done # end k_way
+done # end config
+
diff --git a/src/dmclock/benchmark/data_parser.py b/src/dmclock/benchmark/data_parser.py
new file mode 100755
index 00000000000..c90d85fd9ab
--- /dev/null
+++ b/src/dmclock/benchmark/data_parser.py
@@ -0,0 +1,191 @@
+#!/usr/bin/env python
+
+class DataPoint:  
+  def __init__(self):                
+    self.nserver = 0;
+    self.nclient = 0;
+    self.heap_type = 0;  
+    self.total_time_to_add_req = 0;
+    self.total_time_to_complete_req = 0;
+    self.config = ''
+
+  def set_name(self, config, heap_type):
+    self.config = config;
+    self.heap_type = heap_type
+
+  def get_conig(self):
+    import re
+    return re.split(r"/|\.", self.config)[1]
+
+  def __str__(self):
+    return "s:%d, c:%d,h:%d,config:%s"%(self.nserver, self.nclient, self.heap_type, self.config);
+# end DataPoint
+
+
+def isFloat(elem):        
+ try:
+  float(elem)
+  return True
+ except ValueError:
+  return False
+#end isFloat
+
+
+def parse_config_params(fname):
+  nclient = 0;
+  nserver = 0;
+  # read config file property 
+  with open(fname, 'r') as f:
+    for line in f:
+      line = line.strip('\n \t')
+      if not line: continue;
+      if line.startswith("client_count"):
+        nclient += int(line.split('=')[-1]);
+      if line.startswith("server_count"): 
+        nserver += int(line.split('=')[-1]);
+  # end of file
+  return [nserver, nclient];
+# parse_config_params
+
+def make_aggregate_data_point(dps, config, heap_type): 
+    # create new aggregate point
+    dp = DataPoint();
+    # set set and k_way_heap property
+    dp.set_name(config, heap_type); 
+    
+    num_run = 0
+    for _dp in dps:
+      if _dp.config == config and _dp.heap_type == heap_type:
+        # print _dp, config, heap_type
+        dp.nserver =_dp.nserver
+        dp.nclient = _dp.nclient
+        num_run                       += 1
+        dp.total_time_to_add_req      += _dp.total_time_to_add_req
+        dp.total_time_to_complete_req += _dp.total_time_to_complete_req 
+        
+    # average
+    dp.total_time_to_add_req      /= num_run;
+    dp.total_time_to_complete_req /= num_run
+    #print dp
+    return dp;
+
+def parse_data_points(filename):
+  dps = []; #data-points
+  dp = None;
+  state = 0;
+  configs = {}
+  k_ways  = {}
+  
+  with open(filename, 'r') as f:
+    for line in f:
+      line = line.strip('\n \t')
+      if not line: continue;
+      
+      # file_name:1:configs/dmc_sim_8_6.conf
+      if line.startswith("file_name"):      
+        if dp:
+          dps.append(dp);
+          state = 0;
+         
+        # new data-point 
+        dp = DataPoint();
+        parts = line.split(':')
+        fname = parts[-1];        
+        dp.heap_type = int(parts[1]);
+        if dp.heap_type not in k_ways:
+          k_ways[dp.heap_type] = 1;
+        
+        # add to the dictionary
+        configs[fname] = 1;
+        
+        dp.config = fname;
+        params = parse_config_params(fname)      
+        dp.nserver = params[0];
+        dp.nclient = params[-1];
+         
+      elif line.startswith("average"):	# take last 2 averages
+        r = [float(s) for s in line.split(' ') if isFloat(s)]
+        state +=1;
+        #print r, dp #if isFloat(s)
+        if state == 3:
+          dp.total_time_to_add_req = r[0]
+        elif state == 4:
+          dp.total_time_to_complete_req = r[0]
+        else: pass
+
+      else: 
+        pass;    
+  # final entry
+  dps.append(dp) 
+  
+  # compute average of multiple runs
+  dps_avg = []
+  for config in configs:
+    data_per_config = []
+    for k in k_ways:
+      aggr_dp = make_aggregate_data_point(dps, config , k);
+      data_per_config.append(aggr_dp);
+    dps_avg.append(data_per_config);
+  # end for
+  return dps_avg;
+# end parse_data_points
+
+
+def create_header(num_cols):
+  fields = ['nserver_nclient(config_file)','add_req', 'complete_req'];
+  header = fields[0]
+  #write add_req_{1, ...}
+  for i in range(num_cols):
+    header = '%s %s_%i'%(header, fields[1], i+2)
+  #write complete_req_{1, ...}
+  for i in range(num_cols):
+    header = '%s %s_%i'%(header, fields[2], i+2)
+  # new-line
+  header = '%s\n'%(header)
+  return header
+# end create_header
+
+
+def create_data_line(aggr_dp):
+  # get common info
+  dp = aggr_dp[0]
+  data_line = "s:%d_c:%d "%(dp.nserver, dp.nclient);
+  # get the point-count
+  num_cols = len(aggr_dp);
+  # write add_req_{1, ...}
+  for i in range(num_cols):
+    data_line = '%s %f'%(data_line, aggr_dp[i].total_time_to_add_req)
+  # write complete_req_{1, ...}
+  for i in range(num_cols):
+    data_line = '%s %f'%(data_line, aggr_dp[i].total_time_to_complete_req)
+  # new-line
+  data_line = '%s\n'%(data_line)
+  return data_line
+# end create_data_line
+
+    
+def make_data(filename):
+  # write the aggregated point in space separated file  
+  dps = parse_data_points(filename);
+  if not len(dps) : return
+  print "total points: ", len(dps)
+  # open file
+  with open('%s.dat'%(filename), 'w+') as f:
+    # write header
+    f.write(create_header(len(dps[0])));
+    # write data-line
+    for aggr_dp in dps:
+    	f.write(create_data_line(aggr_dp));
+
+
+def main(output_file):
+  print output_file
+  make_data(output_file);
+
+import sys
+if __name__ == "__main__":
+  file_name="result"
+  if len(sys.argv) > 1:
+    file_name=sys.argv[1].strip()
+  main(file_name)
+
diff --git a/src/dmclock/benchmark/plot_gen.sh b/src/dmclock/benchmark/plot_gen.sh
new file mode 100755
index 00000000000..d90bde1921a
--- /dev/null
+++ b/src/dmclock/benchmark/plot_gen.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+if [ "$1" != "" ]; then
+  output_file="$1"
+else
+  echo "Please provide the name of the output file"
+  exit
+fi
+
+# parameter check -- k-value
+if [ "$2" != "" ]; then
+  k_way="$2"
+else
+  echo "Please provide the maximum K_WAY value"
+  exit
+fi
+#echo "k-way: $k_way"
+#exit
+
+gnuplot << EOF
+
+# Note you need gnuplot 4.4 for the pdfcairo terminal.
+clear
+reset
+
+set terminal pdfcairo size 7in,5in font "Gill Sans,5" linewidth 1 rounded fontscale .8 noenhanced
+set output "${output_file}.pdf"
+
+# starts multiplot
+set multiplot layout 2,1
+
+# Line style for axes
+set style line 80 lt rgb "#808080"
+
+# Line style for grid
+set style line 81 lt 0  # dashed
+set style line 81 lt rgb "#808080"  # grey
+
+set grid back linestyle 81
+set border 3 back linestyle 80 
+
+#set xtics rotate out
+set style data histogram
+set style histogram clustered
+
+set style fill solid border
+set xlabel 'Heap Timing for different K values'   
+set ylabel 'Time (nanosec)'        
+set key top right
+
+set yrange [0:*]
+
+# plot 1
+set title 'Request Addition Time'
+plot for [COL=2:($k_way + 1)] '${output_file}.dat' using COL:xticlabels(1) title columnheader
+
+# plot 2
+set title 'Request Completion Time'
+plot for [COL=($k_way + 2):(2 * $k_way + 1)] '${output_file}.dat' using COL:xticlabels(1) title columnheader
+EOF
diff --git a/src/dmclock/benchmark/run.sh b/src/dmclock/benchmark/run.sh
new file mode 100755
index 00000000000..11432b53008
--- /dev/null
+++ b/src/dmclock/benchmark/run.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+# default value
+k_way=3 #11
+repeat=2 #5
+
+output_file="" 
+if [ "$1" != "" ]; then
+  output_file="$1"
+else
+  echo "Please provide the name of the output file"
+  exit
+fi
+
+echo "generating file ${output_file}"
+sh data_gen.sh ${output_file} ${k_way} ${repeat}
+
+echo "converting ${output_file} to ${output_file}.dat"
+python data_parser.py ${output_file}
+
+echo "now generating bar-chart"
+#gnuplot -e 'output_file=value'  plot_gen.gnuplot 
+sh plot_gen.sh  ${output_file} ${k_way}
+echo "done! check ${output_file}.pdf"
diff --git a/src/dmclock/cmake/modules/Findboost.cmake b/src/dmclock/cmake/modules/Findboost.cmake
new file mode 100644
index 00000000000..4f0dfd052f0
--- /dev/null
+++ b/src/dmclock/cmake/modules/Findboost.cmake
@@ -0,0 +1,15 @@
+# - Find boost
+
+find_path(BOOST_INCLUDE_DIR NAMES boost/variant.hpp
+    PATHS /usr/include /usr/local/include ${BOOST_DIR}/include)
+
+include(FindPackageHandleStandardArgs)
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(boost
+  REQUIRED_VARS BOOST_INCLUDE_DIR)
+
+if(boost_FOUND)
+  set(BOOST_FOUND 1)
+endif()
+if(BOOST_FOUND)
+  set(BOOST_INCLUDES ${BOOST_INCLUDE_DIR})
+endif()
diff --git a/src/dmclock/cmake/modules/Findgtest.cmake b/src/dmclock/cmake/modules/Findgtest.cmake
new file mode 100644
index 00000000000..bfe0980e4ed
--- /dev/null
+++ b/src/dmclock/cmake/modules/Findgtest.cmake
@@ -0,0 +1,48 @@
+# - Find gtest
+#
+#  GTEST_INCLUDE_DIRS   - where to find mcas/mcas.h, etc.
+#  GTEST_LIBRARIES      - List of libraries when using mcas.
+#  GTEST_FOUND          - True if mcas found.
+#
+#  GMOCK_INCLUDE_DIRS   - where to find mcas/mcas.h, etc.
+#  GMOCK_LIBRARIES      - List of libraries when using mcas.
+#  GMOCK_FOUND          - True if mcas found.
+
+
+## GTEST
+
+find_path(GTEST_INCLUDE_DIRS NAMES gtest/gtest.h
+    PATHS /usr/include /usr/local/include)
+
+find_library(GTEST_LIBRARY gtest
+  PATHS /usr/local/lib /usr/lib64)
+
+find_library(GTEST_MAIN_LIBRARY gtest_main
+  PATHS /usr/local/lib /usr/lib64)
+
+include(FindPackageHandleStandardArgs)
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(gtest
+  REQUIRED_VARS GTEST_LIBRARY GTEST_MAIN_LIBRARY GTEST_INCLUDE_DIRS)
+
+if(gtest_FOUND)
+  set(GTEST_FOUND 1)
+endif()
+
+## GMOCK
+
+find_path(GMOCK_INCLUDE_DIRS NAMES gmock/gmock.h
+    PATHS /usr/include /usr/local/include)
+
+find_library(GMOCK_LIBRARY gmock
+  PATHS /usr/local/lib /usr/lib64)
+
+find_library(GMOCK_MAIN_LIBRARY gmock_main
+  PATHS /usr/local/lib /usr/lib64)
+
+include(FindPackageHandleStandardArgs)
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(gmock
+  REQUIRED_VARS GMOCK_LIBRARY GMOCK_MAIN_LIBRARY GMOCK_INCLUDE_DIRS)
+
+if(gmock_FOUND)
+  set(GMOCK_FOUND 1)
+endif()
diff --git a/src/dmclock/dmclock-config.cmake.in b/src/dmclock/dmclock-config.cmake.in
new file mode 100644
index 00000000000..01636532c1d
--- /dev/null
+++ b/src/dmclock/dmclock-config.cmake.in
@@ -0,0 +1,17 @@
+# - Config file for the FooBar package
+# It defines the following variables
+#  DMCLOCK_INCLUDE_DIRS - include directories for FooBar
+#  DMCLOCK_LIBRARIES    - libraries to link against
+ 
+# Compute paths
+get_filename_component(DMCLOCK_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+set(DMCLOCK_INCLUDE_DIRS "${DMCLOCK_CMAKE_DIR}/src")
+# set(DMCLOCK_INCLUDE_DIRS "@CONF_INCLUDE_DIRS@")
+ 
+# Our library dependencies (contains definitions for IMPORTED targets)
+if(NOT TARGET dmclock AND NOT dmclock_BINARY_DIR)
+  include("${DMCLOCK_CMAKE_DIR}/dmclock-targets.cmake")
+endif()
+ 
+# These are IMPORTED targets created by FooBarTargets.cmake
+set(DMCLOCK_LIBRARIES dmclock)
diff --git a/src/dmclock/dmclock-targets.cmake b/src/dmclock/dmclock-targets.cmake
new file mode 100644
index 00000000000..2c84f34a142
--- /dev/null
+++ b/src/dmclock/dmclock-targets.cmake
@@ -0,0 +1 @@
+export(PACKAGE dmclock)
diff --git a/src/dmclock/sim/CMakeLists.txt b/src/dmclock/sim/CMakeLists.txt
new file mode 100644
index 00000000000..febd4f0ab6f
--- /dev/null
+++ b/src/dmclock/sim/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(src)
diff --git a/src/dmclock/sim/dmc_sim_100th.conf b/src/dmclock/sim/dmc_sim_100th.conf
new file mode 100644
index 00000000000..17d0043548e
--- /dev/null
+++ b/src/dmclock/sim/dmc_sim_100th.conf
@@ -0,0 +1,32 @@
+[global]
+server_groups = 1
+client_groups = 2
+server_random_selection = true
+server_soft_limit = true
+
+[client.0]
+client_count = 99
+client_wait = 0
+client_total_ops = 1000
+client_server_select_range = 10
+client_iops_goal = 50
+client_outstanding_ops = 100
+client_reservation = 20.0
+client_limit = 60.0
+client_weight = 1.0
+
+[client.1]
+client_count = 1
+client_wait = 10
+client_total_ops = 1000
+client_server_select_range = 10
+client_iops_goal = 50
+client_outstanding_ops = 100
+client_reservation = 20.0
+client_limit = 60.0
+client_weight = 1.0
+
+[server.0]
+server_count = 100
+server_iops = 40
+server_threads = 1
diff --git a/src/dmclock/sim/dmc_sim_example.conf b/src/dmclock/sim/dmc_sim_example.conf
new file mode 100644
index 00000000000..989f2f08281
--- /dev/null
+++ b/src/dmclock/sim/dmc_sim_example.conf
@@ -0,0 +1,43 @@
+[global]
+server_groups = 1
+client_groups = 3
+server_random_selection = false
+server_soft_limit = false
+
+[client.0]
+client_count = 1
+client_wait = 0
+client_total_ops = 2000
+client_server_select_range = 1
+client_iops_goal = 200
+client_outstanding_ops = 32
+client_reservation = 0.0
+client_limit = 0.0
+client_weight = 1.0
+
+[client.1]
+client_count = 1
+client_wait = 5
+client_total_ops = 2000
+client_server_select_range = 1
+client_iops_goal = 200
+client_outstanding_ops = 32
+client_reservation = 0.0
+client_limit = 40.0
+client_weight = 1.0
+
+[client.2]
+client_count = 1
+client_wait = 10
+client_total_ops = 2000
+client_server_select_range = 1
+client_iops_goal = 200
+client_outstanding_ops = 32
+client_reservation = 0.0
+client_limit = 50.0
+client_weight = 2.0
+
+[server.0]
+server_count = 1
+server_iops = 160
+server_threads = 1
diff --git a/src/dmclock/sim/src/CMakeLists.txt b/src/dmclock/sim/src/CMakeLists.txt
new file mode 100644
index 00000000000..426827b03f2
--- /dev/null
+++ b/src/dmclock/sim/src/CMakeLists.txt
@@ -0,0 +1,42 @@
+include_directories(ssched) # ssched code
+include_directories(../../src) # dmclock code
+include_directories(../../support/src)
+include_directories(${BOOST_INCLUDE_DIR})
+
+set(local_flags "-Wall -pthread ${CMAKE_CXX_SIM_FLAGS}")
+
+set(ssched_sim_srcs test_ssched.cc test_ssched_main.cc)
+set(dmc_sim_srcs test_dmclock.cc test_dmclock_main.cc)
+set(config_srcs config.cc str_list.cc ConfUtils.cc)
+
+set_source_files_properties(${ssched_sim_srcs} ${dmc_sim_srcs} ${dmc_srcs} ${config_srcs}
+  PROPERTIES
+  COMPILE_FLAGS "${local_flags}"
+  )
+
+if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
+  set(warnings_off " -Wno-unused-variable -Wno-unused-function")
+elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
+  set(warnings_off " -Wno-unused-but-set-variable -Wno-unused-function")
+endif()
+
+# append warning flags to certain source files
+set_property(
+  SOURCE ${ssched_sim_srcs} ${dmc_sim_srcs} ${config_srcs}
+  APPEND_STRING
+  PROPERTY COMPILE_FLAGS "${warnings_off}"
+  )
+
+add_executable(ssched_sim EXCLUDE_FROM_ALL ${ssched_sim_srcs})
+add_executable(dmc_sim EXCLUDE_FROM_ALL ${dmc_sim_srcs} ${config_srcs})
+
+set_target_properties(ssched_sim dmc_sim
+  PROPERTIES
+  RUNTIME_OUTPUT_DIRECTORY ..)
+
+add_dependencies(dmc_sim dmclock)
+
+target_link_libraries(ssched_sim LINK_PRIVATE pthread)
+target_link_libraries(dmc_sim LINK_PRIVATE pthread $<TARGET_FILE:dmclock>)
+
+add_custom_target(dmclock-sims DEPENDS ssched_sim dmc_sim)
diff --git a/src/dmclock/sim/src/ConfUtils.cc b/src/dmclock/sim/src/ConfUtils.cc
new file mode 100644
index 00000000000..74ddb06ee29
--- /dev/null
+++ b/src/dmclock/sim/src/ConfUtils.cc
@@ -0,0 +1,574 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <algorithm>
+#include <errno.h>
+#include <list>
+#include <map>
+#include <sstream>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <string>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <iostream>
+
+#include <assert.h>
+#include "ConfUtils.h"
+
+using std::cerr;
+using std::ostringstream;
+using std::pair;
+using std::string;
+
+#define MAX_CONFIG_FILE_SZ 0x40000000
+
+////////////////////////////// ConfLine //////////////////////////////
+ConfLine::
+ConfLine(const std::string &key_, const std::string val_,
+      const std::string newsection_, const std::string comment_, int line_no_)
+  : key(key_), val(val_), newsection(newsection_)
+{
+  // If you want to implement writable ConfFile support, you'll need to save
+  // the comment and line_no arguments here.
+}
+
+bool ConfLine::
+operator<(const ConfLine &rhs) const
+{
+  // We only compare keys.
+  // If you have more than one line with the same key in a given section, the
+  // last one wins.
+  if (key < rhs.key)
+    return true;
+  else
+    return false;
+}
+
+std::ostream &operator<<(std::ostream& oss, const ConfLine &l)
+{
+  oss << "ConfLine(key = '" << l.key << "', val='"
+      << l.val << "', newsection='" << l.newsection << "')";
+  return oss;
+}
+///////////////////////// ConfFile //////////////////////////
+ConfFile::
+ConfFile()
+{
+}
+
+ConfFile::
+~ConfFile()
+{
+}
+
+void ConfFile::
+clear()
+{
+  sections.clear();
+}
+
+/* We load the whole file into memory and then parse it.  Although this is not
+ * the optimal approach, it does mean that most of this code can be shared with
+ * the bufferlist loading function. Since bufferlists are always in-memory, the
+ * load_from_buffer interface works well for them.
+ * In general, configuration files should be a few kilobytes at maximum, so
+ * loading the whole configuration into memory shouldn't be a problem.
+ */
+int ConfFile::
+parse_file(const std::string &fname, std::deque<std::string> *errors,
+	   std::ostream *warnings)
+{
+  clear();
+
+  int ret = 0;
+  size_t sz;
+  char *buf = NULL;
+  char buf2[128];
+  FILE *fp = fopen(fname.c_str(), "r");
+  if (!fp) {
+    ret = -errno;
+    return ret;
+  }
+
+  struct stat st_buf;
+  if (fstat(fileno(fp), &st_buf)) {
+    ret = -errno;
+    ostringstream oss;
+    oss << "read_conf: failed to fstat '" << fname << "': " << strerror_r(ret, buf2, sizeof(buf2));
+    errors->push_back(oss.str());
+    goto done;
+  }
+
+  if (st_buf.st_size > MAX_CONFIG_FILE_SZ) {
+    ostringstream oss;
+    oss << "read_conf: config file '" << fname << "' is " << st_buf.st_size
+	<< " bytes, but the maximum is " << MAX_CONFIG_FILE_SZ;
+    errors->push_back(oss.str());
+    ret = -EINVAL;
+    goto done;
+  }
+
+  sz = (size_t)st_buf.st_size;
+  buf = (char*)malloc(sz);
+  if (!buf) {
+    ret = -ENOMEM;
+    goto done;
+  }
+
+  if (fread(buf, 1, sz, fp) != sz) {
+    if (ferror(fp)) {
+      ret = -errno;
+      ostringstream oss;
+      oss << "read_conf: fread error while reading '" << fname << "': "
+	  << strerror_r(ret, buf2, sizeof(buf2));
+      errors->push_back(oss.str());
+      goto done;
+    }
+    else {
+      ostringstream oss;
+      oss << "read_conf: unexpected EOF while reading '" << fname << "': "
+	  << "possible concurrent modification?";
+      errors->push_back(oss.str());
+      ret = -EIO;
+      goto done;
+    }
+  }
+
+  load_from_buffer(buf, sz, errors, warnings);
+  ret = 0;
+
+done:
+  free(buf);
+  fclose(fp);
+  return ret;
+}
+
+int ConfFile::
+read(const std::string &section, const std::string &key, std::string &val) const
+{
+  string k(normalize_key_name(key));
+
+  const_section_iter_t s = sections.find(section);
+  if (s == sections.end())
+    return -ENOENT;
+  ConfLine exemplar(k, "", "", "", 0);
+  ConfSection::const_line_iter_t l = s->second.lines.find(exemplar);
+  if (l == s->second.lines.end())
+    return -ENOENT;
+  val = l->val;
+  return 0;
+}
+
+ConfFile::const_section_iter_t ConfFile::
+sections_begin() const
+{
+  return sections.begin();
+}
+
+ConfFile::const_section_iter_t ConfFile::
+sections_end() const
+{
+  return sections.end();
+}
+
+void ConfFile::
+trim_whitespace(std::string &str, bool strip_internal)
+{
+  // strip preceding
+  const char *in = str.c_str();
+  while (true) {
+    char c = *in;
+    if ((!c) || (!isspace(c)))
+      break;
+    ++in;
+  }
+  char output[strlen(in) + 1];
+  strcpy(output, in);
+
+  // strip trailing
+  char *o = output + strlen(output);
+  while (true) {
+    if (o == output)
+      break;
+    --o;
+    if (!isspace(*o)) {
+      ++o;
+      *o = '\0';
+      break;
+    }
+  }
+
+  if (!strip_internal) {
+    str.assign(output);
+    return;
+  }
+
+  // strip internal
+  char output2[strlen(output) + 1];
+  char *out2 = output2;
+  bool prev_was_space = false;
+  for (char *u = output; *u; ++u) {
+    char c = *u;
+    if (isspace(c)) {
+      if (!prev_was_space)
+	*out2++ = c;
+      prev_was_space = true;
+    }
+    else {
+      *out2++ = c;
+      prev_was_space = false;
+    }
+  }
+  *out2++ = '\0';
+  str.assign(output2);
+}
+
+/* Normalize a key name.
+ *
+ * Normalized key names have no leading or trailing whitespace, and all
+ * whitespace is stored as underscores.  The main reason for selecting this
+ * normal form is so that in common/config.cc, we can use a macro to stringify
+ * the field names of md_config_t and get a key in normal form.
+ */
+std::string ConfFile::
+normalize_key_name(const std::string &key)
+{
+  string k(key);
+  ConfFile::trim_whitespace(k, true);
+  std::replace(k.begin(), k.end(), ' ', '_');
+  return k;
+}
+
+std::ostream &operator<<(std::ostream &oss, const ConfFile &cf)
+{
+  for (ConfFile::const_section_iter_t s = cf.sections_begin();
+       s != cf.sections_end(); ++s) {
+    oss << "[" << s->first << "]\n";
+    for (ConfSection::const_line_iter_t l = s->second.lines.begin();
+	 l != s->second.lines.end(); ++l) {
+      if (!l->key.empty()) {
+	oss << "\t" << l->key << " = \"" << l->val << "\"\n";
+      }
+    }
+  }
+  return oss;
+}
+
+void ConfFile::
+load_from_buffer(const char *buf, size_t sz, std::deque<std::string> *errors,
+		 std::ostream *warnings)
+{
+  errors->clear();
+
+  section_iter_t::value_type vt("global", ConfSection());
+  pair < section_iter_t, bool > vr(sections.insert(vt));
+  assert(vr.second);
+  section_iter_t cur_section = vr.first;
+  std::string acc;
+
+  const char *b = buf;
+  int line_no = 0;
+  size_t line_len = -1;
+  size_t rem = sz;
+  while (1) {
+    b += line_len + 1;
+    rem -= line_len + 1;
+    if (rem == 0)
+      break;
+    line_no++;
+
+    // look for the next newline
+    const char *end = (const char*)memchr(b, '\n', rem);
+    if (!end) {
+      ostringstream oss;
+      oss << "read_conf: ignoring line " << line_no << " because it doesn't "
+	  << "end with a newline! Please end the config file with a newline.";
+      errors->push_back(oss.str());
+      break;
+    }
+
+    // find length of line, and search for NULLs
+    line_len = 0;
+    bool found_null = false;
+    for (const char *tmp = b; tmp != end; ++tmp) {
+      line_len++;
+      if (*tmp == '\0') {
+	found_null = true;
+      }
+    }
+
+    if (found_null) {
+      ostringstream oss;
+      oss << "read_conf: ignoring line " << line_no << " because it has "
+	  << "an embedded null.";
+      errors->push_back(oss.str());
+      acc.clear();
+      continue;
+    }
+
+    if ((line_len >= 1) && (b[line_len-1] == '\\')) {
+      // A backslash at the end of a line serves as a line continuation marker.
+      // Combine the next line with this one.
+      // Remove the backslash itself from the text.
+      acc.append(b, line_len - 1);
+      continue;
+    }
+
+    acc.append(b, line_len);
+
+    //cerr << "acc = '" << acc << "'" << std::endl;
+    ConfLine *cline = process_line(line_no, acc.c_str(), errors);
+    acc.clear();
+    if (!cline)
+      continue;
+    const std::string &csection(cline->newsection);
+    if (!csection.empty()) {
+      std::map <std::string, ConfSection>::value_type nt(csection, ConfSection());
+      pair < section_iter_t, bool > nr(sections.insert(nt));
+      cur_section = nr.first;
+    }
+    else {
+      if (cur_section->second.lines.count(*cline)) {
+	// replace an existing key/line in this section, so that
+	//  [mysection]
+	//    foo = 1
+	//    foo = 2
+	// will result in foo = 2.
+	cur_section->second.lines.erase(*cline);
+	if (cline->key.length() && warnings)
+	  *warnings << "warning: line " << line_no << ": '" << cline->key << "' in section '"
+		    << cur_section->first << "' redefined " << std::endl;
+      }
+      // add line to current section
+      //std::cerr << "cur_section = " << cur_section->first << ", " << *cline << std::endl;
+      cur_section->second.lines.insert(*cline);
+    }
+    delete cline;
+  }
+
+  if (!acc.empty()) {
+    ostringstream oss;
+    oss << "read_conf: don't end with lines that end in backslashes!";
+    errors->push_back(oss.str());
+  }
+}
+
+/*
+ * A simple state-machine based parser.
+ * This probably could/should be rewritten with something like boost::spirit
+ * or yacc if the grammar ever gets more complex.
+ */
+ConfLine* ConfFile::
+process_line(int line_no, const char *line, std::deque<std::string> *errors)
+{
+  enum acceptor_state_t {
+    ACCEPT_INIT,
+    ACCEPT_SECTION_NAME,
+    ACCEPT_KEY,
+    ACCEPT_VAL_START,
+    ACCEPT_UNQUOTED_VAL,
+    ACCEPT_QUOTED_VAL,
+    ACCEPT_COMMENT_START,
+    ACCEPT_COMMENT_TEXT,
+  };
+  const char *l = line;
+  acceptor_state_t state = ACCEPT_INIT;
+  string key, val, newsection, comment;
+  bool escaping = false;
+  while (true) {
+    char c = *l++;
+    switch (state) {
+      case ACCEPT_INIT:
+	if (c == '\0')
+	  return NULL; // blank line. Not an error, but not interesting either.
+	else if (c == '[')
+	  state = ACCEPT_SECTION_NAME;
+	else if ((c == '#') || (c == ';'))
+	  state = ACCEPT_COMMENT_TEXT;
+	else if (c == ']') {
+	  ostringstream oss;
+	  oss << "unexpected right bracket at char " << (l - line)
+	      << ", line " << line_no;
+	  errors->push_back(oss.str());
+	  return NULL;
+	}
+	else if (isspace(c)) {
+	  // ignore whitespace here
+	}
+	else {
+	  // try to accept this character as a key
+	  state = ACCEPT_KEY;
+	  --l;
+	}
+	break;
+      case ACCEPT_SECTION_NAME:
+	if (c == '\0') {
+	  ostringstream oss;
+	  oss << "error parsing new section name: expected right bracket "
+	      << "at char " << (l - line) << ", line " << line_no;
+	  errors->push_back(oss.str());
+	  return NULL;
+	}
+	else if ((c == ']') && (!escaping)) {
+	  trim_whitespace(newsection, true);
+	  if (newsection.empty()) {
+	    ostringstream oss;
+	    oss << "error parsing new section name: no section name found? "
+	        << "at char " << (l - line) << ", line " << line_no;
+	    errors->push_back(oss.str());
+	    return NULL;
+	  }
+	  state = ACCEPT_COMMENT_START;
+	}
+	else if (((c == '#') || (c == ';')) && (!escaping)) {
+	  ostringstream oss;
+	  oss << "unexpected comment marker while parsing new section name, at "
+	      << "char " << (l - line) << ", line " << line_no;
+	  errors->push_back(oss.str());
+	  return NULL;
+	}
+	else if ((c == '\\') && (!escaping)) {
+	  escaping = true;
+	}
+	else {
+	  escaping = false;
+	  newsection += c;
+	}
+	break;
+      case ACCEPT_KEY:
+	if ((((c == '#') || (c == ';')) && (!escaping)) || (c == '\0')) {
+	  ostringstream oss;
+	  if (c == '\0') {
+	    oss << "end of key=val line " << line_no
+	        << " reached, no \"=val\" found...missing =?";
+	  } else {
+	    oss << "unexpected character while parsing putative key value, "
+		<< "at char " << (l - line) << ", line " << line_no;
+	  }
+	  errors->push_back(oss.str());
+	  return NULL;
+	}
+	else if ((c == '=') && (!escaping)) {
+	  key = normalize_key_name(key);
+	  if (key.empty()) {
+	    ostringstream oss;
+	    oss << "error parsing key name: no key name found? "
+	        << "at char " << (l - line) << ", line " << line_no;
+	    errors->push_back(oss.str());
+	    return NULL;
+	  }
+	  state = ACCEPT_VAL_START;
+	}
+	else if ((c == '\\') && (!escaping)) {
+	  escaping = true;
+	}
+	else {
+	  escaping = false;
+	  key += c;
+	}
+	break;
+      case ACCEPT_VAL_START:
+	if (c == '\0')
+	  return new ConfLine(key, val, newsection, comment, line_no);
+	else if ((c == '#') || (c == ';'))
+	  state = ACCEPT_COMMENT_TEXT;
+	else if (c == '"')
+	  state = ACCEPT_QUOTED_VAL;
+	else if (isspace(c)) {
+	  // ignore whitespace
+	}
+	else {
+	  // try to accept character as a val
+	  state = ACCEPT_UNQUOTED_VAL;
+	  --l;
+	}
+	break;
+      case ACCEPT_UNQUOTED_VAL:
+	if (c == '\0') {
+	  if (escaping) {
+	    ostringstream oss;
+	    oss << "error parsing value name: unterminated escape sequence "
+	        << "at char " << (l - line) << ", line " << line_no;
+	    errors->push_back(oss.str());
+	    return NULL;
+	  }
+	  trim_whitespace(val, false);
+	  return new ConfLine(key, val, newsection, comment, line_no);
+	}
+	else if (((c == '#') || (c == ';')) && (!escaping)) {
+	  trim_whitespace(val, false);
+	  state = ACCEPT_COMMENT_TEXT;
+	}
+	else if ((c == '\\') && (!escaping)) {
+	  escaping = true;
+	}
+	else {
+	  escaping = false;
+	  val += c;
+	}
+	break;
+      case ACCEPT_QUOTED_VAL:
+	if (c == '\0') {
+	  ostringstream oss;
+	  oss << "found opening quote for value, but not the closing quote. "
+	      << "line " << line_no;
+	  errors->push_back(oss.str());
+	  return NULL;
+	}
+	else if ((c == '"') && (!escaping)) {
+	  state = ACCEPT_COMMENT_START;
+	}
+	else if ((c == '\\') && (!escaping)) {
+	  escaping = true;
+	}
+	else {
+	  escaping = false;
+	  // Add anything, including whitespace.
+	  val += c;
+	}
+	break;
+      case ACCEPT_COMMENT_START:
+	if (c == '\0') {
+	  return new ConfLine(key, val, newsection, comment, line_no);
+	}
+	else if ((c == '#') || (c == ';')) {
+	  state = ACCEPT_COMMENT_TEXT;
+	}
+	else if (isspace(c)) {
+	  // ignore whitespace
+	}
+	else {
+	  ostringstream oss;
+	  oss << "unexpected character at char " << (l - line) << " of line "
+	      << line_no;
+	  errors->push_back(oss.str());
+	  return NULL;
+	}
+	break;
+      case ACCEPT_COMMENT_TEXT:
+	if (c == '\0')
+	  return new ConfLine(key, val, newsection, comment, line_no);
+	else
+	  comment += c;
+	break;
+      default:
+	assert(0);
+	break;
+    }
+    assert(c != '\0'); // We better not go past the end of the input string.
+  }
+}
diff --git a/src/dmclock/sim/src/ConfUtils.h b/src/dmclock/sim/src/ConfUtils.h
new file mode 100644
index 00000000000..6c9c2c6c9c8
--- /dev/null
+++ b/src/dmclock/sim/src/ConfUtils.h
@@ -0,0 +1,83 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_CONFUTILS_H
+#define CEPH_CONFUTILS_H
+
+#include <deque>
+#include <map>
+#include <set>
+#include <string>
+
+/*
+ * Ceph configuration file support.
+ *
+ * This class loads an INI-style configuration from a file or bufferlist, and
+ * holds it in memory. In general, an INI configuration file is composed of
+ * sections, which contain key/value pairs. You can put comments on the end of
+ * lines by using either a hash mark (#) or the semicolon (;).
+ *
+ * You can get information out of ConfFile by calling get_key or by examining
+ * individual sections.
+ *
+ * This class could be extended to support modifying configuration files and
+ * writing them back out without too much difficulty. Currently, this is not
+ * implemented, and the file is read-only.
+ */
+class ConfLine {
+public:
+  ConfLine(const std::string &key_, const std::string val_,
+	   const std::string newsection_, const std::string comment_, int line_no_);
+  bool operator<(const ConfLine &rhs) const;
+  friend std::ostream &operator<<(std::ostream& oss, const ConfLine &l);
+
+  std::string key, val, newsection;
+};
+
+class ConfSection {
+public:
+  typedef std::set <ConfLine>::const_iterator const_line_iter_t;
+
+  std::set <ConfLine> lines;
+};
+
+class ConfFile {
+public:
+  typedef std::map <std::string, ConfSection>::iterator section_iter_t;
+  typedef std::map <std::string, ConfSection>::const_iterator const_section_iter_t;
+
+  ConfFile();
+  ~ConfFile();
+  void clear();
+  int parse_file(const std::string &fname, std::deque<std::string> *errors, std::ostream *warnings);
+  int read(const std::string &section, const std::string &key,
+	      std::string &val) const;
+
+  const_section_iter_t sections_begin() const;
+  const_section_iter_t sections_end() const;
+
+  static void trim_whitespace(std::string &str, bool strip_internal);
+  static std::string normalize_key_name(const std::string &key);
+  friend std::ostream &operator<<(std::ostream &oss, const ConfFile &cf);
+
+private:
+  void load_from_buffer(const char *buf, size_t sz,
+			std::deque<std::string> *errors, std::ostream *warnings);
+  static ConfLine* process_line(int line_no, const char *line,
+			        std::deque<std::string> *errors);
+
+  std::map <std::string, ConfSection> sections;
+};
+
+#endif
diff --git a/src/dmclock/sim/src/config.cc b/src/dmclock/sim/src/config.cc
new file mode 100644
index 00000000000..a6702897cd6
--- /dev/null
+++ b/src/dmclock/sim/src/config.cc
@@ -0,0 +1,171 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+
+#include <unistd.h>
+#include <string.h>
+#include <stdarg.h>
+
+#include <iostream>
+#include <vector>
+#include <list>
+
+#include "config.h"
+#include "str_list.h"
+
+
+static void dashes_to_underscores(const char *input, char *output) {
+  char c = 0;
+  char *o = output;
+  const char *i = input;
+  // first two characters are copied as-is
+  *o = *i++;
+  if (*o++ == '\0')
+    return;
+  *o = *i++;
+  if (*o++ == '\0')
+    return;
+  for (; ((c = *i)); ++i) {
+    if (c == '=') {
+      strcpy(o, i);
+      return;
+    }
+    if (c == '-')
+      *o++ = '_';
+    else
+      *o++ = c;
+  }
+  *o++ = '\0';
+}
+
+static int va_ceph_argparse_witharg(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i, std::string *ret,
+	std::ostream &oss, va_list ap) {
+  const char *first = *i;
+  char tmp[strlen(first)+1];
+  dashes_to_underscores(first, tmp);
+  first = tmp;
+
+  // does this argument match any of the possibilities?
+  while (1) {
+    const char *a = va_arg(ap, char*);
+    if (a == NULL)
+      return 0;
+    int strlen_a = strlen(a);
+    char a2[strlen_a+1];
+    dashes_to_underscores(a, a2);
+    if (strncmp(a2, first, strlen(a2)) == 0) {
+      if (first[strlen_a] == '=') {
+	*ret = first + strlen_a + 1;
+	i = args.erase(i);
+	return 1;
+      }
+      else if (first[strlen_a] == '\0') {
+	// find second part (or not)
+	if (i+1 == args.end()) {
+	  oss << "Option " << *i << " requires an argument." << std::endl;
+	  i = args.erase(i);
+	  return -EINVAL;
+	}
+	i = args.erase(i);
+	*ret = *i;
+	i = args.erase(i);
+	return 1;
+      }
+    }
+  }
+}
+
+bool crimson::qos_simulation::ceph_argparse_witharg(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i, std::string *ret, ...) {
+  int r;
+  va_list ap;
+  va_start(ap, ret);
+  r = va_ceph_argparse_witharg(args, i, ret, std::cerr, ap);
+  va_end(ap);
+  if (r < 0)
+    _exit(1);
+  return r != 0;
+}
+
+void crimson::qos_simulation::ceph_argparse_early_args(std::vector<const char*>& args, std::string *conf_file_list) {
+  std::string val;
+
+  std::vector<const char *> orig_args = args;
+
+  for (std::vector<const char*>::iterator i = args.begin(); i != args.end(); ) {
+    if (ceph_argparse_witharg(args, i, &val, "--conf", "-c", (char*)NULL)) {
+      *conf_file_list = val;
+    }
+    else {
+      // ignore
+      ++i;
+    }
+  }
+  return;
+}
+
+static bool stobool(const std::string & v) {
+    return !v.empty () &&
+           (strcasecmp (v.c_str (), "true") == 0 ||
+	   atoi (v.c_str ()) != 0);
+}
+
+int crimson::qos_simulation::parse_config_file(const std::string &fname, sim_config_t &g_conf) {
+  ConfFile cf;
+  std::deque<std::string> err;
+  std::ostringstream warn;
+  int ret = cf.parse_file(fname.c_str(), &err, &warn);
+  if (ret) {
+    // error
+    return ret;
+  }
+
+  std::string val;
+  if (!cf.read("global", "server_groups", val))
+    g_conf.server_groups = std::stoul(val);
+  if (!cf.read("global", "client_groups", val))
+    g_conf.client_groups = std::stoul(val);
+  if (!cf.read("global", "server_random_selection", val))
+    g_conf.server_random_selection = stobool(val);
+  if (!cf.read("global", "server_soft_limit", val))
+    g_conf.server_soft_limit = stobool(val);
+
+  for (uint i = 0; i < g_conf.server_groups; i++) {
+    srv_group_t st;
+    std::string section = "server." + std::to_string(i);
+    if (!cf.read(section, "server_count", val))
+      st.server_count = std::stoul(val);
+    if (!cf.read(section, "server_iops", val))
+      st.server_iops = std::stoul(val);
+    if (!cf.read(section, "server_threads", val))
+      st.server_threads = std::stoul(val);
+    g_conf.srv_group.push_back(st);
+  }
+
+  for (uint i = 0; i < g_conf.client_groups; i++) {
+    cli_group_t ct;
+    std::string section = "client." + std::to_string(i);
+    if (!cf.read(section, "client_count", val))
+      ct.client_count = std::stoul(val);
+    if (!cf.read(section, "client_wait", val))
+      ct.client_wait = std::chrono::seconds(std::stoul(val));
+    if (!cf.read(section, "client_total_ops", val))
+      ct.client_total_ops = std::stoul(val);
+    if (!cf.read(section, "client_server_select_range", val))
+      ct.client_server_select_range = std::stoul(val);
+    if (!cf.read(section, "client_iops_goal", val))
+      ct.client_iops_goal = std::stoul(val);
+    if (!cf.read(section, "client_outstanding_ops", val))
+      ct.client_outstanding_ops = std::stoul(val);
+    if (!cf.read(section, "client_reservation", val))
+      ct.client_reservation = std::stod(val);
+    if (!cf.read(section, "client_limit", val))
+      ct.client_limit = std::stod(val);
+    if (!cf.read(section, "client_weight", val))
+      ct.client_weight = std::stod(val);
+    g_conf.cli_group.push_back(ct);
+  }
+
+  return 0;
+}
diff --git a/src/dmclock/sim/src/config.h b/src/dmclock/sim/src/config.h
new file mode 100644
index 00000000000..010f33a743e
--- /dev/null
+++ b/src/dmclock/sim/src/config.h
@@ -0,0 +1,138 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+
+#pragma once
+
+
+#include <string.h>
+
+#include <chrono>
+#include <vector>
+#include <sstream>
+#include <iomanip>
+
+#include "ConfUtils.h"
+
+
+namespace crimson {
+  namespace qos_simulation {
+
+    struct cli_group_t {
+      uint client_count;
+      std::chrono::seconds client_wait;
+      uint client_total_ops;
+      uint client_server_select_range;
+      uint client_iops_goal;
+      uint client_outstanding_ops;
+      double client_reservation;
+      double client_limit;
+      double client_weight;
+
+      cli_group_t(uint _client_count = 100,
+		  uint _client_wait = 0,
+		  uint _client_total_ops = 1000,
+		  uint _client_server_select_range = 10,
+		  uint _client_iops_goal = 50,
+		  uint _client_outstanding_ops = 100,
+		  double _client_reservation = 20.0,
+		  double _client_limit = 60.0,
+		  double _client_weight = 1.0) :
+	client_count(_client_count),
+	client_wait(std::chrono::seconds(_client_wait)),
+	client_total_ops(_client_total_ops),
+	client_server_select_range(_client_server_select_range),
+	client_iops_goal(_client_iops_goal),
+	client_outstanding_ops(_client_outstanding_ops),
+	client_reservation(_client_reservation),
+	client_limit(_client_limit),
+	client_weight(_client_weight)
+      {
+	// empty
+      }
+
+      friend std::ostream& operator<<(std::ostream& out,
+	  const cli_group_t& cli_group) {
+	out <<
+	  "client_count = " << cli_group.client_count << "\n" <<
+	  "client_wait = " << cli_group.client_wait.count() << "\n" <<
+	  "client_total_ops = " << cli_group.client_total_ops << "\n" <<
+	  "client_server_select_range = " << cli_group.client_server_select_range << "\n" <<
+	  "client_iops_goal = " << cli_group.client_iops_goal << "\n" <<
+	  "client_outstanding_ops = " << cli_group.client_outstanding_ops << "\n" <<
+	  std::fixed << std::setprecision(1) <<
+	  "client_reservation = " << cli_group.client_reservation << "\n" <<
+	  "client_limit = " << cli_group.client_limit << "\n" <<
+	  "client_weight = " << cli_group.client_weight;
+	return out;
+      }
+    }; // class cli_group_t
+
+
+    struct srv_group_t {
+      uint server_count;
+      uint server_iops;
+      uint server_threads;
+
+      srv_group_t(uint _server_count = 100,
+		  uint _server_iops = 40,
+		  uint _server_threads = 1) :
+	server_count(_server_count),
+	server_iops(_server_iops),
+	server_threads(_server_threads)
+      {
+	// empty
+      }
+
+      friend std::ostream& operator<<(std::ostream& out,
+	  const srv_group_t& srv_group) {
+	out <<
+	  "server_count = " << srv_group.server_count << "\n" <<
+	  "server_iops = " << srv_group.server_iops << "\n" <<
+	  "server_threads = " << srv_group.server_threads;
+	return out;
+      }
+    }; // class srv_group_t
+
+
+    struct sim_config_t {
+      uint server_groups;
+      uint client_groups;
+      bool server_random_selection;
+      bool server_soft_limit;
+
+      std::vector<cli_group_t> cli_group;
+      std::vector<srv_group_t> srv_group;
+
+      sim_config_t(uint _server_groups = 1,
+		   uint _client_groups = 1,
+		   bool _server_random_selection = false,
+		   bool _server_soft_limit = true) :
+	server_groups(_server_groups),
+	client_groups(_client_groups),
+	server_random_selection(_server_random_selection),
+	server_soft_limit(_server_soft_limit)
+      {
+	srv_group.reserve(server_groups);
+	cli_group.reserve(client_groups);
+      }
+
+      friend std::ostream& operator<<(std::ostream& out,
+	  const sim_config_t& sim_config) {
+	out <<
+	  "server_groups = " << sim_config.server_groups << "\n" <<
+	  "client_groups = " << sim_config.client_groups << "\n" <<
+	  "server_random_selection = " << sim_config.server_random_selection << "\n" <<
+	  "server_soft_limit = " << sim_config.server_soft_limit;
+	return out;
+      }
+    }; // class sim_config_t
+
+
+    bool ceph_argparse_witharg(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i, std::string *ret, ...);
+    void ceph_argparse_early_args(std::vector<const char*>& args, std::string *conf_file_list);
+    int parse_config_file(const std::string &fname, sim_config_t &g_conf);
+
+  }; // namespace qos_simulation
+}; // namespace crimson
diff --git a/src/dmclock/sim/src/sim_client.h b/src/dmclock/sim/src/sim_client.h
new file mode 100644
index 00000000000..6538dab2c08
--- /dev/null
+++ b/src/dmclock/sim/src/sim_client.h
@@ -0,0 +1,329 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ * Copyright (C) 2016 Red Hat Inc.
+ */
+
+
+#pragma once
+
+
+#include <atomic>
+#include <mutex>
+#include <condition_variable>
+#include <thread>
+#include <chrono>
+#include <vector>
+#include <deque>
+#include <iostream>
+
+#include "sim_recs.h"
+
+
+namespace crimson {
+  namespace qos_simulation {
+
+    struct req_op_t {};
+    struct wait_op_t {};
+    constexpr struct req_op_t req_op {};
+    constexpr struct wait_op_t wait_op {};
+
+
+    enum class CliOp { req, wait };
+    struct CliInst {
+      CliOp op;
+      union {
+	std::chrono::milliseconds wait_time;
+	struct {
+	  uint32_t count;
+	  std::chrono::microseconds time_bw_reqs;
+	  uint16_t max_outstanding;
+	} req_params;
+      } args;
+
+      // D is a duration type
+      template<typename D>
+      CliInst(wait_op_t, D duration) :
+	op(CliOp::wait)
+      {
+	args.wait_time =
+	  std::chrono::duration_cast<std::chrono::milliseconds>(duration);
+      }
+
+      CliInst(req_op_t,
+	      uint32_t count, double ops_per_sec, uint16_t max_outstanding) :
+	op(CliOp::req)
+      {
+	args.req_params.count = count;
+	args.req_params.max_outstanding = max_outstanding;
+	uint32_t us = uint32_t(0.5 + 1.0 / ops_per_sec * 1000000);
+	args.req_params.time_bw_reqs = std::chrono::microseconds(us);
+      }
+    };
+
+
+    using ServerSelectFunc = std::function<const ServerId&(uint64_t seed)>;
+
+
+    template<typename SvcTrk, typename ReqPm, typename RespPm, typename Accum>
+    class SimulatedClient {
+    public:
+
+      struct InternalStats {
+	std::mutex mtx;
+	std::chrono::nanoseconds track_resp_time;
+	std::chrono::nanoseconds get_req_params_time;
+	uint32_t track_resp_count;
+	uint32_t get_req_params_count;
+
+	InternalStats() :
+	  track_resp_time(0),
+	  get_req_params_time(0),
+	  track_resp_count(0),
+	  get_req_params_count(0)
+	{
+	  // empty
+	}
+      };
+
+      using SubmitFunc =
+	std::function<void(const ServerId&,
+			   const TestRequest&,
+			   const ClientId&,
+			   const ReqPm&)>;
+
+      using ClientAccumFunc = std::function<void(Accum&,const RespPm&)>;
+
+      typedef std::chrono::time_point<std::chrono::steady_clock> TimePoint;
+
+      static TimePoint now() { return std::chrono::steady_clock::now(); }
+
+    protected:
+
+      struct RespQueueItem {
+	TestResponse response;
+	ServerId     server_id;
+	RespPm       resp_params;
+      };
+
+      const ClientId id;
+      const SubmitFunc submit_f;
+      const ServerSelectFunc server_select_f;
+      const ClientAccumFunc accum_f;
+
+      std::vector<CliInst> instructions;
+
+      SvcTrk service_tracker;
+
+      // TODO: use lock rather than atomic???
+      std::atomic_ulong        outstanding_ops;
+      std::atomic_bool         requests_complete;
+
+      std::deque<RespQueueItem> resp_queue;
+
+      std::mutex               mtx_req;
+      std::condition_variable  cv_req;
+
+      std::mutex               mtx_resp;
+      std::condition_variable  cv_resp;
+
+      using RespGuard = std::lock_guard<decltype(mtx_resp)>;
+      using Lock = std::unique_lock<std::mutex>;
+
+      // data collection
+
+      std::vector<TimePoint>   op_times;
+      Accum                    accumulator;
+      InternalStats            internal_stats;
+
+      std::thread              thd_req;
+      std::thread              thd_resp;
+
+    public:
+
+      SimulatedClient(ClientId _id,
+		      const SubmitFunc& _submit_f,
+		      const ServerSelectFunc& _server_select_f,
+		      const ClientAccumFunc& _accum_f,
+		      const std::vector<CliInst>& _instrs) :
+	id(_id),
+	submit_f(_submit_f),
+	server_select_f(_server_select_f),
+	accum_f(_accum_f),
+	instructions(_instrs),
+	service_tracker(),
+	outstanding_ops(0),
+	requests_complete(false)
+      {
+	size_t op_count = 0;
+	for (auto i : instructions) {
+	  if (CliOp::req == i.op) {
+	    op_count += i.args.req_params.count;
+	  }
+	}
+	op_times.reserve(op_count);
+
+	thd_resp = std::thread(&SimulatedClient::run_resp, this);
+	thd_req = std::thread(&SimulatedClient::run_req, this);
+      }
+
+
+      SimulatedClient(ClientId _id,
+		      const SubmitFunc& _submit_f,
+		      const ServerSelectFunc& _server_select_f,
+		      const ClientAccumFunc& _accum_f,
+		      uint16_t _ops_to_run,
+		      double _iops_goal,
+		      uint16_t _outstanding_ops_allowed) :
+	SimulatedClient(_id,
+			_submit_f, _server_select_f, _accum_f,
+			{{req_op, _ops_to_run, _iops_goal, _outstanding_ops_allowed}})
+      {
+	// empty
+      }
+
+
+      SimulatedClient(const SimulatedClient&) = delete;
+      SimulatedClient(SimulatedClient&&) = delete;
+      SimulatedClient& operator=(const SimulatedClient&) = delete;
+      SimulatedClient& operator=(SimulatedClient&&) = delete;
+
+      virtual ~SimulatedClient() {
+	wait_until_done();
+      }
+
+      void receive_response(const TestResponse& resp,
+			    const ServerId& server_id,
+			    const RespPm& resp_params) {
+	RespGuard g(mtx_resp);
+	resp_queue.push_back(RespQueueItem{resp, server_id, resp_params});
+	cv_resp.notify_one();
+      }
+
+      const std::vector<TimePoint>& get_op_times() const { return op_times; }
+
+      void wait_until_done() {
+	if (thd_req.joinable()) thd_req.join();
+	if (thd_resp.joinable()) thd_resp.join();
+      }
+
+      const Accum& get_accumulator() const { return accumulator; }
+
+      const InternalStats& get_internal_stats() const { return internal_stats; }
+
+    protected:
+
+      void run_req() {
+	size_t ops_count = 0;
+	for (auto i : instructions) {
+	  if (CliOp::wait == i.op) {
+	    std::this_thread::sleep_for(i.args.wait_time);
+	  } else if (CliOp::req == i.op) {
+	    Lock l(mtx_req);
+	    for (uint64_t o = 0; o < i.args.req_params.count; ++o) {
+	      while (outstanding_ops >= i.args.req_params.max_outstanding) {
+		cv_req.wait(l);
+	      }
+
+	      l.unlock();
+	      auto now = std::chrono::steady_clock::now();
+	      const ServerId& server = server_select_f(o);
+
+	      ReqPm rp =
+		time_stats_w_return<decltype(internal_stats.get_req_params_time),
+				    ReqPm>(internal_stats.mtx,
+					   internal_stats.get_req_params_time,
+					   [&]() -> ReqPm {
+					     return service_tracker.get_req_params(server);
+					   });
+	      count_stats(internal_stats.mtx,
+			  internal_stats.get_req_params_count);
+
+	      TestRequest req(server, o, 12);
+	      submit_f(server, req, id, rp);
+	      ++outstanding_ops;
+	      l.lock(); // lock for return to top of loop
+
+	      auto delay_time = now + i.args.req_params.time_bw_reqs;
+	      while (std::chrono::steady_clock::now() < delay_time) {
+		cv_req.wait_until(l, delay_time);
+	      } // while
+	    } // for
+	    ops_count += i.args.req_params.count;
+	  } else {
+	    assert(false);
+	  }
+	} // for loop
+
+	requests_complete = true;
+
+	// all requests made, thread ends
+      }
+
+
+      void run_resp() {
+	std::chrono::milliseconds delay(1000);
+	int op = 0;
+
+	Lock l(mtx_resp);
+
+	// since the following code would otherwise be repeated (except for
+	// the call to notify_one) in the two loops below; let's avoid
+	// repetition and define it once.
+	const auto proc_resp = [this, &op, &l](const bool notify_req_cv) {
+	  if (!resp_queue.empty()) {
+	    RespQueueItem item = resp_queue.front();
+	    resp_queue.pop_front();
+
+	    l.unlock();
+
+	    // data collection
+
+	    op_times.push_back(now());
+	    accum_f(accumulator, item.resp_params);
+
+	    // processing
+
+#if 0 // not needed
+	    TestResponse& resp = item.response;
+#endif
+
+	    time_stats(internal_stats.mtx,
+		       internal_stats.track_resp_time,
+		       [&](){
+			 service_tracker.track_resp(item.server_id, item.resp_params);
+		       });
+	    count_stats(internal_stats.mtx,
+			internal_stats.track_resp_count);
+
+	    --outstanding_ops;
+	    if (notify_req_cv) {
+	      cv_req.notify_one();
+	    }
+
+	    l.lock();
+	  }
+	};
+
+	while(!requests_complete.load()) {
+	  while(resp_queue.empty() && !requests_complete.load()) {
+	    cv_resp.wait_for(l, delay);
+	  }
+	  proc_resp(true);
+	}
+
+	while(outstanding_ops.load() > 0) {
+	  while(resp_queue.empty() && outstanding_ops.load() > 0) {
+	    cv_resp.wait_for(l, delay);
+	  }
+	  proc_resp(false); // don't call notify_one as all requests are complete
+	}
+
+	// all responses received, thread ends
+      }
+    }; // class SimulatedClient
+
+
+  }; // namespace qos_simulation
+}; // namespace crimson
diff --git a/src/dmclock/sim/src/sim_recs.h b/src/dmclock/sim/src/sim_recs.h
new file mode 100644
index 00000000000..759ab4e1413
--- /dev/null
+++ b/src/dmclock/sim/src/sim_recs.h
@@ -0,0 +1,122 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ * Copyright (C) 2016 Red Hat Inc.
+ */
+
+
+#pragma once
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <signal.h>
+
+#include <sys/time.h>
+
+#include <cmath>
+#include <limits>
+#include <string>
+#include <mutex>
+#include <iostream>
+#include <functional>
+
+
+using ClientId = uint;
+using ServerId = uint;
+
+
+namespace crimson {
+  namespace qos_simulation {
+
+    inline void debugger() {
+      raise(SIGCONT);
+    }
+
+    template<typename T>
+    void time_stats(std::mutex& mtx,
+		    T& time_accumulate,
+		    std::function<void()> code) {
+      auto t1 = std::chrono::steady_clock::now();
+      code();
+      auto t2 = std::chrono::steady_clock::now();
+      auto duration = t2 - t1;
+      auto cast_duration = std::chrono::duration_cast<T>(duration);
+      std::lock_guard<std::mutex> lock(mtx);
+      time_accumulate += cast_duration;
+    }
+
+    // unfortunately it's hard for the compiler to infer the types,
+    // and therefore when called the template params might have to be
+    // explicit
+    template<typename T, typename R>
+    R time_stats_w_return(std::mutex& mtx,
+			  T& time_accumulate,
+			  std::function<R()> code) {
+      auto t1 = std::chrono::steady_clock::now();
+      R result = code();
+      auto t2 = std::chrono::steady_clock::now();
+      auto duration = t2 - t1;
+      auto cast_duration = std::chrono::duration_cast<T>(duration);
+      std::lock_guard<std::mutex> lock(mtx);
+      time_accumulate += cast_duration;
+      return result;
+    }
+
+    template<typename T>
+    void count_stats(std::mutex& mtx,
+		     T& counter) {
+      std::lock_guard<std::mutex> lock(mtx);
+      ++counter;
+    }
+
+    struct TestRequest {
+      ServerId server; // allows debugging
+      uint32_t epoch;
+      uint32_t op;
+
+      TestRequest(ServerId _server,
+		  uint32_t _epoch,
+		  uint32_t _op) :
+	server(_server),
+	epoch(_epoch),
+	op(_op)
+      {
+	// empty
+      }
+
+      TestRequest(const TestRequest& r) :
+	TestRequest(r.server, r.epoch, r.op)
+      {
+	// empty
+      }
+    }; // struct TestRequest
+
+
+    struct TestResponse {
+      uint32_t epoch;
+
+      TestResponse(uint32_t _epoch) :
+	epoch(_epoch)
+      {
+	// empty
+      }
+
+      TestResponse(const TestResponse& r) :
+	epoch(r.epoch)
+      {
+	// empty
+      }
+
+      friend std::ostream& operator<<(std::ostream& out, const TestResponse& resp) {
+	out << "{ ";
+	out << "epoch:" << resp.epoch;
+	out << " }";
+	return out;
+      }
+    }; // class TestResponse
+
+  }; // namespace qos_simulation
+}; // namespace crimson
diff --git a/src/dmclock/sim/src/sim_server.h b/src/dmclock/sim/src/sim_server.h
new file mode 100644
index 00000000000..a61cc3204e4
--- /dev/null
+++ b/src/dmclock/sim/src/sim_server.h
@@ -0,0 +1,225 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ * Copyright (C) 2016 Red Hat Inc.
+ */
+
+
+#pragma once
+
+
+#include <thread>
+#include <mutex>
+#include <condition_variable>
+#include <chrono>
+#include <deque>
+
+#include "sim_recs.h"
+
+
+namespace crimson {
+  namespace qos_simulation {
+
+    template<typename Q, typename ReqPm, typename RespPm, typename Accum>
+    class SimulatedServer {
+
+      struct QueueItem {
+	ClientId                     client;
+	std::unique_ptr<TestRequest> request;
+	RespPm                       additional;
+
+	QueueItem(const ClientId&                _client,
+		  std::unique_ptr<TestRequest>&& _request,
+		  const RespPm&                  _additional) :
+	  client(_client),
+	  request(std::move(_request)),
+	  additional(_additional)
+	{
+	  // empty
+	}
+      }; // QueueItem
+
+    public:
+
+      struct InternalStats {
+	std::mutex mtx;
+	std::chrono::nanoseconds add_request_time;
+	std::chrono::nanoseconds request_complete_time;
+	uint32_t add_request_count;
+	uint32_t request_complete_count;
+
+	InternalStats() :
+	  add_request_time(0),
+	  request_complete_time(0),
+	  add_request_count(0),
+	  request_complete_count(0)
+	{
+	  // empty
+	}
+      };
+
+      using ClientRespFunc = std::function<void(ClientId,
+						const TestResponse&,
+						const ServerId&,
+						const RespPm&)>;
+
+      using ServerAccumFunc = std::function<void(Accum& accumulator,
+						 const RespPm& additional)>;
+
+    protected:
+
+      const ServerId                 id;
+      Q*                             priority_queue;
+      ClientRespFunc                 client_resp_f;
+      int                            iops;
+      size_t                         thread_pool_size;
+
+      bool                           finishing;
+      std::chrono::microseconds      op_time;
+
+      std::mutex                     inner_queue_mtx;
+      std::condition_variable        inner_queue_cv;
+      std::deque<QueueItem>          inner_queue;
+
+      std::thread*                   threads;
+
+      using InnerQGuard = std::lock_guard<decltype(inner_queue_mtx)>;
+      using Lock = std::unique_lock<std::mutex>;
+
+      // data collection
+
+      ServerAccumFunc accum_f;
+      Accum accumulator;
+
+      InternalStats internal_stats;
+
+    public:
+
+      using CanHandleRequestFunc = std::function<bool(void)>;
+      using HandleRequestFunc =
+	std::function<void(const ClientId&,std::unique_ptr<TestRequest>,const RespPm&)>;
+      using CreateQueueF = std::function<Q*(CanHandleRequestFunc,HandleRequestFunc)>;
+					
+
+      SimulatedServer(ServerId _id,
+		      int _iops,
+		      size_t _thread_pool_size,
+		      const ClientRespFunc& _client_resp_f,
+		      const ServerAccumFunc& _accum_f,
+		      CreateQueueF _create_queue_f) :
+	id(_id),
+	priority_queue(_create_queue_f(std::bind(&SimulatedServer::has_avail_thread,
+						 this),
+				       std::bind(&SimulatedServer::inner_post,
+						 this,
+						 std::placeholders::_1,
+						 std::placeholders::_2,
+						 std::placeholders::_3))),
+	client_resp_f(_client_resp_f),
+	iops(_iops),
+	thread_pool_size(_thread_pool_size),
+	finishing(false),
+	accum_f(_accum_f)
+      {
+	op_time =
+	  std::chrono::microseconds((int) (0.5 +
+					   thread_pool_size * 1000000.0 / iops));
+	std::chrono::milliseconds delay(1000);
+	threads = new std::thread[thread_pool_size];
+	for (size_t i = 0; i < thread_pool_size; ++i) {
+	  threads[i] = std::thread(&SimulatedServer::run, this, delay);
+	}
+      }
+
+      virtual ~SimulatedServer() {
+	Lock l(inner_queue_mtx);
+	finishing = true;
+	inner_queue_cv.notify_all();
+	l.unlock();
+
+	for (size_t i = 0; i < thread_pool_size; ++i) {
+	  threads[i].join();
+	}
+
+	delete[] threads;
+      }
+
+      void post(const TestRequest& request,
+		const ClientId& client_id,
+		const ReqPm& req_params)
+      {
+	time_stats(internal_stats.mtx,
+		   internal_stats.add_request_time,
+		   [&](){
+		     priority_queue->add_request(request, client_id, req_params);
+		   });
+	count_stats(internal_stats.mtx,
+		    internal_stats.add_request_count);
+      }
+
+      bool has_avail_thread() {
+	InnerQGuard g(inner_queue_mtx);
+	return inner_queue.size() <= thread_pool_size;
+      }
+
+      const Accum& get_accumulator() const { return accumulator; }
+      const Q& get_priority_queue() const { return *priority_queue; }
+      const InternalStats& get_internal_stats() const { return internal_stats; }
+
+    protected:
+
+      void inner_post(const ClientId& client,
+		      std::unique_ptr<TestRequest> request,
+		      const RespPm& additional) {
+	Lock l(inner_queue_mtx);
+	assert(!finishing);
+	accum_f(accumulator, additional);
+	inner_queue.emplace_back(QueueItem(client,
+					   std::move(request),
+					   additional));
+	inner_queue_cv.notify_one();
+      }
+
+      void run(std::chrono::milliseconds check_period) {
+	Lock l(inner_queue_mtx);
+	while(true) {
+	  while(inner_queue.empty() && !finishing) {
+	    inner_queue_cv.wait_for(l, check_period);
+	  }
+	  if (!inner_queue.empty()) {
+	    auto& front = inner_queue.front();
+	    auto client = front.client;
+	    auto req = std::move(front.request);
+	    auto additional = front.additional;
+	    inner_queue.pop_front();
+
+	    l.unlock();
+
+	    // simulation operation by sleeping; then call function to
+	    // notify server of completion
+	    std::this_thread::sleep_for(op_time);
+
+	    TestResponse resp(req->epoch);
+	    // TODO: rather than assuming this constructor exists, perhaps
+	    // pass in a function that does this mapping?
+	    client_resp_f(client, resp, id, additional);
+
+	    time_stats(internal_stats.mtx,
+		       internal_stats.request_complete_time,
+		       [&](){
+			 priority_queue->request_completed();
+		       });
+	    count_stats(internal_stats.mtx,
+			internal_stats.request_complete_count);
+
+	    l.lock(); // in prep for next iteration of loop
+	  } else {
+	    break;
+	  }
+	}
+      }
+    }; // class SimulatedServer
+
+  }; // namespace qos_simulation
+}; // namespace crimson
diff --git a/src/dmclock/sim/src/simulate.h b/src/dmclock/sim/src/simulate.h
new file mode 100644
index 00000000000..18e752d8a35
--- /dev/null
+++ b/src/dmclock/sim/src/simulate.h
@@ -0,0 +1,430 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ * Copyright (C) 2016 Red Hat Inc.
+ */
+
+
+#pragma once
+
+
+#include <assert.h>
+
+#include <memory>
+#include <chrono>
+#include <map>
+#include <random>
+#include <iostream>
+#include <iomanip>
+#include <string>
+
+
+namespace crimson {
+  namespace qos_simulation {
+
+    template<typename ServerId, typename ClientId, typename TS, typename TC>
+    class Simulation {
+  
+    public:
+
+      using TimePoint = std::chrono::time_point<std::chrono::steady_clock>;
+
+    protected:
+
+      using ClientMap = std::map<ClientId,TC*>;
+      using ServerMap = std::map<ServerId,TS*>;
+
+      uint server_count = 0;
+      uint client_count = 0;
+
+      ServerMap servers;
+      ClientMap clients;
+      std::vector<ServerId> server_ids;
+
+      TimePoint early_time;
+      TimePoint servers_created_time;
+      TimePoint clients_created_time;
+      TimePoint clients_finished_time;
+      TimePoint late_time;
+
+      std::default_random_engine prng;
+
+      bool has_run = false;
+
+
+    public:
+
+      double fmt_tp(const TimePoint& t) {
+	auto c = t.time_since_epoch().count();
+	return uint64_t(c / 1000000.0 + 0.5) % 100000 / 1000.0;
+      }
+
+      TimePoint now() {
+	return std::chrono::steady_clock::now();
+      }
+
+      using ClientBasedServerSelectFunc =
+	std::function<const ServerId&(uint64_t, uint16_t)>;
+
+      using ClientFilter = std::function<bool(const ClientId&)>;
+
+      using ServerFilter = std::function<bool(const ServerId&)>;
+
+      using ServerDataOutF =
+	std::function<void(std::ostream& out,
+			   Simulation* sim, ServerFilter,
+			   int header_w, int data_w, int data_prec)>;
+
+      using ClientDataOutF =
+	std::function<void(std::ostream& out,
+			   Simulation* sim, ClientFilter,
+			   int header_w, int data_w, int data_prec)>;
+
+      Simulation() :
+	early_time(now()),
+	prng(std::chrono::system_clock::now().time_since_epoch().count())
+      {
+	// empty
+      }
+
+      uint get_client_count() const { return client_count; }
+      uint get_server_count() const { return server_count; }
+      TC& get_client(ClientId id) { return *clients[id]; }
+      TS& get_server(ServerId id) { return *servers[id]; }
+      const ServerId& get_server_id(uint index) const {
+	return server_ids[index];
+      }
+
+
+      void add_servers(uint count,
+		       std::function<TS*(ServerId)> create_server_f) {
+	uint i = server_count;
+
+	// increment server_count before creating servers since they
+	// will start running immediately and may use the server_count
+	// value; NB: this could still be an issue if servers are
+	// added with multiple add_servers calls; consider using a
+	// separate start function after all servers (and clients?)
+	// have been added
+	server_count += count;
+
+	for (; i < server_count; ++i) {
+	  server_ids.push_back(i);
+	  servers[i] = create_server_f(i);
+	}
+
+	servers_created_time = now();
+      }
+
+
+      void add_clients(uint count,
+		       std::function<TC*(ClientId)> create_client_f) {
+	uint i = client_count;
+
+	// increment client_count before creating clients since they
+	// will start running immediately and may use the client_count
+	// value (e.g., in the server selection function); NB: this could
+	// still be an issue if clients are added with multiple
+	// add_clients calls; consider using a separate start function
+	// after all clients have been added
+	client_count += count;
+
+	for (; i < client_count; ++i) {
+	  clients[i] = create_client_f(i);
+	}
+
+	clients_created_time = now();
+      }
+
+
+      void run() {
+	assert(server_count > 0);
+	assert(client_count > 0);
+
+	std::cout << "simulation started" << std::endl;
+
+	// clients are now running; wait for all to finish
+
+	for (auto const &i : clients) {
+	  i.second->wait_until_done();
+	}
+
+	late_time = clients_finished_time = now();
+
+	std::cout << "simulation completed in " <<
+	  std::chrono::duration_cast<std::chrono::milliseconds>(clients_finished_time - servers_created_time).count() <<
+	  " millisecs" << std::endl;
+
+	has_run = true;
+      } // run
+
+
+      void display_stats(std::ostream& out,
+			 ServerDataOutF server_out_f, ClientDataOutF client_out_f,
+			 ServerFilter server_filter =
+			 [] (const ServerId&) { return true; },
+			 ClientFilter client_filter =
+			 [] (const ClientId&) { return true; },
+			 int head_w = 12, int data_w = 7, int data_prec = 2) {
+	assert(has_run);
+
+	// skip first 2 secondsd of data
+	const std::chrono::seconds skip_amount(0);
+	// calculate in groups of 5 seconds
+	const std::chrono::seconds measure_unit(2);
+	// unit to output reports in
+	const std::chrono::seconds report_unit(1);
+
+	// compute and display stats
+
+	TimePoint earliest_start = late_time;
+	TimePoint latest_start = early_time;
+	TimePoint earliest_finish = late_time;
+	TimePoint latest_finish = early_time;
+
+	for (auto const &c : clients) {
+	  auto start = c.second->get_op_times().front();
+	  auto end = c.second->get_op_times().back();
+
+	  if (start < earliest_start) { earliest_start = start; }
+	  if (start > latest_start) { latest_start = start; }
+	  if (end < earliest_finish) { earliest_finish = end; }
+	  if (end > latest_finish) { latest_finish = end; }
+	}
+
+	double ops_factor =
+	  std::chrono::duration_cast<std::chrono::duration<double>>(measure_unit) /
+	  std::chrono::duration_cast<std::chrono::duration<double>>(report_unit);
+
+	const auto start_edge = clients_created_time + skip_amount;
+
+	std::map<ClientId,std::vector<double>> ops_data;
+
+	for (auto const &c : clients) {
+	  auto it = c.second->get_op_times().begin();
+	  const auto end = c.second->get_op_times().end();
+	  while (it != end && *it < start_edge) { ++it; }
+
+	  for (auto time_edge = start_edge + measure_unit;
+	       time_edge <= latest_finish + measure_unit;
+	       time_edge += measure_unit) {
+	    int count = 0;
+	    for (; it != end && *it < time_edge; ++count, ++it) { /* empty */ }
+	    double ops_per_second = double(count) / ops_factor;
+	    ops_data[c.first].push_back(ops_per_second);
+	  }
+	}
+
+	out << "==== Client Data ====" << std::endl;
+
+	out << std::setw(head_w) << "client:";
+	for (auto const &c : clients) {
+	  if (!client_filter(c.first)) continue;
+	  out << " " << std::setw(data_w) << c.first;
+	}
+	out << std::setw(data_w) << "total" << std::endl;
+
+	{
+	  bool has_data;
+	  size_t i = 0;
+	  do {
+	    std::string line_header = "t_" + std::to_string(i) + ":";
+	    out << std::setw(head_w) << line_header;
+	    has_data = false;
+	    double total = 0.0;
+	    for (auto const &c : clients) {
+	      double data = 0.0;
+	      if (i < ops_data[c.first].size()) {
+		data = ops_data[c.first][i];
+		has_data = true;
+	      }
+	      total += data;
+
+	      if (!client_filter(c.first)) continue;
+
+	      out << " " << std::setw(data_w) << std::setprecision(data_prec) <<
+		std::fixed << data;
+	    }
+	    out << " " << std::setw(data_w) << std::setprecision(data_prec) <<
+	      std::fixed << total << std::endl;
+	    ++i;
+	  } while(has_data);
+	}
+
+	client_out_f(out, this, client_filter, head_w, data_w, data_prec);
+
+	display_client_internal_stats<std::chrono::nanoseconds>(out,
+								"nanoseconds");
+
+	out << std::endl << "==== Server Data ====" << std::endl;
+
+	out << std::setw(head_w) << "server:";
+	for (auto const &s : servers) {
+	  if (!server_filter(s.first)) continue;
+	  out << " " << std::setw(data_w) << s.first;
+	}
+	out << " " << std::setw(data_w) << "total" << std::endl;
+
+	server_out_f(out, this, server_filter, head_w, data_w, data_prec);
+
+	display_server_internal_stats<std::chrono::nanoseconds>(out,
+								"nanoseconds");
+
+	// clean up clients then servers
+
+	for (auto i = clients.begin(); i != clients.end(); ++i) {
+	  delete i->second;
+	  i->second = nullptr;
+	}
+
+	for (auto i = servers.begin(); i != servers.end(); ++i) {
+	  delete i->second;
+	  i->second = nullptr;
+	}
+      } // display_stats
+
+
+      template<typename T>
+      void display_server_internal_stats(std::ostream& out,
+					 std::string time_unit) {
+	T add_request_time(0);
+	T request_complete_time(0);
+	uint32_t add_request_count = 0;
+	uint32_t request_complete_count = 0;
+
+	for (uint i = 0; i < get_server_count(); ++i) {
+	  const auto& server = get_server(i);
+	  const auto& is = server.get_internal_stats();
+	  add_request_time +=
+	    std::chrono::duration_cast<T>(is.add_request_time);
+	  request_complete_time +=
+	    std::chrono::duration_cast<T>(is.request_complete_time);
+	  add_request_count += is.add_request_count;
+	  request_complete_count += is.request_complete_count;
+	}
+
+	double add_request_time_per_unit =
+	  double(add_request_time.count()) / add_request_count ;
+	out << "total time to add requests: " <<
+	  std::fixed << add_request_time.count() << " " << time_unit <<
+	  ";" << std::endl <<
+	  "    count: " << add_request_count << ";" << std::endl <<
+	  "    average: " << add_request_time_per_unit <<
+	  " " << time_unit << " per request/response" << std::endl;
+
+	double request_complete_time_unit =
+	  double(request_complete_time.count()) / request_complete_count ;
+	out << "total time to note requests complete: " << std::fixed <<
+	  request_complete_time.count() << " " << time_unit << ";" <<
+	  std::endl << 
+	  "    count: " << request_complete_count << ";" << std::endl <<
+	  "    average: " << request_complete_time_unit <<
+	  " " << time_unit << " per request/response" << std::endl;
+
+	out << std::endl;
+
+	assert(add_request_count == request_complete_count);
+	out << "server timing for QOS algorithm: " <<
+	  add_request_time_per_unit + request_complete_time_unit <<
+	  " " << time_unit << " per request/response" << std::endl;
+      }
+
+
+      template<typename T>
+      void display_client_internal_stats(std::ostream& out,
+					 std::string time_unit) {
+	T track_resp_time(0);
+	T get_req_params_time(0);
+	uint32_t track_resp_count = 0;
+	uint32_t get_req_params_count = 0;
+
+	for (uint i = 0; i < get_client_count(); ++i) {
+	  const auto& client = get_client(i);
+	  const auto& is = client.get_internal_stats();
+	  track_resp_time +=
+	    std::chrono::duration_cast<T>(is.track_resp_time);
+	  get_req_params_time +=
+	    std::chrono::duration_cast<T>(is.get_req_params_time);
+	  track_resp_count += is.track_resp_count;
+	  get_req_params_count += is.get_req_params_count;
+	}
+
+	double track_resp_time_unit =
+	  double(track_resp_time.count()) / track_resp_count;
+	out << "total time to track responses: " <<
+	  std::fixed << track_resp_time.count() << " " << time_unit << ";" <<
+	  std::endl <<
+	  "    count: " << track_resp_count << ";" << std::endl <<
+	  "    average: " << track_resp_time_unit << " " << time_unit <<
+	  " per request/response" << std::endl;
+
+	double get_req_params_time_unit =
+	  double(get_req_params_time.count()) / get_req_params_count;
+	out << "total time to get request parameters: " <<
+	  std::fixed << get_req_params_time.count() << " " << time_unit <<
+	  ";" << std::endl <<
+	  "    count: " << get_req_params_count << ";" << std::endl <<
+	  "    average: " << get_req_params_time_unit << " " << time_unit <<
+	  " per request/response" << std::endl;
+
+	out << std::endl;
+
+	assert(track_resp_count == get_req_params_count);
+	out << "client timing for QOS algorithm: " <<
+	  track_resp_time_unit + get_req_params_time_unit << " " <<
+	  time_unit << " per request/response" << std::endl;
+      }
+
+
+      // **** server selection functions ****
+
+
+      const ServerId& server_select_alternate(uint64_t seed,
+					      uint16_t client_idx) {
+	uint index = (client_idx + seed) % server_count;
+	return server_ids[index];
+      }
+
+
+      // returns a lambda using the range specified as servers_per (client)
+      ClientBasedServerSelectFunc
+      make_server_select_alt_range(uint16_t servers_per) {
+	return [servers_per,this](uint64_t seed, uint16_t client_idx)
+	  -> const ServerId& {
+	  double factor = double(server_count) / client_count;
+	  uint offset = seed % servers_per;
+	  uint index = (uint(0.5 + client_idx * factor) + offset) % server_count;
+	  return server_ids[index];
+	};
+      }
+
+
+      // function to choose a server randomly
+      const ServerId& server_select_random(uint64_t seed, uint16_t client_idx) {
+	uint index = prng() % server_count;
+	return server_ids[index];
+      }
+
+  
+      // function to choose a server randomly
+      ClientBasedServerSelectFunc
+      make_server_select_ran_range(uint16_t servers_per) {
+	return [servers_per,this](uint64_t seed, uint16_t client_idx)
+	  -> const ServerId& {
+	  double factor = double(server_count) / client_count;
+	  uint offset = prng() % servers_per;
+	  uint index = (uint(0.5 + client_idx * factor) + offset) % server_count;
+	  return server_ids[index];
+	};
+      }
+
+
+      // function to always choose the first server
+      const ServerId& server_select_0(uint64_t seed, uint16_t client_idx) {
+	return server_ids[0];
+      }
+    }; // class Simulation
+
+  }; // namespace qos_simulation
+}; // namespace crimson
diff --git a/src/dmclock/sim/src/ssched/ssched_client.h b/src/dmclock/sim/src/ssched/ssched_client.h
new file mode 100644
index 00000000000..dcbe0771de5
--- /dev/null
+++ b/src/dmclock/sim/src/ssched/ssched_client.h
@@ -0,0 +1,44 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ * Copyright (C) 2016 Red Hat Inc.
+ */
+
+
+#pragma once
+
+#include "ssched_recs.h"
+
+
+namespace crimson {
+  namespace simple_scheduler {
+
+    // S is server identifier type
+    template<typename S>
+    class ServiceTracker {
+
+    public:
+
+      // we have to start the counters at 1, as 0 is used in the
+      // cleaning process
+      ServiceTracker()
+      {
+	// emptry
+      }
+
+
+      void track_resp(const S& server_id, const NullData& ignore) {
+	// empty
+      }
+
+
+      /*
+       * Returns the ReqParams for the given server.
+       */
+      ReqParams get_req_params(const S& server) {
+	return ReqParams();
+      } // get_req_params
+    }; // class ServiceTracker
+  } // namespace simple_scheduler
+} // namespace crimson
diff --git a/src/dmclock/sim/src/ssched/ssched_recs.h b/src/dmclock/sim/src/ssched/ssched_recs.h
new file mode 100644
index 00000000000..3332d5a4933
--- /dev/null
+++ b/src/dmclock/sim/src/ssched/ssched_recs.h
@@ -0,0 +1,37 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ * Copyright (C) 2016 Red Hat Inc.
+ */
+
+
+#pragma once
+
+
+#include <ostream>
+#include <assert.h>
+
+
+namespace crimson {
+  namespace simple_scheduler {
+
+    // since we send no additional data out
+    // NOTE: Change name to RespParams? Is it used elsewhere?
+    struct NullData {
+      friend std::ostream& operator<<(std::ostream& out, const NullData& n) {
+	out << "NullData{ EMPTY }";
+	return out;
+      }
+    }; // struct NullData
+
+
+    struct ReqParams {
+      friend std::ostream& operator<<(std::ostream& out, const ReqParams& rp) {
+	out << "ReqParams{ EMPTY }";
+	return out;
+      }
+    };
+
+  }
+}
diff --git a/src/dmclock/sim/src/ssched/ssched_server.h b/src/dmclock/sim/src/ssched/ssched_server.h
new file mode 100644
index 00000000000..ee4c1e6e3ef
--- /dev/null
+++ b/src/dmclock/sim/src/ssched/ssched_server.h
@@ -0,0 +1,182 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ * Copyright (C) 2016 Red Hat Inc.
+ */
+
+#pragma once
+
+#include <memory>
+#include <mutex>
+#include <deque>
+
+#include "boost/variant.hpp"
+
+#include "ssched_recs.h"
+
+#ifdef PROFILE
+#include "profile.h"
+#endif
+
+namespace crimson {
+
+  namespace simple_scheduler {
+
+    template<typename C, typename R, typename Time>
+    class SimpleQueue {
+
+    public:
+
+      using RequestRef = std::unique_ptr<R>;
+
+      // a function to see whether the server can handle another request
+      using CanHandleRequestFunc = std::function<bool(void)>;
+
+      // a function to submit a request to the server; the second
+      // parameter is a callback when it's completed
+      using HandleRequestFunc =
+	std::function<void(const C&,RequestRef,NullData)>;
+
+      struct PullReq {
+	enum class Type { returning, none };
+
+	struct Retn {
+	  C           client;
+	  RequestRef  request;
+	};
+
+	Type                 type;
+	boost::variant<Retn> data;
+      };
+
+    protected:
+
+      enum class Mechanism { push, pull };
+
+      struct QRequest {
+	C          client;
+	RequestRef request;
+      };
+
+      bool finishing = false;
+      Mechanism mechanism;
+
+      CanHandleRequestFunc can_handle_f;
+      HandleRequestFunc handle_f;
+
+      mutable std::mutex queue_mtx;
+      using DataGuard = std::lock_guard<decltype(queue_mtx)>;
+
+      std::deque<QRequest> queue;
+
+#ifdef PROFILE
+    public:
+      ProfileTimer<std::chrono::nanoseconds> pull_request_timer;
+      ProfileTimer<std::chrono::nanoseconds> add_request_timer;
+      ProfileTimer<std::chrono::nanoseconds> request_complete_timer;
+    protected:
+#endif
+
+    public:
+
+      // push full constructor
+      SimpleQueue(CanHandleRequestFunc _can_handle_f,
+		  HandleRequestFunc _handle_f) :
+	mechanism(Mechanism::push),
+	can_handle_f(_can_handle_f),
+	handle_f(_handle_f)
+      {
+	// empty
+      }
+
+      SimpleQueue() :
+	mechanism(Mechanism::pull)
+      {
+	// empty
+      }
+
+      ~SimpleQueue() {
+	finishing = true;
+      }
+
+      void add_request(const R& request,
+		       const C& client_id,
+		       const ReqParams& req_params) {
+	add_request(RequestRef(new R(request)), client_id, req_params);
+      }
+
+      void add_request(RequestRef&& request,
+		       const C& client_id,
+		       const ReqParams& req_params) {
+	DataGuard g(queue_mtx);
+
+#ifdef PROFILE
+	add_request_timer.start();
+#endif
+	queue.emplace_back(QRequest{client_id, std::move(request)});
+
+	if (Mechanism::push == mechanism) {
+	  schedule_request();
+	}
+
+#ifdef PROFILE
+	add_request_timer.stop();
+#endif
+      } // add_request
+
+      void request_completed() {
+	assert(Mechanism::push == mechanism);
+	DataGuard g(queue_mtx);
+
+#ifdef PROFILE
+	request_complete_timer.start();
+#endif
+	schedule_request();
+
+#ifdef PROFILE
+	request_complete_timer.stop();
+#endif
+      } // request_completed
+
+      PullReq pull_request() {
+	assert(Mechanism::pull == mechanism);
+	PullReq result;
+	DataGuard g(queue_mtx);
+
+#ifdef PROFILE
+	pull_request_timer.start();
+#endif
+
+	if (queue.empty()) {
+	  result.type = PullReq::Type::none;
+	} else {
+	  auto front = queue.front();
+	  result.type = PullReq::Type::returning;
+	  result.data =
+	    typename PullReq::Retn{front.client, std::move(front.request)};
+	  queue.pop();
+	}
+
+#ifdef PROFILE
+	pull_request_timer.stop();
+#endif
+
+	return result;
+      }
+
+    protected:
+
+      // queue_mtx should be held when called; should only be called
+      // when mechanism is push
+      void schedule_request() {
+	if (!queue.empty() && can_handle_f()) {
+	  auto& front = queue.front();
+	  static NullData null_data;
+	  handle_f(front.client, std::move(front.request), null_data);
+	  queue.pop_front();
+	}
+      }
+    };
+  };
+};
diff --git a/src/dmclock/sim/src/str_list.cc b/src/dmclock/sim/src/str_list.cc
new file mode 100644
index 00000000000..22109e00840
--- /dev/null
+++ b/src/dmclock/sim/src/str_list.cc
@@ -0,0 +1,106 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2009-2010 Dreamhost
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "str_list.h"
+
+using std::string;
+using std::vector;
+using std::set;
+using std::list;
+
+static bool get_next_token(const string &s, size_t& pos, const char *delims, string& token)
+{
+  int start = s.find_first_not_of(delims, pos);
+  int end;
+
+  if (start < 0){
+    pos = s.size();
+    return false;
+  }
+
+  end = s.find_first_of(delims, start);
+  if (end >= 0)
+    pos = end + 1;
+  else {
+    pos = end = s.size();
+  }
+
+  token = s.substr(start, end - start);
+  return true;
+}
+
+void get_str_list(const string& str, const char *delims, list<string>& str_list)
+{
+  size_t pos = 0;
+  string token;
+
+  str_list.clear();
+
+  while (pos < str.size()) {
+    if (get_next_token(str, pos, delims, token)) {
+      if (token.size() > 0) {
+        str_list.push_back(token);
+      }
+    }
+  }
+}
+
+void get_str_list(const string& str, list<string>& str_list)
+{
+  const char *delims = ";,= \t";
+  return get_str_list(str, delims, str_list);
+}
+
+void get_str_vec(const string& str, const char *delims, vector<string>& str_vec)
+{
+  size_t pos = 0;
+  string token;
+  str_vec.clear();
+
+  while (pos < str.size()) {
+    if (get_next_token(str, pos, delims, token)) {
+      if (token.size() > 0) {
+        str_vec.push_back(token);
+      }
+    }
+  }
+}
+
+void get_str_vec(const string& str, vector<string>& str_vec)
+{
+  const char *delims = ";,= \t";
+  return get_str_vec(str, delims, str_vec);
+}
+
+void get_str_set(const string& str, const char *delims, set<string>& str_set)
+{
+  size_t pos = 0;
+  string token;
+
+  str_set.clear();
+
+  while (pos < str.size()) {
+    if (get_next_token(str, pos, delims, token)) {
+      if (token.size() > 0) {
+        str_set.insert(token);
+      }
+    }
+  }
+}
+
+void get_str_set(const string& str, set<string>& str_set)
+{
+  const char *delims = ";,= \t";
+  return get_str_set(str, delims, str_set);
+}
diff --git a/src/dmclock/sim/src/str_list.h b/src/dmclock/sim/src/str_list.h
new file mode 100644
index 00000000000..4ba0cadd960
--- /dev/null
+++ b/src/dmclock/sim/src/str_list.h
@@ -0,0 +1,94 @@
+#ifndef CEPH_STRLIST_H
+#define CEPH_STRLIST_H
+
+#include <list>
+#include <set>
+#include <sstream>
+#include <string>
+#include <vector>
+
+/**
+ * Split **str** into a list of strings, using the ";,= \t" delimiters and output the result in **str_list**.
+ * 
+ * @param [in] str String to split and save as list
+ * @param [out] str_list List modified containing str after it has been split
+**/
+extern void get_str_list(const std::string& str,
+			 std::list<std::string>& str_list);
+
+/**
+ * Split **str** into a list of strings, using the **delims** delimiters and output the result in **str_list**.
+ * 
+ * @param [in] str String to split and save as list
+ * @param [in] delims characters used to split **str**
+ * @param [out] str_list List modified containing str after it has been split
+**/
+extern void get_str_list(const std::string& str,
+                         const char *delims,
+			 std::list<std::string>& str_list);
+
+/**
+ * Split **str** into a list of strings, using the ";,= \t" delimiters and output the result in **str_vec**.
+ * 
+ * @param [in] str String to split and save as Vector
+ * @param [out] str_vec Vector modified containing str after it has been split
+**/
+extern void get_str_vec(const std::string& str,
+			 std::vector<std::string>& str_vec);
+
+/**
+ * Split **str** into a list of strings, using the **delims** delimiters and output the result in **str_vec**.
+ * 
+ * @param [in] str String to split and save as Vector
+ * @param [in] delims characters used to split **str**
+ * @param [out] str_vec Vector modified containing str after it has been split
+**/
+extern void get_str_vec(const std::string& str,
+                         const char *delims,
+			 std::vector<std::string>& str_vec);
+
+/**
+ * Split **str** into a list of strings, using the ";,= \t" delimiters and output the result in **str_list**.
+ * 
+ * @param [in] str String to split and save as Set
+ * @param [out] str_list Set modified containing str after it has been split
+**/
+extern void get_str_set(const std::string& str,
+			std::set<std::string>& str_list);
+
+/**
+ * Split **str** into a list of strings, using the **delims** delimiters and output the result in **str_list**.
+ * 
+ * @param [in] str String to split and save as Set
+ * @param [in] delims characters used to split **str**
+ * @param [out] str_list Set modified containing str after it has been split
+**/
+extern void get_str_set(const std::string& str,
+                        const char *delims,
+			std::set<std::string>& str_list);
+
+/**
+ * Return a String containing the vector **v** joined with **sep**
+ * 
+ * If **v** is empty, the function returns an empty string
+ * For each element in **v**,
+ * it will concatenate this element and **sep** with result
+ * 
+ * @param [in] v Vector to join as a String
+ * @param [in] sep String used to join each element from **v**
+ * @return empty string if **v** is empty or concatenated string
+**/
+inline std::string str_join(const std::vector<std::string>& v, std::string sep)
+{
+  if (v.empty())
+    return std::string();
+  std::vector<std::string>::const_iterator i = v.begin();
+  std::string r = *i;
+  for (++i; i != v.end(); ++i) {
+    r += sep;
+    r += *i;
+  }
+  return r;
+}
+
+#endif
diff --git a/src/dmclock/sim/src/test_dmclock.cc b/src/dmclock/sim/src/test_dmclock.cc
new file mode 100644
index 00000000000..8e7aa4ab219
--- /dev/null
+++ b/src/dmclock/sim/src/test_dmclock.cc
@@ -0,0 +1,40 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ * Copyright (C) 2016 Red Hat Inc.
+ */
+
+
+#include "dmclock_recs.h"
+#include "dmclock_server.h"
+#include "dmclock_client.h"
+
+#include "sim_recs.h"
+#include "sim_server.h"
+#include "sim_client.h"
+
+#include "test_dmclock.h"
+
+
+namespace test = crimson::test_dmc;
+
+
+void test::dmc_server_accumulate_f(test::DmcAccum& a,
+				   const test::dmc::PhaseType& phase) {
+  if (test::dmc::PhaseType::reservation == phase) {
+    ++a.reservation_count;
+  } else {
+    ++a.proportion_count;
+  }
+}
+
+
+void test::dmc_client_accumulate_f(test::DmcAccum& a,
+				   const test::dmc::PhaseType& phase) {
+  if (test::dmc::PhaseType::reservation == phase) {
+    ++a.reservation_count;
+  } else {
+    ++a.proportion_count;
+  }
+}
diff --git a/src/dmclock/sim/src/test_dmclock.h b/src/dmclock/sim/src/test_dmclock.h
new file mode 100644
index 00000000000..7f1e55439ed
--- /dev/null
+++ b/src/dmclock/sim/src/test_dmclock.h
@@ -0,0 +1,56 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ * Copyright (C) 2016 Red Hat Inc.
+ */
+
+
+#include "dmclock_recs.h"
+#include "dmclock_server.h"
+#include "dmclock_client.h"
+
+#include "sim_recs.h"
+#include "sim_server.h"
+#include "sim_client.h"
+
+#include "simulate.h"
+
+
+namespace crimson {
+  namespace test_dmc {
+    
+    namespace dmc = crimson::dmclock;
+    namespace sim = crimson::qos_simulation;
+
+    struct DmcAccum {
+      uint64_t reservation_count = 0;
+      uint64_t proportion_count = 0;
+    };
+
+    using DmcQueue = dmc::PushPriorityQueue<ClientId,sim::TestRequest>;
+
+    using DmcServer = sim::SimulatedServer<DmcQueue,
+					   dmc::ReqParams,
+					   dmc::PhaseType,
+					   DmcAccum>;
+
+    using DmcClient = sim::SimulatedClient<dmc::ServiceTracker<ServerId>,
+					   dmc::ReqParams,
+					   dmc::PhaseType,
+					   DmcAccum>;
+
+    using CreateQueueF = std::function<DmcQueue*(DmcQueue::CanHandleRequestFunc,
+						 DmcQueue::HandleRequestFunc)>;
+
+    using MySim = sim::Simulation<ServerId,ClientId,DmcServer,DmcClient>;
+
+    using SubmitFunc = DmcClient::SubmitFunc;
+
+    extern void dmc_server_accumulate_f(DmcAccum& a,
+					const dmc::PhaseType& phase);
+
+    extern void dmc_client_accumulate_f(DmcAccum& a,
+					const dmc::PhaseType& phase);
+  } // namespace test_dmc
+} // namespace crimson
diff --git a/src/dmclock/sim/src/test_dmclock_main.cc b/src/dmclock/sim/src/test_dmclock_main.cc
new file mode 100644
index 00000000000..c3ba1e18fbd
--- /dev/null
+++ b/src/dmclock/sim/src/test_dmclock_main.cc
@@ -0,0 +1,322 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ * Copyright (C) 2016 Red Hat Inc.
+ */
+
+
+#include "test_dmclock.h"
+#include "config.h"
+
+#ifdef PROFILE
+#include "profile.h"
+#endif
+
+
+namespace dmc = crimson::dmclock;
+namespace test = crimson::test_dmc;
+namespace sim = crimson::qos_simulation;
+
+using namespace std::placeholders;
+
+
+namespace crimson {
+    namespace test_dmc {
+        void server_data(std::ostream& out,
+                         test::MySim* sim,
+                         test::MySim::ServerFilter server_disp_filter,
+                         int head_w, int data_w, int data_prec);
+
+        void client_data(std::ostream& out,
+                         test::MySim* sim,
+                         test::MySim::ClientFilter client_disp_filter,
+                         int head_w, int data_w, int data_prec);
+    }
+}
+
+
+int main(int argc, char* argv[]) {
+    std::vector<const char*> args;
+    for (int i = 1; i < argc; ++i) {
+      args.push_back(argv[i]);
+    }
+
+    std::string conf_file_list;
+    sim::ceph_argparse_early_args(args, &conf_file_list);
+
+    sim::sim_config_t g_conf;
+    std::vector<sim::cli_group_t> &cli_group = g_conf.cli_group;
+    std::vector<sim::srv_group_t> &srv_group = g_conf.srv_group;
+
+    if (!conf_file_list.empty()) {
+      int ret;
+      ret = sim::parse_config_file(conf_file_list, g_conf);
+      if (ret) {
+	// error
+	_exit(1);
+      }
+    } else {
+      // default simulation parameter
+      g_conf.client_groups = 2;
+
+      sim::srv_group_t st;
+      srv_group.push_back(st);
+
+      sim::cli_group_t ct1(99, 0);
+      cli_group.push_back(ct1);
+
+      sim::cli_group_t ct2(1, 10);
+      cli_group.push_back(ct2);
+    }
+
+    const uint server_groups = g_conf.server_groups;
+    const uint client_groups = g_conf.client_groups;
+    const bool server_random_selection = g_conf.server_random_selection;
+    const bool server_soft_limit = g_conf.server_soft_limit;
+    uint server_total_count = 0;
+    uint client_total_count = 0;
+
+    for (uint i = 0; i < client_groups; ++i) {
+      client_total_count += cli_group[i].client_count;
+    }
+
+    for (uint i = 0; i < server_groups; ++i) {
+      server_total_count += srv_group[i].server_count;
+    }
+
+    std::vector<test::dmc::ClientInfo> client_info;
+    for (uint i = 0; i < client_groups; ++i) {
+      client_info.push_back(test::dmc::ClientInfo 
+			  { cli_group[i].client_reservation,
+			    cli_group[i].client_weight,
+			    cli_group[i].client_limit } );
+    }
+
+    auto ret_client_group_f = [&](const ClientId& c) -> uint {
+      uint group_max = 0;
+      uint i = 0;
+      for (; i < client_groups; ++i) {
+	group_max += cli_group[i].client_count;
+	if (c < group_max) {
+	  break;
+	}
+      }
+      return i;
+    };
+
+    auto ret_server_group_f = [&](const ServerId& s) -> uint {
+      uint group_max = 0;
+      uint i = 0;
+      for (; i < server_groups; ++i) {
+	group_max += srv_group[i].server_count;
+	if (s < group_max) {
+	  break;
+	}
+      }
+      return i;
+    };
+
+    auto client_info_f = [=](const ClientId& c) -> test::dmc::ClientInfo {
+      return client_info[ret_client_group_f(c)];
+    };
+
+    auto client_disp_filter = [=] (const ClientId& i) -> bool {
+        return i < 3 || i >= (client_total_count - 3);
+    };
+
+    auto server_disp_filter = [=] (const ServerId& i) -> bool {
+        return i < 3 || i >= (server_total_count - 3);
+    };
+
+
+    test::MySim *simulation;
+  
+
+    // lambda to post a request to the identified server; called by client
+    test::SubmitFunc server_post_f =
+        [&simulation](const ServerId& server,
+                      const sim::TestRequest& request,
+                      const ClientId& client_id,
+                      const test::dmc::ReqParams& req_params) {
+        test::DmcServer& s = simulation->get_server(server);
+        s.post(request, client_id, req_params);
+    };
+
+    std::vector<std::vector<sim::CliInst>> cli_inst;
+    for (uint i = 0; i < client_groups; ++i) {
+      if (cli_group[i].client_wait == std::chrono::seconds(0)) {
+	cli_inst.push_back(
+	    { { sim::req_op, 
+	        (uint32_t)cli_group[i].client_total_ops,
+	        (double)cli_group[i].client_iops_goal, 
+	        (uint16_t)cli_group[i].client_outstanding_ops } } );
+      } else {
+	cli_inst.push_back(
+	    { { sim::wait_op, cli_group[i].client_wait },
+	      { sim::req_op, 
+	        (uint32_t)cli_group[i].client_total_ops,
+		(double)cli_group[i].client_iops_goal, 
+		(uint16_t)cli_group[i].client_outstanding_ops } } );
+      }
+    }
+
+    simulation = new test::MySim();
+
+    test::DmcServer::ClientRespFunc client_response_f =
+        [&simulation](ClientId client_id,
+                      const sim::TestResponse& resp,
+                      const ServerId& server_id,
+                      const dmc::PhaseType& phase) {
+        simulation->get_client(client_id).receive_response(resp,
+                                                           server_id,
+                                                           phase);
+    };
+
+    test::CreateQueueF create_queue_f =
+        [&](test::DmcQueue::CanHandleRequestFunc can_f,
+            test::DmcQueue::HandleRequestFunc handle_f) -> test::DmcQueue* {
+        return new test::DmcQueue(client_info_f, can_f, handle_f, server_soft_limit);
+    };
+
+ 
+    auto create_server_f = [&](ServerId id) -> test::DmcServer* {
+      uint i = ret_server_group_f(id);
+      return new test::DmcServer(id,
+                                 srv_group[i].server_iops,
+				 srv_group[i].server_threads,
+				 client_response_f,
+				 test::dmc_server_accumulate_f,
+				 create_queue_f);
+    };
+
+    auto create_client_f = [&](ClientId id) -> test::DmcClient* {
+      uint i = ret_client_group_f(id);
+      test::MySim::ClientBasedServerSelectFunc server_select_f;
+      uint client_server_select_range = cli_group[i].client_server_select_range;
+      if (!server_random_selection) {
+	server_select_f = simulation->make_server_select_alt_range(client_server_select_range);
+      } else {
+	server_select_f = simulation->make_server_select_ran_range(client_server_select_range);
+      }
+      return new test::DmcClient(id,
+				 server_post_f,
+				 std::bind(server_select_f, _1, id),
+				 test::dmc_client_accumulate_f,
+				 cli_inst[i]);
+    };
+
+#if 1
+    std::cout << "[global]" << std::endl << g_conf << std::endl;
+    for (uint i = 0; i < client_groups; ++i) {
+      std::cout << std::endl << "[client." << i << "]" << std::endl;
+      std::cout << cli_group[i] << std::endl;
+    }
+    for (uint i = 0; i < server_groups; ++i) {
+      std::cout << std::endl << "[server." << i << "]" << std::endl;
+      std::cout << srv_group[i] << std::endl;
+    }
+    std::cout << std::endl;
+#endif
+
+    simulation->add_servers(server_total_count, create_server_f);
+    simulation->add_clients(client_total_count, create_client_f);
+
+    simulation->run();
+    simulation->display_stats(std::cout,
+                              &test::server_data, &test::client_data,
+                              server_disp_filter, client_disp_filter);
+} // main
+
+
+void test::client_data(std::ostream& out,
+		 test::MySim* sim,
+		 test::MySim::ClientFilter client_disp_filter,
+		 int head_w, int data_w, int data_prec) {
+    // report how many ops were done by reservation and proportion for
+    // each client
+
+    int total_r = 0;
+    out << std::setw(head_w) << "res_ops:";
+    for (uint i = 0; i < sim->get_client_count(); ++i) {
+        const auto& client = sim->get_client(i);
+        auto r = client.get_accumulator().reservation_count;
+        total_r += r;
+        if (!client_disp_filter(i)) continue;
+        out << " " << std::setw(data_w) << r;
+    }
+    out << " " << std::setw(data_w) << std::setprecision(data_prec) <<
+        std::fixed << total_r << std::endl;
+
+    int total_p = 0;
+    out << std::setw(head_w) << "prop_ops:";
+    for (uint i = 0; i < sim->get_client_count(); ++i) {
+        const auto& client = sim->get_client(i);
+        auto p = client.get_accumulator().proportion_count;
+        total_p += p;
+        if (!client_disp_filter(i)) continue;
+        out << " " << std::setw(data_w) << p;
+    }
+    out << " " << std::setw(data_w) << std::setprecision(data_prec) <<
+        std::fixed << total_p << std::endl;
+}
+
+
+void test::server_data(std::ostream& out,
+		 test::MySim* sim,
+		 test::MySim::ServerFilter server_disp_filter,
+		 int head_w, int data_w, int data_prec) {
+    out << std::setw(head_w) << "res_ops:";
+    int total_r = 0;
+    for (uint i = 0; i < sim->get_server_count(); ++i) {
+        const auto& server = sim->get_server(i);
+        auto rc = server.get_accumulator().reservation_count;
+        total_r += rc;
+        if (!server_disp_filter(i)) continue;
+        out << " " << std::setw(data_w) << rc;
+    }
+    out << " " << std::setw(data_w) << std::setprecision(data_prec) <<
+        std::fixed << total_r << std::endl;
+
+    out << std::setw(head_w) << "prop_ops:";
+    int total_p = 0;
+    for (uint i = 0; i < sim->get_server_count(); ++i) {
+        const auto& server = sim->get_server(i);
+        auto pc = server.get_accumulator().proportion_count;
+        total_p += pc;
+        if (!server_disp_filter(i)) continue;
+        out << " " << std::setw(data_w) << pc;
+    }
+    out << " " << std::setw(data_w) << std::setprecision(data_prec) <<
+        std::fixed << total_p << std::endl;
+
+    const auto& q = sim->get_server(0).get_priority_queue();
+    out << std::endl <<
+	" k-way heap: " << q.get_heap_branching_factor() << std::endl
+	<< std::endl;
+
+#ifdef PROFILE
+    crimson::ProfileCombiner<std::chrono::nanoseconds> art_combiner;
+    crimson::ProfileCombiner<std::chrono::nanoseconds> rct_combiner;
+    for (uint i = 0; i < sim->get_server_count(); ++i) {
+      const auto& q = sim->get_server(i).get_priority_queue();
+      const auto& art = q.add_request_timer;
+      art_combiner.combine(art);
+      const auto& rct = q.request_complete_timer;
+      rct_combiner.combine(rct);
+    }
+    out << "Server add_request_timer: count:" << art_combiner.get_count() <<
+      ", mean:" << art_combiner.get_mean() <<
+      ", std_dev:" << art_combiner.get_std_dev() <<
+      ", low:" << art_combiner.get_low() <<
+      ", high:" << art_combiner.get_high() << std::endl;
+    out << "Server request_complete_timer: count:" << rct_combiner.get_count() <<
+      ", mean:" << rct_combiner.get_mean() <<
+      ", std_dev:" << rct_combiner.get_std_dev() <<
+      ", low:" << rct_combiner.get_low() <<
+      ", high:" << rct_combiner.get_high() << std::endl;
+    out << "Server combined mean: " <<
+      (art_combiner.get_mean() + rct_combiner.get_mean()) <<
+      std::endl;
+#endif
+}
diff --git a/src/dmclock/sim/src/test_ssched.cc b/src/dmclock/sim/src/test_ssched.cc
new file mode 100644
index 00000000000..e28b015cbdb
--- /dev/null
+++ b/src/dmclock/sim/src/test_ssched.cc
@@ -0,0 +1,33 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ * Copyright (C) 2016 Red Hat Inc.
+ */
+
+
+#include "ssched_recs.h"
+#include "ssched_server.h"
+#include "ssched_client.h"
+
+#include "sim_recs.h"
+#include "sim_server.h"
+#include "sim_client.h"
+
+#include "test_ssched.h"
+
+
+namespace test = crimson::test_simple_scheduler;
+namespace ssched = crimson::simple_scheduler;
+
+
+void test::simple_server_accumulate_f(test::SimpleAccum& a,
+				      const ssched::NullData& add_info) {
+  ++a.request_count;
+}
+
+
+void test::simple_client_accumulate_f(test::SimpleAccum& a,
+				      const ssched::NullData& ignore) {
+  // empty
+}
diff --git a/src/dmclock/sim/src/test_ssched.h b/src/dmclock/sim/src/test_ssched.h
new file mode 100644
index 00000000000..96ac33ff376
--- /dev/null
+++ b/src/dmclock/sim/src/test_ssched.h
@@ -0,0 +1,57 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ * Copyright (C) 2016 Red Hat Inc.
+ */
+
+
+#include "ssched_server.h"
+#include "ssched_client.h"
+
+#include "sim_recs.h"
+#include "sim_server.h"
+#include "sim_client.h"
+
+#include "simulate.h"
+
+
+namespace crimson {
+  namespace test_simple_scheduler {
+
+    namespace ssched = crimson::simple_scheduler;
+    namespace sim = crimson::qos_simulation;
+
+    using Time = double;
+
+    struct SimpleAccum {
+      uint32_t request_count = 0;
+    };
+
+    using SimpleQueue = ssched::SimpleQueue<ClientId,sim::TestRequest,Time>;
+
+    using SimpleServer = sim::SimulatedServer<SimpleQueue,
+					      ssched::ReqParams,
+					      ssched::NullData,
+					      SimpleAccum>;
+    using SimpleClient = sim::SimulatedClient<ssched::ServiceTracker<ServerId>,
+					      ssched::ReqParams,
+					      ssched::NullData,
+					      SimpleAccum>;
+
+    using CreateQueueF =
+      std::function<SimpleQueue*(SimpleQueue::CanHandleRequestFunc,
+				 SimpleQueue::HandleRequestFunc)>;
+
+
+    using MySim = sim::Simulation<ServerId,ClientId,SimpleServer,SimpleClient>;
+  
+    using SubmitFunc = SimpleClient::SubmitFunc;
+
+    extern void simple_server_accumulate_f(SimpleAccum& a,
+					   const ssched::NullData& add_info);
+
+    extern void simple_client_accumulate_f(SimpleAccum& a,
+					   const ssched::NullData& ignore);
+  } // namespace test_simple
+} // namespace crimson
diff --git a/src/dmclock/sim/src/test_ssched_main.cc b/src/dmclock/sim/src/test_ssched_main.cc
new file mode 100644
index 00000000000..6df20dc5f89
--- /dev/null
+++ b/src/dmclock/sim/src/test_ssched_main.cc
@@ -0,0 +1,187 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ * Copyright (C) 2016 Red Hat Inc.
+ */
+
+
+#include "test_ssched.h"
+
+
+#ifdef PROFILE
+#include "profile.h"
+#endif
+
+
+namespace test = crimson::test_simple_scheduler;
+namespace ssched = crimson::simple_scheduler;
+namespace sim = crimson::qos_simulation;
+
+using namespace std::placeholders;
+
+
+namespace crimson {
+  namespace test_simple_scheduler {
+    void client_data(std::ostream& out,
+		     test::MySim* sim,
+		     test::MySim::ClientFilter client_disp_filter,
+		     int head_w, int data_w, int data_prec);
+
+    void server_data(std::ostream& out,
+		     test::MySim* sim,
+		     test::MySim::ServerFilter server_disp_filter,
+		     int head_w, int data_w, int data_prec);
+  } // namespace test_simple
+} // namespace crimson
+    
+
+int main(int argc, char* argv[]) {
+  // server params
+
+  const uint server_count = 100;
+  const uint server_iops = 40;
+  const uint server_threads = 1;
+
+  // client params
+
+  const uint client_total_ops = 1000;
+  const uint client_count = 100;
+  const uint client_server_select_range = 10;
+  const uint client_wait_count = 1;
+  const uint client_iops_goal = 50;
+  const uint client_outstanding_ops = 100;
+  const std::chrono::seconds client_wait(10);
+
+  auto client_disp_filter = [=] (const ClientId& i) -> bool {
+    return i < 3 || i >= (client_count - 3);
+  };
+
+  auto server_disp_filter = [=] (const ServerId& i) -> bool {
+    return i < 3 || i >= (server_count - 3);
+  };
+
+
+  test::MySim *simulation;
+
+  // lambda to post a request to the identified server; called by client
+  test::SubmitFunc server_post_f =
+    [&simulation](const ServerId& server_id,
+		  const sim::TestRequest& request,
+		  const ClientId& client_id,
+		  const ssched::ReqParams& req_params) {
+    auto& server = simulation->get_server(server_id);
+    server.post(request, client_id, req_params);
+  };
+
+  static std::vector<sim::CliInst> no_wait =
+    { { sim::req_op, client_total_ops, client_iops_goal, client_outstanding_ops } };
+  static std::vector<sim::CliInst> wait =
+    { { sim::wait_op, client_wait },
+      { sim::req_op, client_total_ops, client_iops_goal, client_outstanding_ops } };
+
+  simulation = new test::MySim();
+
+#if 1
+  test::MySim::ClientBasedServerSelectFunc server_select_f =
+    simulation->make_server_select_alt_range(client_server_select_range);
+#elif 0
+  test::MySim::ClientBasedServerSelectFunc server_select_f =
+    std::bind(&test::MySim::server_select_random, simulation, _1, _2);
+#else
+  test::MySim::ClientBasedServerSelectFunc server_select_f =
+    std::bind(&test::MySim::server_select_0, simulation, _1, _2);
+#endif
+
+  test::SimpleServer::ClientRespFunc client_response_f =
+    [&simulation](ClientId client_id,
+		  const sim::TestResponse& resp,
+		  const ServerId& server_id,
+		  const ssched::NullData& resp_params) {
+    simulation->get_client(client_id).receive_response(resp,
+						       server_id,
+						       resp_params);
+  };
+
+  test::CreateQueueF create_queue_f =
+    [&](test::SimpleQueue::CanHandleRequestFunc can_f,
+	test::SimpleQueue::HandleRequestFunc handle_f) -> test::SimpleQueue* {
+    return new test::SimpleQueue(can_f, handle_f);
+  };
+
+  auto create_server_f = [&](ServerId id) -> test::SimpleServer* {
+    return new test::SimpleServer(id,
+				  server_iops, server_threads,
+				  client_response_f,
+				  test::simple_server_accumulate_f,
+				  create_queue_f);
+  };
+
+  auto create_client_f = [&](ClientId id) -> test::SimpleClient* {
+    return new test::SimpleClient(id,
+				  server_post_f,
+				  std::bind(server_select_f, _1, id),
+				  test::simple_client_accumulate_f,
+				  id < (client_count - client_wait_count)
+				  ? no_wait : wait);
+  };
+
+  simulation->add_servers(server_count, create_server_f);
+  simulation->add_clients(client_count, create_client_f);
+
+  simulation->run();
+  simulation->display_stats(std::cout,
+			    &test::server_data, &test::client_data,
+			    server_disp_filter, client_disp_filter);
+} // main
+
+
+void test::client_data(std::ostream& out,
+		       test::MySim* sim,
+		       test::MySim::ClientFilter client_disp_filter,
+		       int head_w, int data_w, int data_prec) {
+  // empty
+}
+
+
+void test::server_data(std::ostream& out,
+		       test::MySim* sim,
+		       test::MySim::ServerFilter server_disp_filter,
+		       int head_w, int data_w, int data_prec) {
+  out << std::setw(head_w) << "requests:";
+  int total_req = 0;
+  for (uint i = 0; i < sim->get_server_count(); ++i) {
+    const auto& server = sim->get_server(i);
+    auto req_count = server.get_accumulator().request_count;
+    total_req += req_count;
+    if (!server_disp_filter(i)) continue;
+    out << std::setw(data_w) << req_count;
+  }
+  out << std::setw(data_w) << std::setprecision(data_prec) <<
+    std::fixed << total_req << std::endl;
+
+#ifdef PROFILE
+    crimson::ProfileCombiner<std::chrono::nanoseconds> art_combiner;
+    crimson::ProfileCombiner<std::chrono::nanoseconds> rct_combiner;
+    for (uint i = 0; i < sim->get_server_count(); ++i) {
+      const auto& q = sim->get_server(i).get_priority_queue();
+      const auto& art = q.add_request_timer;
+      art_combiner.combine(art);
+      const auto& rct = q.request_complete_timer;
+      rct_combiner.combine(rct);
+    }
+    out << "Server add_request_timer: count:" << art_combiner.get_count() <<
+      ", mean:" << art_combiner.get_mean() <<
+      ", std_dev:" << art_combiner.get_std_dev() <<
+      ", low:" << art_combiner.get_low() <<
+      ", high:" << art_combiner.get_high() << std::endl;
+    out << "Server request_complete_timer: count:" << rct_combiner.get_count() <<
+      ", mean:" << rct_combiner.get_mean() <<
+      ", std_dev:" << rct_combiner.get_std_dev() <<
+      ", low:" << rct_combiner.get_low() <<
+      ", high:" << rct_combiner.get_high() << std::endl;
+    out << "Server combined mean: " <<
+      (art_combiner.get_mean() + rct_combiner.get_mean()) <<
+      std::endl;
+#endif
+}
diff --git a/src/dmclock/src/CMakeLists.txt b/src/dmclock/src/CMakeLists.txt
new file mode 100644
index 00000000000..691e64cce43
--- /dev/null
+++ b/src/dmclock/src/CMakeLists.txt
@@ -0,0 +1,19 @@
+include_directories(../support/src)
+include_directories(${BOOST_INCLUDE_DIR})
+
+set(local_flags "-Wall -pthread")
+
+set(dmc_srcs dmclock_util.cc ../support/src/run_every.cc)
+
+set_source_files_properties(${dmc_srcs}
+  PROPERTIES
+  COMPILE_FLAGS "${local_flags}"
+  )
+
+if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
+  set(warnings_off " -Wno-unused-variable -Wno-unused-function")
+elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
+  set(warnings_off " -Wno-unused-but-set-variable -Wno-unused-function")
+endif()
+
+add_library(dmclock STATIC ${dmc_srcs})
diff --git a/src/dmclock/src/dmclock_client.h b/src/dmclock/src/dmclock_client.h
new file mode 100644
index 00000000000..b44e1211b53
--- /dev/null
+++ b/src/dmclock/src/dmclock_client.h
@@ -0,0 +1,194 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ * Copyright (C) 2017 Red Hat Inc.
+ */
+
+
+#pragma once
+
+#include <map>
+#include <deque>
+#include <chrono>
+#include <thread>
+#include <mutex>
+#include <condition_variable>
+
+#include "run_every.h"
+#include "dmclock_util.h"
+#include "dmclock_recs.h"
+
+#include "gtest/gtest_prod.h"
+
+
+namespace crimson {
+  namespace dmclock {
+    struct ServerInfo {
+      Counter   delta_prev_req;
+      Counter   rho_prev_req;
+      uint32_t  my_delta;
+      uint32_t  my_rho;
+
+      ServerInfo(Counter _delta_prev_req,
+		 Counter _rho_prev_req) :
+	delta_prev_req(_delta_prev_req),
+	rho_prev_req(_rho_prev_req),
+	my_delta(0),
+	my_rho(0)
+      {
+	// empty
+      }
+
+      inline void req_update(Counter delta, Counter rho) {
+	delta_prev_req = delta;
+	rho_prev_req = rho;
+	my_delta = 0;
+	my_rho = 0;
+      }
+
+      inline void resp_update(PhaseType phase) {
+	++my_delta;
+	if (phase == PhaseType::reservation) ++my_rho;
+      }
+    };
+
+
+    // S is server identifier type
+    template<typename S>
+    class ServiceTracker {
+      FRIEND_TEST(dmclock_client, server_erase);
+
+      using TimePoint = decltype(std::chrono::steady_clock::now());
+      using Duration = std::chrono::milliseconds;
+      using MarkPoint = std::pair<TimePoint,Counter>;
+
+      Counter                 delta_counter; // # reqs completed
+      Counter                 rho_counter;   // # reqs completed via reservation
+      std::map<S,ServerInfo>  server_map;
+      mutable std::mutex      data_mtx;      // protects Counters and map
+
+      using DataGuard = std::lock_guard<decltype(data_mtx)>;
+
+      // clean config
+
+      std::deque<MarkPoint>     clean_mark_points;
+      Duration                  clean_age;     // age at which ServerInfo cleaned
+
+      // NB: All threads declared at end, so they're destructed firs!
+
+      std::unique_ptr<RunEvery> cleaning_job;
+
+
+    public:
+
+      // we have to start the counters at 1, as 0 is used in the
+      // cleaning process
+      template<typename Rep, typename Per>
+      ServiceTracker(std::chrono::duration<Rep,Per> _clean_every,
+		     std::chrono::duration<Rep,Per> _clean_age) :
+	delta_counter(1),
+	rho_counter(1),
+	clean_age(std::chrono::duration_cast<Duration>(_clean_age))
+      {
+	cleaning_job =
+	  std::unique_ptr<RunEvery>(
+	    new RunEvery(_clean_every,
+			 std::bind(&ServiceTracker::do_clean, this)));
+      }
+
+
+      // the reason we're overloading the constructor rather than
+      // using default values for the arguments is so that callers
+      // have to either use all defaults or specify all timings; with
+      // default arguments they could specify some without others
+      ServiceTracker() :
+	ServiceTracker(std::chrono::minutes(5), std::chrono::minutes(10))
+      {
+	// empty
+      }
+
+
+      /*
+       * Incorporates the RespParams received into the various counter.
+       */
+      void track_resp(const S& server_id, const PhaseType& phase) {
+	DataGuard g(data_mtx);
+
+	auto it = server_map.find(server_id);
+	if (server_map.end() == it) {
+	  // this code can only run if a request did not precede the
+	  // response or if the record was cleaned up b/w when
+	  // the request was made and now
+	  ServerInfo si(delta_counter, rho_counter);
+	  si.resp_update(phase);
+	  server_map.emplace(server_id, si);
+	} else {
+	  it->second.resp_update(phase);
+	}
+
+	++delta_counter;
+	if (PhaseType::reservation == phase) {
+	  ++rho_counter;
+	}
+      }
+
+
+      /*
+       * Returns the ReqParams for the given server.
+       */
+      ReqParams get_req_params(const S& server) {
+	DataGuard g(data_mtx);
+	auto it = server_map.find(server);
+	if (server_map.end() == it) {
+	  server_map.emplace(server, ServerInfo(delta_counter, rho_counter));
+	  return ReqParams(1, 1);
+	} else {
+	  Counter delta =
+	    1 + delta_counter - it->second.delta_prev_req - it->second.my_delta;
+	  Counter rho =
+	    1 + rho_counter - it->second.rho_prev_req - it->second.my_rho;
+
+	  it->second.req_update(delta_counter, rho_counter);
+
+	  return ReqParams(uint32_t(delta), uint32_t(rho));
+	}
+      }
+
+    private:
+
+      /*
+       * This is being called regularly by RunEvery. Every time it's
+       * called it notes the time and delta counter (mark point) in a
+       * deque. It also looks at the deque to find the most recent
+       * mark point that is older than clean_age. It then walks the
+       * map and delete all server entries that were last used before
+       * that mark point.
+       */
+      void do_clean() {
+	TimePoint now = std::chrono::steady_clock::now();
+	DataGuard g(data_mtx);
+	clean_mark_points.emplace_back(MarkPoint(now, delta_counter));
+
+	Counter earliest = 0;
+	auto point = clean_mark_points.front();
+	while (point.first <= now - clean_age) {
+	  earliest = point.second;
+	  clean_mark_points.pop_front();
+	  point = clean_mark_points.front();
+	}
+
+	if (earliest > 0) {
+	  for (auto i = server_map.begin();
+	       i != server_map.end();
+	       /* empty */) {
+	    auto i2 = i++;
+	    if (i2->second.delta_prev_req <= earliest) {
+	      server_map.erase(i2);
+	    }
+	  }
+	}
+      } // do_clean
+    }; // class ServiceTracker
+  }
+}
diff --git a/src/dmclock/src/dmclock_recs.h b/src/dmclock/src/dmclock_recs.h
new file mode 100644
index 00000000000..f7a5aaadb10
--- /dev/null
+++ b/src/dmclock/src/dmclock_recs.h
@@ -0,0 +1,61 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ * Copyright (C) 2017 Red Hat Inc.
+ */
+
+
+#pragma once
+
+
+#include <ostream>
+#include <assert.h>
+
+
+namespace crimson {
+  namespace dmclock {
+    using Counter = uint64_t;
+
+    enum class PhaseType { reservation, priority };
+
+    inline std::ostream& operator<<(std::ostream& out, const PhaseType& phase) {
+      out << (PhaseType::reservation == phase ? "reservation" : "priority");
+      return out;
+    }
+
+    struct ReqParams {
+      // count of all replies since last request; MUSTN'T BE 0
+      uint32_t delta;
+
+      // count of reservation replies since last request; MUSTN'T BE 0
+      uint32_t rho;
+
+      ReqParams(uint32_t _delta, uint32_t _rho) :
+	delta(_delta),
+	rho(_rho)
+      {
+	assert(0 != delta && 0 != rho && rho <= delta);
+      }
+
+      ReqParams() :
+	ReqParams(1, 1)
+      {
+	// empty
+      }
+
+      ReqParams(const ReqParams& other) :
+	delta(other.delta),
+	rho(other.rho)
+      {
+	// empty
+      }
+
+      friend std::ostream& operator<<(std::ostream& out, const ReqParams& rp) {
+	out << "ReqParams{ delta:" << rp.delta <<
+	  ", rho:" << rp.rho << " }";
+	return out;
+      }
+    }; // class ReqParams
+  }
+}
diff --git a/src/dmclock/src/dmclock_server.h b/src/dmclock/src/dmclock_server.h
new file mode 100644
index 00000000000..65013063fa7
--- /dev/null
+++ b/src/dmclock/src/dmclock_server.h
@@ -0,0 +1,1588 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ * Copyright (C) 2017 Red Hat Inc.
+ */
+
+
+#pragma once
+
+/* COMPILATION OPTIONS
+ *
+ * By default we include an optimization over the originally published
+ * dmclock algorithm using not the values of rho and delta that were
+ * sent in with a request but instead the most recent rho and delta
+ * values from the requests's client. To restore the algorithm's
+ * original behavior, define DO_NOT_DELAY_TAG_CALC (i.e., compiler
+ * argument -DDO_NOT_DELAY_TAG_CALC).
+ *
+ * The prop_heap does not seem to be necessary. The only thing it
+ * would help with is quickly finding the mininum proportion/prioity
+ * when an idle client became active. To have the code maintain the
+ * proportional heap, define USE_PROP_HEAP (i.e., compiler argument
+ * -DUSE_PROP_HEAP).
+ */
+
+#include <assert.h>
+
+#include <cmath>
+#include <memory>
+#include <map>
+#include <deque>
+#include <queue>
+#include <atomic>
+#include <mutex>
+#include <condition_variable>
+#include <thread>
+#include <iostream>
+#include <sstream>
+#include <limits>
+
+#include <boost/variant.hpp>
+
+#include "indirect_intrusive_heap.h"
+#include "run_every.h"
+#include "dmclock_util.h"
+#include "dmclock_recs.h"
+
+#ifdef PROFILE
+#include "profile.h"
+#endif
+
+#include "gtest/gtest_prod.h"
+
+
+namespace crimson {
+
+  namespace dmclock {
+
+    namespace c = crimson;
+
+    constexpr double max_tag = std::numeric_limits<double>::is_iec559 ?
+      std::numeric_limits<double>::infinity() :
+      std::numeric_limits<double>::max();
+    constexpr double min_tag = std::numeric_limits<double>::is_iec559 ?
+      -std::numeric_limits<double>::infinity() :
+      std::numeric_limits<double>::lowest();
+    constexpr uint tag_modulo = 1000000;
+
+    struct ClientInfo {
+      const double reservation;  // minimum
+      const double weight;       // proportional
+      const double limit;        // maximum
+
+      // multiplicative inverses of above, which we use in calculations
+      // and don't want to recalculate repeatedly
+      const double reservation_inv;
+      const double weight_inv;
+      const double limit_inv;
+
+      // order parameters -- min, "normal", max
+      ClientInfo(double _reservation, double _weight, double _limit) :
+	reservation(_reservation),
+	weight(_weight),
+	limit(_limit),
+	reservation_inv(0.0 == reservation ? 0.0 : 1.0 / reservation),
+	weight_inv(     0.0 == weight      ? 0.0 : 1.0 / weight),
+	limit_inv(      0.0 == limit       ? 0.0 : 1.0 / limit)
+      {
+	// empty
+      }
+
+
+      friend std::ostream& operator<<(std::ostream& out,
+				      const ClientInfo& client) {
+	out <<
+	  "{ ClientInfo:: r:" << client.reservation <<
+	  " w:" << std::fixed << client.weight <<
+	  " l:" << std::fixed << client.limit <<
+	  " 1/r:" << std::fixed << client.reservation_inv <<
+	  " 1/w:" << std::fixed << client.weight_inv <<
+	  " 1/l:" << std::fixed << client.limit_inv <<
+	  " }";
+	return out;
+      }
+    }; // class ClientInfo
+
+
+    struct RequestTag {
+      double reservation;
+      double proportion;
+      double limit;
+      bool   ready; // true when within limit
+#ifndef DO_NOT_DELAY_TAG_CALC
+      Time   arrival;
+#endif
+
+      RequestTag(const RequestTag& prev_tag,
+		 const ClientInfo& client,
+		 const ReqParams& req_params,
+		 const Time& time,
+		 const double cost = 0.0) :
+	reservation(cost + tag_calc(time,
+				    prev_tag.reservation,
+				    client.reservation_inv,
+				    req_params.rho,
+				    true)),
+	proportion(tag_calc(time,
+			    prev_tag.proportion,
+			    client.weight_inv,
+			    req_params.delta,
+			    true)),
+	limit(tag_calc(time,
+		       prev_tag.limit,
+		       client.limit_inv,
+		       req_params.delta,
+		       false)),
+	ready(false)
+#ifndef DO_NOT_DELAY_TAG_CALC
+	, arrival(time)
+#endif
+      {
+	assert(reservation < max_tag || proportion < max_tag);
+      }
+
+      RequestTag(double _res, double _prop, double _lim, const Time& _arrival) :
+	reservation(_res),
+	proportion(_prop),
+	limit(_lim),
+	ready(false)
+#ifndef DO_NOT_DELAY_TAG_CALC
+	, arrival(_arrival)
+#endif
+      {
+	assert(reservation < max_tag || proportion < max_tag);
+      }
+
+      RequestTag(const RequestTag& other) :
+	reservation(other.reservation),
+	proportion(other.proportion),
+	limit(other.limit),
+	ready(other.ready)
+#ifndef DO_NOT_DELAY_TAG_CALC
+	, arrival(other.arrival)
+#endif
+      {
+	// empty
+      }
+
+      static std::string format_tag_change(double before, double after) {
+	if (before == after) {
+	  return std::string("same");
+	} else {
+	  std::stringstream ss;
+	  ss << format_tag(before) << "=>" << format_tag(after);
+	  return ss.str();
+	}
+      }
+
+      static std::string format_tag(double value) {
+	if (max_tag == value) {
+	  return std::string("max");
+	} else if (min_tag == value) {
+	  return std::string("min");
+	} else {
+	  return format_time(value, tag_modulo);
+	}
+      }
+
+    private:
+
+      static double tag_calc(const Time& time,
+			     double prev,
+			     double increment,
+			     uint32_t dist_req_val,
+			     bool extreme_is_high) {
+	if (0.0 == increment) {
+	  return extreme_is_high ? max_tag : min_tag;
+	} else {
+	  if (0 != dist_req_val) {
+	    increment *= dist_req_val;
+	  }
+	  return std::max(time, prev + increment);
+	}
+      }
+
+      friend std::ostream& operator<<(std::ostream& out,
+				      const RequestTag& tag) {
+	out <<
+	  "{ RequestTag:: ready:" << (tag.ready ? "true" : "false") <<
+	  " r:" << format_tag(tag.reservation) <<
+	  " p:" << format_tag(tag.proportion) <<
+	  " l:" << format_tag(tag.limit) <<
+#if 0 // try to resolve this to make sure Time is operator<<'able.
+#ifndef DO_NOT_DELAY_TAG_CALC
+	  " arrival:" << tag.arrival <<
+#endif
+#endif
+	  " }";
+	return out;
+      }
+    }; // class RequestTag
+
+
+    // C is client identifier type, R is request type, B is heap
+    // branching factor
+    template<typename C, typename R, uint B>
+    class PriorityQueueBase {
+      FRIEND_TEST(dmclock_server, client_idle_erase);
+
+    public:
+
+      using RequestRef = std::unique_ptr<R>;
+
+    protected:
+
+      using TimePoint = decltype(std::chrono::steady_clock::now());
+      using Duration = std::chrono::milliseconds;
+      using MarkPoint = std::pair<TimePoint,Counter>;
+
+      enum class ReadyOption {ignore, lowers, raises};
+
+      // forward decl for friend decls
+      template<double RequestTag::*, ReadyOption, bool>
+      struct ClientCompare;
+
+      class ClientReq {
+	friend PriorityQueueBase;
+
+	RequestTag tag;
+	C          client_id;
+	RequestRef request;
+
+      public:
+
+	ClientReq(const RequestTag& _tag,
+		  const C&          _client_id,
+		  RequestRef&&      _request) :
+	  tag(_tag),
+	  client_id(_client_id),
+	  request(std::move(_request))
+	{
+	  // empty
+	}
+
+	friend std::ostream& operator<<(std::ostream& out, const ClientReq& c) {
+	  out << "{ ClientReq:: tag:" << c.tag << " client:" <<
+	    c.client_id << " }";
+	  return out;
+	}
+      }; // class ClientReq
+
+    public:
+
+      // NOTE: ClientRec is in the "public" section for compatibility
+      // with g++ 4.8.4, which complains if it's not. By g++ 6.3.1
+      // ClientRec could be "protected" with no issue. [See comments
+      // associated with function submit_top_request.]
+      class ClientRec {
+	friend PriorityQueueBase<C,R,B>;
+
+	C                     client;
+	RequestTag            prev_tag;
+	std::deque<ClientReq> requests;
+
+	// amount added from the proportion tag as a result of
+	// an idle client becoming unidle
+	double                prop_delta = 0.0;
+
+	c::IndIntruHeapData   reserv_heap_data;
+	c::IndIntruHeapData   lim_heap_data;
+	c::IndIntruHeapData   ready_heap_data;
+#if USE_PROP_HEAP
+	c::IndIntruHeapData   prop_heap_data;
+#endif
+
+      public:
+
+	ClientInfo            info;
+	bool                  idle;
+	Counter               last_tick;
+	uint32_t              cur_rho;
+	uint32_t              cur_delta;
+
+	ClientRec(C _client,
+		  const ClientInfo& _info,
+		  Counter current_tick) :
+	  client(_client),
+	  prev_tag(0.0, 0.0, 0.0, TimeZero),
+	  info(_info),
+	  idle(true),
+	  last_tick(current_tick),
+	  cur_rho(1),
+	  cur_delta(1)
+	{
+	  // empty
+	}
+
+	inline const RequestTag& get_req_tag() const {
+	  return prev_tag;
+	}
+
+	static inline void assign_unpinned_tag(double& lhs, const double rhs) {
+	  if (rhs != max_tag && rhs != min_tag) {
+	    lhs = rhs;
+	  }
+	}
+
+	inline void update_req_tag(const RequestTag& _prev,
+				   const Counter& _tick) {
+	  assign_unpinned_tag(prev_tag.reservation, _prev.reservation);
+	  assign_unpinned_tag(prev_tag.limit, _prev.limit);
+	  assign_unpinned_tag(prev_tag.proportion, _prev.proportion);
+	  last_tick = _tick;
+	}
+
+	inline void add_request(const RequestTag& tag,
+				const C&          client_id,
+				RequestRef&&      request) {
+	  requests.emplace_back(ClientReq(tag, client_id, std::move(request)));
+	}
+
+	inline const ClientReq& next_request() const {
+	  return requests.front();
+	}
+
+	inline ClientReq& next_request() {
+	  return requests.front();
+	}
+
+	inline void pop_request() {
+	  requests.pop_front();
+	}
+
+	inline bool has_request() const {
+	  return !requests.empty();
+	}
+
+	inline size_t request_count() const {
+	  return requests.size();
+	}
+
+	// NB: because a deque is the underlying structure, this
+	// operation might be expensive
+	bool remove_by_req_filter_fw(std::function<bool(const R&)> filter_accum) {
+	  bool any_removed = false;
+	  for (auto i = requests.begin();
+	       i != requests.end();
+	       /* no inc */) {
+	    if (filter_accum(*i->request)) {
+	      any_removed = true;
+	      i = requests.erase(i);
+	    } else {
+	      ++i;
+	    }
+	  }
+	  return any_removed;
+	}
+
+	// NB: because a deque is the underlying structure, this
+	// operation might be expensive
+	bool remove_by_req_filter_bw(std::function<bool(const R&)> filter_accum) {
+	  bool any_removed = false;
+	  for (auto i = requests.rbegin();
+	       i != requests.rend();
+	       /* no inc */) {
+	    if (filter_accum(*i->request)) {
+	      any_removed = true;
+	      i = decltype(i){ requests.erase(std::next(i).base()) };
+	    } else {
+	      ++i;
+	    }
+	  }
+	  return any_removed;
+	}
+
+	inline bool
+	remove_by_req_filter(std::function<bool(const R&)> filter_accum,
+			     bool visit_backwards) {
+	  if (visit_backwards) {
+	    return remove_by_req_filter_bw(filter_accum);
+	  } else {
+	    return remove_by_req_filter_fw(filter_accum);
+	  }
+	}
+
+	friend std::ostream&
+	operator<<(std::ostream& out,
+		   const typename PriorityQueueBase<C,R,B>::ClientRec& e) {
+	  out << "{ ClientRec::" <<
+	    " client:" << e.client <<
+	    " prev_tag:" << e.prev_tag <<
+	    " req_count:" << e.requests.size() <<
+	    " top_req:";
+	  if (e.has_request()) {
+	    out << e.next_request();
+	  } else {
+	    out << "none";
+	  }
+	  out << " }";
+
+	  return out;
+	}
+      }; // class ClientRec
+
+      using ClientRecRef = std::shared_ptr<ClientRec>;
+
+      // when we try to get the next request, we'll be in one of three
+      // situations -- we'll have one to return, have one that can
+      // fire in the future, or not have any
+      enum class NextReqType { returning, future, none };
+
+      // specifies which queue next request will get popped from
+      enum class HeapId { reservation, ready };
+
+      // this is returned from next_req to tell the caller the situation
+      struct NextReq {
+	NextReqType type;
+	union {
+	  HeapId    heap_id;
+	  Time      when_ready;
+	};
+      };
+
+
+      // a function that can be called to look up client information
+      using ClientInfoFunc = std::function<ClientInfo(const C&)>;
+
+
+      bool empty() const {
+	DataGuard g(data_mtx);
+	return (resv_heap.empty() || ! resv_heap.top().has_request());
+      }
+
+
+      size_t client_count() const {
+	DataGuard g(data_mtx);
+	return resv_heap.size();
+      }
+
+
+      size_t request_count() const {
+	DataGuard g(data_mtx);
+	size_t total = 0;
+	for (auto i = resv_heap.cbegin(); i != resv_heap.cend(); ++i) {
+	  total += i->request_count();
+	}
+	return total;
+      }
+
+
+      bool remove_by_req_filter(std::function<bool(const R&)> filter_accum,
+				bool visit_backwards = false) {
+	bool any_removed = false;
+	DataGuard g(data_mtx);
+	for (auto i : client_map) {
+	  bool modified =
+	    i.second->remove_by_req_filter(filter_accum, visit_backwards);
+	  if (modified) {
+	    resv_heap.adjust(*i.second);
+	    limit_heap.adjust(*i.second);
+	    ready_heap.adjust(*i.second);
+#if USE_PROP_HEAP
+	    prop_heap.adjust(*i.second);
+#endif
+	    any_removed = true;
+	  }
+	}
+	return any_removed;
+      }
+
+
+      // use as a default value when no accumulator is provide
+      static void request_sink(const R& req) {
+	// do nothing
+      }
+
+
+      void remove_by_client(const C& client,
+			    bool reverse = false,
+			    std::function<void (const R&)> accum = request_sink) {
+	DataGuard g(data_mtx);
+
+	auto i = client_map.find(client);
+
+	if (i == client_map.end()) return;
+
+	if (reverse) {
+	  for (auto j = i->second->requests.rbegin();
+	       j != i->second->requests.rend();
+	       ++j) {
+	    accum(*j->request);
+	  }
+	} else {
+	  for (auto j = i->second->requests.begin();
+	       j != i->second->requests.end();
+	       ++j) {
+	    accum(*j->request);
+	  }
+	}
+
+	i->second->requests.clear();
+
+	resv_heap.adjust(*i->second);
+	limit_heap.adjust(*i->second);
+	ready_heap.adjust(*i->second);
+#if USE_PROP_HEAP
+	prop_heap.adjust(*i->second);
+#endif
+      }
+
+
+      uint get_heap_branching_factor() const {
+	return B;
+      }
+
+
+      friend std::ostream& operator<<(std::ostream& out,
+				      const PriorityQueueBase& q) {
+	std::lock_guard<decltype(q.data_mtx)> guard(q.data_mtx);
+
+	out << "{ PriorityQueue::";
+	for (const auto& c : q.client_map) {
+	  out << "  { client:" << c.first << ", record:" << *c.second <<
+	    " }";
+	}
+	if (!q.resv_heap.empty()) {
+	  const auto& resv = q.resv_heap.top();
+	  out << " { reservation_top:" << resv << " }";
+	  const auto& ready = q.ready_heap.top();
+	  out << " { ready_top:" << ready << " }";
+	  const auto& limit = q.limit_heap.top();
+	  out << " { limit_top:" << limit << " }";
+	} else {
+	  out << " HEAPS-EMPTY";
+	}
+	out << " }";
+
+	return out;
+      }
+
+      // for debugging
+      void display_queues(std::ostream& out,
+			  bool show_res = true,
+			  bool show_lim = true,
+			  bool show_ready = true,
+			  bool show_prop = true) const {
+	auto filter = [](const ClientRec& e)->bool { return true; };
+	DataGuard g(data_mtx);
+	if (show_res) {
+	  resv_heap.display_sorted(out << "RESER:", filter);
+	}
+	if (show_lim) {
+	  limit_heap.display_sorted(out << "LIMIT:", filter);
+	}
+	if (show_ready) {
+	  ready_heap.display_sorted(out << "READY:", filter);
+	}
+#if USE_PROP_HEAP
+	if (show_prop) {
+	  prop_heap.display_sorted(out << "PROPO:", filter);
+	}
+#endif
+      } // display_queues
+
+
+    protected:
+
+      // The ClientCompare functor is essentially doing a precedes?
+      // operator, returning true if and only if the first parameter
+      // must precede the second parameter. If the second must precede
+      // the first, or if they are equivalent, false should be
+      // returned. The reason for this behavior is that it will be
+      // called to test if two items are out of order and if true is
+      // returned it will reverse the items. Therefore false is the
+      // default return when it doesn't matter to prevent unnecessary
+      // re-ordering.
+      //
+      // The template is supporting variations in sorting based on the
+      // heap in question and allowing these variations to be handled
+      // at compile-time.
+      //
+      // tag_field determines which tag is being used for comparison
+      //
+      // ready_opt determines how the ready flag influences the sort
+      //
+      // use_prop_delta determines whether the proportional delta is
+      // added in for comparison
+      template<double RequestTag::*tag_field,
+	       ReadyOption ready_opt,
+	       bool use_prop_delta>
+      struct ClientCompare {
+	bool operator()(const ClientRec& n1, const ClientRec& n2) const {
+	  if (n1.has_request()) {
+	    if (n2.has_request()) {
+	      const auto& t1 = n1.next_request().tag;
+	      const auto& t2 = n2.next_request().tag;
+	      if (ReadyOption::ignore == ready_opt || t1.ready == t2.ready) {
+		// if we don't care about ready or the ready values are the same
+		if (use_prop_delta) {
+		  return (t1.*tag_field + n1.prop_delta) <
+		    (t2.*tag_field + n2.prop_delta);
+		} else {
+		  return t1.*tag_field < t2.*tag_field;
+		}
+	      } else if (ReadyOption::raises == ready_opt) {
+		// use_ready == true && the ready fields are different
+		return t1.ready;
+	      } else {
+		return t2.ready;
+	      }
+	    } else {
+	      // n1 has request but n2 does not
+	      return true;
+	    }
+	  } else if (n2.has_request()) {
+	    // n2 has request but n1 does not
+	    return false;
+	  } else {
+	    // both have none; keep stable w false
+	    return false;
+	  }
+	}
+      };
+
+      ClientInfoFunc       client_info_f;
+
+      mutable std::mutex data_mtx;
+      using DataGuard = std::lock_guard<decltype(data_mtx)>;
+
+      // stable mapping between client ids and client queues
+      std::map<C,ClientRecRef> client_map;
+
+      c::IndIntruHeap<ClientRecRef,
+		      ClientRec,
+		      &ClientRec::reserv_heap_data,
+		      ClientCompare<&RequestTag::reservation,
+				    ReadyOption::ignore,
+				    false>,
+		      B> resv_heap;
+#if USE_PROP_HEAP
+      c::IndIntruHeap<ClientRecRef,
+		      ClientRec,
+		      &ClientRec::prop_heap_data,
+		      ClientCompare<&RequestTag::proportion,
+				    ReadyOption::ignore,
+				    true>,
+		      B> prop_heap;
+#endif
+      c::IndIntruHeap<ClientRecRef,
+		      ClientRec,
+		      &ClientRec::lim_heap_data,
+		      ClientCompare<&RequestTag::limit,
+				    ReadyOption::lowers,
+				    false>,
+		      B> limit_heap;
+      c::IndIntruHeap<ClientRecRef,
+		      ClientRec,
+		      &ClientRec::ready_heap_data,
+		      ClientCompare<&RequestTag::proportion,
+				    ReadyOption::raises,
+				    true>,
+		      B> ready_heap;
+
+      // if all reservations are met and all other requestes are under
+      // limit, this will allow the request next in terms of
+      // proportion to still get issued
+      bool             allow_limit_break;
+
+      std::atomic_bool finishing;
+
+      // every request creates a tick
+      Counter tick = 0;
+
+      // performance data collection
+      size_t reserv_sched_count = 0;
+      size_t prop_sched_count = 0;
+      size_t limit_break_sched_count = 0;
+
+      Duration                  idle_age;
+      Duration                  erase_age;
+      Duration                  check_time;
+      std::deque<MarkPoint>     clean_mark_points;
+
+      // NB: All threads declared at end, so they're destructed first!
+
+      std::unique_ptr<RunEvery> cleaning_job;
+
+
+      // COMMON constructor that others feed into; we can accept three
+      // different variations of durations
+      template<typename Rep, typename Per>
+      PriorityQueueBase(ClientInfoFunc _client_info_f,
+			std::chrono::duration<Rep,Per> _idle_age,
+			std::chrono::duration<Rep,Per> _erase_age,
+			std::chrono::duration<Rep,Per> _check_time,
+			bool _allow_limit_break) :
+	client_info_f(_client_info_f),
+	allow_limit_break(_allow_limit_break),
+	finishing(false),
+	idle_age(std::chrono::duration_cast<Duration>(_idle_age)),
+	erase_age(std::chrono::duration_cast<Duration>(_erase_age)),
+	check_time(std::chrono::duration_cast<Duration>(_check_time))
+      {
+	assert(_erase_age >= _idle_age);
+	assert(_check_time < _idle_age);
+	cleaning_job =
+	  std::unique_ptr<RunEvery>(
+	    new RunEvery(check_time,
+			 std::bind(&PriorityQueueBase::do_clean, this)));
+      }
+
+
+      ~PriorityQueueBase() {
+	finishing = true;
+      }
+
+
+      // data_mtx must be held by caller
+      void do_add_request(RequestRef&&     request,
+			  const C&         client_id,
+			  const ReqParams& req_params,
+			  const Time       time,
+			  const double     cost = 0.0) {
+	++tick;
+
+	// this pointer will help us create a reference to a shared
+	// pointer, no matter which of two codepaths we take
+	ClientRec* temp_client;
+
+	auto client_it = client_map.find(client_id);
+	if (client_map.end() != client_it) {
+	  temp_client = &(*client_it->second); // address of obj of shared_ptr
+	} else {
+	  ClientInfo info = client_info_f(client_id);
+	  ClientRecRef client_rec =
+	    std::make_shared<ClientRec>(client_id, info, tick);
+	  resv_heap.push(client_rec);
+#if USE_PROP_HEAP
+	  prop_heap.push(client_rec);
+#endif
+	  limit_heap.push(client_rec);
+	  ready_heap.push(client_rec);
+	  client_map[client_id] = client_rec;
+	  temp_client = &(*client_rec); // address of obj of shared_ptr
+	}
+
+	// for convenience, we'll create a reference to the shared pointer
+	ClientRec& client = *temp_client;
+
+	if (client.idle) {
+	  // We need to do an adjustment so that idle clients compete
+	  // fairly on proportional tags since those tags may have
+	  // drifted from real-time. Either use the lowest existing
+	  // proportion tag -- O(1) -- or the client with the lowest
+	  // previous proportion tag -- O(n) where n = # clients.
+	  //
+	  // So we don't have to maintain a propotional queue that
+	  // keeps the minimum on proportional tag alone (we're
+	  // instead using a ready queue), we'll have to check each
+	  // client.
+	  //
+	  // The alternative would be to maintain a proportional queue
+	  // (define USE_PROP_TAG) and do an O(1) operation here.
+
+	  // Was unable to confirm whether equality testing on
+	  // std::numeric_limits<double>::max() is guaranteed, so
+	  // we'll use a compile-time calculated trigger that is one
+	  // third the max, which should be much larger than any
+	  // expected organic value.
+	  constexpr double lowest_prop_tag_trigger =
+	    std::numeric_limits<double>::max() / 3.0;
+
+	  double lowest_prop_tag = std::numeric_limits<double>::max();
+	  for (auto const &c : client_map) {
+	    // don't use ourselves (or anything else that might be
+	    // listed as idle) since we're now in the map
+	    if (!c.second->idle) {
+	      double p;
+	      // use either lowest proportion tag or previous proportion tag
+	      if (c.second->has_request()) {
+		p = c.second->next_request().tag.proportion +
+		  c.second->prop_delta;
+	      } else {
+	        p = c.second->get_req_tag().proportion + c.second->prop_delta;
+	      }
+
+	      if (p < lowest_prop_tag) {
+		lowest_prop_tag = p;
+	      }
+	    }
+	  }
+
+	  // if this conditional does not fire, it
+	  if (lowest_prop_tag < lowest_prop_tag_trigger) {
+	    client.prop_delta = lowest_prop_tag - time;
+	  }
+	  client.idle = false;
+	} // if this client was idle
+
+#ifndef DO_NOT_DELAY_TAG_CALC
+	RequestTag tag(0, 0, 0, time);
+
+	if (!client.has_request()) {
+	  tag = RequestTag(client.get_req_tag(), client.info,
+			   req_params, time, cost);
+
+	  // copy tag to previous tag for client
+	  client.update_req_tag(tag, tick);
+	}
+#else
+	RequestTag tag(client.get_req_tag(), client.info, req_params, time, cost);
+	// copy tag to previous tag for client
+	client.update_req_tag(tag, tick);
+#endif
+
+	client.add_request(tag, client.client, std::move(request));
+	if (1 == client.requests.size()) {
+	  // NB: can the following 4 calls to adjust be changed
+	  // promote? Can adding a request ever demote a client in the
+	  // heaps?
+	  resv_heap.adjust(client);
+	  limit_heap.adjust(client);
+	  ready_heap.adjust(client);
+#if USE_PROP_HEAP
+	  prop_heap.adjust(client);
+#endif
+	}
+
+	client.cur_rho = req_params.rho;
+	client.cur_delta = req_params.delta;
+
+	resv_heap.adjust(client);
+	limit_heap.adjust(client);
+	ready_heap.adjust(client);
+#if USE_PROP_HEAP
+	prop_heap.adjust(client);
+#endif
+      } // add_request
+
+
+      // data_mtx should be held when called; top of heap should have
+      // a ready request
+      template<typename C1, IndIntruHeapData ClientRec::*C2, typename C3>
+      void pop_process_request(IndIntruHeap<C1, ClientRec, C2, C3, B>& heap,
+			       std::function<void(const C& client,
+						  RequestRef& request)> process) {
+	// gain access to data
+	ClientRec& top = heap.top();
+	ClientReq& first = top.next_request();
+	RequestRef request = std::move(first.request);
+
+	// pop request and adjust heaps
+	top.pop_request();
+
+#ifndef DO_NOT_DELAY_TAG_CALC
+	if (top.has_request()) {
+	  ClientReq& next_first = top.next_request();
+	  next_first.tag = RequestTag(first.tag, top.info,
+	                              ReqParams(top.cur_delta, top.cur_rho),
+				      next_first.tag.arrival);
+
+  	  // copy tag to previous tag for client
+	  top.update_req_tag(next_first.tag, tick);
+	}
+#endif
+
+	resv_heap.demote(top);
+	limit_heap.adjust(top);
+#if USE_PROP_HEAP
+	prop_heap.demote(top);
+#endif
+	ready_heap.demote(top);
+
+	// process
+	process(top.client, request);
+      } // pop_process_request
+
+
+      // data_mtx should be held when called
+      void reduce_reservation_tags(ClientRec& client) {
+	for (auto& r : client.requests) {
+	  r.tag.reservation -= client.info.reservation_inv;
+
+#ifndef DO_NOT_DELAY_TAG_CALC
+	  // reduce only for front tag. because next tags' value are invalid
+	  break;
+#endif
+	}
+	// don't forget to update previous tag
+	client.prev_tag.reservation -= client.info.reservation_inv;
+	resv_heap.promote(client);
+      }
+
+
+      // data_mtx should be held when called
+      void reduce_reservation_tags(const C& client_id) {
+	auto client_it = client_map.find(client_id);
+
+	// means the client was cleaned from map; should never happen
+	// as long as cleaning times are long enough
+	assert(client_map.end() != client_it);
+	reduce_reservation_tags(*client_it->second);
+      }
+
+
+      // data_mtx should be held when called
+      NextReq do_next_request(Time now) {
+	NextReq result;
+
+	// if reservation queue is empty, all are empty (i.e., no active clients)
+	if(resv_heap.empty()) {
+	  result.type = NextReqType::none;
+	  return result;
+	}
+
+	// try constraint (reservation) based scheduling
+
+	auto& reserv = resv_heap.top();
+	if (reserv.has_request() &&
+	    reserv.next_request().tag.reservation <= now) {
+	  result.type = NextReqType::returning;
+	  result.heap_id = HeapId::reservation;
+	  return result;
+	}
+
+	// no existing reservations before now, so try weight-based
+	// scheduling
+
+	// all items that are within limit are eligible based on
+	// priority
+	auto limits = &limit_heap.top();
+	while (limits->has_request() &&
+	       !limits->next_request().tag.ready &&
+	       limits->next_request().tag.limit <= now) {
+	  limits->next_request().tag.ready = true;
+	  ready_heap.promote(*limits);
+	  limit_heap.demote(*limits);
+
+	  limits = &limit_heap.top();
+	}
+
+	auto& readys = ready_heap.top();
+	if (readys.has_request() &&
+	    readys.next_request().tag.ready &&
+	    readys.next_request().tag.proportion < max_tag) {
+	  result.type = NextReqType::returning;
+	  result.heap_id = HeapId::ready;
+	  return result;
+	}
+
+	// if nothing is schedulable by reservation or
+	// proportion/weight, and if we allow limit break, try to
+	// schedule something with the lowest proportion tag or
+	// alternatively lowest reservation tag.
+	if (allow_limit_break) {
+	  if (readys.has_request() &&
+	      readys.next_request().tag.proportion < max_tag) {
+	    result.type = NextReqType::returning;
+	    result.heap_id = HeapId::ready;
+	    return result;
+	  } else if (reserv.has_request() &&
+		     reserv.next_request().tag.reservation < max_tag) {
+	    result.type = NextReqType::returning;
+	    result.heap_id = HeapId::reservation;
+	    return result;
+	  }
+	}
+
+	// nothing scheduled; make sure we re-run when next
+	// reservation item or next limited item comes up
+
+	Time next_call = TimeMax;
+	if (resv_heap.top().has_request()) {
+	  next_call =
+	    min_not_0_time(next_call,
+			   resv_heap.top().next_request().tag.reservation);
+	}
+	if (limit_heap.top().has_request()) {
+	  const auto& next = limit_heap.top().next_request();
+	  assert(!next.tag.ready || max_tag == next.tag.proportion);
+	  next_call = min_not_0_time(next_call, next.tag.limit);
+	}
+	if (next_call < TimeMax) {
+	  result.type = NextReqType::future;
+	  result.when_ready = next_call;
+	  return result;
+	} else {
+	  result.type = NextReqType::none;
+	  return result;
+	}
+      } // do_next_request
+
+
+      // if possible is not zero and less than current then return it;
+      // otherwise return current; the idea is we're trying to find
+      // the minimal time but ignoring zero
+      static inline const Time& min_not_0_time(const Time& current,
+					       const Time& possible) {
+	return TimeZero == possible ? current : std::min(current, possible);
+      }
+
+
+      /*
+       * This is being called regularly by RunEvery. Every time it's
+       * called it notes the time and delta counter (mark point) in a
+       * deque. It also looks at the deque to find the most recent
+       * mark point that is older than clean_age. It then walks the
+       * map and delete all server entries that were last used before
+       * that mark point.
+       */
+      void do_clean() {
+	TimePoint now = std::chrono::steady_clock::now();
+	DataGuard g(data_mtx);
+	clean_mark_points.emplace_back(MarkPoint(now, tick));
+
+	// first erase the super-old client records
+
+	Counter erase_point = 0;
+	auto point = clean_mark_points.front();
+	while (point.first <= now - erase_age) {
+	  erase_point = point.second;
+	  clean_mark_points.pop_front();
+	  point = clean_mark_points.front();
+	}
+
+	Counter idle_point = 0;
+	for (auto i : clean_mark_points) {
+	  if (i.first <= now - idle_age) {
+	    idle_point = i.second;
+	  } else {
+	    break;
+	  }
+	}
+
+	if (erase_point > 0 || idle_point > 0) {
+	  for (auto i = client_map.begin(); i != client_map.end(); /* empty */) {
+	    auto i2 = i++;
+	    if (erase_point && i2->second->last_tick <= erase_point) {
+	      delete_from_heaps(i2->second);
+	      client_map.erase(i2);
+	    } else if (idle_point && i2->second->last_tick <= idle_point) {
+	      i2->second->idle = true;
+	    }
+	  } // for
+	} // if
+      } // do_clean
+
+
+      // data_mtx must be held by caller
+      template<IndIntruHeapData ClientRec::*C1,typename C2>
+      void delete_from_heap(ClientRecRef& client,
+			    c::IndIntruHeap<ClientRecRef,ClientRec,C1,C2,B>& heap) {
+	auto i = heap.rfind(client);
+	heap.remove(i);
+      }
+
+
+      // data_mtx must be held by caller
+      void delete_from_heaps(ClientRecRef& client) {
+	delete_from_heap(client, resv_heap);
+#if USE_PROP_HEAP
+	delete_from_heap(client, prop_heap);
+#endif
+	delete_from_heap(client, limit_heap);
+	delete_from_heap(client, ready_heap);
+      }
+    }; // class PriorityQueueBase
+
+
+    template<typename C, typename R, uint B=2>
+    class PullPriorityQueue : public PriorityQueueBase<C,R,B> {
+      using super = PriorityQueueBase<C,R,B>;
+
+    public:
+
+      // When a request is pulled, this is the return type.
+      struct PullReq {
+	struct Retn {
+	  C                           client;
+	  typename super::RequestRef  request;
+	  PhaseType                   phase;
+	};
+
+	typename super::NextReqType   type;
+	boost::variant<Retn,Time>     data;
+
+	bool is_none() const { return type == super::NextReqType::none; }
+
+	bool is_retn() const { return type == super::NextReqType::returning; }
+	Retn& get_retn() {
+	  return boost::get<Retn>(data);
+	}
+
+	bool is_future() const { return type == super::NextReqType::future; }
+	Time getTime() const { return boost::get<Time>(data); }
+      };
+
+
+#ifdef PROFILE
+      ProfileTimer<std::chrono::nanoseconds> pull_request_timer;
+      ProfileTimer<std::chrono::nanoseconds> add_request_timer;
+#endif
+
+      template<typename Rep, typename Per>
+      PullPriorityQueue(typename super::ClientInfoFunc _client_info_f,
+			std::chrono::duration<Rep,Per> _idle_age,
+			std::chrono::duration<Rep,Per> _erase_age,
+			std::chrono::duration<Rep,Per> _check_time,
+			bool _allow_limit_break = false) :
+	super(_client_info_f,
+	      _idle_age, _erase_age, _check_time,
+	      _allow_limit_break)
+      {
+	// empty
+      }
+
+
+      // pull convenience constructor
+      PullPriorityQueue(typename super::ClientInfoFunc _client_info_f,
+			bool _allow_limit_break = false) :
+	PullPriorityQueue(_client_info_f,
+			  std::chrono::minutes(10),
+			  std::chrono::minutes(15),
+			  std::chrono::minutes(6),
+			  _allow_limit_break)
+      {
+	// empty
+      }
+
+
+      inline void add_request(const R& request,
+			      const C& client_id,
+			      const ReqParams& req_params,
+			      double addl_cost = 0.0) {
+	add_request(typename super::RequestRef(new R(request)),
+		    client_id,
+		    req_params,
+		    get_time(),
+		    addl_cost);
+      }
+
+
+      inline void add_request(const R& request,
+			      const C& client_id,
+			      double addl_cost = 0.0) {
+	static const ReqParams null_req_params;
+	add_request(typename super::RequestRef(new R(request)),
+		    client_id,
+		    null_req_params,
+		    get_time(),
+		    addl_cost);
+      }
+
+
+
+      inline void add_request_time(const R& request,
+				   const C& client_id,
+				   const ReqParams& req_params,
+				   const Time time,
+				   double addl_cost = 0.0) {
+	add_request(typename super::RequestRef(new R(request)),
+		    client_id,
+		    req_params,
+		    time,
+		    addl_cost);
+      }
+
+
+      inline void add_request(typename super::RequestRef&& request,
+			      const C& client_id,
+			      const ReqParams& req_params,
+			      double addl_cost = 0.0) {
+	add_request(request, req_params, client_id, get_time(), addl_cost);
+      }
+
+
+      inline void add_request(typename super::RequestRef&& request,
+			      const C& client_id,
+			      double addl_cost = 0.0) {
+	static const ReqParams null_req_params;
+	add_request(request, null_req_params, client_id, get_time(), addl_cost);
+      }
+
+
+      // this does the work; the versions above provide alternate interfaces
+      void add_request(typename super::RequestRef&& request,
+		       const C&                     client_id,
+		       const ReqParams&             req_params,
+		       const Time                   time,
+		       double                       addl_cost = 0.0) {
+	typename super::DataGuard g(this->data_mtx);
+#ifdef PROFILE
+	add_request_timer.start();
+#endif
+	super::do_add_request(std::move(request),
+			      client_id,
+			      req_params,
+			      time,
+			      addl_cost);
+	// no call to schedule_request for pull version
+#ifdef PROFILE
+	add_request_timer.stop();
+#endif
+      }
+
+
+      inline PullReq pull_request() {
+	return pull_request(get_time());
+      }
+
+
+      PullReq pull_request(Time now) {
+	PullReq result;
+	typename super::DataGuard g(this->data_mtx);
+#ifdef PROFILE
+	pull_request_timer.start();
+#endif
+
+	typename super::NextReq next = super::do_next_request(now);
+	result.type = next.type;
+	switch(next.type) {
+	case super::NextReqType::none:
+	  return result;
+	  break;
+	case super::NextReqType::future:
+	  result.data = next.when_ready;
+	  return result;
+	  break;
+	case super::NextReqType::returning:
+	  // to avoid nesting, break out and let code below handle this case
+	  break;
+	default:
+	  assert(false);
+	}
+
+	// we'll only get here if we're returning an entry
+
+	auto process_f =
+	  [&] (PullReq& pull_result, PhaseType phase) ->
+	  std::function<void(const C&,
+			     typename super::RequestRef&)> {
+	  return [&pull_result, phase](const C& client,
+				       typename super::RequestRef& request) {
+	    pull_result.data =
+	    typename PullReq::Retn{client, std::move(request), phase};
+	  };
+	};
+
+	switch(next.heap_id) {
+	case super::HeapId::reservation:
+	  super::pop_process_request(this->resv_heap,
+				     process_f(result, PhaseType::reservation));
+	  ++this->reserv_sched_count;
+	  break;
+	case super::HeapId::ready:
+	  super::pop_process_request(this->ready_heap,
+				     process_f(result, PhaseType::priority));
+	  { // need to use retn temporarily
+	    auto& retn = boost::get<typename PullReq::Retn>(result.data);
+	    super::reduce_reservation_tags(retn.client);
+	  }
+	  ++this->prop_sched_count;
+	  break;
+	default:
+	  assert(false);
+	}
+
+#ifdef PROFILE
+	pull_request_timer.stop();
+#endif
+	return result;
+      } // pull_request
+
+
+    protected:
+
+
+      // data_mtx should be held when called; unfortunately this
+      // function has to be repeated in both push & pull
+      // specializations
+      typename super::NextReq next_request() {
+	return next_request(get_time());
+      }
+    }; // class PullPriorityQueue
+
+
+    // PUSH version
+    template<typename C, typename R, uint B=2>
+    class PushPriorityQueue : public PriorityQueueBase<C,R,B> {
+
+    protected:
+
+      using super = PriorityQueueBase<C,R,B>;
+
+    public:
+
+      // a function to see whether the server can handle another request
+      using CanHandleRequestFunc = std::function<bool(void)>;
+
+      // a function to submit a request to the server; the second
+      // parameter is a callback when it's completed
+      using HandleRequestFunc =
+	std::function<void(const C&,typename super::RequestRef,PhaseType)>;
+
+    protected:
+
+      CanHandleRequestFunc can_handle_f;
+      HandleRequestFunc    handle_f;
+      // for handling timed scheduling
+      std::mutex  sched_ahead_mtx;
+      std::condition_variable sched_ahead_cv;
+      Time sched_ahead_when = TimeZero;
+
+#ifdef PROFILE
+    public:
+      ProfileTimer<std::chrono::nanoseconds> add_request_timer;
+      ProfileTimer<std::chrono::nanoseconds> request_complete_timer;
+    protected:
+#endif
+
+      // NB: threads declared last, so constructed last and destructed first
+
+      std::thread sched_ahead_thd;
+
+    public:
+
+      // push full constructor
+      template<typename Rep, typename Per>
+      PushPriorityQueue(typename super::ClientInfoFunc _client_info_f,
+			CanHandleRequestFunc _can_handle_f,
+			HandleRequestFunc _handle_f,
+			std::chrono::duration<Rep,Per> _idle_age,
+			std::chrono::duration<Rep,Per> _erase_age,
+			std::chrono::duration<Rep,Per> _check_time,
+			bool _allow_limit_break = false) :
+	super(_client_info_f,
+	      _idle_age, _erase_age, _check_time,
+	      _allow_limit_break)
+      {
+	can_handle_f = _can_handle_f;
+	handle_f = _handle_f;
+	sched_ahead_thd = std::thread(&PushPriorityQueue::run_sched_ahead, this);
+      }
+
+
+      // push convenience constructor
+      PushPriorityQueue(typename super::ClientInfoFunc _client_info_f,
+			CanHandleRequestFunc _can_handle_f,
+			HandleRequestFunc _handle_f,
+			bool _allow_limit_break = false) :
+	PushPriorityQueue(_client_info_f,
+			  _can_handle_f,
+			  _handle_f,
+			  std::chrono::minutes(10),
+			  std::chrono::minutes(15),
+			  std::chrono::minutes(6),
+			  _allow_limit_break)
+      {
+	// empty
+      }
+
+
+      ~PushPriorityQueue() {
+	this->finishing = true;
+	sched_ahead_cv.notify_one();
+	sched_ahead_thd.join();
+      }
+
+    public:
+
+      inline void add_request(const R& request,
+			      const C& client_id,
+			      const ReqParams& req_params,
+			      double addl_cost = 0.0) {
+	add_request(typename super::RequestRef(new R(request)),
+		    client_id,
+		    req_params,
+		    get_time(),
+		    addl_cost);
+      }
+
+
+      inline void add_request(typename super::RequestRef&& request,
+			      const C& client_id,
+			      const ReqParams& req_params,
+			      double addl_cost = 0.0) {
+	add_request(request, req_params, client_id, get_time(), addl_cost);
+      }
+
+
+      inline void add_request_time(const R& request,
+				   const C& client_id,
+				   const ReqParams& req_params,
+				   const Time time,
+				   double addl_cost = 0.0) {
+	add_request(typename super::RequestRef(new R(request)),
+		    client_id,
+		    req_params,
+		    time,
+		    addl_cost);
+      }
+
+
+      void add_request(typename super::RequestRef&& request,
+		       const C&         client_id,
+		       const ReqParams& req_params,
+		       const Time       time,
+		       double           addl_cost = 0.0) {
+	typename super::DataGuard g(this->data_mtx);
+#ifdef PROFILE
+	add_request_timer.start();
+#endif
+	super::do_add_request(std::move(request),
+			      client_id,
+			      req_params,
+			      time,
+			      addl_cost);
+	schedule_request();
+#ifdef PROFILE
+	add_request_timer.stop();
+#endif
+      }
+
+
+      void request_completed() {
+	typename super::DataGuard g(this->data_mtx);
+#ifdef PROFILE
+	request_complete_timer.start();
+#endif
+	schedule_request();
+#ifdef PROFILE
+	request_complete_timer.stop();
+#endif
+      }
+
+    protected:
+
+      // data_mtx should be held when called; furthermore, the heap
+      // should not be empty and the top element of the heap should
+      // not be already handled
+      //
+      // NOTE: the use of "super::ClientRec" in either the template
+      // construct or as a parameter to submit_top_request generated
+      // a compiler error in g++ 4.8.4, when ClientRec was
+      // "protected" rather than "public". By g++ 6.3.1 this was not
+      // an issue. But for backwards compatibility
+      // PriorityQueueBase::ClientRec is public.
+      template<typename C1,
+	       IndIntruHeapData super::ClientRec::*C2,
+	       typename C3,
+	       uint B4>
+      C submit_top_request(IndIntruHeap<C1,typename super::ClientRec,C2,C3,B4>& heap,
+			   PhaseType phase) {
+	C client_result;
+	super::pop_process_request(heap,
+				   [this, phase, &client_result]
+				   (const C& client,
+				    typename super::RequestRef& request) {
+				     client_result = client;
+				     handle_f(client, std::move(request), phase);
+				   });
+	return client_result;
+      }
+
+
+      // data_mtx should be held when called
+      void submit_request(typename super::HeapId heap_id) {
+	C client;
+	switch(heap_id) {
+	case super::HeapId::reservation:
+	  // don't need to note client
+	  (void) submit_top_request(this->resv_heap, PhaseType::reservation);
+	  // unlike the other two cases, we do not reduce reservation
+	  // tags here
+	  ++this->reserv_sched_count;
+	  break;
+	case super::HeapId::ready:
+	  client = submit_top_request(this->ready_heap, PhaseType::priority);
+	  super::reduce_reservation_tags(client);
+	  ++this->prop_sched_count;
+	  break;
+	default:
+	  assert(false);
+	}
+      } // submit_request
+
+
+      // data_mtx should be held when called; unfortunately this
+      // function has to be repeated in both push & pull
+      // specializations
+      typename super::NextReq next_request() {
+	return next_request(get_time());
+      }
+
+
+      // data_mtx should be held when called; overrides member
+      // function in base class to add check for whether a request can
+      // be pushed to the server
+      typename super::NextReq next_request(Time now) {
+	if (!can_handle_f()) {
+	  typename super::NextReq result;
+	  result.type = super::NextReqType::none;
+	  return result;
+	} else {
+	  return super::do_next_request(now);
+	}
+      } // next_request
+
+
+      // data_mtx should be held when called
+      void schedule_request() {
+	typename super::NextReq next_req = next_request();
+	switch (next_req.type) {
+	case super::NextReqType::none:
+	  return;
+	case super::NextReqType::future:
+	  sched_at(next_req.when_ready);
+	  break;
+	case super::NextReqType::returning:
+	  submit_request(next_req.heap_id);
+	  break;
+	default:
+	  assert(false);
+	}
+      }
+
+
+      // this is the thread that handles running schedule_request at
+      // future times when nothing can be scheduled immediately
+      void run_sched_ahead() {
+	std::unique_lock<std::mutex> l(sched_ahead_mtx);
+
+	while (!this->finishing) {
+	  if (TimeZero == sched_ahead_when) {
+	    sched_ahead_cv.wait(l);
+	  } else {
+	    Time now;
+	    while (!this->finishing && (now = get_time()) < sched_ahead_when) {
+	      long microseconds_l = long(1 + 1000000 * (sched_ahead_when - now));
+	      auto microseconds = std::chrono::microseconds(microseconds_l);
+	      sched_ahead_cv.wait_for(l, microseconds);
+	    }
+	    sched_ahead_when = TimeZero;
+	    if (this->finishing) return;
+
+	    l.unlock();
+	    if (!this->finishing) {
+	      typename super::DataGuard g(this->data_mtx);
+	      schedule_request();
+	    }
+	    l.lock();
+	  }
+	}
+      }
+
+
+      void sched_at(Time when) {
+	std::lock_guard<std::mutex> l(sched_ahead_mtx);
+	if (TimeZero == sched_ahead_when || when < sched_ahead_when) {
+	  sched_ahead_when = when;
+	  sched_ahead_cv.notify_one();
+	}
+      }
+    }; // class PushPriorityQueue
+
+  } // namespace dmclock
+} // namespace crimson
diff --git a/src/dmclock/src/dmclock_util.cc b/src/dmclock/src/dmclock_util.cc
new file mode 100644
index 00000000000..865b60d42a8
--- /dev/null
+++ b/src/dmclock/src/dmclock_util.cc
@@ -0,0 +1,27 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ * Copyright (C) 2017 Red Hat Inc.
+ */
+
+
+#include <signal.h>
+
+#include <iomanip>
+#include <sstream>
+
+#include "dmclock_util.h"
+
+
+std::string crimson::dmclock::format_time(const Time& time, uint modulo) {
+  long subtract = long(time / modulo) * modulo;
+  std::stringstream ss;
+  ss << std::fixed << std::setprecision(4) << (time - subtract);
+  return ss.str();
+}
+
+
+void crimson::dmclock::debugger() {
+  raise(SIGCONT);
+}
diff --git a/src/dmclock/src/dmclock_util.h b/src/dmclock/src/dmclock_util.h
new file mode 100644
index 00000000000..d12c6f9eb63
--- /dev/null
+++ b/src/dmclock/src/dmclock_util.h
@@ -0,0 +1,45 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ * Copyright (C) 2017 Red Hat Inc.
+ */
+
+
+#pragma once
+
+
+#include <unistd.h>
+#include <assert.h>
+#include <sys/time.h>
+
+#include <limits>
+#include <cmath>
+#include <chrono>
+
+
+namespace crimson {
+  namespace dmclock {
+    // we're using double to represent time, but we could change it by
+    // changing the following declarations (and by making sure a min
+    // function existed)
+    using Time = double;
+    static const Time TimeZero = 0.0;
+    static const Time TimeMax = std::numeric_limits<Time>::max();
+    static const double NaN = nan("");
+
+
+    inline Time get_time() {
+      struct timeval now;
+      auto result = gettimeofday(&now, NULL);
+      (void) result;
+      assert(0 == result);
+      return now.tv_sec + (now.tv_usec / 1000000.0);
+    }
+
+    std::string format_time(const Time& time, uint modulo = 1000);
+
+    void debugger();
+
+  } // namespace dmclock
+} // namespace crimson
diff --git a/src/dmclock/support/CMakeLists.txt b/src/dmclock/support/CMakeLists.txt
new file mode 100644
index 00000000000..552439ebc59
--- /dev/null
+++ b/src/dmclock/support/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(test)
diff --git a/src/dmclock/support/src/debug.h b/src/dmclock/support/src/debug.h
new file mode 100644
index 00000000000..2a78cc82309
--- /dev/null
+++ b/src/dmclock/support/src/debug.h
@@ -0,0 +1,17 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ * Copyright (C) 2016 Red Hat Inc.
+ */
+
+
+#pragma once
+
+
+#include <signal.h>
+
+
+inline void debugger() {
+    raise(SIGCONT);
+}
diff --git a/src/dmclock/support/src/heap.h b/src/dmclock/support/src/heap.h
new file mode 100644
index 00000000000..0f4d24f7c2d
--- /dev/null
+++ b/src/dmclock/support/src/heap.h
@@ -0,0 +1,240 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ * Copyright (C) 2016 Red Hat Inc.
+ */
+
+
+#pragma once
+
+
+#include <vector>
+#include <ostream>
+
+#include "assert.h"
+
+
+namespace crimson {
+
+  /*
+   * T : type of data held in the heap.
+   *
+   * C : class that implements operator() with two arguments and
+   * returns a boolean when the first argument is greater than (higher
+   * in priority than) the second.
+   */
+  template<typename T, typename C>
+  class Heap {
+
+  public:
+
+    class iterator {
+
+      friend Heap<T,C>;
+
+      Heap<T,C>& heap;
+      int        index;
+
+      iterator(Heap<T,C>& _heap, int _index) :
+	heap(_heap),
+	index(_index)
+      {
+	// empty
+      }
+
+    public:
+
+      iterator(iterator&& other) :
+	heap(other.heap),
+	index(other.index)
+      {
+	// empty
+      }
+
+      iterator& operator++() {
+	++index;
+	return *this;
+      }
+
+      bool operator==(const iterator& other) const {
+	return index == other.index;
+      }
+
+      bool operator!=(const iterator& other) const {
+	return !(*this == other);
+      }
+
+      T& operator*() {
+	return heap.data[index];
+      }
+
+      // the item this iterator refers to
+      void increase() {
+	heap.siftUp(index);
+      }
+    }; // class iterator
+
+    friend iterator;
+
+  protected:
+
+    std::vector<T> data;
+    int count;
+    C comparator;
+
+    // parent(0) should be a negative value, which it is due to
+    // truncating towards negative infinity
+    static inline int parent(int i) { return (i - 1) / 2; }
+
+    static inline int lhs(int i) { return 2*i + 1; }
+
+    static inline int rhs(int i) { return 2*i + 2; }
+
+    void siftUp(int i) {
+      assert(i < count);
+
+      while (i > 0) {
+	int pi = parent(i);
+	if (!comparator(data[i], data[pi])) {
+	  break;
+	}
+
+	std::swap(data[i], data[pi]);
+	i = pi;
+      }
+    }
+
+    void siftDown(int i) {
+      while (i < count) {
+	int li = lhs(i);
+	int ri = rhs(i);
+
+	if (li < count) {
+	  if (comparator(data[li], data[i])) {
+	    if (ri < count && comparator(data[ri], data[li])) {
+	      std::swap(data[i], data[ri]);
+	      i = ri;
+	    } else {
+	      std::swap(data[i], data[li]);
+	      i = li;
+	    }
+	  } else if (ri < count && comparator(data[ri], data[i])) {
+	    std::swap(data[i], data[ri]);
+	    i = ri;
+	  } else {
+	    break;
+	  }
+	} else {
+	  break;
+	}
+      }
+    }
+
+
+  public:
+
+    Heap() :
+      count(0)
+    {
+      // empty
+    }
+
+    Heap(const Heap<T,C>& other) {
+      data.resize(other.data.size());
+      for (int i = 0; i < other.count; ++i) {
+	data[i] = other.data[i];
+      }
+      count = other.count;
+    }
+
+    const Heap<T,C>& operator=(const Heap<T,C>& other) {
+      data.resize(other.data.size());
+      for (int i = 0; i < other.count; ++i) {
+	data[i] = other.data[i];
+      }
+      count = other.count;
+      return *this;
+    }
+
+    bool empty() const { return 0 == count; }
+
+    T& top() { return data[0]; }
+
+    void push(T item) {
+      int i = count++;
+      data.push_back(item);
+      siftUp(i);
+    }
+
+    void pop() {
+      data[0] = data[--count];
+      data.resize(count);
+      siftDown(0);
+    }
+
+    void updateTop() {
+      siftDown(0);
+    }
+
+    void clear() {
+      count = 0;
+      data.resize(0);
+    }
+
+    iterator begin() {
+      return iterator(*this, 0);
+    }
+
+    iterator end() {
+      return iterator(*this, count);
+    }
+
+    std::ostream& displaySorted(std::ostream& out,
+				std::function<bool(const T&)> filter,
+				bool insert_line_breaks = true) const {
+      Heap<T,C> temp = *this;
+
+      bool first = true;
+      out << "[ ";
+
+      while(!temp.empty()) {
+	const T& top = temp.top();
+	if (filter(top)) {
+	  if (!first) {
+	    out << ", ";
+	  }
+	  if (insert_line_breaks) {
+	    out << std::endl << "    ";
+	  }
+	  out << temp.top();
+	  first = false;
+	}
+	temp.pop();
+      }
+
+      out << " ]";
+      if (insert_line_breaks) {
+	out << std::endl;
+      }
+      return out;
+    }
+
+    template<typename T1, typename T2>
+    friend std::ostream& operator<<(std::ostream&, const Heap<T1,T2>&);
+  }; // class Heap
+
+
+  template<typename T1, typename T2>
+  std::ostream& operator<<(std::ostream& out, const Heap<T1,T2>& h) {
+    out << "[ ";
+    if (h.count) {
+      out << h.data[0];
+    }
+    for (int i = 1; i < h.count; i++) {
+      out << ", " << h.data[i];
+    }
+    out << " ]";
+    return out;
+  }
+} // namespace
diff --git a/src/dmclock/support/src/indirect_intrusive_heap.h b/src/dmclock/support/src/indirect_intrusive_heap.h
new file mode 100644
index 00000000000..b6075bda22f
--- /dev/null
+++ b/src/dmclock/support/src/indirect_intrusive_heap.h
@@ -0,0 +1,549 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ * Copyright (C) 2016 Red Hat Inc.
+ */
+
+
+#pragma once
+
+
+#include <memory>
+#include <vector>
+#include <string>
+#include <iostream>
+#include <functional>
+#include <algorithm>
+
+#include "assert.h"
+
+
+namespace crimson {
+  using IndIntruHeapData = size_t;
+
+  /* T is the ultimate data that's being stored in the heap, although
+   *   through indirection.
+   *
+   * I is the indirect type that will actually be stored in the heap
+   *   and that must allow dereferencing (via operator*) to yield a
+   *   T&.
+   *
+   * C is a functor when given two T&'s will return true if the first
+   *   must precede the second.
+   *
+   * heap_info is a data member pointer as to where the heap data in T
+   * is stored.
+   *
+   * K is the branching factor of the heap, default is 2 (binary heap).
+   */
+  template<typename I,
+	   typename T,
+	   IndIntruHeapData T::*heap_info,
+	   typename C,
+	   uint K = 2>
+  class IndIntruHeap {
+
+    // shorthand
+    using HeapIndex = IndIntruHeapData;
+
+    static_assert(
+      std::is_same<T,typename std::pointer_traits<I>::element_type>::value,
+      "class I must resolve to class T by indirection (pointer dereference)");
+
+    static_assert(
+      std::is_same<bool,
+      typename std::result_of<C(const T&,const T&)>::type>::value,
+      "class C must define operator() to take two const T& and return a bool");
+
+    static_assert(K >= 2, "K (degree of branching) must be at least 2");
+
+    class Iterator {
+      friend IndIntruHeap<I, T, heap_info, C, K>;
+
+      IndIntruHeap<I, T, heap_info, C, K>& heap;
+      HeapIndex                            index;
+
+      Iterator(IndIntruHeap<I, T, heap_info, C, K>& _heap, HeapIndex _index) :
+	heap(_heap),
+	index(_index)
+      {
+	// empty
+      }
+
+    public:
+
+      Iterator(Iterator&& other) :
+	heap(other.heap),
+	index(other.index)
+      {
+	// empty
+      }
+
+      Iterator(const Iterator& other) :
+	heap(other.heap),
+	index(other.index)
+      {
+	// empty
+      }
+
+      Iterator& operator=(Iterator&& other) {
+	std::swap(heap, other.heap);
+	std::swap(index, other.index);
+	return *this;
+      }
+
+      Iterator& operator=(const Iterator& other) {
+	heap = other.heap;
+	index = other.index;
+      }
+
+      Iterator& operator++() {
+	if (index <= heap.count) {
+	  ++index;
+	}
+	return *this;
+      }
+
+      bool operator==(const Iterator& other) const {
+	return &heap == &other.heap && index == other.index;
+      }
+
+      bool operator!=(const Iterator& other) const {
+	return !(*this == other);
+      }
+
+      T& operator*() {
+	return *heap.data[index];
+      }
+
+      T* operator->() {
+	return &(*heap.data[index]);
+      }
+
+#if 0
+      // the item this iterator refers to
+      void increase() {
+	heap.sift_up(index);
+      }
+#endif
+    }; // class Iterator
+
+
+    class ConstIterator {
+      friend IndIntruHeap<I, T, heap_info, C, K>;
+
+      const IndIntruHeap<I, T, heap_info, C, K>& heap;
+      HeapIndex                                  index;
+
+      ConstIterator(const IndIntruHeap<I, T, heap_info, C, K>& _heap,
+		    HeapIndex _index) :
+	heap(_heap),
+	index(_index)
+      {
+	// empty
+      }
+
+    public:
+
+      ConstIterator(ConstIterator&& other) :
+	heap(other.heap),
+	index(other.index)
+      {
+	// empty
+      }
+
+      ConstIterator(const ConstIterator& other) :
+	heap(other.heap),
+	index(other.index)
+      {
+	// empty
+      }
+
+      ConstIterator& operator=(ConstIterator&& other) {
+	std::swap(heap, other.heap);
+	std::swap(index, other.index);
+	return *this;
+      }
+
+      ConstIterator& operator=(const ConstIterator& other) {
+	heap = other.heap;
+	index = other.index;
+      }
+
+      ConstIterator& operator++() {
+	if (index <= heap.count) {
+	  ++index;
+	}
+	return *this;
+      }
+
+      bool operator==(const ConstIterator& other) const {
+	return &heap == &other.heap && index == other.index;
+      }
+
+      bool operator!=(const ConstIterator& other) const {
+	return !(*this == other);
+      }
+
+      const T& operator*() {
+	return *heap.data[index];
+      }
+
+      const T* operator->() {
+	return &(*heap.data[index]);
+      }
+    }; // class ConstIterator
+
+
+  protected:
+
+    std::vector<I> data;
+    HeapIndex      count;
+    C              comparator;
+
+  public:
+
+    IndIntruHeap() :
+      count(0)
+    {
+      // empty
+    }
+
+    IndIntruHeap(const IndIntruHeap<I,T,heap_info,C,K>& other) :
+      count(other.count)
+    {
+      for (HeapIndex i = 0; i < other.count; ++i) {
+	data.push_back(other.data[i]);
+      }
+    }
+
+    bool empty() const { return 0 == count; }
+
+    size_t size() const { return (size_t) count; }
+
+    T& top() { return *data[0]; }
+
+    const T& top() const { return *data[0]; }
+
+    I& top_ind() { return data[0]; }
+
+    const I& top_ind() const { return data[0]; }
+
+    void push(I&& item) {
+      HeapIndex i = count++;
+      intru_data_of(item) = i;
+      data.emplace_back(std::move(item));
+      sift_up(i);
+    }
+
+    void push(const I& item) {
+      I copy(item);
+      push(std::move(copy));
+    }
+
+    void pop() {
+      remove(0);
+    }
+
+    void remove(Iterator& i) {
+      remove(i.index);
+      i = end();
+    }
+
+    Iterator find(const I& ind_item) {
+      for (HeapIndex i = 0; i < count; ++i) {
+	if (data[i] == ind_item) {
+	  return Iterator(*this, i);
+	}
+      }
+      return end();
+    }
+
+    // when passing in value we do a comparison via operator==
+    Iterator find(const T& item) {
+      for (HeapIndex i = 0; i < count; ++i) {
+	if (*data[i] == item) {
+	  return Iterator(*this, i);
+	}
+      }
+      return end();
+    }
+
+    // reverse find -- start looking from bottom of heap
+    Iterator rfind(const I& ind_item) {
+      // HeapIndex is unsigned, so we can't allow to go negative; so
+      // we'll keep it one more than actual index
+      for (HeapIndex i = count; i > 0; --i) {
+	if (data[i-1] == ind_item) {
+	  return Iterator(*this, i-1);
+	}
+      }
+      return end();
+    }
+
+    // reverse find -- start looking from bottom of heap
+    Iterator rfind(const T& item) {
+      // HeapIndex is unsigned, so we can't allow to go negative; so
+      // we'll keep it one more than actual index
+      for (HeapIndex i = count; i > 0; --i) {
+	if (*data[i-1] == item) {
+	  return Iterator(*this, i-1);
+	}
+      }
+      return end();
+    }
+
+    ConstIterator find(const I& ind_item) const {
+      for (HeapIndex i = 0; i < count; ++i) {
+	if (data[i] == ind_item) {
+	  return ConstIterator(*this, i);
+	}
+      }
+      return cend();
+    }
+
+    // when passing in value we do a comparison via operator==
+    ConstIterator find(const T& item) const {
+      for (HeapIndex i = 0; i < count; ++i) {
+	if (*data[i] == item) {
+	  return ConstIterator(*this, i);
+	}
+      }
+      return cend();
+    }
+
+    // reverse find -- start looking from bottom of heap
+    ConstIterator rfind(const I& ind_item) const {
+      // HeapIndex is unsigned, so we can't allow to go negative; so
+      // we'll keep it one more than actual index
+      for (HeapIndex i = count; i > 0; --i) {
+	if (data[i-1] == ind_item) {
+	  return ConstIterator(*this, i-1);
+	}
+      }
+      return cend();
+    }
+
+    // reverse find -- start looking from bottom of heap
+    ConstIterator rfind(const T& item) const {
+      // HeapIndex is unsigned, so we can't allow to go negative; so
+      // we'll keep it one more than actual index
+      for (HeapIndex i = count; i > 0; --i) {
+	if (*data[i-1] == item) {
+	  return ConstIterator(*this, i-1);
+	}
+      }
+      return cend();
+    }
+
+    void promote(T& item) {
+      sift_up(item.*heap_info);
+    }
+
+    void demote(T& item) {
+      sift_down(item.*heap_info);
+    }
+
+    void adjust(T& item) {
+      sift(item.*heap_info);
+    }
+
+    Iterator begin() {
+      return Iterator(*this, 0);
+    }
+
+    Iterator end() {
+      return Iterator(*this, count);
+    }
+
+    ConstIterator cbegin() const {
+      return ConstIterator(*this, 0);
+    }
+
+    ConstIterator cend() const {
+      return ConstIterator(*this, count);
+    }
+
+    friend std::ostream& operator<<(std::ostream& out, const IndIntruHeap& h) {
+      auto i = h.data.cbegin();
+      if (i != h.data.cend()) {
+	out << **i;
+	++i;
+	while (i != h.data.cend()) {
+	  out << ", " << **i;
+	}
+      }
+      return out;
+    }
+
+    // can only be called if I is copyable; copies heap into a vector
+    // and sorts it before displaying it
+    std::ostream&
+    display_sorted(std::ostream& out,
+		   std::function<bool(const T&)> filter = all_filter) const {
+      static_assert(std::is_copy_constructible<I>::value,
+		    "cannot call display_sorted when class I is not copy"
+		    " constructible");
+      auto compare = [this] (const I first, const I second) -> bool {
+	return this->comparator(*first, *second);
+      };
+      std::vector<I> copy(data);
+      std::sort(copy.begin(), copy.end(), compare);
+
+      bool first = true;
+      for (auto c = copy.begin(); c != copy.end(); ++c) {
+	if (filter(**c)) {
+	  if (!first) {
+	    out << ", ";
+	  } else {
+	    first = false;
+	  }
+	  out << **c;
+	}
+      }
+
+      return out;
+    }
+
+
+  protected:
+
+    static IndIntruHeapData& intru_data_of(I& item) {
+      return (*item).*heap_info;
+    }
+
+    void remove(HeapIndex i) {
+      std::swap(data[i], data[--count]);
+      intru_data_of(data[i]) = i;
+      data.pop_back();
+
+      // the following needs to be sift (and not sift_down) as it can
+      // go up or down the heap; imagine the heap vector contains 0,
+      // 10, 100, 20, 30, 200, 300, 40; then 200 is removed, and 40
+      // would have to be sifted upwards
+      // sift(i);
+      sift(i);
+    }
+
+    // default value of filter parameter to display_sorted
+    static bool all_filter(const T& data) { return true; }
+
+    // when i is negative?
+    static inline HeapIndex parent(HeapIndex i) {
+      assert(0 != i);
+      return (i - 1) / K;
+    }
+
+    // index of left child when K==2, index of left-most child when K>2
+    static inline HeapIndex lhs(HeapIndex i) { return K*i + 1; }
+
+    // index of right child when K==2, index of right-most child when K>2
+    static inline HeapIndex rhs(HeapIndex i) { return K*i + K; }
+
+    void sift_up(HeapIndex i) {
+      while (i > 0) {
+	HeapIndex pi = parent(i);
+	if (!comparator(*data[i], *data[pi])) {
+	  break;
+	}
+
+	std::swap(data[i], data[pi]);
+	intru_data_of(data[i]) = i;
+	intru_data_of(data[pi]) = pi;
+	i = pi;
+      }
+    } // sift_up
+
+    // use this sift_down definition when K>2; it's more general and
+    // uses a loop; EnableBool insures template uses a template
+    // parameter
+    template<bool EnableBool=true>
+    typename std::enable_if<(K>2)&&EnableBool,void>::type sift_down(HeapIndex i) {
+      if (i >= count) return;
+      while (true) {
+	HeapIndex li = lhs(i);
+
+	if (li < count) {
+	  HeapIndex ri = std::min(rhs(i), count - 1);
+
+	  // find the index of min. child
+	  HeapIndex min_i = li;
+	  for (HeapIndex k = li + 1; k <= ri; ++k) {
+	    if (comparator(*data[k], *data[min_i])) {
+	      min_i = k;
+	    }
+	  }
+
+	  if (comparator(*data[min_i], *data[i])) {
+	    std::swap(data[i], data[min_i]);
+	    intru_data_of(data[i]) = i;
+	    intru_data_of(data[min_i]) = min_i;
+	    i = min_i;
+	  } else {
+	    // no child is smaller
+	    break;
+	  }
+	} else {
+	  // no children
+	  break;
+	}
+      }
+    } // sift_down
+
+    // use this sift_down definition when K==2; EnableBool insures
+    // template uses a template parameter
+    template<bool EnableBool=true>
+    typename std::enable_if<K==2&&EnableBool,void>::type sift_down(HeapIndex i) {
+      if (i >= count) return;
+      while (true) {
+	const HeapIndex li = lhs(i);
+	const HeapIndex ri = 1 + li;
+
+        if (li < count) {
+	  if (comparator(*data[li], *data[i])) {
+	    if (ri < count && comparator(*data[ri], *data[li])) {
+	      std::swap(data[i], data[ri]);
+	      intru_data_of(data[i]) = i;
+	      intru_data_of(data[ri]) = ri;
+	      i = ri;
+	    } else {
+	      std::swap(data[i], data[li]);
+	      intru_data_of(data[i]) = i;
+	      intru_data_of(data[li]) = li;
+	      i = li;
+            }
+	  } else if (ri < count && comparator(*data[ri], *data[i])) {
+	    std::swap(data[i], data[ri]);
+            intru_data_of(data[i]) = i;
+	    intru_data_of(data[ri]) = ri;
+	    i = ri;
+          } else {
+	    // no child is smaller
+            break;
+          }
+        } else {
+	  // no children
+          break;
+        }
+      } // while
+    } // sift_down
+
+    void sift(HeapIndex i) {
+      if (i == 0) {
+	// if we're at top, can only go down
+	sift_down(i);
+      } else {
+	HeapIndex pi = parent(i);
+	if (comparator(*data[i], *data[pi])) {
+	  // if we can go up, we will
+	  sift_up(i);
+	} else {
+	  // otherwise we'll try to go down
+	  sift_down(i);
+	}
+      }
+    } // sift
+  }; // class IndIntruHeap
+
+} // namespace crimson
diff --git a/src/dmclock/support/src/intrusive_heap.h b/src/dmclock/support/src/intrusive_heap.h
new file mode 100644
index 00000000000..291e5798149
--- /dev/null
+++ b/src/dmclock/support/src/intrusive_heap.h
@@ -0,0 +1,214 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ * Copyright (C) 2016 Red Hat Inc.
+ */
+
+
+#pragma once
+
+
+#include <vector>
+#include <string>
+#include <iostream>
+#include <functional>
+
+#include "assert.h"
+
+
+namespace crimson {
+  using IntruHeapData = size_t;
+
+  // T = type of data in heap; I = functor that returns a non-const
+  // reference to IntruHeapData; C = functor that compares two const
+  // refs and return true if the first precedes the second
+  template<typename T, typename I, typename C>
+  class IntruHeap {
+
+    static_assert(
+      std::is_same<IntruHeapData&,typename std::result_of<I(T&)>::type>::value,
+      "class I must define operator() to take T& and return a IntruHeapData&.");
+
+    static_assert(
+      std::is_same<bool,typename std::result_of<C(const T&,const T&)>::type>::value,
+      "class C must define operator() to take two const T& and return a bool.");
+
+
+  protected:
+    using index_t = IntruHeapData;
+
+    std::vector<T> data;
+    index_t count;
+    I intru_data_of;
+    C comparator;
+
+  public:
+
+    IntruHeap() :
+      count(0)
+    {
+      // empty
+    }
+
+    IntruHeap(const IntruHeap<T,I,C>& other) :
+      count(other.count)
+    {
+      for (uint i = 0; i < other.count; ++i) {
+	data.push_back(other.data[i]);
+      }
+    }
+
+    bool empty() const { return 0 == count; }
+
+    T& top() { return data[0]; }
+
+    void push(T&& item) {
+      index_t i = count++;
+      intru_data_of(item) = i;
+      data.emplace_back(item);
+      sift_up(i);
+    }
+
+    void push(const T& item) {
+      T copy(item);
+      push(std::move(copy));
+    }
+
+    void pop() {
+      std::swap(data[0], data[--count]);
+      intru_data_of(data[0]) = 0;
+      data.pop_back();
+      sift_down(0);
+    }
+
+    void adjust_up(T& item) {
+      sift_up(intru_data_of(item));
+    }
+
+    void adjust_down(T& item) {
+      sift_down(intru_data_of(item));
+    }
+
+    void adjust(T& item) {
+      sift(intru_data_of(item));
+    }
+
+    friend std::ostream& operator<<(std::ostream& out, const IntruHeap& h) {
+      for (uint i = 0; i < h.count; ++i) {
+	out << h.data[i] << ", ";
+      }
+      return out;
+    }
+
+    std::ostream&
+    display_sorted(std::ostream& out,
+		   bool insert_line_breaks = true,
+		   std::function<bool(const T&)> filter = all_filter) const {
+      IntruHeap<T,I,C> copy = *this;
+
+      bool first = true;
+      out << "[ ";
+
+      while(!copy.empty()) {
+	const T& top = copy.top();
+	if (filter(top)) {
+	  if (!first) {
+	    out << ", ";
+	  }
+	  if (insert_line_breaks) {
+	    out << std::endl << "    ";
+	  }
+	  out << copy.top();
+	  first = false;
+	}
+	copy.pop();
+      }
+
+      out << " ]";
+      if (insert_line_breaks) {
+	out << std::endl;
+      }
+
+      return out;
+    }
+
+
+  protected:
+
+    // default value of filter parameter to display_sorted
+    static bool all_filter(const T& data) { return true; }
+
+    // when i is negative?
+    static inline index_t parent(index_t i) {
+      assert(0 != i);
+      return (i - 1) / 2;
+    }
+
+    static inline index_t lhs(index_t i) { return 2*i + 1; }
+
+    static inline index_t rhs(index_t i) { return 2*i + 2; }
+
+    void sift_up(index_t i) {
+      while (i > 0) {
+	index_t pi = parent(i);
+	if (!comparator(data[i], data[pi])) {
+	  break;
+	}
+
+	std::swap(data[i], data[pi]);
+	intru_data_of(data[i]) = i;
+	intru_data_of(data[pi]) = pi;
+	i = pi;
+      }
+    } // sift_up
+
+    void sift_down(index_t i) {
+      while (i < count) {
+	index_t li = lhs(i);
+	index_t ri = rhs(i);
+
+	if (li < count) {
+	  if (comparator(data[li], data[i])) {
+	    if (ri < count && comparator(data[ri], data[li])) {
+	      std::swap(data[i], data[ri]);
+	      intru_data_of(data[i]) = i;
+	      intru_data_of(data[ri]) = ri;
+	      i = ri;
+	    } else {
+	      std::swap(data[i], data[li]);
+	      intru_data_of(data[i]) = i;
+	      intru_data_of(data[li]) = li;
+	      i = li;
+	    }
+	  } else if (ri < count && comparator(data[ri], data[i])) {
+	    std::swap(data[i], data[ri]);
+	    intru_data_of(data[i]) = i;
+	    intru_data_of(data[ri]) = ri;
+	    i = ri;
+	  } else {
+	    break;
+	  }
+	} else {
+	  break;
+	}
+      }
+    } // sift_down
+
+    void sift(index_t i) {
+      if (i == 0) {
+	// if we're at top, can only go down
+	sift_down(i);
+      } else {
+	index_t pi = parent(i);
+	if (comparator(data[i], data[pi])) {
+	  // if we can go up, we will
+	  sift_up(i);
+	} else {
+	  // otherwise we'll try to go down
+	  sift_down(i);
+	}
+      }
+    } // sift
+  }; // class IntruHeap
+} // namespace crimson
diff --git a/src/dmclock/support/src/profile.h b/src/dmclock/support/src/profile.h
new file mode 100644
index 00000000000..77493c75be5
--- /dev/null
+++ b/src/dmclock/support/src/profile.h
@@ -0,0 +1,114 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ * Copyright (C) 2016 Red Hat Inc.
+ */
+
+
+#pragma once
+
+
+#include <cmath>
+#include <chrono>
+
+
+namespace crimson {
+  template<typename T>
+  class ProfileBase {
+
+  protected:
+
+    using clock = std::chrono::steady_clock;
+
+    uint count = 0;
+    typename T::rep sum = 0;
+    typename T::rep sum_squares = 0;
+    typename T::rep low = 0;
+    typename T::rep high = 0;
+
+  public:
+
+    uint get_count() const { return count; }
+    typename T::rep get_sum() const { return sum; }
+    typename T::rep get_low() const { return low; }
+    typename T::rep get_high() const { return high; }
+    double get_mean() const {
+      if (0 == count) return nan("");
+      return sum / double(count); }
+    double get_std_dev() const {
+      if (0 == count) return nan("");
+      double variance =
+	(count * sum_squares - sum * sum) / double(count * count);
+      return sqrt(variance);
+    }
+  }; // class ProfileBase
+
+
+  // forward declaration for friend
+  template<typename T>
+  class ProfileCombiner;
+
+
+  template<typename T>
+  class ProfileTimer : public ProfileBase<T> {
+    friend ProfileCombiner<T>;
+
+    using super = ProfileBase<T>;
+
+    bool is_timing = false;
+    typename super::clock::time_point start_time;
+
+  public:
+
+    ProfileTimer() {
+    }
+
+    void start() {
+      assert(!is_timing);
+      start_time = super::clock::now();
+      is_timing = true;
+    }
+
+    void stop() {
+      assert(is_timing);
+      T duration = std::chrono::duration_cast<T>(super::clock::now() - start_time);
+      typename T::rep duration_count = duration.count();
+      this->sum += duration_count;
+      this->sum_squares += duration_count * duration_count;
+      if (0 == this->count) {
+	this->low = duration_count;
+	this->high = duration_count;
+      } else {
+	if (duration_count < this->low) this->low = duration_count;
+	else if (duration_count > this->high) this->high = duration_count;
+      }
+      ++this->count;
+      is_timing = false;
+    }
+  };  // class ProfileTimer
+
+
+  template<typename T>
+  class ProfileCombiner : public ProfileBase<T> {
+
+    using super = ProfileBase<T>;
+
+  public:
+
+    ProfileCombiner() {}
+
+    void combine(const ProfileTimer<T>& timer) {
+      if (0 == this->count) {
+	this->low = timer.low;
+	this->high = timer.high;
+      } else {
+	if (timer.low < this->low) this->low = timer.low;
+	else if (timer.high > this->high) this->high = timer.high;
+      }
+      this->count += timer.count;
+      this->sum += timer.sum;
+      this->sum_squares += timer.sum_squares;
+    }
+  }; // class ProfileCombiner
+} // namespace crimson
diff --git a/src/dmclock/support/src/run_every.cc b/src/dmclock/support/src/run_every.cc
new file mode 100644
index 00000000000..258baaa74c0
--- /dev/null
+++ b/src/dmclock/support/src/run_every.cc
@@ -0,0 +1,73 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Copyright (C) 2016 Red Hat Inc.
+ */
+
+
+#include <iostream>
+
+#include "run_every.h"
+
+
+// can define ADD_MOVE_SEMANTICS, although not fully debugged and tested
+
+
+namespace chrono = std::chrono;
+
+
+#ifdef ADD_MOVE_SEMANTICS
+crimson::RunEvery::RunEvery()
+{
+  // empty
+}
+
+
+crimson::RunEvery& crimson::RunEvery::operator=(crimson::RunEvery&& other)
+{
+  // finish run every thread
+  {
+    Guard g(mtx);
+    finishing = true;
+    cv.notify_one();
+  }
+  if (thd.joinable()) {
+    thd.join();
+  }
+
+  // transfer info over from previous thread
+  finishing.store(other.finishing);
+  wait_period = other.wait_period;
+  body = other.body;
+
+  // finish other thread
+  other.finishing.store(true);
+  other.cv.notify_one();
+
+  // start this thread
+  thd = std::thread(&RunEvery::run, this);
+
+  return *this;
+}
+#endif
+
+
+crimson::RunEvery::~RunEvery() {
+  finishing = true;
+  cv.notify_all();
+  thd.join();
+}
+
+
+void crimson::RunEvery::run() {
+  Lock l(mtx);
+  while(!finishing) {
+    TimePoint until = chrono::steady_clock::now() + wait_period;
+    while (!finishing && chrono::steady_clock::now() < until) {
+      cv.wait_until(l, until);
+    }
+    if (!finishing) {
+      body();
+    }
+  }
+}
diff --git a/src/dmclock/support/src/run_every.h b/src/dmclock/support/src/run_every.h
new file mode 100644
index 00000000000..58b85563e7e
--- /dev/null
+++ b/src/dmclock/support/src/run_every.h
@@ -0,0 +1,68 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Copyright (C) 2016 Red Hat Inc.
+ */
+
+
+#pragma once
+
+#include <chrono>
+#include <mutex>
+#include <condition_variable>
+#include <thread>
+#include <functional>
+
+namespace crimson {
+  using std::chrono::duration_cast;
+  using std::chrono::milliseconds;
+
+  // runs a given simple function object waiting wait_period
+  // milliseconds between; the destructor stops the other thread
+  // immediately
+  class RunEvery {
+    using Lock      = std::unique_lock<std::mutex>;
+    using Guard     = std::lock_guard<std::mutex>;
+    using TimePoint = std::chrono::steady_clock::time_point;
+
+    bool                      finishing = false;
+    std::chrono::milliseconds wait_period;
+    std::function<void()>     body;
+    std::mutex                mtx;
+    std::condition_variable   cv;
+
+    // put threads last so all other variables are initialized first
+
+    std::thread               thd;
+
+  public:
+
+#ifdef ADD_MOVE_SEMANTICS
+    RunEvery();
+#endif
+
+    template<typename D>
+    RunEvery(D                     _wait_period,
+	     std::function<void()> _body) :
+      wait_period(duration_cast<milliseconds>(_wait_period)),
+      body(_body)
+    {
+      thd = std::thread(&RunEvery::run, this);
+    }
+
+    RunEvery(const RunEvery& other) = delete;
+    RunEvery& operator=(const RunEvery& other) = delete;
+    RunEvery(RunEvery&& other) = delete;
+#ifdef ADD_MOVE_SEMANTICS
+    RunEvery& operator=(RunEvery&& other);
+#else
+    RunEvery& operator=(RunEvery&& other) = delete;
+#endif
+
+    ~RunEvery();
+
+  protected:
+
+    void run();
+  };
+}
diff --git a/src/dmclock/support/test/CMakeLists.txt b/src/dmclock/support/test/CMakeLists.txt
new file mode 100644
index 00000000000..addea6c96a9
--- /dev/null
+++ b/src/dmclock/support/test/CMakeLists.txt
@@ -0,0 +1,29 @@
+include_directories(../src)
+
+set(local_flags "-Wall -pthread")
+
+# dmclock does not use intrusive heap (but it does use indirect
+# intrusive heap), so we won't use this code
+if(false)
+  set(srcs
+    test_intrusive_heap.cc)
+  add_executable(test_intru_heap test_intrusive_heap.cc)
+  set_source_files_properties(${srcs}
+    PROPERTIES
+    COMPILE_FLAGS "${local_flags}")
+endif(false)
+
+set(test_srcs test_indirect_intrusive_heap.cc)
+
+set_source_files_properties(${test_srcs}
+  PROPERTIES
+  COMPILE_FLAGS "${local_flags}"
+  )
+
+add_executable(dmclock-data-struct-tests ${test_srcs})
+
+target_link_libraries(dmclock-data-struct-tests
+  LINK_PRIVATE gtest gtest_main pthread)
+
+add_test(NAME dmclock-data-struct-tests
+  COMMAND $<TARGET_FILE:dmclock-data-struct-tests>)
diff --git a/src/dmclock/support/test/test_ind_intru_heap.cc b/src/dmclock/support/test/test_ind_intru_heap.cc
new file mode 100644
index 00000000000..9ec03b5cacf
--- /dev/null
+++ b/src/dmclock/support/test/test_ind_intru_heap.cc
@@ -0,0 +1,82 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ * Copyright (C) 2016 Red Hat Inc.
+ */
+
+
+#include <memory>
+#include <string>
+#include <iostream>
+
+#include "indirect_intrusive_heap.h"
+
+
+class TestCompare;
+
+
+class Test1 {
+    friend TestCompare;
+
+    int data;
+
+public:
+
+    crimson::IndIntruHeapData heap_data;
+
+    Test1(int _data) : data(_data) {}
+
+    friend std::ostream& operator<<(std::ostream& out, const Test1& d) {
+        out << d.data << " (" << d.heap_data << ")";
+        return out;
+    }
+
+    int& the_data() { return data; }
+};
+
+
+struct TestCompare {
+    bool operator()(const Test1& d1, const Test1& d2) {
+        return d1.data < d2.data;
+    }
+};
+
+
+int main(int argc, char** argv) {
+    Test1 d1(2);
+    Test1 d2(3);
+    Test1 d3(1);
+    Test1 d4(-5);
+
+    crimson::IndIntruHeap<std::shared_ptr<Test1>, Test1, &Test1::heap_data, TestCompare> my_heap;
+
+    const std::shared_ptr<Test1> d99 = std::make_shared<Test1>(99);
+
+    my_heap.push(std::make_shared<Test1>(2));
+    my_heap.push(d99);
+    my_heap.push(std::make_shared<Test1>(1));
+    my_heap.push(std::make_shared<Test1>(-5));
+    my_heap.push(std::make_shared<Test1>(12));
+    my_heap.push(std::make_shared<Test1>(-12));
+    my_heap.push(std::make_shared<Test1>(-7));
+
+    std::cout << my_heap << std::endl;
+
+    auto& t = my_heap.top();
+    t.the_data() = 17;
+    my_heap.adjust_down(t);
+
+    std::cout << my_heap << std::endl;
+
+    my_heap.display_sorted(std::cout);
+
+    while (!my_heap.empty()) {
+        auto& top = my_heap.top();
+        std::cout << top << std::endl;
+        my_heap.pop();
+        std::cout << my_heap << std::endl;
+    }
+
+    return 0;
+}
diff --git a/src/dmclock/support/test/test_indirect_intrusive_heap.cc b/src/dmclock/support/test/test_indirect_intrusive_heap.cc
new file mode 100644
index 00000000000..23863a24ce9
--- /dev/null
+++ b/src/dmclock/support/test/test_indirect_intrusive_heap.cc
@@ -0,0 +1,930 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ * Copyright (C) 2016 Red Hat Inc.
+ */
+
+#include <iostream>
+#include <memory>
+#include <set>
+
+#include "gtest/gtest.h"
+
+#include "indirect_intrusive_heap.h"
+
+
+struct Elem {
+  int data;
+
+  crimson::IndIntruHeapData heap_data;
+  crimson::IndIntruHeapData heap_data_alt;
+
+  Elem(int _data) : data(_data) { }
+
+  bool operator==(const Elem& other) {
+    return data == other.data;
+  }
+
+  friend std::ostream& operator<<(std::ostream& out, const Elem& d) {
+    out << d.data;
+    return out;
+  }
+};
+
+
+// sorted low to high
+struct ElemCompare {
+  bool operator()(const Elem& d1, const Elem& d2) const {
+    return d1.data < d2.data;
+  }
+};
+
+
+// first all evens precede all odds, then they're sorted high to low
+struct ElemCompareAlt {
+  bool operator()(const Elem& d1, const Elem& d2) {
+    if (0 == d1.data % 2) {
+      if (0 == d2.data % 2) {
+	return d1.data > d2.data;
+      } else {
+	return true;
+      }
+    } else if (0 == d2.data % 2) {
+      return false;
+    } else {
+      return d1.data > d2.data;
+    }
+  }
+};
+
+
+class HeapFixture1: public ::testing::Test {
+
+public:
+
+  crimson::IndIntruHeap<std::shared_ptr<Elem>,
+			Elem,
+			&Elem::heap_data,
+			ElemCompare> heap;
+
+  std::shared_ptr<Elem> data1, data2, data3, data4, data5, data6, data7;
+
+  void SetUp() {
+    data1 = std::make_shared<Elem>(2);
+    data2 = std::make_shared<Elem>(99);
+    data3 = std::make_shared<Elem>(1);
+    data4 = std::make_shared<Elem>(-5);
+    data5 = std::make_shared<Elem>(12);
+    data6 = std::make_shared<Elem>(-12);
+    data7 = std::make_shared<Elem>(-7);
+
+    heap.push(data1);
+    heap.push(data2);
+    heap.push(data3);
+    heap.push(data4);
+    heap.push(data5);
+    heap.push(data6);
+    heap.push(data7);
+  }
+
+  void TearDown() {
+    // nothing to do
+  }
+}; // class HeapFixture1
+
+TEST(IndIntruHeap, shared_ptr) {
+  crimson::IndIntruHeap<std::shared_ptr<Elem>,
+			Elem,
+			&Elem::heap_data,
+			ElemCompare> heap;
+
+  EXPECT_TRUE(heap.empty());
+
+  heap.push(std::make_shared<Elem>(2));
+
+  EXPECT_FALSE(heap.empty());
+
+  heap.push(std::make_shared<Elem>(99));
+  heap.push(std::make_shared<Elem>(1));
+  heap.push(std::make_shared<Elem>(-5));
+  heap.push(std::make_shared<Elem>(12));
+  heap.push(std::make_shared<Elem>(-12));
+  heap.push(std::make_shared<Elem>(-7));
+
+  // std::cout << heap << std::endl;
+
+  EXPECT_FALSE(heap.empty());
+
+  EXPECT_EQ(-12, heap.top().data);
+  heap.pop();
+  EXPECT_EQ(-7, heap.top().data);
+  heap.pop();
+  EXPECT_EQ(-5, heap.top().data);
+  heap.pop();
+  EXPECT_EQ(1, heap.top().data);
+  heap.pop();
+  EXPECT_EQ(2, heap.top().data);
+  heap.pop();
+  EXPECT_EQ(12, heap.top().data);
+  heap.pop();
+  EXPECT_EQ(99, heap.top().data);
+
+  EXPECT_FALSE(heap.empty());
+  heap.pop();
+  EXPECT_TRUE(heap.empty());
+}
+
+
+TEST(IndIntruHeap, unique_ptr) {
+  crimson::IndIntruHeap<std::unique_ptr<Elem>,
+			Elem,
+			&Elem::heap_data,
+			ElemCompare> heap;
+
+  EXPECT_TRUE(heap.empty());
+
+  heap.push(std::unique_ptr<Elem>(new Elem(2)));
+
+  EXPECT_FALSE(heap.empty());
+
+  heap.push(std::unique_ptr<Elem>(new Elem(99)));
+  heap.push(std::unique_ptr<Elem>(new Elem(1)));
+  heap.push(std::unique_ptr<Elem>(new Elem(-5)));
+  heap.push(std::unique_ptr<Elem>(new Elem(12)));
+  heap.push(std::unique_ptr<Elem>(new Elem(-12)));
+  heap.push(std::unique_ptr<Elem>(new Elem(-7)));
+
+  EXPECT_FALSE(heap.empty());
+
+  EXPECT_EQ(-12, heap.top().data);
+  heap.pop();
+  EXPECT_EQ(-7, heap.top().data);
+  heap.pop();
+  EXPECT_EQ(-5, heap.top().data);
+  heap.pop();
+  EXPECT_EQ(1, heap.top().data);
+  heap.pop();
+  EXPECT_EQ(2, heap.top().data);
+  heap.pop();
+  EXPECT_EQ(12, heap.top().data);
+  heap.pop();
+  EXPECT_EQ(99, heap.top().data);
+
+  EXPECT_FALSE(heap.empty());
+  heap.pop();
+  EXPECT_TRUE(heap.empty());
+}
+
+
+TEST(IndIntruHeap, regular_ptr) {
+  crimson::IndIntruHeap<Elem*, Elem, &Elem::heap_data, ElemCompare> heap;
+
+  EXPECT_TRUE(heap.empty());
+
+  heap.push(new Elem(2));
+
+  EXPECT_FALSE(heap.empty());
+
+  heap.push(new Elem(99));
+  heap.push(new Elem(1));
+  heap.push(new Elem(-5));
+  heap.push(new Elem(12));
+  heap.push(new Elem(-12));
+  heap.push(new Elem(-7));
+
+  EXPECT_FALSE(heap.empty());
+
+  EXPECT_EQ(-12, heap.top().data);
+  delete &heap.top();
+  heap.pop();
+  EXPECT_EQ(-7, heap.top().data);
+  delete &heap.top();
+  heap.pop();
+  EXPECT_EQ(-5, heap.top().data);
+  delete &heap.top();
+  heap.pop();
+  EXPECT_EQ(1, heap.top().data);
+  delete &heap.top();
+  heap.pop();
+  EXPECT_EQ(2, heap.top().data);
+  delete &heap.top();
+  heap.pop();
+  EXPECT_EQ(12, heap.top().data);
+  delete &heap.top();
+  heap.pop();
+  EXPECT_EQ(99, heap.top().data);
+
+  delete &heap.top();
+
+  EXPECT_FALSE(heap.empty());
+  heap.pop();
+  EXPECT_TRUE(heap.empty());
+}
+
+
+TEST(IndIntruHeap, K_3) {
+  crimson::IndIntruHeap<std::shared_ptr<Elem>,
+			Elem,
+			&Elem::heap_data,
+			ElemCompare,
+			3> heap;
+
+  EXPECT_TRUE(heap.empty());
+
+  heap.push(std::make_shared<Elem>(2));
+
+  EXPECT_FALSE(heap.empty());
+
+  heap.push(std::make_shared<Elem>(99));
+  heap.push(std::make_shared<Elem>(1));
+  heap.push(std::make_shared<Elem>(-5));
+  heap.push(std::make_shared<Elem>(12));
+  heap.push(std::make_shared<Elem>(-12));
+  heap.push(std::make_shared<Elem>(-7));
+
+  // std::cout << heap << std::endl;
+
+  EXPECT_FALSE(heap.empty());
+
+  EXPECT_EQ(-12, heap.top().data);
+  heap.pop();
+  EXPECT_EQ(-7, heap.top().data);
+  heap.pop();
+  EXPECT_EQ(-5, heap.top().data);
+  heap.pop();
+  EXPECT_EQ(1, heap.top().data);
+  heap.pop();
+  EXPECT_EQ(2, heap.top().data);
+  heap.pop();
+  EXPECT_EQ(12, heap.top().data);
+  heap.pop();
+  EXPECT_EQ(99, heap.top().data);
+
+  EXPECT_FALSE(heap.empty());
+  heap.pop();
+  EXPECT_TRUE(heap.empty());
+}
+
+
+TEST(IndIntruHeap, K_4) {
+  crimson::IndIntruHeap<std::shared_ptr<Elem>,
+			Elem,
+			&Elem::heap_data,
+			ElemCompare,
+			4> heap;
+
+  EXPECT_TRUE(heap.empty());
+
+  heap.push(std::make_shared<Elem>(2));
+
+  EXPECT_FALSE(heap.empty());
+
+  heap.push(std::make_shared<Elem>(99));
+  heap.push(std::make_shared<Elem>(1));
+  heap.push(std::make_shared<Elem>(-5));
+  heap.push(std::make_shared<Elem>(12));
+  heap.push(std::make_shared<Elem>(-12));
+  heap.push(std::make_shared<Elem>(-7));
+
+  // std::cout << heap << std::endl;
+
+  EXPECT_FALSE(heap.empty());
+
+  EXPECT_EQ(-12, heap.top().data);
+  heap.pop();
+  EXPECT_EQ(-7, heap.top().data);
+  heap.pop();
+  EXPECT_EQ(-5, heap.top().data);
+  heap.pop();
+  EXPECT_EQ(1, heap.top().data);
+  heap.pop();
+  EXPECT_EQ(2, heap.top().data);
+  heap.pop();
+  EXPECT_EQ(12, heap.top().data);
+  heap.pop();
+  EXPECT_EQ(99, heap.top().data);
+
+  EXPECT_FALSE(heap.empty());
+  heap.pop();
+  EXPECT_TRUE(heap.empty());
+}
+
+
+TEST(IndIntruHeap, K_10) {
+  crimson::IndIntruHeap<std::shared_ptr<Elem>,
+			Elem,
+			&Elem::heap_data,
+			ElemCompare,
+			10> heap;
+
+  EXPECT_TRUE(heap.empty());
+
+  heap.push(std::make_shared<Elem>(2));
+
+  EXPECT_FALSE(heap.empty());
+
+  heap.push(std::make_shared<Elem>(99));
+  heap.push(std::make_shared<Elem>(1));
+  heap.push(std::make_shared<Elem>(-5));
+  heap.push(std::make_shared<Elem>(12));
+  heap.push(std::make_shared<Elem>(-12));
+  heap.push(std::make_shared<Elem>(-7));
+
+  // std::cout << heap << std::endl;
+
+  EXPECT_FALSE(heap.empty());
+
+  EXPECT_EQ(-12, heap.top().data);
+  heap.pop();
+  EXPECT_EQ(-7, heap.top().data);
+  heap.pop();
+  EXPECT_EQ(-5, heap.top().data);
+  heap.pop();
+  EXPECT_EQ(1, heap.top().data);
+  heap.pop();
+  EXPECT_EQ(2, heap.top().data);
+  heap.pop();
+  EXPECT_EQ(12, heap.top().data);
+  heap.pop();
+  EXPECT_EQ(99, heap.top().data);
+
+  EXPECT_FALSE(heap.empty());
+  heap.pop();
+  EXPECT_TRUE(heap.empty());
+}
+
+
+TEST(IndIntruHeap, multi_K) {
+  crimson::IndIntruHeap<std::shared_ptr<Elem>,
+			Elem,
+			&Elem::heap_data,
+			ElemCompare,
+			2> heap2;
+
+  crimson::IndIntruHeap<std::shared_ptr<Elem>,
+			Elem,
+			&Elem::heap_data,
+			ElemCompare,
+			3> heap3;
+
+  crimson::IndIntruHeap<std::shared_ptr<Elem>,
+			Elem,
+			&Elem::heap_data,
+			ElemCompare,
+			4> heap4;
+
+  crimson::IndIntruHeap<std::shared_ptr<Elem>,
+			Elem,
+			&Elem::heap_data,
+			ElemCompare,
+			10> heap10;
+
+  // 250 should give us at least 4 levels on all heaps
+  constexpr size_t count = 250;
+
+  std::srand(std::time(0)); // use current time as seed for random generator
+
+  // insert same set of random values into the four heaps
+  for (size_t i = 0; i < count; ++i) {
+    int value = std::rand() % 201 - 100; // -100...+100
+    auto data = std::make_shared<Elem>(value);
+    heap2.push(data);
+    heap3.push(data);
+    heap4.push(data);
+    heap10.push(data);
+  }
+
+  auto bound = std::numeric_limits<decltype(Elem::data)>::min();
+
+  for (size_t i = 0; i < count; ++i) {
+    auto current = heap2.top().data;
+
+    EXPECT_GE(current, bound) <<
+      "we should never go down, only increase or remain the same";
+    EXPECT_EQ(current, heap3.top().data) <<
+      "heap1's data and heap3's data should match";
+    EXPECT_EQ(current, heap4.top().data) <<
+      "heap1's data and heap4's data should match";
+    EXPECT_EQ(current, heap10.top().data) <<
+      "heap1's data and heap10's data should match";
+
+    heap2.pop();
+    heap3.pop();
+    heap4.pop();
+    heap10.pop();
+
+    bound = current;
+  }
+
+  EXPECT_TRUE(heap2.empty()) << "should be empty after all elements popped";
+  EXPECT_TRUE(heap3.empty()) << "should be empty after all elements popped";
+  EXPECT_TRUE(heap4.empty()) << "should be empty after all elements popped";
+  EXPECT_TRUE(heap10.empty()) << "should be empty after all elements popped";
+}
+
+
+TEST(IndIntruHeap, demote) {
+  crimson::IndIntruHeap<std::unique_ptr<Elem>,
+			Elem,
+			&Elem::heap_data,
+			ElemCompare> heap;
+
+  heap.push(std::unique_ptr<Elem>(new Elem(2)));
+  heap.push(std::unique_ptr<Elem>(new Elem(99)));
+  heap.push(std::unique_ptr<Elem>(new Elem(1)));
+  heap.push(std::unique_ptr<Elem>(new Elem(-5)));
+  heap.push(std::unique_ptr<Elem>(new Elem(12)));
+  heap.push(std::unique_ptr<Elem>(new Elem(-12)));
+  heap.push(std::unique_ptr<Elem>(new Elem(-7)));
+
+  heap.top().data = 24;
+
+  heap.demote(heap.top());
+
+  EXPECT_EQ(-7, heap.top().data);
+
+  heap.pop();
+  heap.pop();
+  heap.pop();
+  heap.pop();
+  heap.pop();
+
+  EXPECT_EQ(24, heap.top().data);
+}
+
+
+TEST(IndIntruHeap, demote_not) {
+  crimson::IndIntruHeap<std::unique_ptr<Elem>,
+			Elem,
+			&Elem::heap_data,
+			ElemCompare> heap;
+
+  heap.push(std::unique_ptr<Elem>(new Elem(2)));
+  heap.push(std::unique_ptr<Elem>(new Elem(99)));
+  heap.push(std::unique_ptr<Elem>(new Elem(1)));
+  heap.push(std::unique_ptr<Elem>(new Elem(-5)));
+  heap.push(std::unique_ptr<Elem>(new Elem(12)));
+  heap.push(std::unique_ptr<Elem>(new Elem(-12)));
+  heap.push(std::unique_ptr<Elem>(new Elem(-7)));
+
+  heap.top().data = -99;
+
+  heap.demote(heap.top());
+
+  EXPECT_EQ(-99, heap.top().data);
+
+  heap.pop();
+
+  EXPECT_EQ(-7, heap.top().data);
+}
+
+
+TEST(IndIntruHeap, promote_and_demote) {
+  crimson::IndIntruHeap<std::shared_ptr<Elem>,
+			Elem,
+			&Elem::heap_data,
+			ElemCompare> heap;
+
+  auto data1 = std::make_shared<Elem>(1);
+
+  heap.push(std::make_shared<Elem>(2));
+  heap.push(std::make_shared<Elem>(99));
+  heap.push(data1);
+  heap.push(std::make_shared<Elem>(-5));
+  heap.push(std::make_shared<Elem>(12));
+  heap.push(std::make_shared<Elem>(-12));
+  heap.push(std::make_shared<Elem>(-7));
+
+  EXPECT_EQ(-12, heap.top().data);
+
+  data1->data = -99;
+  heap.promote(*data1);
+
+  EXPECT_EQ(-99, heap.top().data);
+
+  data1->data = 999;
+  heap.demote(*data1);
+
+  EXPECT_EQ(-12, heap.top().data);
+
+  data1->data = 9;
+  heap.promote(*data1);
+
+  heap.pop(); // remove -12
+  heap.pop(); // remove -7
+  heap.pop(); // remove -5
+  heap.pop(); // remove 2
+
+  EXPECT_EQ(9, heap.top().data);
+}
+
+
+TEST(IndIntruHeap, adjust) {
+  crimson::IndIntruHeap<std::shared_ptr<Elem>,
+			Elem,
+			&Elem::heap_data,
+			ElemCompare> heap;
+
+  auto data1 = std::make_shared<Elem>(1);
+
+  heap.push(std::make_shared<Elem>(2));
+  heap.push(std::make_shared<Elem>(99));
+  heap.push(data1);
+  heap.push(std::make_shared<Elem>(-5));
+  heap.push(std::make_shared<Elem>(12));
+  heap.push(std::make_shared<Elem>(-12));
+  heap.push(std::make_shared<Elem>(-7));
+
+  // heap.display_sorted(std::cout);
+
+  EXPECT_EQ(-12, heap.top().data);
+
+  data1->data = 999;
+  heap.adjust(*data1);
+
+  EXPECT_EQ(-12, heap.top().data);
+
+  data1->data = -99;
+  heap.adjust(*data1);
+
+  EXPECT_EQ(-99, heap.top().data);
+
+  data1->data = 9;
+  heap.adjust(*data1);
+
+  EXPECT_EQ(-12, heap.top().data);
+
+  heap.pop(); // remove -12
+  heap.pop(); // remove -7
+  heap.pop(); // remove -5
+  heap.pop(); // remove 2
+
+  EXPECT_EQ(9, heap.top().data);
+}
+
+
+TEST(IndIntruHeap, remove_careful) {
+  // here we test whether a common mistake in implementing remove is
+  // done; if after we remove an item and move the last element of the
+  // heap to the position of the removed element, we need to sift it
+  // rather than sift_down it.
+
+  crimson::IndIntruHeap<std::shared_ptr<Elem>,
+			Elem,
+			&Elem::heap_data,
+			ElemCompare,
+			2> heap;
+
+  heap.push(std::make_shared<Elem>(0));
+  heap.push(std::make_shared<Elem>(10));
+  heap.push(std::make_shared<Elem>(100));
+  heap.push(std::make_shared<Elem>(20));
+  heap.push(std::make_shared<Elem>(30));
+  heap.push(std::make_shared<Elem>(200));
+  heap.push(std::make_shared<Elem>(300));
+  heap.push(std::make_shared<Elem>(40));
+
+  auto k = heap.find(Elem(200));
+  EXPECT_NE(heap.end(), k) <<
+    "we should have found an element with the value 200, which we'll remove";
+  heap.remove(k);
+
+  auto i = heap.cbegin();
+  EXPECT_EQ(0, i->data);
+  ++i;
+  EXPECT_EQ(10, i->data);
+  ++i;
+  EXPECT_EQ(40, i->data) <<
+    "this needs to be 40 or there's a mistake in implementation";
+  ++i;
+  EXPECT_EQ(20, i->data);
+  ++i;
+  EXPECT_EQ(30, i->data);
+  ++i;
+  EXPECT_EQ(100, i->data) <<
+    "this needs to be 100 or there's a mistake in implementation";
+}
+
+
+TEST_F(HeapFixture1, shared_data) {
+
+  crimson::IndIntruHeap<std::shared_ptr<Elem>,Elem,&Elem::heap_data_alt,ElemCompareAlt> heap2;
+
+  heap2.push(data1);
+  heap2.push(data2);
+  heap2.push(data3);
+  heap2.push(data4);
+  heap2.push(data5);
+  heap2.push(data6);
+  heap2.push(data7);
+
+  data3->data = 32;
+  heap.adjust(*data3);
+  heap2.adjust(*data3);
+
+  EXPECT_EQ(-12, heap.top().data);
+  heap.pop();
+  EXPECT_EQ(-7, heap.top().data);
+  heap.pop();
+  EXPECT_EQ(-5, heap.top().data);
+  heap.pop();
+  EXPECT_EQ(2, heap.top().data);
+  heap.pop();
+  EXPECT_EQ(12, heap.top().data);
+  heap.pop();
+  EXPECT_EQ(32, heap.top().data);
+  heap.pop();
+  EXPECT_EQ(99, heap.top().data);
+
+  EXPECT_EQ(32, heap2.top().data);
+  heap2.pop();
+  EXPECT_EQ(12, heap2.top().data);
+  heap2.pop();
+  EXPECT_EQ(2, heap2.top().data);
+  heap2.pop();
+  EXPECT_EQ(-12, heap2.top().data);
+  heap2.pop();
+  EXPECT_EQ(99, heap2.top().data);
+  heap2.pop();
+  EXPECT_EQ(-5, heap2.top().data);
+  heap2.pop();
+  EXPECT_EQ(-7, heap2.top().data);
+}
+
+
+TEST_F(HeapFixture1, iterator_basics) {
+  {
+    uint count = 0;
+    for(auto i = heap.begin(); i != heap.end(); ++i) {
+      ++count;
+    }
+
+    EXPECT_EQ(7u, count) << "count should be 7";
+  }
+
+  auto i1 = heap.begin();
+
+  EXPECT_EQ(-12, i1->data) <<
+    "first member with * operator must be smallest";
+
+  EXPECT_EQ(-12, (*i1).data) <<
+    "first member with -> operator must be smallest";
+
+  Elem& e1 = *i1;
+  EXPECT_EQ(-12, e1.data) <<
+    "first member with -> operator must be smallest";
+
+  {
+    std::set<int> values;
+    values.insert(2);
+    values.insert(99);
+    values.insert(1);
+    values.insert(-5);
+    values.insert(12);
+    values.insert(-12);
+    values.insert(-7);
+
+    for(auto i = heap.begin(); i != heap.end(); ++i) {
+      auto v = *i;
+      EXPECT_NE(values.end(), values.find(v.data)) <<
+	"value in heap must be part of original set";
+      values.erase(v.data);
+    }
+    EXPECT_EQ(0u, values.size()) << "all values must have been seen";
+  }
+}
+
+
+TEST_F(HeapFixture1, const_iterator_basics) {
+  const auto& cheap = heap;
+
+  {
+    uint count = 0;
+    for(auto i = cheap.cbegin(); i != cheap.cend(); ++i) {
+      ++count;
+    }
+
+    EXPECT_EQ(7u, count) << "count should be 7";
+  }
+
+  auto i1 = heap.cbegin();
+
+  EXPECT_EQ(-12, i1->data) <<
+    "first member with * operator must be smallest";
+
+  EXPECT_EQ(-12, (*i1).data) <<
+    "first member with -> operator must be smallest";
+
+  const Elem& e1 = *i1;
+  EXPECT_EQ(-12, e1.data) <<
+    "first member with -> operator must be smallest";
+
+  {
+    std::set<int> values;
+    values.insert(2);
+    values.insert(99);
+    values.insert(1);
+    values.insert(-5);
+    values.insert(12);
+    values.insert(-12);
+    values.insert(-7);
+
+    for(auto i = heap.cbegin(); i != heap.cend(); ++i) {
+      auto v = *i;
+      EXPECT_NE(values.end(), values.find(v.data)) <<
+	"value in heap must be part of original set";
+      values.erase(v.data);
+    }
+    EXPECT_EQ(0u, values.size()) << "all values must have been seen";
+  }
+}
+
+
+TEST_F(HeapFixture1, iterator_find_rfind) {
+  {
+    auto it1 = heap.find(data7);
+    EXPECT_NE(heap.end(), it1) <<
+      "find by indirection for included element should succeed";
+    EXPECT_EQ(-7, it1->data) <<
+      "find by indirection for included element should result in right value";
+
+    auto fake_data = std::make_shared<Elem>(-7);
+    auto it2 = heap.find(fake_data);
+    EXPECT_EQ(heap.end(), it2) <<
+      "find by indirection for not included element should fail";
+  }
+
+  {
+    auto it1 = heap.find(Elem(-7));
+    EXPECT_NE(heap.end(), it1) <<
+      "find by value for included element should succeed";
+    EXPECT_EQ(-7, it1->data) <<
+      "find by value for included element should result in right value";
+
+    auto it2 = heap.find(Elem(7));
+    EXPECT_EQ(heap.end(), it2) <<
+      "find by value for not included element should fail";
+  }
+
+  {
+    auto it1 = heap.rfind(data7);
+    EXPECT_NE(heap.end(), it1) <<
+      "reverse find by indirecton for included element should succeed";
+    EXPECT_EQ(-7, it1->data) <<
+      "reverse find by indirection for included element should result "
+      "in right value";
+
+    auto fake_data = std::make_shared<Elem>(-7);
+    auto it2 = heap.rfind(fake_data);
+    EXPECT_EQ(heap.end(), it2) <<
+      "reverse find by indirection for not included element should fail";
+  }
+
+  {
+    auto it1 = heap.rfind(Elem(-7));
+    EXPECT_NE(heap.end(), it1) <<
+      "reverse find by value for included element should succeed";
+    EXPECT_EQ(-7, it1->data) <<
+      "reverse find by value for included element should result "
+      "in right value";
+
+    auto it2 = heap.rfind(Elem(7));
+    EXPECT_EQ(heap.end(), it2) <<
+      "reverse find by value for not included element should fail";
+  }
+}
+
+
+TEST_F(HeapFixture1, const_iterator_find_rfind) {
+  const auto& c_heap = heap;
+
+  {
+    auto it1 = c_heap.find(data7);
+    EXPECT_NE(c_heap.cend(), it1) <<
+      "find by indirection for included element should succeed";
+    EXPECT_EQ(-7, it1->data) <<
+      "find by indirection for included element should result in right value";
+
+    auto fake_data = std::make_shared<Elem>(-7);
+    auto it2 = c_heap.find(fake_data);
+    EXPECT_EQ(c_heap.cend(), it2) <<
+      "find by indirection for not included element should fail";
+  }
+
+  {
+    auto it1 = c_heap.find(Elem(-7));
+    EXPECT_NE(c_heap.cend(), it1) <<
+      "find by value for included element should succeed";
+    EXPECT_EQ(-7, it1->data) <<
+      "find by value for included element should result in right value";
+
+    auto it2 = c_heap.find(Elem(7));
+    EXPECT_EQ(c_heap.cend(), it2) <<
+      "find by value for not included element should fail";
+  }
+
+  {
+    auto it1 = c_heap.rfind(data7);
+    EXPECT_NE(c_heap.cend(), it1) <<
+      "reverse find by indirecton for included element should succeed";
+    EXPECT_EQ(-7, it1->data) <<
+      "reverse find by indirection for included element should result "
+      "in right value";
+
+    auto fake_data = std::make_shared<Elem>(-7);
+    auto it2 = c_heap.rfind(fake_data);
+    EXPECT_EQ(c_heap.cend(), it2) <<
+      "reverse find by indirection for not included element should fail";
+  }
+
+  {
+    auto it1 = c_heap.rfind(Elem(-7));
+    EXPECT_NE(c_heap.cend(), it1) <<
+      "reverse find by value for included element should succeed";
+    EXPECT_EQ(-7, it1->data) <<
+      "reverse find by value for included element should result "
+      "in right value";
+
+    auto it2 = c_heap.rfind(Elem(7));
+    EXPECT_EQ(c_heap.cend(), it2) <<
+      "reverse find by value for not included element should fail";
+  }
+}
+
+
+TEST_F(HeapFixture1, iterator_remove) {
+  auto it1 = heap.find(data7);
+  EXPECT_NE(heap.end(), it1) << "find for included element should succeed";
+
+  heap.remove(it1);
+
+  auto it2 = heap.find(data7);
+  EXPECT_EQ(heap.end(), it2) << "find for removed element should fail";
+
+  for (auto it3 = heap.begin(); it3 != heap.end(); ++it3) {
+    EXPECT_NE(-7, it3->data) <<
+      "iterating through heap should not find removed value";
+  }
+
+  // move through heap without -7
+  EXPECT_EQ(-12, heap.top().data);
+  heap.pop();
+  EXPECT_EQ(-5, heap.top().data);
+  heap.pop();
+  EXPECT_EQ(1, heap.top().data);
+  heap.pop();
+  EXPECT_EQ(2, heap.top().data);
+  heap.pop();
+  EXPECT_EQ(12, heap.top().data);
+  heap.pop();
+  EXPECT_EQ(99, heap.top().data);
+  heap.pop();
+}
+
+
+TEST_F(HeapFixture1, four_tops) {
+  Elem& top1 = heap.top();
+  EXPECT_EQ(-12, top1.data);
+
+  const Elem& top2 = heap.top();
+  EXPECT_EQ(-12, top2.data);
+
+  std::shared_ptr<Elem> top3 = heap.top_ind();
+  EXPECT_EQ(-12, top3->data);
+
+  const std::shared_ptr<Elem> top4 = heap.top_ind();
+  EXPECT_EQ(-12, top4->data);
+
+  const auto& c_heap = heap;
+
+  const Elem& top5 = c_heap.top();
+  EXPECT_EQ(-12, top5.data);
+
+  const std::shared_ptr<Elem> top6 = c_heap.top_ind();
+  EXPECT_EQ(-12, top6->data);
+}
+
+
+TEST_F(HeapFixture1, display_sorted) {
+  std::stringstream ss;
+
+  heap.display_sorted(ss);
+
+  std::string s = ss.str();
+
+  EXPECT_GT(s.length(), 0u);
+
+  auto negseven = s.find("-7");
+  EXPECT_NE(negseven, std::string::npos);
+
+  auto ninetynine = s.find("99");
+  EXPECT_NE(ninetynine, std::string::npos);
+
+  // index of -7 should be less than index of 99
+  EXPECT_LT(negseven, ninetynine);
+
+#if 0
+  std::cout << s << std::endl;
+#endif
+}
diff --git a/src/dmclock/support/test/test_intrusive_heap.cc b/src/dmclock/support/test/test_intrusive_heap.cc
new file mode 100644
index 00000000000..a0ad07524e0
--- /dev/null
+++ b/src/dmclock/support/test/test_intrusive_heap.cc
@@ -0,0 +1,86 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ * Copyright (C) 2016 Red Hat Inc.
+ */
+
+
+#include <string>
+#include <iostream>
+
+#include "intrusive_heap.h"
+
+
+struct TestCompare;
+struct TestIntruData;
+
+
+class Test1 {
+    friend TestCompare;
+    friend TestIntruData;
+
+    int data;
+    crimson::IntruHeapData heap_data;
+
+public:
+    Test1(int _data) : data(_data) {}
+
+    friend std::ostream& operator<<(std::ostream& out, const Test1& d) {
+        out << d.data << " (" << d.heap_data << ")";
+        return out;
+    }
+
+    int& the_data() { return data; }
+};
+
+
+struct TestCompare {
+    bool operator()(const Test1& d1, const Test1& d2) {
+        return d1.data < d2.data;
+    }
+};
+
+
+struct TestIntruData {
+    crimson::IntruHeapData& operator()(Test1& d) {
+        return d.heap_data;
+    }
+};
+
+
+int main(int argc, char** argv) {
+    Test1 d1(2);
+    Test1 d2(3);
+    Test1 d3(1);
+    Test1 d4(-5);
+
+    crimson::IntruHeap<Test1, TestIntruData, TestCompare> my_heap;
+
+    my_heap.push(d1);
+    my_heap.push(d2);
+    my_heap.push(d3);
+    my_heap.push(d4);
+    my_heap.push(Test1(-9));
+    my_heap.push(Test1(99));
+    my_heap.push(Test1(0));
+
+    std::cout << my_heap << std::endl;
+
+    auto& t = my_heap.top();
+    t.the_data() = 17;
+    my_heap.adjust_down(t);
+
+    std::cout << my_heap << std::endl;
+
+    my_heap.display_sorted(std::cout);
+
+    while (!my_heap.empty()) {
+        auto& top = my_heap.top();
+        std::cout << top << std::endl;
+        my_heap.pop();
+        std::cout << my_heap << std::endl;
+    }
+
+    return 0;
+}
diff --git a/src/dmclock/test/CMakeLists.txt b/src/dmclock/test/CMakeLists.txt
new file mode 100644
index 00000000000..e72810b56aa
--- /dev/null
+++ b/src/dmclock/test/CMakeLists.txt
@@ -0,0 +1,35 @@
+include_directories(../src)
+include_directories(../support/src)
+include_directories(../sim/src)
+include_directories(${BOOST_INCLUDE_DIR})
+
+set(support_srcs ../sim/src/test_dmclock.cc)
+set(test_srcs
+  test_test_client.cc
+  test_dmclock_server.cc
+  test_dmclock_client.cc
+  )
+
+set_source_files_properties(${core_srcs} ${test_srcs}
+  PROPERTIES
+  COMPILE_FLAGS "${local_flags}"
+  )
+
+add_executable(dmclock-tests ${test_srcs} ${support_srcs})
+
+if (TARGET gtest AND TARGET gtest_main)
+  add_dependencies(dmclock-tests gtest gtest_main)
+  target_link_libraries(dmclock-tests
+    LINK_PRIVATE $<TARGET_FILE:dmclock>
+    pthread
+    $<TARGET_FILE:gtest>
+    $<TARGET_FILE:gtest_main>)
+else()
+  target_link_libraries(dmclock-tests
+    LINK_PRIVATE $<TARGET_FILE:dmclock> pthread ${GTEST_LIBRARY} ${GTEST_MAIN_LIBRARY})
+endif()
+  
+add_dependencies(dmclock-tests dmclock)
+
+add_test(NAME dmclock-tests
+  COMMAND $<TARGET_FILE:dmclock-tests>)
diff --git a/src/dmclock/test/test_dmclock_client.cc b/src/dmclock/test/test_dmclock_client.cc
new file mode 100644
index 00000000000..ee4172dc348
--- /dev/null
+++ b/src/dmclock/test/test_dmclock_client.cc
@@ -0,0 +1,219 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Copyright (C) 2016 Red Hat Inc.
+ */
+
+
+#include <chrono>
+#include <mutex>
+#include <functional>
+#include <iostream>
+
+
+#include "dmclock_client.h"
+#include "dmclock_util.h"
+#include "gtest/gtest.h"
+
+
+namespace dmc = crimson::dmclock;
+
+
+namespace crimson {
+  namespace dmclock {
+
+    /*
+     * Allows us to test the code provided with the mutex provided locked.
+     */
+    static void test_locked(std::mutex& mtx, std::function<void()> code) {
+      std::lock_guard<std::mutex> l(mtx);
+      code();
+    }
+
+
+    TEST(dmclock_client, server_erase) {
+      using ServerId = int;
+      // using ClientId = int;
+
+      ServerId server = 101;
+      // ClientId client = 3;
+
+      // dmc::PhaseType resp_params = dmc::PhaseType::reservation;
+
+      dmc::ServiceTracker<ServerId> st(std::chrono::seconds(2),
+                                       std::chrono::seconds(3));
+
+      auto lock_st = [&](std::function<void()> code) {
+	test_locked(st.data_mtx, code);
+      };
+
+      /* The timeline should be as follows:
+       *
+       *     0 seconds : request created
+       *
+       *     1 seconds : map is size 1
+       *
+       * 2 seconds : clean notes first mark; +2 is base for further calcs
+       *
+       * 4 seconds : clean does nothing except makes another mark
+       *
+       *   5 seconds : when we're secheduled to erase (+2 + 3)
+       *
+       *     5 seconds : since the clean job hasn't run yet, map still size 1
+       *
+       * 6 seconds : clean erases server
+       *
+       *     7 seconds : verified server is gone (map size 0)
+       */
+
+      lock_st([&] () {
+	  EXPECT_EQ(0u, st.server_map.size()) <<
+	    "server map initially has size 0";
+	});
+
+      std::this_thread::sleep_for(std::chrono::seconds(1));
+
+      // call for side effects
+      (void) st.get_req_params(server);
+
+      lock_st([&] () {
+	  EXPECT_EQ(1u, st.server_map.size()) <<
+	    "server map has size 1 after first request";
+	});
+
+      std::this_thread::sleep_for(std::chrono::seconds(4));
+
+      lock_st([&] () {
+	  EXPECT_EQ(1u, st.server_map.size()) <<
+	    "server map has size 1 just before erase";
+	});
+
+      std::this_thread::sleep_for(std::chrono::seconds(2));
+
+      lock_st([&] () {
+	  EXPECT_EQ(0u, st.server_map.size()) <<
+	    "server map has size 0 just after erase";
+	});
+    } // TEST
+
+
+    TEST(dmclock_client, delta_rho_values) {
+      using ServerId = int;
+      // using ClientId = int;
+
+      ServerId server1 = 101;
+      ServerId server2 = 7;
+      // ClientId client = 3;
+
+      // RespParams<ServerId> resp_params(server, dmc::PhaseType::reservation);
+
+      dmc::ServiceTracker<ServerId> st(std::chrono::seconds(2),
+                                       std::chrono::seconds(3));
+
+      auto rp1 = st.get_req_params(server1);
+
+      EXPECT_EQ(1u, rp1.delta) <<
+	"delta should be 1 with no intervening responses by" <<
+	"other servers";
+      EXPECT_EQ(1u, rp1.rho) <<
+	"rho should be 1 with no intervening reservation responses by" <<
+	"other servers";
+
+      auto rp2 = st.get_req_params(server1);
+
+      EXPECT_EQ(1u, rp2.delta) <<
+	"delta should be 1 with no intervening responses by" <<
+	"other servers";
+      EXPECT_EQ(1u, rp2.rho) <<
+	"rho should be 1 with no intervening reservation responses by" <<
+	"other servers";
+
+      st.track_resp(server1, dmc::PhaseType::priority);
+
+      auto rp3 = st.get_req_params(server1);
+
+      EXPECT_EQ(1u, rp3.delta) <<
+	"delta should be 1 with no intervening responses by" <<
+	"other servers";
+      EXPECT_EQ(1u, rp3.rho) <<
+	"rho should be 1 with no intervening reservation responses by" <<
+	"other servers";
+
+      st.track_resp(server2, dmc::PhaseType::priority);
+
+      auto rp4 = st.get_req_params(server1);
+
+      EXPECT_EQ(2u, rp4.delta) <<
+	"delta should be 2 with one intervening priority response by " <<
+	"another server";
+      EXPECT_EQ(1u, rp4.rho) <<
+	"rho should be 1 with one intervening priority responses by " <<
+	"another server";
+
+      auto rp5 = st.get_req_params(server1);
+
+      EXPECT_EQ(1u, rp5.delta) <<
+	"delta should be 1 with no intervening responses by" <<
+	"other servers";
+      EXPECT_EQ(1u, rp5.rho) <<
+	"rho should be 1 with no intervening reservation responses by" <<
+	"other servers";
+
+      st.track_resp(server2, dmc::PhaseType::reservation);
+
+      auto rp6 = st.get_req_params(server1);
+
+      EXPECT_EQ(2u, rp6.delta) <<
+	"delta should be 2 with one intervening reservation response by " <<
+	"another server";
+      EXPECT_EQ(2u, rp6.rho) <<
+	"rho should be 2 with one intervening reservation responses by " <<
+	"another server";
+
+      // auto rp6_b = st.get_req_params(server2);
+
+      st.track_resp(server2, dmc::PhaseType::reservation);
+      st.track_resp(server1, dmc::PhaseType::priority);
+      st.track_resp(server2, dmc::PhaseType::priority);
+      st.track_resp(server2, dmc::PhaseType::reservation);
+      st.track_resp(server1, dmc::PhaseType::reservation);
+      st.track_resp(server1, dmc::PhaseType::priority);
+      st.track_resp(server2, dmc::PhaseType::priority);
+
+      auto rp7 = st.get_req_params(server1);
+
+      EXPECT_EQ(5u, rp7.delta) <<
+	"delta should be 5 with fourintervening responses by " <<
+	"another server";
+      EXPECT_EQ(3u, rp7.rho) <<
+	"rho should be 3 with two intervening reservation responses by " <<
+	"another server";
+
+      auto rp7b = st.get_req_params(server2);
+
+      EXPECT_EQ(4u, rp7b.delta) <<
+	"delta should be 4 with three intervening responses by " <<
+	"another server";
+      EXPECT_EQ(2u, rp7b.rho) <<
+	"rho should be 2 with one intervening reservation responses by " <<
+	"another server";
+
+      auto rp8 = st.get_req_params(server1);
+
+      EXPECT_EQ(1u, rp8.delta) <<
+	"delta should be 1 with no intervening responses by " <<
+	"another server";
+      EXPECT_EQ(1u, rp8.rho) <<
+	"rho should be 1 with no intervening reservation responses by " <<
+	"another server";
+
+      auto rp8b = st.get_req_params(server2);
+      EXPECT_EQ(1u, rp8b.delta) <<
+	"delta should be 1 with no intervening responses by " <<
+	"another server";
+      EXPECT_EQ(1u, rp8b.rho) <<
+	"rho should be 1 with no intervening reservation responses by " <<
+	"another server";
+    } // TEST
+  } // namespace dmclock
+} // namespace crimson
diff --git a/src/dmclock/test/test_dmclock_server.cc b/src/dmclock/test/test_dmclock_server.cc
new file mode 100644
index 00000000000..4555e377323
--- /dev/null
+++ b/src/dmclock/test/test_dmclock_server.cc
@@ -0,0 +1,826 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Copyright (C) 2016 Red Hat Inc.
+ */
+
+
+#include <memory>
+#include <chrono>
+#include <iostream>
+#include <list>
+#include <vector>
+
+
+#include "dmclock_server.h"
+#include "dmclock_util.h"
+#include "gtest/gtest.h"
+
+
+namespace dmc = crimson::dmclock;
+
+
+// we need a request object; an empty one will do
+struct Request {
+};
+
+
+namespace crimson {
+  namespace dmclock {
+
+    /*
+     * Allows us to test the code provided with the mutex provided locked.
+     */
+    static void test_locked(std::mutex& mtx, std::function<void()> code) {
+      std::unique_lock<std::mutex> l(mtx);
+      code();
+    }
+
+
+    TEST(dmclock_server, bad_tag_deathtest) {
+      using ClientId = int;
+      using Queue = dmc::PullPriorityQueue<ClientId,Request>;
+      using QueueRef = std::unique_ptr<Queue>;
+
+      ClientId client1 = 17;
+      ClientId client2 = 18;
+
+      double reservation = 0.0;
+      double weight = 0.0;
+
+      dmc::ClientInfo ci1(reservation, weight, 0.0);
+      dmc::ClientInfo ci2(reservation, weight, 1.0);
+
+      auto client_info_f = [&] (ClientId c) -> dmc::ClientInfo {
+	if (client1 == c) return ci1;
+	else if (client2 == c) return ci2;
+	else {
+	  ADD_FAILURE() << "got request from neither of two clients";
+	  return ci1; // must return
+	}
+      };
+
+      QueueRef pq(new Queue(client_info_f, false));
+      Request req;
+      ReqParams req_params(1,1);
+
+      EXPECT_DEATH_IF_SUPPORTED(pq->add_request(req, client1, req_params),
+				"Assertion.*reservation.*max_tag.*"
+				"proportion.*max_tag") <<
+	"we should fail if a client tries to generate a reservation tag "
+	"where reservation and proportion are both 0";
+
+
+      EXPECT_DEATH_IF_SUPPORTED(pq->add_request(req, client2, req_params),
+				"Assertion.*reservation.*max_tag.*"
+				"proportion.*max_tag") <<
+	"we should fail if a client tries to generate a reservation tag "
+	"where reservation and proportion are both 0";
+    }
+
+
+    TEST(dmclock_server, client_idle_erase) {
+      using ClientId = int;
+      using Queue = dmc::PushPriorityQueue<ClientId,Request>;
+      int client = 17;
+      double reservation = 100.0;
+
+      dmc::ClientInfo ci(reservation, 1.0, 0.0);
+      auto client_info_f = [&] (ClientId c) -> dmc::ClientInfo { return ci; };
+      auto server_ready_f = [] () -> bool { return true; };
+      auto submit_req_f = [] (const ClientId& c,
+			      std::unique_ptr<Request> req,
+			      dmc::PhaseType phase) {
+	// empty; do nothing
+      };
+
+      Queue pq(client_info_f,
+	       server_ready_f,
+	       submit_req_f,
+	       std::chrono::seconds(3),
+	       std::chrono::seconds(5),
+	       std::chrono::seconds(2),
+	       false);
+
+      auto lock_pq = [&](std::function<void()> code) {
+	test_locked(pq.data_mtx, code);
+      };
+
+
+      /* The timeline should be as follows:
+       *
+       *     0 seconds : request created
+       *
+       *     1 seconds : map is size 1, idle is false
+       *
+       * 2 seconds : clean notes first mark; +2 is base for further calcs
+       *
+       * 4 seconds : clean does nothing except makes another mark
+       *
+       *   5 seconds : when we're secheduled to idle (+2 + 3)
+       *
+       * 6 seconds : clean idles client
+       *
+       *   7 seconds : when we're secheduled to erase (+2 + 5)
+       *
+       *     7 seconds : verified client is idle
+       *
+       * 8 seconds : clean erases client info
+       *
+       *     9 seconds : verified client is erased
+       */
+
+      lock_pq([&] () {
+	  EXPECT_EQ(0u, pq.client_map.size()) <<
+	    "client map initially has size 0";
+	});
+
+      Request req;
+      dmc::ReqParams req_params(1, 1);
+      pq.add_request_time(req, client, req_params, dmc::get_time());
+
+      std::this_thread::sleep_for(std::chrono::seconds(1));
+
+      lock_pq([&] () {
+	  EXPECT_EQ(1u, pq.client_map.size()) <<
+	    "client map has 1 after 1 client";
+	  EXPECT_FALSE(pq.client_map.at(client)->idle) <<
+	    "initially client map entry shows not idle.";
+	});
+
+      std::this_thread::sleep_for(std::chrono::seconds(6));
+
+      lock_pq([&] () {
+	  EXPECT_TRUE(pq.client_map.at(client)->idle) <<
+	    "after idle age client map entry shows idle.";
+	});
+
+      std::this_thread::sleep_for(std::chrono::seconds(2));
+
+      lock_pq([&] () {
+	  EXPECT_EQ(0u, pq.client_map.size()) <<
+	    "client map loses its entry after erase age";
+	});
+    } // TEST
+
+
+#if 0
+    TEST(dmclock_server, reservation_timing) {
+      using ClientId = int;
+      // NB? PUSH OR PULL
+      using Queue = std::unique_ptr<dmc::PriorityQueue<ClientId,Request>>;
+      using std::chrono::steady_clock;
+
+      int client = 17;
+
+      std::vector<dmc::Time> times;
+      std::mutex times_mtx;
+      using Guard = std::lock_guard<decltype(times_mtx)>;
+
+      // reservation every second
+      dmc::ClientInfo ci(1.0, 0.0, 0.0);
+      Queue pq;
+
+      auto client_info_f = [&] (ClientId c) -> dmc::ClientInfo { return ci; };
+      auto server_ready_f = [] () -> bool { return true; };
+      auto submit_req_f = [&] (const ClientId& c,
+			       std::unique_ptr<Request> req,
+			       dmc::PhaseType phase) {
+	{
+	  Guard g(times_mtx);
+	  times.emplace_back(dmc::get_time());
+	}
+	std::thread complete([&](){ pq->request_completed(); });
+	complete.detach();
+      };
+
+      // NB? PUSH OR PULL
+      pq = Queue(new dmc::PriorityQueue<ClientId,Request>(client_info_f,
+							  server_ready_f,
+							  submit_req_f,
+							  false));
+
+      Request req;
+      ReqParams<ClientId> req_params(client, 1, 1);
+
+      for (int i = 0; i < 5; ++i) {
+	pq->add_request_time(req, req_params, dmc::get_time());
+      }
+
+      {
+	Guard g(times_mtx);
+	std::this_thread::sleep_for(std::chrono::milliseconds(5500));
+	EXPECT_EQ(5, times.size()) <<
+	  "after 5.5 seconds, we should have 5 requests times at 1 second apart";
+      }
+    } // TEST
+#endif
+
+
+    TEST(dmclock_server, remove_by_req_filter) {
+      struct MyReq {
+	int id;
+
+	MyReq(int _id) :
+	  id(_id)
+	{
+	  // empty
+	}
+      }; // MyReq
+
+      using ClientId = int;
+      using Queue = dmc::PullPriorityQueue<ClientId,MyReq>;
+
+      ClientId client1 = 17;
+      ClientId client2 = 98;
+
+      dmc::ClientInfo info1(0.0, 1.0, 0.0);
+
+      auto client_info_f = [&] (ClientId c) -> dmc::ClientInfo {
+	return info1;
+      };
+
+      Queue pq(client_info_f, true);
+
+      EXPECT_EQ(0u, pq.client_count());
+      EXPECT_EQ(0u, pq.request_count());
+
+      ReqParams req_params(1,1);
+
+      pq.add_request(MyReq(1), client1, req_params);
+      pq.add_request(MyReq(11), client1, req_params);
+      pq.add_request(MyReq(2), client2, req_params);
+      pq.add_request(MyReq(0), client2, req_params);
+      pq.add_request(MyReq(13), client2, req_params);
+      pq.add_request(MyReq(2), client2, req_params);
+      pq.add_request(MyReq(13), client2, req_params);
+      pq.add_request(MyReq(98), client2, req_params);
+      pq.add_request(MyReq(44), client1, req_params);
+
+      EXPECT_EQ(2u, pq.client_count());
+      EXPECT_EQ(9u, pq.request_count());
+
+      pq.remove_by_req_filter([](const MyReq& r) -> bool {return 1 == r.id % 2;});
+
+      EXPECT_EQ(5u, pq.request_count());
+
+      std::list<MyReq> capture;
+      pq.remove_by_req_filter(
+	[&capture] (const MyReq& r) -> bool {
+	  if (0 == r.id % 2) {
+	    capture.push_front(r);
+	    return true;
+	  } else {
+	    return false;
+	  }
+	},
+	true);
+
+      EXPECT_EQ(0u, pq.request_count());
+      EXPECT_EQ(5u, capture.size());
+      int total = 0;
+      for (auto i : capture) {
+	total += i.id;
+      }
+      EXPECT_EQ(146, total) << " sum of captured items should be 146";
+    } // TEST
+
+
+    TEST(dmclock_server, remove_by_req_filter_ordering_forwards_visit) {
+      struct MyReq {
+	int id;
+
+	MyReq(int _id) :
+	  id(_id)
+	{
+	  // empty
+	}
+      }; // MyReq
+
+      using ClientId = int;
+      using Queue = dmc::PullPriorityQueue<ClientId,MyReq>;
+
+      ClientId client1 = 17;
+
+      dmc::ClientInfo info1(0.0, 1.0, 0.0);
+
+      auto client_info_f = [&] (ClientId c) -> dmc::ClientInfo {
+	return info1;
+      };
+
+      Queue pq(client_info_f, true);
+
+      EXPECT_EQ(0u, pq.client_count());
+      EXPECT_EQ(0u, pq.request_count());
+
+      ReqParams req_params(1,1);
+
+      pq.add_request(MyReq(1), client1, req_params);
+      pq.add_request(MyReq(2), client1, req_params);
+      pq.add_request(MyReq(3), client1, req_params);
+      pq.add_request(MyReq(4), client1, req_params);
+      pq.add_request(MyReq(5), client1, req_params);
+      pq.add_request(MyReq(6), client1, req_params);
+
+      EXPECT_EQ(1u, pq.client_count());
+      EXPECT_EQ(6u, pq.request_count());
+
+      // remove odd ids in forward order and append to end
+
+      std::vector<MyReq> capture;
+      pq.remove_by_req_filter(
+	[&capture] (const MyReq& r) -> bool {
+	  if (1 == r.id % 2) {
+	    capture.push_back(r);
+	    return true;
+	  } else {
+	    return false;
+	  }
+	},
+	false);
+
+      EXPECT_EQ(3u, pq.request_count());
+      EXPECT_EQ(3u, capture.size());
+      EXPECT_EQ(1, capture[0].id) << "items should come out in forward order";
+      EXPECT_EQ(3, capture[1].id) << "items should come out in forward order";
+      EXPECT_EQ(5, capture[2].id) << "items should come out in forward order";
+
+      // remove even ids in reverse order but insert at front so comes
+      // out forwards
+
+      std::vector<MyReq> capture2;
+      pq.remove_by_req_filter(
+	[&capture2] (const MyReq& r) -> bool {
+	  if (0 == r.id % 2) {
+	    capture2.insert(capture2.begin(), r);
+	    return true;
+	  } else {
+	    return false;
+	  }
+	},
+	false);
+
+      EXPECT_EQ(0u, pq.request_count());
+      EXPECT_EQ(3u, capture2.size());
+      EXPECT_EQ(6, capture2[0].id) << "items should come out in reverse order";
+      EXPECT_EQ(4, capture2[1].id) << "items should come out in reverse order";
+      EXPECT_EQ(2, capture2[2].id) << "items should come out in reverse order";
+    } // TEST
+
+
+    TEST(dmclock_server, remove_by_req_filter_ordering_backwards_visit) {
+      struct MyReq {
+	int id;
+
+	MyReq(int _id) :
+	  id(_id)
+	{
+	  // empty
+	}
+      }; // MyReq
+
+      using ClientId = int;
+      using Queue = dmc::PullPriorityQueue<ClientId,MyReq>;
+
+      ClientId client1 = 17;
+
+      dmc::ClientInfo info1(0.0, 1.0, 0.0);
+
+      auto client_info_f = [&] (ClientId c) -> dmc::ClientInfo {
+	return info1;
+      };
+
+      Queue pq(client_info_f, true);
+
+      EXPECT_EQ(0u, pq.client_count());
+      EXPECT_EQ(0u, pq.request_count());
+
+      ReqParams req_params(1,1);
+
+      pq.add_request(MyReq(1), client1, req_params);
+      pq.add_request(MyReq(2), client1, req_params);
+      pq.add_request(MyReq(3), client1, req_params);
+      pq.add_request(MyReq(4), client1, req_params);
+      pq.add_request(MyReq(5), client1, req_params);
+      pq.add_request(MyReq(6), client1, req_params);
+
+      EXPECT_EQ(1u, pq.client_count());
+      EXPECT_EQ(6u, pq.request_count());
+
+      // now remove odd ids in forward order
+
+      std::vector<MyReq> capture;
+      pq.remove_by_req_filter(
+	[&capture] (const MyReq& r) -> bool {
+	  if (1 == r.id % 2) {
+	    capture.insert(capture.begin(), r);
+	    return true;
+	  } else {
+	    return false;
+	  }
+	},
+	true);
+
+      EXPECT_EQ(3u, pq.request_count());
+      EXPECT_EQ(3u, capture.size());
+      EXPECT_EQ(1, capture[0].id) << "items should come out in forward order";
+      EXPECT_EQ(3, capture[1].id) << "items should come out in forward order";
+      EXPECT_EQ(5, capture[2].id) << "items should come out in forward order";
+
+      // now remove even ids in reverse order
+
+      std::vector<MyReq> capture2;
+      pq.remove_by_req_filter(
+	[&capture2] (const MyReq& r) -> bool {
+	  if (0 == r.id % 2) {
+	    capture2.push_back(r);
+	    return true;
+	  } else {
+	    return false;
+	  }
+	},
+	true);
+
+      EXPECT_EQ(0u, pq.request_count());
+      EXPECT_EQ(3u, capture2.size());
+      EXPECT_EQ(6, capture2[0].id) << "items should come out in reverse order";
+      EXPECT_EQ(4, capture2[1].id) << "items should come out in reverse order";
+      EXPECT_EQ(2, capture2[2].id) << "items should come out in reverse order";
+    } // TEST
+
+
+    TEST(dmclock_server, remove_by_client) {
+      struct MyReq {
+	int id;
+
+	MyReq(int _id) :
+	  id(_id)
+	{
+	  // empty
+	}
+      }; // MyReq
+
+      using ClientId = int;
+      using Queue = dmc::PullPriorityQueue<ClientId,MyReq>;
+
+      ClientId client1 = 17;
+      ClientId client2 = 98;
+
+      dmc::ClientInfo info1(0.0, 1.0, 0.0);
+
+      auto client_info_f = [&] (ClientId c) -> dmc::ClientInfo {
+	return info1;
+      };
+
+      Queue pq(client_info_f, true);
+
+      EXPECT_EQ(0u, pq.client_count());
+      EXPECT_EQ(0u, pq.request_count());
+
+      ReqParams req_params(1,1);
+
+      pq.add_request(MyReq(1), client1, req_params);
+      pq.add_request(MyReq(11), client1, req_params);
+      pq.add_request(MyReq(2), client2, req_params);
+      pq.add_request(MyReq(0), client2, req_params);
+      pq.add_request(MyReq(13), client2, req_params);
+      pq.add_request(MyReq(2), client2, req_params);
+      pq.add_request(MyReq(13), client2, req_params);
+      pq.add_request(MyReq(98), client2, req_params);
+      pq.add_request(MyReq(44), client1, req_params);
+
+      EXPECT_EQ(2u, pq.client_count());
+      EXPECT_EQ(9u, pq.request_count());
+
+      std::list<MyReq> removed;
+
+      pq.remove_by_client(client1,
+			  true,
+			  [&removed] (const MyReq& r) {
+			    removed.push_front(r);
+			  });
+
+      EXPECT_EQ(3u, removed.size());
+      EXPECT_EQ(1, removed.front().id);
+      removed.pop_front();
+      EXPECT_EQ(11, removed.front().id);
+      removed.pop_front();
+      EXPECT_EQ(44, removed.front().id);
+      removed.pop_front();
+
+      EXPECT_EQ(6u, pq.request_count());
+
+      Queue::PullReq pr = pq.pull_request();
+      EXPECT_TRUE(pr.is_retn());
+      EXPECT_EQ(2, pr.get_retn().request->id);
+
+      pr = pq.pull_request();
+      EXPECT_TRUE(pr.is_retn());
+      EXPECT_EQ(0, pr.get_retn().request->id);
+
+      pq.remove_by_client(client2);
+      EXPECT_EQ(0u, pq.request_count()) <<
+	"after second client removed, none left";
+    } // TEST
+
+
+    TEST(dmclock_server_pull, pull_weight) {
+      using ClientId = int;
+      using Queue = dmc::PullPriorityQueue<ClientId,Request>;
+      using QueueRef = std::unique_ptr<Queue>;
+
+      ClientId client1 = 17;
+      ClientId client2 = 98;
+
+      dmc::ClientInfo info1(0.0, 1.0, 0.0);
+      dmc::ClientInfo info2(0.0, 2.0, 0.0);
+
+      QueueRef pq;
+
+      auto client_info_f = [&] (ClientId c) -> dmc::ClientInfo {
+	if (client1 == c) return info1;
+	else if (client2 == c) return info2;
+	else {
+	  ADD_FAILURE() << "client info looked up for non-existant client";
+	  return info1;
+	}
+      };
+
+      pq = QueueRef(new Queue(client_info_f, false));
+
+      Request req;
+      ReqParams req_params(1,1);
+
+      auto now = dmc::get_time();
+
+      for (int i = 0; i < 5; ++i) {
+	pq->add_request(req, client1, req_params);
+	pq->add_request(req, client2, req_params);
+	now += 0.0001;
+      }
+
+      int c1_count = 0;
+      int c2_count = 0;
+      for (int i = 0; i < 6; ++i) {
+	Queue::PullReq pr = pq->pull_request();
+	EXPECT_EQ(Queue::NextReqType::returning, pr.type);
+	auto& retn = boost::get<Queue::PullReq::Retn>(pr.data);
+
+	if (client1 == retn.client) ++c1_count;
+	else if (client2 == retn.client) ++c2_count;
+	else ADD_FAILURE() << "got request from neither of two clients";
+
+	EXPECT_EQ(PhaseType::priority, retn.phase);
+      }
+
+      EXPECT_EQ(2, c1_count) <<
+	"one-third of request should have come from first client";
+      EXPECT_EQ(4, c2_count) <<
+	"two-thirds of request should have come from second client";
+    }
+
+
+    TEST(dmclock_server_pull, pull_reservation) {
+      using ClientId = int;
+      using Queue = dmc::PullPriorityQueue<ClientId,Request>;
+      using QueueRef = std::unique_ptr<Queue>;
+
+      ClientId client1 = 52;
+      ClientId client2 = 8;
+
+      dmc::ClientInfo info1(2.0, 0.0, 0.0);
+      dmc::ClientInfo info2(1.0, 0.0, 0.0);
+
+      auto client_info_f = [&] (ClientId c) -> dmc::ClientInfo {
+	if (client1 == c) return info1;
+	else if (client2 == c) return info2;
+	else {
+	  ADD_FAILURE() << "client info looked up for non-existant client";
+	  return info1;
+	}
+      };
+
+      QueueRef pq(new Queue(client_info_f, false));
+
+      Request req;
+      ReqParams req_params(1,1);
+
+      // make sure all times are well before now
+      auto old_time = dmc::get_time() - 100.0;
+
+      for (int i = 0; i < 5; ++i) {
+	pq->add_request_time(req, client1, req_params, old_time);
+	pq->add_request_time(req, client2, req_params, old_time);
+	old_time += 0.001;
+      }
+
+      int c1_count = 0;
+      int c2_count = 0;
+
+      for (int i = 0; i < 6; ++i) {
+	Queue::PullReq pr = pq->pull_request();
+	EXPECT_EQ(Queue::NextReqType::returning, pr.type);
+	auto& retn = boost::get<Queue::PullReq::Retn>(pr.data);
+
+	if (client1 == retn.client) ++c1_count;
+	else if (client2 == retn.client) ++c2_count;
+	else ADD_FAILURE() << "got request from neither of two clients";
+
+	EXPECT_EQ(PhaseType::reservation, retn.phase);
+      }
+
+      EXPECT_EQ(4, c1_count) <<
+	"two-thirds of request should have come from first client";
+      EXPECT_EQ(2, c2_count) <<
+	"one-third of request should have come from second client";
+    } // dmclock_server_pull.pull_reservation
+
+
+    // This test shows what happens when a request can be ready (under
+    // limit) but not schedulable since proportion tag is 0. We expect
+    // to get some future and none responses.
+    TEST(dmclock_server_pull, ready_and_under_limit) {
+      using ClientId = int;
+      using Queue = dmc::PullPriorityQueue<ClientId,Request>;
+      using QueueRef = std::unique_ptr<Queue>;
+
+      ClientId client1 = 52;
+      ClientId client2 = 8;
+
+      dmc::ClientInfo info1(1.0, 0.0, 0.0);
+      dmc::ClientInfo info2(1.0, 0.0, 0.0);
+
+      auto client_info_f = [&] (ClientId c) -> dmc::ClientInfo {
+	if (client1 == c) return info1;
+	else if (client2 == c) return info2;
+	else {
+	  ADD_FAILURE() << "client info looked up for non-existant client";
+	  return info1;
+	}
+      };
+
+      QueueRef pq(new Queue(client_info_f, false));
+
+      Request req;
+      ReqParams req_params(1,1);
+
+      // make sure all times are well before now
+      auto start_time = dmc::get_time() - 100.0;
+
+      // add six requests; for same client reservations spaced one apart
+      for (int i = 0; i < 3; ++i) {
+	pq->add_request_time(req, client1, req_params, start_time);
+	pq->add_request_time(req, client2, req_params, start_time);
+      }
+
+      Queue::PullReq pr = pq->pull_request(start_time + 0.5);
+      EXPECT_EQ(Queue::NextReqType::returning, pr.type);
+
+      pr = pq->pull_request(start_time + 0.5);
+      EXPECT_EQ(Queue::NextReqType::returning, pr.type);
+
+      pr = pq->pull_request(start_time + 0.5);
+      EXPECT_EQ(Queue::NextReqType::future, pr.type) <<
+	"too soon for next reservation";
+
+      pr = pq->pull_request(start_time + 1.5);
+      EXPECT_EQ(Queue::NextReqType::returning, pr.type);
+
+      pr = pq->pull_request(start_time + 1.5);
+      EXPECT_EQ(Queue::NextReqType::returning, pr.type);
+
+      pr = pq->pull_request(start_time + 1.5);
+      EXPECT_EQ(Queue::NextReqType::future, pr.type) <<
+	"too soon for next reservation";
+
+      pr = pq->pull_request(start_time + 2.5);
+      EXPECT_EQ(Queue::NextReqType::returning, pr.type);
+
+      pr = pq->pull_request(start_time + 2.5);
+      EXPECT_EQ(Queue::NextReqType::returning, pr.type);
+
+      pr = pq->pull_request(start_time + 2.5);
+      EXPECT_EQ(Queue::NextReqType::none, pr.type) << "no more requests left";
+    }
+
+
+    TEST(dmclock_server_pull, pull_none) {
+      using ClientId = int;
+      using Queue = dmc::PullPriorityQueue<ClientId,Request>;
+      using QueueRef = std::unique_ptr<Queue>;
+
+      dmc::ClientInfo info(1.0, 1.0, 1.0);
+
+      auto client_info_f = [&] (ClientId c) -> dmc::ClientInfo {
+	return info;
+      };
+
+      QueueRef pq(new Queue(client_info_f, false));
+
+      // Request req;
+      ReqParams req_params(1,1);
+
+      auto now = dmc::get_time();
+
+      Queue::PullReq pr = pq->pull_request(now + 100);
+
+      EXPECT_EQ(Queue::NextReqType::none, pr.type);
+    }
+
+
+    TEST(dmclock_server_pull, pull_future) {
+      using ClientId = int;
+      using Queue = dmc::PullPriorityQueue<ClientId,Request>;
+      using QueueRef = std::unique_ptr<Queue>;
+
+      ClientId client1 = 52;
+      // ClientId client2 = 8;
+
+      dmc::ClientInfo info(1.0, 0.0, 1.0);
+
+      auto client_info_f = [&] (ClientId c) -> dmc::ClientInfo {
+	return info;
+      };
+
+      QueueRef pq(new Queue(client_info_f, false));
+
+      Request req;
+      ReqParams req_params(1,1);
+
+      // make sure all times are well before now
+      auto now = dmc::get_time();
+
+      pq->add_request_time(req, client1, req_params, now + 100);
+      Queue::PullReq pr = pq->pull_request(now);
+
+      EXPECT_EQ(Queue::NextReqType::future, pr.type);
+
+      Time when = boost::get<Time>(pr.data);
+      EXPECT_EQ(now + 100, when);
+    }
+
+
+    TEST(dmclock_server_pull, pull_future_limit_break_weight) {
+      using ClientId = int;
+      using Queue = dmc::PullPriorityQueue<ClientId,Request>;
+      using QueueRef = std::unique_ptr<Queue>;
+
+      ClientId client1 = 52;
+      // ClientId client2 = 8;
+
+      dmc::ClientInfo info(0.0, 1.0, 1.0);
+
+      auto client_info_f = [&] (ClientId c) -> dmc::ClientInfo {
+	return info;
+      };
+
+      QueueRef pq(new Queue(client_info_f, true));
+
+      Request req;
+      ReqParams req_params(1,1);
+
+      // make sure all times are well before now
+      auto now = dmc::get_time();
+
+      pq->add_request_time(req, client1, req_params, now + 100);
+      Queue::PullReq pr = pq->pull_request(now);
+
+      EXPECT_EQ(Queue::NextReqType::returning, pr.type);
+
+      auto& retn = boost::get<Queue::PullReq::Retn>(pr.data);
+      EXPECT_EQ(client1, retn.client);
+    }
+
+
+    TEST(dmclock_server_pull, pull_future_limit_break_reservation) {
+      using ClientId = int;
+      using Queue = dmc::PullPriorityQueue<ClientId,Request>;
+      using QueueRef = std::unique_ptr<Queue>;
+
+      ClientId client1 = 52;
+      // ClientId client2 = 8;
+
+      dmc::ClientInfo info(1.0, 0.0, 1.0);
+
+      auto client_info_f = [&] (ClientId c) -> dmc::ClientInfo {
+	return info;
+      };
+
+      QueueRef pq(new Queue(client_info_f, true));
+
+      Request req;
+      ReqParams req_params(1,1);
+
+      // make sure all times are well before now
+      auto now = dmc::get_time();
+
+      pq->add_request_time(req, client1, req_params, now + 100);
+      Queue::PullReq pr = pq->pull_request(now);
+
+      EXPECT_EQ(Queue::NextReqType::returning, pr.type);
+
+      auto& retn = boost::get<Queue::PullReq::Retn>(pr.data);
+      EXPECT_EQ(client1, retn.client);
+    }
+  } // namespace dmclock
+} // namespace crimson
diff --git a/src/dmclock/test/test_test_client.cc b/src/dmclock/test/test_test_client.cc
new file mode 100644
index 00000000000..6015cb9bf7b
--- /dev/null
+++ b/src/dmclock/test/test_test_client.cc
@@ -0,0 +1,123 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Copyright (C) 2016 Red Hat Inc.
+ */
+
+#include <atomic>
+#include <thread>
+#include <chrono>
+#include <iostream>
+
+#include "gtest/gtest.h"
+
+#include "sim_recs.h"
+#include "sim_client.h"
+
+#include "test_dmclock.h"
+
+
+using namespace std::placeholders;
+
+namespace dmc = crimson::dmclock;
+namespace test = crimson::test_dmc;
+namespace sim = crimson::qos_simulation;
+
+using TimePoint = std::chrono::time_point<std::chrono::system_clock>;
+
+static TimePoint now() { return std::chrono::system_clock::now(); }
+
+
+TEST(test_client, full_bore_timing) {
+  std::atomic_ulong count(0);
+
+  ServerId server_id = 3;
+
+  sim::TestResponse resp(0);
+  dmc::PhaseType resp_params = dmc::PhaseType::priority;
+  test::DmcClient* client;
+
+  auto start = now();
+  client =
+    new test::DmcClient(ClientId(0),
+			[&] (const ServerId& server,
+			     const sim::TestRequest& req,
+			     const ClientId& client_id,
+			     const dmc::ReqParams& req_params) {
+			  ++count;
+			  client->receive_response(resp, client_id, resp_params);
+			},
+			[&] (const uint64_t seed) -> ServerId& {
+			  return server_id;
+			},
+			test::dmc_client_accumulate_f,
+			1000, // ops to run
+			100, // iops goal
+			5); // outstanding ops allowed
+  client->wait_until_done();
+  auto end = now();
+  EXPECT_EQ(1000u, count) << "didn't get right number of ops";
+
+  int milliseconds = (end - start) / std::chrono::milliseconds(1);
+  EXPECT_LT(10000, milliseconds) << "timing too fast to be correct";
+  EXPECT_GT(12000, milliseconds) << "timing suspiciously slow";
+}
+
+
+TEST(test_client, paused_timing) {
+  std::atomic_ulong count(0);
+  std::atomic_ulong unresponded_count(0);
+  std::atomic_bool auto_respond(false);
+
+  ClientId my_client_id = 0;
+  ServerId server_id = 3;
+
+  sim::TestResponse resp(0);
+  dmc::PhaseType resp_params = dmc::PhaseType::priority;
+  test::DmcClient* client;
+
+  auto start = now();
+  client =
+    new test::DmcClient(my_client_id,
+			[&] (const ServerId& server,
+			     const sim::TestRequest& req,
+			     const ClientId& client_id,
+			     const dmc::ReqParams& req_params) {
+			  ++count;
+			  if (auto_respond.load()) {
+			    client->receive_response(resp, client_id, resp_params);
+			  } else {
+			    ++unresponded_count;
+			  }
+			},
+			[&] (const uint64_t seed) -> ServerId& {
+			  return server_id;
+			},
+			test::dmc_client_accumulate_f,
+
+			1000, // ops to run
+			100, // iops goal
+			50); // outstanding ops allowed
+  std::thread t([&]() {
+      std::this_thread::sleep_for(std::chrono::seconds(5));
+      EXPECT_EQ(50u, unresponded_count.load()) <<
+	"should have 50 unresponded calls";
+      auto_respond = true;
+      // respond to those 50 calls
+      for(int i = 0; i < 50; ++i) {
+	client->receive_response(resp, my_client_id, resp_params);
+	--unresponded_count;
+      }
+    });
+
+  client->wait_until_done();
+  auto end = now();
+  int milliseconds = (end - start) / std::chrono::milliseconds(1);
+
+  // the 50 outstanding ops allowed means the first half-second of
+  // requests get responded to during the 5 second pause. So we have
+  // to adjust our expectations by a half-second.
+  EXPECT_LT(15000 - 500, milliseconds) << "timing too fast to be correct";
+  EXPECT_GT(17000 - 500, milliseconds) << "timing suspiciously slow";
+  t.join();
+}
diff --git a/src/include/CMakeLists.txt b/src/include/CMakeLists.txt
index fc9cf7d0178..7feffe195a9 100644
--- a/src/include/CMakeLists.txt
+++ b/src/include/CMakeLists.txt
@@ -8,6 +8,7 @@ install(FILES rados/librados.h
   memory.h
   page.h
   crc32c.h
+  rados/objclass.h
   DESTINATION include/rados)
 install(FILES
   radosstriper/libradosstriper.h
diff --git a/src/include/buffer.h b/src/include/buffer.h
index 4d4942adaab..177e95f8a87 100644
--- a/src/include/buffer.h
+++ b/src/include/buffer.h
@@ -17,6 +17,7 @@
 #if defined(__linux__) || defined(__FreeBSD__)
 #include <stdlib.h>
 #endif
+#include <limits.h>
 
 #ifndef _XOPEN_SOURCE
 # define _XOPEN_SOURCE 600
@@ -115,6 +116,8 @@ namespace buffer CEPH_BUFFER_API {
   int get_cached_crc();
   /// count of cached crc hits (mismatching input, required adjustment)
   int get_cached_crc_adjusted();
+  /// count of crc cache misses
+  int get_missed_crc();
   /// enable/disable tracking of cached crcs
   void track_cached_crc(bool b);
 
@@ -867,9 +870,26 @@ namespace buffer CEPH_BUFFER_API {
     int write_fd(int fd) const;
     int write_fd(int fd, uint64_t offset) const;
     int write_fd_zero_copy(int fd) const;
-    void prepare_iov(std::vector<iovec> *piov) const;
+    template<typename VectorT>
+    void prepare_iov(VectorT *piov) const {
+      assert(_buffers.size() <= IOV_MAX);
+      piov->resize(_buffers.size());
+      unsigned n = 0;
+      for (auto& p : _buffers) {
+	(*piov)[n].iov_base = (void *)p.c_str();
+	(*piov)[n].iov_len = p.length();
+	++n;
+      }
+    }
     uint32_t crc32c(uint32_t crc) const;
     void invalidate_crc();
+
+    // These functions return a bufferlist with a pointer to a single
+    // static buffer. They /must/ not outlive the memory they
+    // reference.
+    static list static_from_mem(char* c, size_t l);
+    static list static_from_cstring(char* c);
+    static list static_from_string(std::string& s);
   };
 
   /*
diff --git a/src/include/byteorder.h b/src/include/byteorder.h
index e76d035ed11..86748601668 100644
--- a/src/include/byteorder.h
+++ b/src/include/byteorder.h
@@ -1,67 +1,45 @@
-/*
- * byteorder.h
- *
- * LGPL 2
- */
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 
-#ifndef CEPH_BYTEORDER_H
-#define CEPH_BYTEORDER_H
+#pragma once
 
-#include <sys/param.h>
+#include <type_traits>
+#include "acconfig.h"
 #include "int_types.h"
 
-#if defined(__APPLE__)
-# if __DARWIN_BYTE_ORDER == __DARWIN_LITTLE_ENDIAN
-#  define CEPH_LITTLE_ENDIAN
-# elif __DARWIN_BYTE_ORDER == __DARWIN_BIG_ENDIAN
-#  define CEPH_BIG_ENDIAN
-# endif
-#endif
-
-#if defined(__FreeBSD__)
-# if _BYTE_ORDER == _LITTLE_ENDIAN
-#  define CEPH_LITTLE_ENDIAN
-# elif _BYTE_ORDER == _BIG_ENDIAN
-#  define CEPH_BIG_ENDIAN
-# endif
-#endif
-
-#if defined(__linux__)
-# if BYTE_ORDER == LITTLE_ENDIAN
-#  define CEPH_LITTLE_ENDIAN
-# elif BYTE_ORDER == BIG_ENDIAN
-#  define CEPH_BIG_ENDIAN
-# endif
-#endif
-
-#if defined(__sun) && defined(__SVR4)
-# if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-#  define CEPH_BIG_ENDIAN
-# elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-#  define CEPH_LITTLE_ENDIAN
-# endif
-#endif
 
-#if defined(_AIX)
-# define CEPH_BIG_ENDIAN
-#endif
-
-
-
-
-static __inline__ __u16 swab16(__u16 val) 
-{
+#ifdef __GNUC__
+template<typename T>
+inline typename std::enable_if<sizeof(T) == sizeof(uint16_t), T>::type
+swab(T val) {
+  return __builtin_bswap16(val);
+}
+template<typename T>
+inline typename std::enable_if<sizeof(T) == sizeof(uint32_t), T>::type
+swab(T val) {
+  return __builtin_bswap32(val);
+}
+template<typename T>
+inline typename std::enable_if<sizeof(T) == sizeof(uint64_t), T>::type
+swab(T val) {
+  return __builtin_bswap64(val);
+}
+#else
+template<typename T>
+inline typename std::enable_if<sizeof(T) == sizeof(uint16_t), T>::type
+swab(T val) {
   return (val >> 8) | (val << 8);
 }
-static __inline__ __u32 swab32(__u32 val) 
-{
+template<typename T>
+inline typename std::enable_if<sizeof(T) == sizeof(uint32_t), T>::type
+swab(T val) {
   return (( val >> 24) |
 	  ((val >> 8)  & 0xff00) |
 	  ((val << 8)  & 0xff0000) | 
 	  ((val << 24)));
 }
-static __inline__ uint64_t swab64(uint64_t val) 
-{
+template<typename T>
+inline typename std::enable_if<sizeof(T) == sizeof(uint64_t), T>::type
+swab(T val) {
   return (( val >> 56) |
 	  ((val >> 40) & 0xff00ull) |
 	  ((val >> 24) & 0xff0000ull) |
@@ -71,45 +49,49 @@ static __inline__ uint64_t swab64(uint64_t val)
 	  ((val << 40) & 0xff000000000000ull) |
 	  ((val << 56)));
 }
+#endif
 
 // mswab == maybe swab (if not LE)
 #ifdef CEPH_BIG_ENDIAN
-# define mswab64(a) swab64(a)
-# define mswab32(a) swab32(a)
-# define mswab16(a) swab16(a)
-#elif defined(CEPH_LITTLE_ENDIAN)
-# define mswab64(a) (a)
-# define mswab32(a) (a)
-# define mswab16(a) (a)
+template<typename T>
+inline T mswab(T val) {
+  return swab(val);
+}
 #else
-# error "Could not determine endianess"
+template<typename T>
+inline T mswab(T val) {
+  return val;
+}
 #endif
 
-#ifdef __cplusplus
-
-#define MAKE_LE_CLASS(bits)						\
-  struct ceph_le##bits {							\
-    __u##bits v;							\
-    ceph_le##bits &operator=(__u##bits nv) {				\
-      v = mswab##bits(nv);						\
-      return *this;							\
-    }									\
-    operator __u##bits() const { return mswab##bits(v); }		\
-  } __attribute__ ((packed));						\
-  static inline bool operator==(ceph_le##bits a, ceph_le##bits b) {		\
-    return a.v == b.v;							\
+template<typename T>
+struct ceph_le {
+  T v;
+  ceph_le<T>& operator=(T nv) {
+    v = mswab(nv);
+    return *this;
   }
-  
-MAKE_LE_CLASS(64)
-MAKE_LE_CLASS(32)
-MAKE_LE_CLASS(16)
-#undef MAKE_LE_CLASS
+  operator T() const { return mswab(v); }
+} __attribute__ ((packed));
 
-#endif /* __cplusplus */
+template<typename T>
+inline bool operator==(ceph_le<T> a, ceph_le<T> b) {
+  return a.v == b.v;
+}
 
-#define init_le64(x) { (__u64)mswab64(x) }
-#define init_le32(x) { (__u32)mswab32(x) }
-#define init_le16(x) { (__u16)mswab16(x) }
+using ceph_le64 = ceph_le<__u64>;
+using ceph_le32 = ceph_le<__u32>;
+using ceph_le16 = ceph_le<__u16>;
+
+inline __u64 init_le64(__u64 x) {
+  return mswab<__u64>(x);
+}
+inline __u32 init_le32(__u32 x) {
+  return mswab<__u32>(x);
+}
+inline __u16 init_le16(__u16 x) {
+  return mswab<__u16>(x);
+}
 
   /*
 #define cpu_to_le64(x) (x)
@@ -119,5 +101,3 @@ MAKE_LE_CLASS(16)
 #define le64_to_cpu(x) ((uint64_t)x)
 #define le32_to_cpu(x) ((__u32)x)
 #define le16_to_cpu(x) ((__u16)x)
-
-#endif
diff --git a/src/include/ceph_features.h b/src/include/ceph_features.h
index 68a55d05d3a..9decdaff3e7 100755
--- a/src/include/ceph_features.h
+++ b/src/include/ceph_features.h
@@ -98,9 +98,10 @@ DEFINE_CEPH_FEATURE_RETIRED(20, 1, MON_NULLROUTE, JEWEL, LUMINOUS)
 
 DEFINE_CEPH_FEATURE_RETIRED(21, 1, MON_GV, HAMMER, JEWEL)
 DEFINE_CEPH_FEATURE(21, 2, SERVER_LUMINOUS)
-DEFINE_CEPH_FEATURE(21, 2, RESEND_ON_SPLIT) // overlap
-DEFINE_CEPH_FEATURE(21, 2, RADOS_BACKOFF)   // overlap
-DEFINE_CEPH_FEATURE(21, 2, OSDMAP_REMAP)    // overlap
+DEFINE_CEPH_FEATURE(21, 2, RESEND_ON_SPLIT)  // overlap
+DEFINE_CEPH_FEATURE(21, 2, RADOS_BACKOFF)    // overlap
+DEFINE_CEPH_FEATURE(21, 2, OSDMAP_PG_UPMAP)  // overlap
+DEFINE_CEPH_FEATURE(21, 2, CRUSH_CHOOSE_ARGS) // overlap
 DEFINE_CEPH_FEATURE_RETIRED(22, 1, BACKFILL_RESERVATION, JEWEL, LUMINOUS)
 
 DEFINE_CEPH_FEATURE(23, 1, MSG_AUTH)
@@ -161,6 +162,7 @@ DEFINE_CEPH_FEATURE(58, 1, FS_FILE_LAYOUT_V2) // overlap
 DEFINE_CEPH_FEATURE(59, 1, FS_BTIME)
 DEFINE_CEPH_FEATURE(59, 1, FS_CHANGE_ATTR) // overlap
 DEFINE_CEPH_FEATURE(59, 1, MSG_ADDR2) // overlap
+DEFINE_CEPH_FEATURE(60, 1, BLKIN_TRACING)  // *do not share this bit*
 
 DEFINE_CEPH_FEATURE(61, 1, RESERVED2)          // unused, but slow down!
 DEFINE_CEPH_FEATURE(62, 1, RESERVED)           // do not use; used as a sentinal
@@ -168,6 +170,15 @@ DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facin
 
 
 /*
+ * conditionally include blkin in CEPH_FEATURES_ALL/SUPPORTED_DEFAULT
+ */
+#ifdef WITH_BLKIN
+#define CEPH_FEATURES_BLKIN CEPH_FEATURE_BLKIN_TRACING
+#else
+#define CEPH_FEATURES_BLKIN 0
+#endif
+
+/*
  * Features supported.  Should be everything above.
  */
 #define CEPH_FEATURES_ALL		 \
@@ -227,6 +238,7 @@ DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facin
 	 CEPH_FEATURE_SERVER_LUMINOUS |		\
 	 CEPH_FEATURE_RESEND_ON_SPLIT |		\
 	 CEPH_FEATURE_RADOS_BACKOFF |		\
+	 CEPH_FEATURES_BLKIN | \
 	 0ULL)
 
 #define CEPH_FEATURES_SUPPORTED_DEFAULT  CEPH_FEATURES_ALL
@@ -240,7 +252,8 @@ DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facin
 	 CEPH_FEATURE_CRUSH_TUNABLES3 |		\
 	 CEPH_FEATURE_CRUSH_TUNABLES5 |		\
 	 CEPH_FEATURE_CRUSH_V2 |		\
-	 CEPH_FEATURE_CRUSH_V4)
+	 CEPH_FEATURE_CRUSH_V4 |		\
+	 CEPH_FEATURE_CRUSH_CHOOSE_ARGS)
 
 /*
  * make sure we don't try to use the reserved features
diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h
index 9c2135e56b7..726ee69d62f 100644
--- a/src/include/ceph_fs.h
+++ b/src/include/ceph_fs.h
@@ -385,6 +385,20 @@ extern const char *ceph_mds_op_name(int op);
 #define CEPH_SETATTR_KILL_SGUID	(1 << 10)
 
 /*
+ * open request flags
+ */
+#define CEPH_O_RDONLY          00000000
+#define CEPH_O_WRONLY          00000001
+#define CEPH_O_RDWR            00000002
+#define CEPH_O_CREAT           00000100
+#define CEPH_O_EXCL            00000200
+#define CEPH_O_TRUNC           00001000
+#define CEPH_O_DIRECTORY       00200000
+#define CEPH_O_NOFOLLOW        00400000
+
+int ceph_flags_sys2wire(int flags);
+
+/*
  * Ceph setxattr request flags.
  */
 #define CEPH_XATTR_CREATE  (1 << 0)
diff --git a/src/include/config-h.in.cmake b/src/include/config-h.in.cmake
index b536433b1e4..55df142f3cc 100644
--- a/src/include/config-h.in.cmake
+++ b/src/include/config-h.in.cmake
@@ -157,8 +157,8 @@
 /* define if leveldb is enabled */
 #cmakedefine WITH_LEVELDB
 
-/* define if radosgw's asio frontend enabled */
-#cmakedefine WITH_RADOSGW_ASIO_FRONTEND
+/* define if radosgw's beast frontend enabled */
+#cmakedefine WITH_RADOSGW_BEAST_FRONTEND
 
 /* define if HAVE_THREAD_SAFE_RES_QUERY */
 #cmakedefine HAVE_THREAD_SAFE_RES_QUERY
@@ -205,18 +205,12 @@
 /* Define to 1 if you have fdatasync. */
 #cmakedefine HAVE_FDATASYNC 1
 
-/* Define to 1 if you have the <inttypes.h> header file. */
-#cmakedefine HAVE_INTTYPES_H 1
-
 /* Defined if you have librocksdb enabled */
 #cmakedefine HAVE_LIBROCKSDB
 
 /* Define to 1 if you have the <valgrind/helgrind.h> header file. */
 #cmakedefine HAVE_VALGRIND_HELGRIND_H 1
 
-/* Define to 1 if you have the <stdint.h> header file. */
-#cmakedefine HAVE_STDINT_H 1
-
 /* Define to 1 if you have the <sys/prctl.h> header file. */
 #cmakedefine HAVE_SYS_PRCTL_H 1
 
@@ -301,6 +295,9 @@
 /* Defined if pthread_setname_np() is available */
 #cmakedefine HAVE_PTHREAD_SETNAME_NP 1
 
+/* Defined if blkin enabled */
+#cmakedefine WITH_BLKIN
+
 /* Defined if pthread_set_name_np() is available */
 #cmakedefine HAVE_PTHREAD_SET_NAME_NP
 
@@ -310,4 +307,12 @@
 /* Support POWER8 instructions */
 #cmakedefine HAVE_POWER8
 
+/* Define if endian type is big endian */
+#cmakedefine CEPH_BIG_ENDIAN
+
+/* Define if endian type is little endian */
+#cmakedefine CEPH_LITTLE_ENDIAN
+
+#cmakedefine PYTHON_EXECUTABLE "@PYTHON_EXECUTABLE@"
+
 #endif /* CONFIG_H */
diff --git a/src/include/coredumpctl.h b/src/include/coredumpctl.h
new file mode 100644
index 00000000000..e4424941f43
--- /dev/null
+++ b/src/include/coredumpctl.h
@@ -0,0 +1,39 @@
+#ifdef HAVE_SYS_PRCTL_H
+#include <iostream>
+#include <sys/prctl.h>
+#include "common/errno.h"
+
+struct PrCtl {
+  int saved_state = -1;
+  int set_dumpable(int new_state) {
+    int r = prctl(PR_SET_DUMPABLE, new_state);
+    if (r) {
+      r = -errno;
+      std::cerr << "warning: unable to " << (new_state ? "set" : "unset")
+                << " dumpable flag: " << cpp_strerror(r)
+                << std::endl;
+    }
+    return r;
+  }
+  PrCtl(int new_state = 0) {
+    int r = prctl(PR_GET_DUMPABLE);
+    if (r == -1) {
+      r = errno;
+      std::cerr << "warning: unable to get dumpable flag: " << cpp_strerror(r)
+                << std::endl;
+    } else if (r != new_state) {
+      if (!set_dumpable(new_state)) {
+        saved_state = r;
+      }
+    }
+  }
+  ~PrCtl() {
+    if (saved_state < 0) {
+      return;
+    }
+    set_dumpable(saved_state);
+  }
+};
+#else
+struct PrCtl {};
+#endif
diff --git a/src/include/crc32c.h b/src/include/crc32c.h
index a568edabe19..35b6cafcc3a 100644
--- a/src/include/crc32c.h
+++ b/src/include/crc32c.h
@@ -1,8 +1,7 @@
 #ifndef CEPH_CRC32C_H
 #define CEPH_CRC32C_H
 
-#include <inttypes.h>
-#include <string.h>
+#include <stdint.h>
 
 typedef uint32_t (*ceph_crc32c_func_t)(uint32_t crc, unsigned char const *data, unsigned length);
 
diff --git a/src/include/denc.h b/src/include/denc.h
index 97b4713ce5e..d04f959d043 100644
--- a/src/include/denc.h
+++ b/src/include/denc.h
@@ -56,6 +56,52 @@ struct denc_traits {
 //#include <iostream>
 //using std::cout;
 
+// Define this to compile in a dump of all encoded objects to disk to
+// populate ceph-object-corpus.  Note that there is an almost
+// identical implementation in encoding.h, but you only need to define
+// ENCODE_DUMP_PATH here.
+//
+// See src/test/encoding/generate-corpus-objects.sh.
+//
+//#define ENCODE_DUMP_PATH /tmp/something
+
+#ifdef ENCODE_DUMP_PATH
+# include <stdio.h>
+# include <sys/types.h>
+# include <sys/stat.h>
+# include <fcntl.h>
+# define ENCODE_STR(x) #x
+# define ENCODE_STRINGIFY(x) ENCODE_STR(x)
+# define DENC_DUMP_PRE(Type)			\
+  char *__denc_dump_pre = p.get_pos();
+  // this hackery with bits below is just to get a semi-reasonable
+  // distribution across time.  it is somewhat exponential but not
+  // quite.
+# define DENC_DUMP_POST(Type)			\
+  do {									\
+    static int i = 0;							\
+    i++;								\
+    int bits = 0;							\
+    for (unsigned t = i; t; bits++)					\
+      t &= t - 1;							\
+    if (bits > 2)							\
+      break;								\
+    char fn[PATH_MAX];							\
+    snprintf(fn, sizeof(fn),						\
+	     ENCODE_STRINGIFY(ENCODE_DUMP_PATH) "/%s__%d.%x", #Type,		\
+	     getpid(), i++);						\
+    int fd = ::open(fn, O_WRONLY|O_TRUNC|O_CREAT, 0644);		\
+    if (fd >= 0) {							\
+      size_t len = p.get_pos() - __denc_dump_pre;			\
+      int r = ::write(fd, __denc_dump_pre, len);			\
+      (void)r;								\
+      ::close(fd);							\
+    }									\
+  } while (0)
+#else
+# define DENC_DUMP_PRE(Type)
+# define DENC_DUMP_POST(Type)
+#endif
 
 
 /*
@@ -344,7 +390,7 @@ inline void denc_signed_varint_lowz(int64_t v,
     v = -v;
     negative = true;
   }
-  int lowznib = v ? (ctz(v) / 4) : 0;
+  unsigned lowznib = v ? (ctz(v) / 4) : 0u;
   if (lowznib > 3)
     lowznib = 3;
   v >>= lowznib * 4;
@@ -1511,7 +1557,9 @@ inline typename std::enable_if<traits::supported &&
     _denc_friend(*this, p);						\
   }									\
   void encode(bufferlist::contiguous_appender& p) const {		\
+    DENC_DUMP_PRE(Type);						\
     _denc_friend(*this, p);						\
+    DENC_DUMP_POST(Type);						\
   }									\
   void decode(buffer::ptr::iterator& p) {				\
     _denc_friend(*this, p);						\
@@ -1527,7 +1575,9 @@ inline typename std::enable_if<traits::supported &&
     _denc_friend(*this, p, f);						\
   }									\
   void encode(bufferlist::contiguous_appender& p, uint64_t f) const {	\
+    DENC_DUMP_PRE(Type);						\
     _denc_friend(*this, p, f);						\
+    DENC_DUMP_POST(Type);						\
   }									\
   void decode(buffer::ptr::iterator& p, uint64_t f=0) {			\
     _denc_friend(*this, p, f);						\
diff --git a/src/include/encoding.h b/src/include/encoding.h
index 9ea25d316d1..953420e8991 100644
--- a/src/include/encoding.h
+++ b/src/include/encoding.h
@@ -111,20 +111,10 @@ WRITE_INTTYPE_ENCODER(int32_t, le32)
 WRITE_INTTYPE_ENCODER(uint16_t, le16)
 WRITE_INTTYPE_ENCODER(int16_t, le16)
 
-#ifdef ENCODE_DUMP
-# include <stdio.h>
-# include <sys/types.h>
-# include <sys/stat.h>
-# include <fcntl.h>
-
-# define ENCODE_STR(x) #x
-# define ENCODE_STRINGIFY(x) ENCODE_STR(x)
-
+// see denc.h for ENCODE_DUMP_PATH discussion and definition.
+#ifdef ENCODE_DUMP_PATH
 # define ENCODE_DUMP_PRE()			\
   unsigned pre_off = bl.length()
-
-// NOTE: This is almost an exponential backoff, but because we count
-// bits we get a better sample of things we encode later on.
 # define ENCODE_DUMP_POST(cl)						\
   do {									\
     static int i = 0;							\
@@ -134,8 +124,8 @@ WRITE_INTTYPE_ENCODER(int16_t, le16)
       t &= t - 1;							\
     if (bits > 2)							\
       break;								\
-    char fn[200];							\
-    snprintf(fn, sizeof(fn), ENCODE_STRINGIFY(ENCODE_DUMP) "/%s__%d.%x", #cl, getpid(), i++); \
+    char fn[PATH_MAX];							\
+    snprintf(fn, sizeof(fn), ENCODE_STRINGIFY(ENCODE_DUMP_PATH) "/%s__%d.%x", #cl, getpid(), i++); \
     int fd = ::open(fn, O_WRONLY|O_TRUNC|O_CREAT, 0644);		\
     if (fd >= 0) {							\
       bufferlist sub;							\
@@ -149,6 +139,7 @@ WRITE_INTTYPE_ENCODER(int16_t, le16)
 # define ENCODE_DUMP_POST(cl)
 #endif
 
+
 #define WRITE_CLASS_ENCODER(cl)						\
   inline void encode(const cl &c, bufferlist &bl, uint64_t features=0) { \
     ENCODE_DUMP_PRE(); c.encode(bl); ENCODE_DUMP_POST(cl); }		\
@@ -389,7 +380,7 @@ inline typename std::enable_if<!traits::supported>::type
 {
   __u32 n = (__u32)(ls.size());  // c++11 std::list::size() is O(1)
   encode(n, bl);
-  for (typename std::list<T>::const_iterator p = ls.begin(); p != ls.end(); ++p)
+  for (auto p = ls.begin(); p != ls.end(); ++p)
     encode(*p, bl);
 }
 template<class T, class Alloc, typename traits=denc_traits<T>>
@@ -401,7 +392,7 @@ inline typename std::enable_if<!traits::supported>::type
     unsigned pos = bl.length();
     unsigned n = 0;
     encode(n, bl);
-    for (typename std::list<T>::const_iterator p = ls.begin(); p != ls.end(); ++p) {
+    for (auto p = ls.begin(); p != ls.end(); ++p) {
       n++;
       encode(*p, bl, features);
     }
@@ -411,7 +402,7 @@ inline typename std::enable_if<!traits::supported>::type
   } else {
     __u32 n = (__u32)(ls.size());    // FIXME: this is slow on a list.
     encode(n, bl);
-    for (typename std::list<T>::const_iterator p = ls.begin(); p != ls.end(); ++p)
+    for (auto p = ls.begin(); p != ls.end(); ++p)
       encode(*p, bl, features);
   }
 }
@@ -436,7 +427,7 @@ inline void encode(const std::list<ceph::shared_ptr<T>, Alloc>& ls,
 {
   __u32 n = (__u32)(ls.size());  // c++11 std::list::size() is O(1)
   encode(n, bl);
-  for (typename std::list<ceph::shared_ptr<T> >::const_iterator p = ls.begin(); p != ls.end(); ++p)
+  for (auto p = ls.begin(); p != ls.end(); ++p)
     encode(**p, bl);
 }
 template<class T, class Alloc>
@@ -445,7 +436,7 @@ inline void encode(const std::list<ceph::shared_ptr<T>, Alloc>& ls,
 {
   __u32 n = (__u32)(ls.size());  // c++11 std::list::size() is O(1)
   encode(n, bl);
-  for (typename std::list<ceph::shared_ptr<T> >::const_iterator p = ls.begin(); p != ls.end(); ++p)
+  for (auto p = ls.begin(); p != ls.end(); ++p)
     encode(**p, bl, features);
 }
 template<class T, class Alloc>
@@ -469,7 +460,7 @@ inline typename std::enable_if<!traits::supported>::type
 {
   __u32 n = (__u32)(s.size());
   encode(n, bl);
-  for (typename std::set<T>::const_iterator p = s.begin(); p != s.end(); ++p)
+  for (auto p = s.begin(); p != s.end(); ++p)
     encode(*p, bl);
 }
 template<class T, class Comp, class Alloc, typename traits=denc_traits<T>>
@@ -490,7 +481,7 @@ template<class T, class Comp, class Alloc, typename traits=denc_traits<T>>
 inline typename std::enable_if<!traits::supported>::type
   encode_nohead(const std::set<T,Comp,Alloc>& s, bufferlist& bl)
 {
-  for (typename std::set<T,Comp>::const_iterator p = s.begin(); p != s.end(); ++p)
+  for (auto p = s.begin(); p != s.end(); ++p)
     encode(*p, bl);
 }
 template<class T, class Comp, class Alloc, typename traits=denc_traits<T>>
@@ -554,7 +545,7 @@ inline void encode(const std::multiset<T,Comp,Alloc>& s, bufferlist& bl)
 {
   __u32 n = (__u32)(s.size());
   encode(n, bl);
-  for (typename std::multiset<T,Comp>::const_iterator p = s.begin(); p != s.end(); ++p)
+  for (auto p = s.begin(); p != s.end(); ++p)
     encode(*p, bl);
 }
 template<class T, class Comp, class Alloc>
@@ -623,7 +614,7 @@ template<class T, class Alloc, typename traits=denc_traits<T>>
 inline typename std::enable_if<!traits::supported>::type
   encode_nohead(const std::vector<T,Alloc>& v, bufferlist& bl)
 {
-  for (typename std::vector<T>::const_iterator p = v.begin(); p != v.end(); ++p)
+  for (auto p = v.begin(); p != v.end(); ++p)
     encode(*p, bl);
 }
 template<class T, class Alloc, typename traits=denc_traits<T>>
@@ -643,7 +634,7 @@ inline void encode(const std::vector<ceph::shared_ptr<T>,Alloc>& v,
 {
   __u32 n = (__u32)(v.size());
   encode(n, bl);
-  for (typename std::vector<ceph::shared_ptr<T> >::const_iterator p = v.begin(); p != v.end(); ++p)
+  for (auto p = v.begin(); p != v.end(); ++p)
     if (*p)
       encode(**p, bl, features);
     else
@@ -655,7 +646,7 @@ inline void encode(const std::vector<ceph::shared_ptr<T>,Alloc>& v,
 {
   __u32 n = (__u32)(v.size());
   encode(n, bl);
-  for (typename std::vector<ceph::shared_ptr<T> >::const_iterator p = v.begin(); p != v.end(); ++p)
+  for (auto p = v.begin(); p != v.end(); ++p)
     if (*p)
       encode(**p, bl);
     else
@@ -708,7 +699,7 @@ inline typename std::enable_if<!t_traits::supported ||
 {
   __u32 n = (__u32)(m.size());
   encode(n, bl);
-  for (typename std::map<T,U,Comp>::const_iterator p = m.begin(); p != m.end(); ++p) {
+  for (auto p = m.begin(); p != m.end(); ++p) {
     encode(p->first, bl);
     encode(p->second, bl);
   }
@@ -721,7 +712,7 @@ inline typename std::enable_if<!t_traits::supported ||
 {
   __u32 n = (__u32)(m.size());
   encode(n, bl);
-  for (typename std::map<T,U,Comp>::const_iterator p = m.begin(); p != m.end(); ++p) {
+  for (auto p = m.begin(); p != m.end(); ++p) {
     encode(p->first, bl, features);
     encode(p->second, bl, features);
   }
@@ -758,7 +749,7 @@ inline typename std::enable_if<!t_traits::supported ||
 				 !u_traits::supported>::type
   encode_nohead(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl)
 {
-  for (typename std::map<T,U,Comp>::const_iterator p = m.begin(); p != m.end(); ++p) {
+  for (auto p = m.begin(); p != m.end(); ++p) {
     encode(p->first, bl);
     encode(p->second, bl);
   }
@@ -769,7 +760,7 @@ inline typename std::enable_if<!t_traits::supported ||
 				 !u_traits::supported>::type
   encode_nohead(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl, uint64_t features)
 {
-  for (typename std::map<T,U,Comp>::const_iterator p = m.begin(); p != m.end(); ++p) {
+  for (auto p = m.begin(); p != m.end(); ++p) {
     encode(p->first, bl, features);
     encode(p->second, bl, features);
   }
@@ -812,8 +803,7 @@ template<class T, class U, class Comp, class Alloc,
 {
   __u32 n = (__u32)(m.size());
   encode(n, bl);
-  for (typename boost::container::flat_map<T,U,Comp>::const_iterator p
-	 = m.begin(); p != m.end(); ++p) {
+  for (auto p = m.begin(); p != m.end(); ++p) {
     encode(p->first, bl, features);
     encode(p->second, bl, features);
   }
@@ -852,8 +842,7 @@ template<class T, class U, class Comp, class Alloc,
   encode_nohead(const boost::container::flat_map<T,U,Comp,Alloc>& m,
 		bufferlist& bl)
 {
-  for (typename boost::container::flat_map<T,U,Comp>::const_iterator p
-	 = m.begin(); p != m.end(); ++p) {
+  for (auto p = m.begin(); p != m.end(); ++p) {
     encode(p->first, bl);
     encode(p->second, bl);
   }
@@ -865,8 +854,7 @@ template<class T, class U, class Comp, class Alloc,
   encode_nohead(const boost::container::flat_map<T,U,Comp,Alloc>& m,
 		bufferlist& bl, uint64_t features)
 {
-  for (typename boost::container::flat_map<T,U,Comp>::const_iterator p
-	 = m.begin(); p != m.end(); ++p) {
+  for (auto p = m.begin(); p != m.end(); ++p) {
     encode(p->first, bl, features);
     encode(p->second, bl, features);
   }
@@ -892,7 +880,7 @@ inline void encode(const std::multimap<T,U,Comp,Alloc>& m, bufferlist& bl)
 {
   __u32 n = (__u32)(m.size());
   encode(n, bl);
-  for (typename std::multimap<T,U,Comp>::const_iterator p = m.begin(); p != m.end(); ++p) {
+  for (auto p = m.begin(); p != m.end(); ++p) {
     encode(p->first, bl);
     encode(p->second, bl);
   }
@@ -918,7 +906,7 @@ inline void encode(const unordered_map<T,U,Hash,Pred,Alloc>& m, bufferlist& bl,
 {
   __u32 n = (__u32)(m.size());
   encode(n, bl);
-  for (typename unordered_map<T,U,Hash,Pred>::const_iterator p = m.begin(); p != m.end(); ++p) {
+  for (auto p = m.begin(); p != m.end(); ++p) {
     encode(p->first, bl, features);
     encode(p->second, bl, features);
   }
@@ -928,7 +916,7 @@ inline void encode(const unordered_map<T,U,Hash,Pred,Alloc>& m, bufferlist& bl)
 {
   __u32 n = (__u32)(m.size());
   encode(n, bl);
-  for (typename unordered_map<T,U,Hash,Pred,Alloc>::const_iterator p = m.begin(); p != m.end(); ++p) {
+  for (auto p = m.begin(); p != m.end(); ++p) {
     encode(p->first, bl);
     encode(p->second, bl);
   }
@@ -952,7 +940,7 @@ inline void encode(const ceph::unordered_set<T,Hash,Pred,Alloc>& m, bufferlist&
 {
   __u32 n = (__u32)(m.size());
   encode(n, bl);
-  for (typename ceph::unordered_set<T,Hash,Pred>::const_iterator p = m.begin(); p != m.end(); ++p)
+  for (auto p = m.begin(); p != m.end(); ++p)
     encode(*p, bl);
 }
 template<class T, class Hash, class Pred, class Alloc>
@@ -974,7 +962,7 @@ inline void encode(const std::deque<T,Alloc>& ls, bufferlist& bl, uint64_t featu
 {
   __u32 n = ls.size();
   encode(n, bl);
-  for (typename std::deque<T>::const_iterator p = ls.begin(); p != ls.end(); ++p)
+  for (auto p = ls.begin(); p != ls.end(); ++p)
     encode(*p, bl, features);
 }
 template<class T, class Alloc>
@@ -982,7 +970,7 @@ inline void encode(const std::deque<T,Alloc>& ls, bufferlist& bl)
 {
   __u32 n = ls.size();
   encode(n, bl);
-  for (typename std::deque<T>::const_iterator p = ls.begin(); p != ls.end(); ++p)
+  for (auto p = ls.begin(); p != ls.end(); ++p)
     encode(*p, bl);
 }
 template<class T, class Alloc>
@@ -1063,11 +1051,8 @@ decode(std::array<T, N>& v, bufferlist::iterator& p)
 
 #define ENCODE_FINISH(bl) ENCODE_FINISH_NEW_COMPAT(bl, 0)
 
-#define DECODE_ERR_VERSION(func, v)			\
-  (std::string(func) + " unknown encoding version > " #v)
-
-#define DECODE_ERR_OLDVERSION(func, v)			\
-  (std::string(func) + " no longer understand old encoding version < " #v)
+#define DECODE_ERR_OLDVERSION(func, v, compatv)					\
+  (std::string(func) + " no longer understand old encoding version " #v " < " #compatv)
 
 #define DECODE_ERR_PAST(func) \
   (std::string(func) + " decode past end of struct encoding")
@@ -1081,7 +1066,7 @@ decode(std::array<T, N>& v, bufferlist::iterator& p)
  */
 #define DECODE_OLDEST(oldestv)						\
   if (struct_v < oldestv)						\
-    throw buffer::malformed_input(DECODE_ERR_OLDVERSION(__PRETTY_FUNCTION__, v)); 
+    throw buffer::malformed_input(DECODE_ERR_OLDVERSION(__PRETTY_FUNCTION__, v, oldestv)); 
 
 /**
  * start a decoding block
@@ -1094,7 +1079,7 @@ decode(std::array<T, N>& v, bufferlist::iterator& p)
   ::decode(struct_v, bl);						\
   ::decode(struct_compat, bl);						\
   if (v < struct_compat)						\
-    throw buffer::malformed_input(DECODE_ERR_VERSION(__PRETTY_FUNCTION__, v)); \
+    throw buffer::malformed_input(DECODE_ERR_OLDVERSION(__PRETTY_FUNCTION__, v, struct_compat)); \
   __u32 struct_len;							\
   ::decode(struct_len, bl);						\
   if (struct_len > bl.get_remaining())					\
@@ -1109,7 +1094,7 @@ decode(std::array<T, N>& v, bufferlist::iterator& p)
     __u8 struct_compat;							\
     ::decode(struct_compat, bl);					\
     if (v < struct_compat)						\
-      throw buffer::malformed_input(DECODE_ERR_VERSION(__PRETTY_FUNCTION__, v)); \
+      throw buffer::malformed_input(DECODE_ERR_OLDVERSION(__PRETTY_FUNCTION__, v, struct_compat)); \
   } else if (skip_v) {							\
     if ((int)bl.get_remaining() < skip_v)				\
       throw buffer::malformed_input(DECODE_ERR_PAST(__PRETTY_FUNCTION__)); \
diff --git a/src/include/fs_types.h b/src/include/fs_types.h
index 161d44104ec..5513fcefd79 100644
--- a/src/include/fs_types.h
+++ b/src/include/fs_types.h
@@ -4,7 +4,6 @@
 #define CEPH_INCLUDE_FS_TYPES_H
 
 #include "types.h"
-#include "utime.h"
 
 // --------------------------------------
 // ino
diff --git a/src/include/int_types.h b/src/include/int_types.h
index 2ffb162cc24..61ad7971392 100644
--- a/src/include/int_types.h
+++ b/src/include/int_types.h
@@ -3,62 +3,11 @@
 
 #include "acconfig.h"
 
-/*
- * Get 64b integers either from inttypes.h or glib.h
- */
-#ifdef HAVE_INTTYPES_H
-#  include <inttypes.h>
-//#else
-//#  ifdef HAVE_GLIB
-//#    include <glib.h>
-//#  endif
-#endif
-
-/*
- * C99 says inttypes.h includes stdint.h, but that's not true on all
- * systems. If it's there, include it always - just in case.
- */
-#ifdef HAVE_STDINT_H
-#include <stdint.h>
-#endif
+#include <inttypes.h>
 
-/*
- * Include types.h after stdint.h to accomodate for older distributions
- *
- */
 #ifdef HAVE_LINUX_TYPES_H
 #include <linux/types.h>
-#endif
-
-/*
- * Emergency replacements for PRI*64 modifiers. Some systems have
- * an inttypes.h that doesn't define all the PRI[doxu]64 macros.
- */
-#if !defined(PRIu64)
-#  if defined(HAVE_INTTYPES_H) || defined(HAVE_GLIB)
-/* If we have inttypes or glib, assume we have 64-bit long long int */
-#    define PRIu64 "llu"
-#    define PRIi64 "lli"
-#    define PRIx64 "llx"
-#    define PRIX64 "llX"
-#    define PRIo64 "llo"
-#    define PRId64 "lld"
-#  else
-/* Assume that we don't have long long, so use long int modifiers */
-#    define PRIu64 "lu"
-#    define PRIi64 "li"
-#    define PRIx64 "lx"
-#    define PRIX64 "lX"
-#    define PRIo64 "lo"
-#    define PRId64 "ld"
-#  endif
-#endif
-
-#ifdef HAVE_SYS_TYPES_H
-#include <sys/types.h>
-#endif
-
-#ifndef HAVE_LINUX_TYPES_H
+#else
 #ifndef HAVE___U8
 typedef uint8_t __u8;
 #endif
diff --git a/src/include/intarith.h b/src/include/intarith.h
index f46a1eb2fe2..390cdc024d7 100644
--- a/src/include/intarith.h
+++ b/src/include/intarith.h
@@ -78,17 +78,33 @@
 
 // count trailing zeros.
 // NOTE: the builtin is nondeterministic on 0 input
-static inline unsigned ctz(unsigned v) {
+template<class T>
+  inline typename std::enable_if<
+  (std::is_integral<T>::value &&
+   sizeof(T) <= sizeof(unsigned)),
+  unsigned>::type ctz(T v) {
   if (v == 0)
     return sizeof(v) * 8;
   return __builtin_ctz(v);
 }
-static inline unsigned ctzl(unsigned long v) {
+
+template<class T>
+  inline typename std::enable_if<
+  (std::is_integral<T>::value &&
+   sizeof(T) > sizeof(unsigned int) &&
+   sizeof(T) <= sizeof(unsigned long)),
+  unsigned>::type ctz(T v) {
   if (v == 0)
     return sizeof(v) * 8;
   return __builtin_ctzl(v);
 }
-static inline unsigned ctzll(unsigned long long v) {
+
+template<class T>
+  inline typename std::enable_if<
+  (std::is_integral<T>::value &&
+   sizeof(T) > sizeof(unsigned long) &&
+   sizeof(T) <= sizeof(unsigned long long)),
+  unsigned>::type ctz(T v) {
   if (v == 0)
     return sizeof(v) * 8;
   return __builtin_ctzll(v);
@@ -96,34 +112,66 @@ static inline unsigned ctzll(unsigned long long v) {
 
 // count leading zeros
 // NOTE: the builtin is nondeterministic on 0 input
-static inline unsigned clz(unsigned v) {
+template<class T>
+  inline typename std::enable_if<
+  (std::is_integral<T>::value &&
+   sizeof(T) <= sizeof(unsigned)),
+  unsigned>::type clz(T v) {
   if (v == 0)
     return sizeof(v) * 8;
   return __builtin_clz(v);
 }
-static inline unsigned clzl(unsigned long v) {
+
+template<class T>
+  inline typename std::enable_if<
+  (std::is_integral<T>::value &&
+   sizeof(T) > sizeof(unsigned int) &&
+   sizeof(T) <= sizeof(unsigned long)),
+  unsigned>::type clz(T v) {
   if (v == 0)
     return sizeof(v) * 8;
   return __builtin_clzl(v);
 }
-static inline unsigned clzll(unsigned long long v) {
+
+template<class T>
+  inline typename std::enable_if<
+  (std::is_integral<T>::value &&
+   sizeof(T) > sizeof(unsigned long) &&
+   sizeof(T) <= sizeof(unsigned long long)),
+  unsigned>::type clz(T v) {
   if (v == 0)
     return sizeof(v) * 8;
   return __builtin_clzll(v);
 }
 
 // count bits (set + any 0's that follow)
-static inline unsigned cbits(unsigned v) {
+template<class T>
+  inline typename std::enable_if<
+  (std::is_integral<T>::value &&
+   sizeof(T) <= sizeof(unsigned)),
+  unsigned>::type cbits(T v) {
   if (v == 0)
     return 0;
   return (sizeof(v) * 8) - __builtin_clz(v);
 }
-static inline unsigned cbitsl(unsigned long v) {
+
+template<class T>
+  inline typename std::enable_if<
+  (std::is_integral<T>::value &&
+   sizeof(T) > sizeof(unsigned int) &&
+   sizeof(T) <= sizeof(unsigned long)),
+  unsigned>::type cbits(T v) {
   if (v == 0)
     return 0;
   return (sizeof(v) * 8) - __builtin_clzl(v);
 }
-static inline unsigned cbitsll(unsigned long long v) {
+
+template<class T>
+  inline typename std::enable_if<
+  (std::is_integral<T>::value &&
+   sizeof(T) > sizeof(unsigned long) &&
+   sizeof(T) <= sizeof(unsigned long long)),
+  unsigned>::type cbits(T v) {
   if (v == 0)
     return 0;
   return (sizeof(v) * 8) - __builtin_clzll(v);
diff --git a/src/include/ipaddr.h b/src/include/ipaddr.h
index bf1a0830899..ac0dc620fac 100644
--- a/src/include/ipaddr.h
+++ b/src/include/ipaddr.h
@@ -1,10 +1,6 @@
 #ifndef CEPH_IPADDR_H
 #define CEPH_IPADDR_H
 
-#include <netinet/in.h>
-#include <sys/types.h>
-#include <ifaddrs.h>
-
 /*
   Find an IP address that is in the wanted subnet.
 
diff --git a/src/include/mempool.h b/src/include/mempool.h
index b9a3d047bf2..b995905b545 100644
--- a/src/include/mempool.h
+++ b/src/include/mempool.h
@@ -14,19 +14,15 @@
 
 #ifndef _CEPH_INCLUDE_MEMPOOL_H
 #define _CEPH_INCLUDE_MEMPOOL_H
-#include <iostream>
-#include <fstream>
 
 #include <cstddef>
 #include <map>
 #include <unordered_map>
 #include <set>
 #include <vector>
-#include <assert.h>
 #include <list>
 #include <mutex>
 #include <atomic>
-#include <climits>
 #include <typeinfo>
 
 #include <common/Formatter.h>
@@ -149,6 +145,7 @@ namespace mempool {
   f(buffer_meta)		      \
   f(buffer_data)		      \
   f(osd)			      \
+  f(osdmap)			      \
   f(osdmap_mapping)		      \
   f(unittest_1)			      \
   f(unittest_2)
diff --git a/src/include/msgr.h b/src/include/msgr.h
index f60c9dfa493..1953eb28b40 100644
--- a/src/include/msgr.h
+++ b/src/include/msgr.h
@@ -5,6 +5,8 @@
 #include <sys/socket.h> // for struct sockaddr_storage
 #endif
 
+#include "include/int_types.h"
+
 /*
  * Data types for message passing layer used by Ceph.
  */
diff --git a/src/include/rados.h b/src/include/rados.h
index 4c5d379fdf0..4eb09b89cc2 100644
--- a/src/include/rados.h
+++ b/src/include/rados.h
@@ -270,6 +270,7 @@ extern const char *ceph_osd_state_name(int s);
 									    \
 	/* ESX/SCSI */							    \
 	f(WRITESAME,	__CEPH_OSD_OP(WR, DATA, 38),	"write-same")	    \
+	f(CMPEXT,	__CEPH_OSD_OP(RD, DATA, 32),	"cmpext")	    \
 									    \
 	/** attrs **/							    \
 	/* read */							    \
@@ -361,6 +362,7 @@ static inline int ceph_osd_op_uses_extent(int op)
 	case CEPH_OSD_OP_ZERO:
 	case CEPH_OSD_OP_APPEND:
 	case CEPH_OSD_OP_TRIMTRUNC:
+	case CEPH_OSD_OP_CMPEXT:
 		return true;
 	default:
 		return false;
diff --git a/src/include/rados/librados.h b/src/include/rados/librados.h
index e1dc23191a3..e111380a0fb 100644
--- a/src/include/rados/librados.h
+++ b/src/include/rados/librados.h
@@ -317,7 +317,7 @@ struct rados_cluster_stat_t {
  * - Creating objects: rados_write_op_create()
  * - IO on objects: rados_write_op_append(), rados_write_op_write(), rados_write_op_zero
  *   rados_write_op_write_full(), rados_write_op_writesame(), rados_write_op_remove,
- *   rados_write_op_truncate(), rados_write_op_zero()
+ *   rados_write_op_truncate(), rados_write_op_zero(), rados_write_op_cmpext()
  * - Hints: rados_write_op_set_alloc_hint()
  * - Performing the operation: rados_write_op_operate(), rados_aio_write_op_operate()
  */
@@ -336,7 +336,8 @@ typedef void *rados_write_op_t;
  *   rados_read_op_omap_cmp()
  * - Object properties: rados_read_op_stat(), rados_read_op_assert_exists(),
  *   rados_read_op_assert_version()
- * - IO on objects: rados_read_op_read(), rados_read_op_checksum()
+ * - IO on objects: rados_read_op_read(), rados_read_op_checksum(),
+ *   rados_read_op_cmpext()
  * - Custom operations: rados_read_op_exec(), rados_read_op_exec_user_buf()
  * - Request properties: rados_read_op_set_flags()
  * - Performing the operation: rados_read_op_operate(),
@@ -353,6 +354,12 @@ typedef void *rados_read_op_t;
 typedef void *rados_completion_t;
 
 /**
+ * @struct blkin_trace_info
+ * blkin trace information for Zipkin tracing
+ */
+struct blkin_trace_info;
+
+/**
  * Get the version of librados.
  *
  * The version number is major.minor.extra. Note that this is
@@ -648,6 +655,8 @@ CEPH_RADOS_API int rados_wait_for_latest_osdmap(rados_t cluster);
  * If len is too short to fit all the pool name entries we need, we will fill
  * as much as we can.
  *
+ * Buf may be null to determine the buffer size needed to list all pools.
+ *
  * @param cluster cluster handle
  * @param buf output buffer
  * @param len output buffer length
@@ -1520,6 +1529,21 @@ CEPH_RADOS_API int rados_trunc(rados_ioctx_t io, const char *oid,
                                uint64_t size);
 
 /**
+ * Compare an on-disk object range with a buffer
+ *
+ * @param io the context in which to perform the comparison
+ * @param o name of the object
+ * @param cmp_buf buffer containing bytes to be compared with object contents
+ * @param cmp_len length to compare and size of @cmp_buf in bytes
+ * @param off object byte offset at which to start the comparison
+ * @returns 0 on success, negative error code on failure,
+ *  (-MAX_ERRNO - mismatch_off) on mismatch
+ */
+CEPH_RADOS_API int rados_cmpext(rados_ioctx_t io, const char *o,
+                                const char *cmp_buf, size_t cmp_len,
+                                uint64_t off);
+
+/**
  * @name Xattrs
  * Extended attributes are stored as extended attributes on the files
  * representing an object on the OSDs. Thus, they have the same
@@ -2111,6 +2135,24 @@ CEPH_RADOS_API int rados_aio_stat(rados_ioctx_t io, const char *o,
 		                  uint64_t *psize, time_t *pmtime);
 
 /**
+ * Asynchronously compare an on-disk object range with a buffer
+ *
+ * @param io the context in which to perform the comparison
+ * @param o the name of the object to compare with
+ * @param completion what to do when the comparison is complete
+ * @param cmp_buf buffer containing bytes to be compared with object contents
+ * @param cmp_len length to compare and size of @cmp_buf in bytes
+ * @param off object byte offset at which to start the comparison
+ * @returns 0 on success, negative error code on failure,
+ *  (-MAX_ERRNO - mismatch_off) on mismatch
+ */
+CEPH_RADOS_API int rados_aio_cmpext(rados_ioctx_t io, const char *o,
+                                    rados_completion_t completion,
+                                    const char *cmp_buf,
+                                    size_t cmp_len,
+                                    uint64_t off);
+
+/**
  * Cancel async operation
  *
  * @param io ioctx
@@ -2723,6 +2765,22 @@ CEPH_RADOS_API void rados_write_op_assert_exists(rados_write_op_t write_op);
 CEPH_RADOS_API void rados_write_op_assert_version(rados_write_op_t write_op, uint64_t ver);
 
 /**
+ * Ensure that given object range (extent) satisfies comparison.
+ *
+ * @param write_op operation to add this action to
+ * @param cmp_buf buffer containing bytes to be compared with object contents
+ * @param cmp_len length to compare and size of @cmp_buf in bytes
+ * @param off object byte offset at which to start the comparison
+ * @param prval returned result of comparison, 0 on success, negative error code
+ *  on failure, (-MAX_ERRNO - mismatch_off) on mismatch
+ */
+CEPH_RADOS_API void rados_write_op_cmpext(rados_write_op_t write_op,
+                                          const char *cmp_buf,
+                                          size_t cmp_len,
+                                          uint64_t off,
+                                          int *prval);
+
+/**
  * Ensure that given xattr satisfies comparison.
  * If the comparison is not satisfied, the return code of the
  * operation will be -ECANCELED
@@ -3024,6 +3082,22 @@ CEPH_RADOS_API void rados_read_op_assert_exists(rados_read_op_t read_op);
 CEPH_RADOS_API void rados_read_op_assert_version(rados_read_op_t read_op, uint64_t ver);
 
 /**
+ * Ensure that given object range (extent) satisfies comparison.
+ *
+ * @param read_op operation to add this action to
+ * @param cmp_buf buffer containing bytes to be compared with object contents
+ * @param cmp_len length to compare and size of @cmp_buf in bytes
+ * @param off object byte offset at which to start the comparison
+ * @param prval returned result of comparison, 0 on success, negative error code
+ *  on failure, (-MAX_ERRNO - mismatch_off) on mismatch
+ */
+CEPH_RADOS_API void rados_read_op_cmpext(rados_read_op_t read_op,
+                                         const char *cmp_buf,
+                                         size_t cmp_len,
+                                         uint64_t off,
+                                         int *prval);
+
+/**
  * Ensure that the an xattr satisfies a comparison
  * If the comparison is not satisfied, the return code of the
  * operation will be -ECANCELED
diff --git a/src/include/rados/librados.hpp b/src/include/rados/librados.hpp
index a7e4d1ca141..10e8f203067 100644
--- a/src/include/rados/librados.hpp
+++ b/src/include/rados/librados.hpp
@@ -301,6 +301,7 @@ namespace librados
     //flag mean ObjectOperationFlags
     void set_op_flags2(int flags);
 
+    void cmpext(uint64_t off, bufferlist& cmp_bl, int *prval);
     void cmpxattr(const char *name, uint8_t op, const bufferlist& val);
     void cmpxattr(const char *name, uint8_t op, uint64_t v);
     void exec(const char *cls, const char *method, bufferlist& inbl);
@@ -755,6 +756,7 @@ namespace librados
     int remove(const std::string& oid, int flags);
     int trunc(const std::string& oid, uint64_t size);
     int mapext(const std::string& o, uint64_t off, size_t len, std::map<uint64_t,uint64_t>& m);
+    int cmpext(const std::string& o, uint64_t off, bufferlist& cmp_bl);
     int sparse_read(const std::string& o, std::map<uint64_t,uint64_t>& m, bufferlist& bl, size_t len, uint64_t off);
     int getxattr(const std::string& oid, const char *name, bufferlist& bl);
     int getxattrs(const std::string& oid, std::map<std::string, bufferlist>& attrset);
@@ -991,6 +993,20 @@ namespace librados
     int aio_sparse_read(const std::string& oid, AioCompletion *c,
 			std::map<uint64_t,uint64_t> *m, bufferlist *data_bl,
 			size_t len, uint64_t off, uint64_t snapid);
+    /**
+     * Asynchronously compare an on-disk object range with a buffer
+     *
+     * @param oid the name of the object to read from
+     * @param c what to do when the read is complete
+     * @param off object byte offset at which to start the comparison
+     * @param cmp_bl buffer containing bytes to be compared with object contents
+     * @returns 0 on success, negative error code on failure,
+     *  (-MAX_ERRNO - mismatch_off) on mismatch
+     */
+    int aio_cmpext(const std::string& oid,
+		   librados::AioCompletion *c,
+		   uint64_t off,
+		   bufferlist& cmp_bl);
     int aio_write(const std::string& oid, AioCompletion *c, const bufferlist& bl,
 		  size_t len, uint64_t off);
     int aio_append(const std::string& oid, AioCompletion *c, const bufferlist& bl,
@@ -1078,6 +1094,10 @@ namespace librados
 		    ObjectWriteOperation *op, snap_t seq,
 		    std::vector<snap_t>& snaps);
     int aio_operate(const std::string& oid, AioCompletion *c,
+        ObjectWriteOperation *op, snap_t seq,
+        std::vector<snap_t>& snaps,
+        const blkin_trace_info *trace_info);
+    int aio_operate(const std::string& oid, AioCompletion *c,
 		    ObjectReadOperation *op, bufferlist *pbl);
 
     int aio_operate(const std::string& oid, AioCompletion *c,
@@ -1088,6 +1108,9 @@ namespace librados
     int aio_operate(const std::string& oid, AioCompletion *c,
 		    ObjectReadOperation *op, int flags,
 		    bufferlist *pbl);
+    int aio_operate(const std::string& oid, AioCompletion *c,
+        ObjectReadOperation *op, int flags,
+        bufferlist *pbl, const blkin_trace_info *trace_info);
 
     // watch/notify
     int watch2(const std::string& o, uint64_t *handle,
diff --git a/src/include/rados/objclass.h b/src/include/rados/objclass.h
new file mode 100644
index 00000000000..08c50869427
--- /dev/null
+++ b/src/include/rados/objclass.h
@@ -0,0 +1,181 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_OBJCLASS_OBJCLASS_PUBLIC_H
+#define CEPH_OBJCLASS_OBJCLASS_PUBLIC_H
+
+#ifdef __cplusplus
+
+#include "buffer.h"
+
+extern "C" {
+#endif
+
+#ifndef BUILDING_FOR_EMBEDDED
+#define CLS_VER(maj,min) \
+int __cls_ver__## maj ## _ ##min = 0; \
+int __cls_ver_maj = maj; \
+int __cls_ver_min = min;
+
+#define CLS_NAME(name) \
+int __cls_name__## name = 0; \
+const char *__cls_name = #name;
+#define CLS_INIT(name) \
+void CEPH_CLS_API __cls_init()
+#else
+#define CLS_VER(maj,min)
+#define CLS_NAME(name)
+#define CLS_INIT(name) \
+void CEPH_CLS_API name##_cls_init()
+#endif
+
+#define CLS_METHOD_RD       0x1 /// method executes read operations
+#define CLS_METHOD_WR       0x2 /// method executes write operations
+#define CLS_METHOD_PROMOTE  0x8 /// method cannot be proxied to base tier
+
+#define CLS_LOG(level, fmt, ...)                                        \
+  cls_log(level, "<cls> %s:%d: " fmt, __FILE__, __LINE__, ##__VA_ARGS__)
+#define CLS_ERR(fmt, ...) CLS_LOG(0, fmt, ##__VA_ARGS__)
+
+/**
+ * Initialize a class.
+ */
+void __cls_init();
+
+/**
+ * @typdef cls_handle_t
+ *
+ * A handle for interacting with the object class.
+ */
+typedef void *cls_handle_t;
+
+/**
+ * @typedef cls_method_handle_t
+ *
+ * A handle for interacting with the method of the object class.
+ */
+typedef void *cls_method_handle_t;
+
+/**
+ * @typedef cls_method_context_t
+ *
+ * A context for the method of the object class.
+ */
+typedef void* cls_method_context_t;
+
+/*class utils*/
+extern int cls_log(int level, const char *format, ...)
+  __attribute__((__format__(printf, 2, 3)));
+
+/* class registration api */
+extern int cls_register(const char *name, cls_handle_t *handle);
+
+#ifdef __cplusplus
+}
+
+/**
+ * @typedef cls_method_cxx_call_t
+ *
+ */
+typedef int (*cls_method_cxx_call_t)(cls_method_context_t ctx,
+    class buffer::list *inbl, class buffer::list *outbl);
+
+/**
+ * Register a method.
+ *
+ * @param hclass
+ * @param method
+ * @param flags
+ * @param class_call
+ * @param handle
+ */
+extern int cls_register_cxx_method(cls_handle_t hclass, const char *method, int flags,
+                                   cls_method_cxx_call_t class_call, cls_method_handle_t *handle);
+
+/**
+ * Create an object.
+ *
+ * @param hctx
+ * @param exclusive
+ */
+extern int cls_cxx_create(cls_method_context_t hctx, bool exclusive);
+
+/**
+ * Remove an object.
+ *
+ * @param hctx
+ */
+extern int cls_cxx_remove(cls_method_context_t hctx);
+
+/**
+ * Check on the status of an object.
+ *
+ * @param hctx
+ * @param size
+ * @param mtime
+ */
+extern int cls_cxx_stat(cls_method_context_t hctx, uint64_t *size, time_t *mtime);
+
+/**
+ * Read contents of an object.
+ *
+ * @param hctx
+ * @param ofs
+ * @param len
+ * @param bl
+ */
+extern int cls_cxx_read(cls_method_context_t hctx, int ofs, int len, bufferlist *bl);
+
+/**
+ * Write to the object.
+ *
+ * @param hctx
+ * @param ofs
+ * @param len
+ * @param bl
+ */
+extern int cls_cxx_write(cls_method_context_t hctx, int ofs, int len, bufferlist *bl);
+
+/**
+ * Get xattr of the object.
+ *
+ * @param hctx
+ * @param name
+ * @param outbl
+ */
+extern int cls_cxx_getxattr(cls_method_context_t hctx, const char *name,
+                            bufferlist *outbl);
+
+/**
+ * Set xattr of the object.
+ *
+ * @param hctx
+ * @param name
+ * @param inbl
+ */
+extern int cls_cxx_setxattr(cls_method_context_t hctx, const char *name,
+                            bufferlist *inbl);
+
+/**
+ * Get value corresponding to a key from the map.
+ *
+ * @param hctx
+ * @param key
+ * @param outbl
+ */
+extern int cls_cxx_map_get_val(cls_method_context_t hctx,
+                               const std::string &key, bufferlist *outbl);
+
+/**
+ * Set value corresponding to a key in the map.
+ *
+ * @param hctx
+ * @param key
+ * @param inbl
+ */
+extern int cls_cxx_map_set_val(cls_method_context_t hctx,
+                               const std::string &key, bufferlist *inbl);
+
+#endif
+
+#endif
diff --git a/src/include/rbd/librbd.h b/src/include/rbd/librbd.h
index b435feac1ee..87d04eb4470 100644
--- a/src/include/rbd/librbd.h
+++ b/src/include/rbd/librbd.h
@@ -1,4 +1,4 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 // vim: ts=8 sw=2 smarttab
 /*
  * Ceph - scalable distributed file system
@@ -255,8 +255,14 @@ CEPH_RBD_API int rbd_remove(rados_ioctx_t io, const char *name);
 CEPH_RBD_API int rbd_remove_with_progress(rados_ioctx_t io, const char *name,
 			                  librbd_progress_fn_t cb,
                                           void *cbdata);
+CEPH_RBD_API int rbd_rename(rados_ioctx_t src_io_ctx, const char *srcname,
+                            const char *destname);
+
 CEPH_RBD_API int rbd_trash_move(rados_ioctx_t io, const char *name,
                                 uint64_t delay);
+CEPH_RBD_API int rbd_trash_get(rados_ioctx_t io, const char *id,
+                               rbd_trash_image_info_t *info);
+CEPH_RBD_API void rbd_trash_get_cleanup(rbd_trash_image_info_t *info);
 CEPH_RBD_API int rbd_trash_list(rados_ioctx_t io,
                                 rbd_trash_image_info_t *trash_entries,
                                 size_t *num_entries);
@@ -268,8 +274,6 @@ CEPH_RBD_API int rbd_trash_remove_with_progress(rados_ioctx_t io, const char *id
                                                 void *cbdata);
 CEPH_RBD_API int rbd_trash_restore(rados_ioctx_t io, const char *id,
                                    const char *name);
-CEPH_RBD_API int rbd_rename(rados_ioctx_t src_io_ctx, const char *srcname,
-                            const char *destname);
 
 /* pool mirroring */
 CEPH_RBD_API int rbd_mirror_mode_get(rados_ioctx_t io_ctx,
@@ -370,6 +374,13 @@ CEPH_RBD_API int rbd_get_parent_info(rbd_image_t image,
 			             char *parent_name, size_t pnamelen,
 			             char *parent_snapname,
                                      size_t psnapnamelen);
+CEPH_RBD_API int rbd_get_parent_info2(rbd_image_t image,
+                                      char *parent_poolname,
+                                      size_t ppoolnamelen,
+                                      char *parent_name, size_t pnamelen,
+                                      char *parent_id, size_t pidlen,
+                                      char *parent_snapname,
+                                      size_t psnapnamelen);
 CEPH_RBD_API int rbd_get_flags(rbd_image_t image, uint64_t *flags);
 CEPH_RBD_API int rbd_set_image_notification(rbd_image_t image, int fd, int type);
 
diff --git a/src/include/rbd/librbd.hpp b/src/include/rbd/librbd.hpp
index f404494b1be..f03ca9f3622 100644
--- a/src/include/rbd/librbd.hpp
+++ b/src/include/rbd/librbd.hpp
@@ -158,15 +158,16 @@ public:
 	     IoCtx& c_ioctx, const char *c_name, ImageOptions& opts);
   int remove(IoCtx& io_ctx, const char *name);
   int remove_with_progress(IoCtx& io_ctx, const char *name, ProgressContext& pctx);
+  int rename(IoCtx& src_io_ctx, const char *srcname, const char *destname);
+
   int trash_move(IoCtx &io_ctx, const char *name, uint64_t delay);
+  int trash_get(IoCtx &io_ctx, const char *id, trash_image_info_t *info);
   int trash_list(IoCtx &io_ctx, std::vector<trash_image_info_t> &entries);
   int trash_remove(IoCtx &io_ctx, const char *image_id, bool force);
-  int trash_remove_with_progress(IoCtx &io_ctx, const char *image_id, bool force,
-                                 ProgressContext &pctx);
+  int trash_remove_with_progress(IoCtx &io_ctx, const char *image_id,
+                                 bool force, ProgressContext &pctx);
   int trash_restore(IoCtx &io_ctx, const char *id, const char *name);
 
-  int rename(IoCtx& src_io_ctx, const char *srcname, const char *destname);
-
   // RBD pool mirroring support functions
   int mirror_mode_get(IoCtx& io_ctx, rbd_mirror_mode_t *mirror_mode);
   int mirror_mode_set(IoCtx& io_ctx, rbd_mirror_mode_t mirror_mode);
@@ -253,7 +254,9 @@ public:
   std::string get_block_name_prefix();
   int64_t get_data_pool_id();
   int parent_info(std::string *parent_poolname, std::string *parent_name,
-		      std::string *parent_snapname);
+		  std::string *parent_snapname);
+  int parent_info2(std::string *parent_poolname, std::string *parent_name,
+                   std::string *parent_id, std::string *parent_snapname);
   int old_format(uint8_t *old);
   int size(uint64_t *size);
   int get_group(group_spec_t *group_spec);
diff --git a/src/include/small_encoding.h b/src/include/small_encoding.h
deleted file mode 100644
index efd4953fc48..00000000000
--- a/src/include/small_encoding.h
+++ /dev/null
@@ -1,245 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-#ifndef CEPH_SMALL_ENCODING_H
-#define CEPH_SMALL_ENCODING_H
-
-#include "include/buffer.h"
-#include "include/int_types.h"
-
-// varint encoding
-//
-// high bit of every byte indicates whether another byte follows.
-template<typename T>
-inline void small_encode_varint(T v, bufferlist& bl) {
-  uint8_t byte = v & 0x7f;
-  v >>= 7;
-  while (v) {
-    byte |= 0x80;
-    ::encode(byte, bl);
-    byte = (v & 0x7f);
-    v >>= 7;
-  }
-  ::encode(byte, bl);
-}
-
-template<typename T>
-inline void small_decode_varint(T& v, bufferlist::iterator& p)
-{
-  uint8_t byte;
-  ::decode(byte, p);
-  v = byte & 0x7f;
-  int shift = 7;
-  while (byte & 0x80) {
-    ::decode(byte, p);
-    v |= (T)(byte & 0x7f) << shift;
-    shift += 7;
-  }
-}
-
-// signed varint encoding
-//
-// low bit = 1 = negative, 0 = positive
-// high bit of every byte indicates whether another byte follows.
-inline void small_encode_signed_varint(int64_t v, bufferlist& bl) {
-  if (v < 0) {
-    v = (-v << 1) | 1;
-  } else {
-    v <<= 1;
-  }
-  small_encode_varint(v, bl);
-}
-
-template<typename T>
-inline void small_decode_signed_varint(T& v, bufferlist::iterator& p)
-{
-  int64_t i;
-  small_decode_varint(i, p);
-  if (i & 1) {
-    v = -(i >> 1);
-  } else {
-    v = i >> 1;
-  }
-}
-
-// varint + lowz encoding
-//
-// first(low) 2 bits = how many low zero bits (nibbles)
-// high bit of each byte = another byte follows
-// (so, 5 bits data in first byte, 7 bits data thereafter)
-inline void small_encode_varint_lowz(uint64_t v, bufferlist& bl) {
-  int lowznib = v ? (ctz(v) / 4) : 0;
-  if (lowznib > 3)
-    lowznib = 3;
-  v >>= lowznib * 4;
-  v <<= 2;
-  v |= lowznib;
-  small_encode_varint(v, bl);
-}
-
-template<typename T>
-inline void small_decode_varint_lowz(T& v, bufferlist::iterator& p)
-{
-  uint64_t i;
-  small_decode_varint(i, p);
-  int lowznib = (i & 3);
-  i >>= 2;
-  i <<= lowznib * 4;
-  v = i;
-}
-
-// signed varint + lowz encoding
-//
-// first low bit = 1 for negative, 0 for positive
-// next 2 bits = how many low zero bits (nibbles)
-// high bit of each byte = another byte follows
-// (so, 4 bits data in first byte, 7 bits data thereafter)
-inline void small_encode_signed_varint_lowz(int64_t v, bufferlist& bl) {
-  bool negative = false;
-  if (v < 0) {
-    v = -v;
-    negative = true;
-  }
-  int lowznib = v ? (ctz(v) / 4) : 0;
-  if (lowznib > 3)
-    lowznib = 3;
-  v >>= lowznib * 4;
-  v <<= 3;
-  v |= lowznib << 1;
-  v |= (int)negative;
-  small_encode_varint(v, bl);
-}
-
-template<typename T>
-inline void small_decode_signed_varint_lowz(T& v, bufferlist::iterator& p)
-{
-  int64_t i;
-  small_decode_varint(i, p);
-  int lowznib = (i & 6) >> 1;
-  if (i & 1) {
-    i >>= 3;
-    i <<= lowznib * 4;
-    v = -i;
-  } else {
-    i >>= 3;
-    i <<= lowznib * 4;
-    v = i;
-  }
-}
-
-
-// LBA
-//
-// first 1-3 bits = how many low zero bits
-//     *0 = 12 (common 4 K alignment case)
-//    *01 = 16
-//   *011 = 20
-//   *111 = byte
-// then 28-30 bits of data
-// then last bit = another byte follows
-// high bit of each subsequent byte = another byte follows
-inline void small_encode_lba(uint64_t v, bufferlist& bl) {
-  int low_zero_nibbles = v ? (int)(ctz(v) / 4) : 0;
-  int pos;
-  uint32_t word;
-  int t = low_zero_nibbles - 3;
-  if (t < 0) {
-    pos = 3;
-    word = 0x7;
-  } else if (t < 3) {
-    v >>= (low_zero_nibbles * 4);
-    pos = t + 1;
-    word = (1 << t) - 1;
-  } else {
-    v >>= 20;
-    pos = 3;
-    word = 0x3;
-  }
-  word |= (v << pos) & 0x7fffffff;
-  v >>= 31 - pos;
-  if (!v) {
-    ::encode(word, bl);
-    return;
-  }
-  word |= 0x80000000;
-  ::encode(word, bl);
-  uint8_t byte = v & 0x7f;
-  v >>= 7;
-  while (v) {
-    byte |= 0x80;
-    ::encode(byte, bl);
-    byte = (v & 0x7f);
-    v >>= 7;
-  }
-  ::encode(byte, bl);
-}
-
-inline void small_decode_lba(uint64_t& v, bufferlist::iterator& p) {
-  uint32_t word;
-  ::decode(word, p);
-  int shift;
-  switch (word & 7) {
-  case 0:
-  case 2:
-  case 4:
-  case 6:
-    v = (uint64_t)(word & 0x7ffffffe) << (12 - 1);
-    shift = 12 + 30;
-    break;
-  case 1:
-  case 5:
-    v = (uint64_t)(word & 0x7ffffffc) << (16 - 2);
-    shift = 16 + 29;
-    break;
-  case 3:
-    v = (uint64_t)(word & 0x7ffffff8) << (20 - 3);
-    shift = 20 + 28;
-    break;
-  case 7:
-    v = (uint64_t)(word & 0x7ffffff8) >> 3;
-    shift = 28;
-  }
-  uint8_t byte = word >> 24;
-  while (byte & 0x80) {
-    ::decode(byte, p);
-    v |= (uint64_t)(byte & 0x7f) << shift;
-    shift += 7;
-  }
-}
-
-
-// short bufferptrs, bufferlists, strings
-template<typename T>
-inline void small_encode_buf_lowz(const T& bp, bufferlist& bl) {
-  size_t l = bp.length();
-  small_encode_varint_lowz(l, bl);
-  bl.append(bp);
-}
-template<typename T>
-inline void small_decode_buf_lowz(T& bp, bufferlist::iterator& p) {
-  size_t l;
-  small_decode_varint_lowz(l, p);
-  p.copy_deep(l, bp);
-}
-
-// STL containers
-
-template<typename T>
-inline void small_encode_obj(const std::vector<T>& v, bufferlist& bl) {
-  size_t n = v.size();
-  small_encode_varint(n, bl);
-  for (auto p = v.cbegin(); p != v.cend(); ++p) {
-    p->encode(bl);
-  }
-}
-template<typename T>
-inline void small_decode_obj(std::vector<T>& v, bufferlist::iterator& p) {
-  size_t n;
-  small_decode_varint(n, p);
-  v.clear();
-  while (n--) {
-    v.push_back(T());
-    v.back().decode(p);
-  }
-}
-
-#endif
diff --git a/src/include/str_list.h b/src/include/str_list.h
index 7f4dce53536..8ca07f3d31b 100644
--- a/src/include/str_list.h
+++ b/src/include/str_list.h
@@ -3,7 +3,6 @@
 
 #include <list>
 #include <set>
-#include <sstream>
 #include <string>
 #include <vector>
 
diff --git a/src/include/utime.h b/src/include/utime.h
index da73f47235e..732eaebcdff 100644
--- a/src/include/utime.h
+++ b/src/include/utime.h
@@ -24,6 +24,7 @@
 #include "include/timegm.h"
 #include "common/strtol.h"
 #include "common/ceph_time.h"
+#include "include/denc.h"
 
 
 // --------
@@ -136,6 +137,12 @@ public:
 #endif
   }
 
+  DENC(utime_t, v, p) {
+    denc(v.tv.tv_sec, p);
+    denc(v.tv.tv_nsec, p);
+  }
+
+
   void encode_timeval(struct ceph_timespec *t) const {
     t->tv_sec = tv.tv_sec;
     t->tv_nsec = tv.tv_nsec;
@@ -381,6 +388,7 @@ public:
   }
 };
 WRITE_CLASS_ENCODER(utime_t)
+WRITE_CLASS_DENC(utime_t)
 
 
 // arithmetic operators
diff --git a/src/init-ceph.in b/src/init-ceph.in
index fda274f201a..26704d62e67 100755
--- a/src/init-ceph.in
+++ b/src/init-ceph.in
@@ -266,7 +266,7 @@ get_name_list "$@"
 
 # Reverse the order if we are stopping
 
-if [ "$command" = "stop" -o "$command" = "onestop"]; then
+if [ "$command" = "stop" -o "$command" = "onestop" ]; then
     for f in $what; do
        new_order="$f $new_order"
     done
diff --git a/src/krbd.cc b/src/krbd.cc
index fe0d8c2b285..bf7e2558369 100644
--- a/src/krbd.cc
+++ b/src/krbd.cc
@@ -42,6 +42,8 @@
 
 using namespace std;
 
+const static int POLL_TIMEOUT=120000;
+
 struct krbd_ctx {
   CephContext *cct;
   struct udev *udev;
@@ -189,7 +191,7 @@ static int wait_for_udev_add(struct udev_monitor *mon, const char *pool,
 
     fds[0].fd = udev_monitor_get_fd(mon);
     fds[0].events = POLLIN;
-    if (poll(fds, 1, -1) < 0)
+    if (poll(fds, 1, POLL_TIMEOUT) < 0)
       return -errno;
 
     dev = udev_monitor_receive_device(mon);
@@ -473,7 +475,7 @@ static int wait_for_udev_remove(struct udev_monitor *mon, dev_t devno)
 
     fds[0].fd = udev_monitor_get_fd(mon);
     fds[0].events = POLLIN;
-    if (poll(fds, 1, -1) < 0)
+    if (poll(fds, 1, POLL_TIMEOUT) < 0)
       return -errno;
 
     dev = udev_monitor_receive_device(mon);
diff --git a/src/kv/KeyValueDB.cc b/src/kv/KeyValueDB.cc
index 42036e3c593..7a917b7a183 100644
--- a/src/kv/KeyValueDB.cc
+++ b/src/kv/KeyValueDB.cc
@@ -12,9 +12,6 @@
 #ifdef HAVE_KINETIC
 #include "KineticStore.h"
 #endif
-#ifdef HAVE_LIBAIO
-#include "os/bluestore/BlueStore.h"
-#endif
 
 KeyValueDB *KeyValueDB::create(CephContext *cct, const string& type,
 			       const string& dir,
@@ -37,18 +34,6 @@ KeyValueDB *KeyValueDB::create(CephContext *cct, const string& type,
   }
 #endif
 
-#ifdef HAVE_LIBAIO
-  if (type == "bluestore-kv") {
-    // note: we'll leak this!  the only user is ceph-kvstore-tool and
-    // we don't care.
-    BlueStore *bluestore = new BlueStore(cct, dir);
-    KeyValueDB *db = nullptr;
-    int r = bluestore->start_kv_only(&db);
-    if (r < 0)
-      return nullptr;  // yes, we leak.
-    return db;
-  }
-#endif
   if ((type == "memdb") && 
     cct->check_experimental_feature_enabled("memdb")) {
     return new MemDB(cct, dir, p);
diff --git a/src/kv/RocksDBStore.cc b/src/kv/RocksDBStore.cc
index 81577401f12..a535179428a 100644
--- a/src/kv/RocksDBStore.cc
+++ b/src/kv/RocksDBStore.cc
@@ -289,6 +289,11 @@ int RocksDBStore::do_open(ostream &out, bool create_if_missing)
   auto cache = rocksdb::NewLRUCache(g_conf->rocksdb_cache_size, g_conf->rocksdb_cache_shard_bits);
   bbt_opts.block_size = g_conf->rocksdb_block_size;
   bbt_opts.block_cache = cache;
+  if (g_conf->kstore_rocksdb_bloom_bits_per_key > 0) {
+    dout(10) << __func__ << " set bloom filter bits per key to "
+	     << g_conf->kstore_rocksdb_bloom_bits_per_key << dendl;
+    bbt_opts.filter_policy.reset(rocksdb::NewBloomFilterPolicy(g_conf->kstore_rocksdb_bloom_bits_per_key));
+  }
   opt.table_factory.reset(rocksdb::NewBlockBasedTableFactory(bbt_opts));
   dout(10) << __func__ << " set block size to " << g_conf->rocksdb_block_size
            << " cache size to " << g_conf->rocksdb_cache_size
diff --git a/src/libcephfs.cc b/src/libcephfs.cc
index 0446edb99f1..bfacd1bae51 100644
--- a/src/libcephfs.cc
+++ b/src/libcephfs.cc
@@ -33,6 +33,7 @@
 
 #include "include/cephfs/libcephfs.h"
 
+
 struct ceph_mount_info
 {
 public:
@@ -86,7 +87,7 @@ public:
 
     //at last the client
     ret = -CEPHFS_ERROR_NEW_CLIENT; //defined in libcephfs.h;
-    client = new Client(messenger, monclient);
+    client = new StandaloneClient(messenger, monclient);
     if (!client)
       goto fail;
 
@@ -248,7 +249,7 @@ public:
 private:
   bool mounted;
   bool inited;
-  Client *client;
+  StandaloneClient *client;
   MonClient *monclient;
   Messenger *messenger;
   CephContext *cct;
@@ -1224,9 +1225,11 @@ extern "C" int ceph_get_osd_crush_location(struct ceph_mount_info *cmount,
     string& name = it->second;
     needed += type.size() + name.size() + 2;
     if (needed <= len) {
-      strcpy(path + cur, type.c_str());
+      if (path)
+	strcpy(path + cur, type.c_str());
       cur += type.size() + 1;
-      strcpy(path + cur, name.c_str());
+      if (path)
+	strcpy(path + cur, name.c_str());
       cur += name.size() + 1;
     }
   }
diff --git a/src/librados/IoCtxImpl.cc b/src/librados/IoCtxImpl.cc
index 2647c57094d..d0d5a1ae507 100644
--- a/src/librados/IoCtxImpl.cc
+++ b/src/librados/IoCtxImpl.cc
@@ -769,7 +769,8 @@ int librados::IoCtxImpl::aio_operate_read(const object_t &oid,
 					  ::ObjectOperation *o,
 					  AioCompletionImpl *c,
 					  int flags,
-					  bufferlist *pbl)
+					  bufferlist *pbl,
+                                          const blkin_trace_info *trace_info)
 {
   FUNCTRACE();
   Context *oncomplete = new C_aio_Complete(c);
@@ -780,16 +781,26 @@ int librados::IoCtxImpl::aio_operate_read(const object_t &oid,
   c->is_read = true;
   c->io = this;
 
+  ZTracer::Trace trace;
+  if (trace_info) {
+    ZTracer::Trace parent_trace("", nullptr, trace_info);
+    trace.init("rados operate read", &objecter->trace_endpoint, &parent_trace);
+  }
+
+  trace.event("init root span");
   Objecter::Op *objecter_op = objecter->prepare_read_op(oid, oloc,
 		 *o, snap_seq, pbl, flags,
-		 oncomplete, &c->objver);
+		 oncomplete, &c->objver, nullptr, 0, &trace);
   objecter->op_submit(objecter_op, &c->tid);
+  trace.event("rados operate read submitted");
+
   return 0;
 }
 
 int librados::IoCtxImpl::aio_operate(const object_t& oid,
 				     ::ObjectOperation *o, AioCompletionImpl *c,
-				     const SnapContext& snap_context, int flags)
+				     const SnapContext& snap_context, int flags,
+                                     const blkin_trace_info *trace_info)
 {
   FUNCTRACE();
   OID_EVENT_TRACE(oid.name.c_str(), "RADOS_WRITE_OP_BEGIN");
@@ -806,17 +817,25 @@ int librados::IoCtxImpl::aio_operate(const object_t& oid,
   c->io = this;
   queue_aio_write(c);
 
+  ZTracer::Trace trace;
+  if (trace_info) {
+    ZTracer::Trace parent_trace("", nullptr, trace_info);
+    trace.init("rados operate", &objecter->trace_endpoint, &parent_trace);
+  }
+
+  trace.event("init root span");
   Objecter::Op *op = objecter->prepare_mutate_op(
     oid, oloc, *o, snap_context, ut, flags,
-    oncomplete, &c->objver);
+    oncomplete, &c->objver, osd_reqid_t(), &trace);
   objecter->op_submit(op, &c->tid);
+  trace.event("rados operate op submitted");
 
   return 0;
 }
 
 int librados::IoCtxImpl::aio_read(const object_t oid, AioCompletionImpl *c,
 				  bufferlist *pbl, size_t len, uint64_t off,
-				  uint64_t snapid)
+				  uint64_t snapid, const blkin_trace_info *info)
 {
   FUNCTRACE();
   if (len > (size_t) INT_MAX)
@@ -832,17 +851,21 @@ int librados::IoCtxImpl::aio_read(const object_t oid, AioCompletionImpl *c,
   c->io = this;
   c->blp = pbl;
 
+  ZTracer::Trace trace;
+  if (info)
+    trace.init("rados read", &objecter->trace_endpoint, info);
+
   Objecter::Op *o = objecter->prepare_read_op(
     oid, oloc,
     off, len, snapid, pbl, 0,
-    oncomplete, &c->objver);
+    oncomplete, &c->objver, nullptr, 0, &trace);
   objecter->op_submit(o, &c->tid);
   return 0;
 }
 
 int librados::IoCtxImpl::aio_read(const object_t oid, AioCompletionImpl *c,
 				  char *buf, size_t len, uint64_t off,
-				  uint64_t snapid)
+				  uint64_t snapid, const blkin_trace_info *info)
 {
   FUNCTRACE();
   if (len > (size_t) INT_MAX)
@@ -861,10 +884,14 @@ int librados::IoCtxImpl::aio_read(const object_t oid, AioCompletionImpl *c,
   c->blp = &c->bl;
   c->out_buf = buf;
 
+  ZTracer::Trace trace;
+  if (info)
+    trace.init("rados read", &objecter->trace_endpoint, info);
+
   Objecter::Op *o = objecter->prepare_read_op(
     oid, oloc,
     off, len, snapid, &c->bl, 0,
-    oncomplete, &c->objver);
+    oncomplete, &c->objver, nullptr, 0, &trace);
   objecter->op_submit(o, &c->tid);
   return 0;
 }
@@ -909,9 +936,57 @@ int librados::IoCtxImpl::aio_sparse_read(const object_t oid,
   return 0;
 }
 
+int librados::IoCtxImpl::aio_cmpext(const object_t& oid,
+				    AioCompletionImpl *c,
+				    uint64_t off,
+				    bufferlist& cmp_bl)
+{
+  if (cmp_bl.length() > UINT_MAX/2)
+    return -E2BIG;
+
+  Context *onack = new C_aio_Complete(c);
+
+  c->is_read = true;
+  c->io = this;
+
+  Objecter::Op *o = objecter->prepare_cmpext_op(
+    oid, oloc, off, cmp_bl, snap_seq, 0,
+    onack, &c->objver);
+  objecter->op_submit(o, &c->tid);
+
+  return 0;
+}
+
+/* use m_ops.cmpext() + prepare_read_op() for non-bufferlist C API */
+int librados::IoCtxImpl::aio_cmpext(const object_t& oid,
+				    AioCompletionImpl *c,
+				    const char *cmp_buf,
+				    size_t cmp_len,
+				    uint64_t off)
+{
+  if (cmp_len > UINT_MAX/2)
+    return -E2BIG;
+
+  bufferlist cmp_bl;
+  cmp_bl.append(cmp_buf, cmp_len);
+
+  Context *nested = new C_aio_Complete(c);
+  C_ObjectOperation *onack = new C_ObjectOperation(nested);
+
+  c->is_read = true;
+  c->io = this;
+
+  onack->m_ops.cmpext(off, cmp_len, cmp_buf, NULL);
+
+  Objecter::Op *o = objecter->prepare_read_op(
+    oid, oloc, onack->m_ops, snap_seq, NULL, 0, onack, &c->objver);
+  objecter->op_submit(o, &c->tid);
+  return 0;
+}
+
 int librados::IoCtxImpl::aio_write(const object_t &oid, AioCompletionImpl *c,
 				   const bufferlist& bl, size_t len,
-				   uint64_t off)
+				   uint64_t off, const blkin_trace_info *info)
 {
   FUNCTRACE();
   auto ut = ceph::real_clock::now();
@@ -929,13 +1004,17 @@ int librados::IoCtxImpl::aio_write(const object_t &oid, AioCompletionImpl *c,
 #if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE)
   ((C_aio_Complete *) oncomplete)->oid = oid;
 #endif
+  ZTracer::Trace trace;
+  if (info)
+    trace.init("rados write", &objecter->trace_endpoint, info);
+
   c->io = this;
   queue_aio_write(c);
 
   Objecter::Op *o = objecter->prepare_write_op(
     oid, oloc,
     off, len, snapc, bl, ut, 0,
-    oncomplete, &c->objver);
+    oncomplete, &c->objver, nullptr, 0, &trace);
   objecter->op_submit(o, &c->tid);
 
   return 0;
@@ -1376,6 +1455,18 @@ int librados::IoCtxImpl::read(const object_t& oid,
   return bl.length();
 }
 
+int librados::IoCtxImpl::cmpext(const object_t& oid, uint64_t off,
+                                bufferlist& cmp_bl)
+{
+  if (cmp_bl.length() > UINT_MAX/2)
+    return -E2BIG;
+
+  ::ObjectOperation op;
+  prepare_assert_ops(&op);
+  op.cmpext(off, cmp_bl, NULL);
+  return operate_read(oid, &op, NULL);
+}
+
 int librados::IoCtxImpl::mapext(const object_t& oid,
 				uint64_t off, size_t len,
 				std::map<uint64_t,uint64_t>& m)
diff --git a/src/librados/IoCtxImpl.h b/src/librados/IoCtxImpl.h
index c9d9c98fbc1..7870b831710 100644
--- a/src/librados/IoCtxImpl.h
+++ b/src/librados/IoCtxImpl.h
@@ -18,6 +18,7 @@
 #include "common/Cond.h"
 #include "common/Mutex.h"
 #include "common/snap_types.h"
+#include "common/zipkin_trace.h"
 #include "include/atomic.h"
 #include "include/types.h"
 #include "include/rados/librados.h"
@@ -138,6 +139,7 @@ struct librados::IoCtxImpl {
   int stat(const object_t& oid, uint64_t *psize, time_t *pmtime);
   int stat2(const object_t& oid, uint64_t *psize, struct timespec *pts);
   int trunc(const object_t& oid, uint64_t size);
+  int cmpext(const object_t& oid, uint64_t off, bufferlist& cmp_bl);
 
   int tmap_update(const object_t& oid, bufferlist& cmdbl);
   int tmap_put(const object_t& oid, bufferlist& bl);
@@ -155,9 +157,9 @@ struct librados::IoCtxImpl {
   int operate_read(const object_t& oid, ::ObjectOperation *o, bufferlist *pbl, int flags=0);
   int aio_operate(const object_t& oid, ::ObjectOperation *o,
 		  AioCompletionImpl *c, const SnapContext& snap_context,
-		  int flags);
+		  int flags, const blkin_trace_info *trace_info = nullptr);
   int aio_operate_read(const object_t& oid, ::ObjectOperation *o,
-		       AioCompletionImpl *c, int flags, bufferlist *pbl);
+		       AioCompletionImpl *c, int flags, bufferlist *pbl, const blkin_trace_info *trace_info = nullptr);
 
   struct C_aio_stat_Ack : public Context {
     librados::AioCompletionImpl *c;
@@ -185,14 +187,21 @@ struct librados::IoCtxImpl {
   };
 
   int aio_read(const object_t oid, AioCompletionImpl *c,
-	       bufferlist *pbl, size_t len, uint64_t off, uint64_t snapid);
+	       bufferlist *pbl, size_t len, uint64_t off, uint64_t snapid,
+	       const blkin_trace_info *info = nullptr);
   int aio_read(object_t oid, AioCompletionImpl *c,
-	       char *buf, size_t len, uint64_t off, uint64_t snapid);
+	       char *buf, size_t len, uint64_t off, uint64_t snapid,
+	       const blkin_trace_info *info = nullptr);
   int aio_sparse_read(const object_t oid, AioCompletionImpl *c,
 		      std::map<uint64_t,uint64_t> *m, bufferlist *data_bl,
 		      size_t len, uint64_t off, uint64_t snapid);
+  int aio_cmpext(const object_t& oid, AioCompletionImpl *c, uint64_t off,
+		      bufferlist& cmp_bl);
+  int aio_cmpext(const object_t& oid, AioCompletionImpl *c,
+		      const char *cmp_buf, size_t cmp_len, uint64_t off);
   int aio_write(const object_t &oid, AioCompletionImpl *c,
-		const bufferlist& bl, size_t len, uint64_t off);
+		const bufferlist& bl, size_t len, uint64_t off,
+		const blkin_trace_info *info = nullptr);
   int aio_append(const object_t &oid, AioCompletionImpl *c,
 		 const bufferlist& bl, size_t len);
   int aio_write_full(const object_t &oid, AioCompletionImpl *c,
diff --git a/src/librados/RadosXattrIter.h b/src/librados/RadosXattrIter.h
index 35c87043840..6b232bd5873 100644
--- a/src/librados/RadosXattrIter.h
+++ b/src/librados/RadosXattrIter.h
@@ -18,7 +18,7 @@
 #include <string>
 #include <map>
 
-#include "include/buffer.h"
+#include "include/buffer.h"  // for bufferlist
 
 namespace librados {
 
diff --git a/src/librados/librados.cc b/src/librados/librados.cc
index 3c063260bf1..e7e75605d44 100644
--- a/src/librados/librados.cc
+++ b/src/librados/librados.cc
@@ -153,6 +153,14 @@ void librados::ObjectOperation::set_op_flags2(int flags)
   ::set_op_flags(o, flags);
 }
 
+void librados::ObjectOperation::cmpext(uint64_t off,
+                                       bufferlist &cmp_bl,
+                                       int *prval)
+{
+  ::ObjectOperation *o = &impl->o;
+  o->cmpext(off, cmp_bl, prval);
+}
+
 void librados::ObjectOperation::cmpxattr(const char *name, uint8_t op, const bufferlist& v)
 {
   ::ObjectOperation *o = &impl->o;
@@ -1222,6 +1230,12 @@ int librados::IoCtx::mapext(const std::string& oid, uint64_t off, size_t len,
   return io_ctx_impl->mapext(obj, off, len, m);
 }
 
+int librados::IoCtx::cmpext(const std::string& oid, uint64_t off, bufferlist& cmp_bl)
+{
+  object_t obj(oid);
+  return io_ctx_impl->cmpext(obj, off, cmp_bl);
+}
+
 int librados::IoCtx::sparse_read(const std::string& oid, std::map<uint64_t,uint64_t>& m,
 				 bufferlist& bl, size_t len, uint64_t off)
 {
@@ -1505,6 +1519,21 @@ int librados::IoCtx::aio_operate(const std::string& oid, AioCompletion *c,
 }
 
 int librados::IoCtx::aio_operate(const std::string& oid, AioCompletion *c,
+         librados::ObjectWriteOperation *o,
+         snap_t snap_seq, std::vector<snap_t>& snaps,
+         const blkin_trace_info *trace_info)
+{
+  object_t obj(oid);
+  vector<snapid_t> snv;
+  snv.resize(snaps.size());
+  for (size_t i = 0; i < snaps.size(); ++i)
+    snv[i] = snaps[i];
+  SnapContext snapc(snap_seq, snv);
+  return io_ctx_impl->aio_operate(obj, &o->impl->o, c->pc,
+          snapc, 0, trace_info);
+}
+
+int librados::IoCtx::aio_operate(const std::string& oid, AioCompletion *c,
 				 librados::ObjectReadOperation *o,
 				 bufferlist *pbl)
 {
@@ -1541,6 +1570,14 @@ int librados::IoCtx::aio_operate(const std::string& oid, AioCompletion *c,
 				       translate_flags(flags), pbl);
 }
 
+int librados::IoCtx::aio_operate(const std::string& oid, AioCompletion *c,
+         librados::ObjectReadOperation *o,
+         int flags, bufferlist *pbl, const blkin_trace_info *trace_info)
+{
+  object_t obj(oid);
+  return io_ctx_impl->aio_operate_read(obj, &o->impl->o, c->pc,
+               translate_flags(flags), pbl, trace_info);
+}
 
 void librados::IoCtx::snap_set_read(snap_t seq)
 {
@@ -1829,6 +1866,14 @@ int librados::IoCtx::aio_exec(const std::string& oid,
   return io_ctx_impl->aio_exec(obj, c->pc, cls, method, inbl, outbl);
 }
 
+int librados::IoCtx::aio_cmpext(const std::string& oid,
+				librados::AioCompletion *c,
+				uint64_t off,
+				bufferlist& cmp_bl)
+{
+  return io_ctx_impl->aio_cmpext(oid, c->pc, off, cmp_bl);
+}
+
 int librados::IoCtx::aio_sparse_read(const std::string& oid, librados::AioCompletion *c,
 				     std::map<uint64_t,uint64_t> *m, bufferlist *data_bl,
 				     size_t len, uint64_t off)
@@ -3046,10 +3091,12 @@ extern "C" int rados_pool_list(rados_t cluster, char *buf, size_t len)
       break;
     const char* pool = i->second.c_str();
     tracepoint(librados, rados_pool_list_pool, pool);
-    strncat(b, pool, rl);
+    if (b) {
+      strncat(b, pool, rl);
+      b += rl;
+    }
     needed += rl;
     len -= rl;
-    b += rl;
   }
   for (; i != p_end; ++i) {
     int rl = i->second.length() + 1;
@@ -3924,6 +3971,23 @@ extern "C" int rados_ioctx_snap_get_stamp(rados_ioctx_t io, rados_snap_t id, tim
   return retval;
 }
 
+extern "C" int rados_cmpext(rados_ioctx_t io, const char *o,
+			    const char *cmp_buf, size_t cmp_len, uint64_t off)
+{
+  tracepoint(librados, rados_cmpext_enter, io, o, cmp_buf, cmp_len, off);
+  librados::IoCtxImpl *ctx = (librados::IoCtxImpl *)io;
+  int ret;
+  object_t oid(o);
+
+  bufferlist cmp_bl;
+  cmp_bl.append(cmp_buf, cmp_len);
+
+  ret = ctx->cmpext(oid, off, cmp_bl);
+  tracepoint(librados, rados_cmpext_exit, ret);
+
+  return ret;
+}
+
 extern "C" int rados_getxattr(rados_ioctx_t io, const char *o, const char *name,
 			      char *buf, size_t len)
 {
@@ -4505,6 +4569,22 @@ extern "C" int rados_aio_read(rados_ioctx_t io, const char *o,
   return retval;
 }
 
+#ifdef WITH_BLKIN
+extern "C" int rados_aio_read_traced(rados_ioctx_t io, const char *o,
+				     rados_completion_t completion,
+				     char *buf, size_t len, uint64_t off,
+				     struct blkin_trace_info *info)
+{
+  tracepoint(librados, rados_aio_read_enter, io, o, completion, len, off);
+  librados::IoCtxImpl *ctx = (librados::IoCtxImpl *)io;
+  object_t oid(o);
+  int retval = ctx->aio_read(oid, (librados::AioCompletionImpl*)completion,
+                             buf, len, off, ctx->snap_seq, info);
+  tracepoint(librados, rados_aio_read_exit, retval);
+  return retval;
+}
+#endif
+
 extern "C" int rados_aio_write(rados_ioctx_t io, const char *o,
 				rados_completion_t completion,
 				const char *buf, size_t len, uint64_t off)
@@ -4522,6 +4602,26 @@ extern "C" int rados_aio_write(rados_ioctx_t io, const char *o,
   return retval;
 }
 
+#ifdef WITH_BLKIN
+extern "C" int rados_aio_write_traced(rados_ioctx_t io, const char *o,
+                                      rados_completion_t completion,
+                                      const char *buf, size_t len, uint64_t off,
+                                      struct blkin_trace_info *info)
+{
+  tracepoint(librados, rados_aio_write_enter, io, o, completion, buf, len, off);
+  if (len > UINT_MAX/2)
+    return -E2BIG;
+  librados::IoCtxImpl *ctx = (librados::IoCtxImpl *)io;
+  object_t oid(o);
+  bufferlist bl;
+  bl.append(buf, len);
+  int retval = ctx->aio_write(oid, (librados::AioCompletionImpl*)completion,
+                              bl, len, off, info);
+  tracepoint(librados, rados_aio_write_exit, retval);
+  return retval;
+}
+#endif
+
 extern "C" int rados_aio_append(rados_ioctx_t io, const char *o,
 				rados_completion_t completion,
 				const char *buf, size_t len)
@@ -4727,7 +4827,7 @@ extern "C" int rados_aio_rmxattr(rados_ioctx_t io, const char *o,
   return retval;
 }
 
-extern "C" int rados_aio_stat(rados_ioctx_t io, const char *o, 
+extern "C" int rados_aio_stat(rados_ioctx_t io, const char *o,
 			      rados_completion_t completion,
 			      uint64_t *psize, time_t *pmtime)
 {
@@ -4740,6 +4840,20 @@ extern "C" int rados_aio_stat(rados_ioctx_t io, const char *o,
   return retval;
 }
 
+extern "C" int rados_aio_cmpext(rados_ioctx_t io, const char *o,
+				rados_completion_t completion, const char *cmp_buf,
+				size_t cmp_len, uint64_t off)
+{
+  tracepoint(librados, rados_aio_cmpext_enter, io, o, completion, cmp_buf,
+	     cmp_len, off);
+  librados::IoCtxImpl *ctx = (librados::IoCtxImpl *)io;
+  object_t oid(o);
+  int retval = ctx->aio_cmpext(oid, (librados::AioCompletionImpl*)completion,
+			       cmp_buf, cmp_len, off);
+  tracepoint(librados, rados_aio_cmpext_exit, retval);
+  return retval;
+}
+
 extern "C" int rados_aio_cancel(rados_ioctx_t io, rados_completion_t completion)
 {
   librados::IoCtxImpl *ctx = (librados::IoCtxImpl *)io;
@@ -5200,6 +5314,18 @@ extern "C" void rados_write_op_assert_exists(rados_write_op_t write_op)
   tracepoint(librados, rados_write_op_assert_exists_exit);
 }
 
+extern "C" void rados_write_op_cmpext(rados_write_op_t write_op,
+				      const char *cmp_buf,
+				      size_t cmp_len,
+				      uint64_t off,
+				      int *prval)
+{
+  tracepoint(librados, rados_write_op_cmpext_enter, write_op, cmp_buf,
+	     cmp_len, off, prval);
+  ((::ObjectOperation *)write_op)->cmpext(off, cmp_len, cmp_buf, prval);
+  tracepoint(librados, rados_write_op_cmpext_exit);
+}
+
 extern "C" void rados_write_op_cmpxattr(rados_write_op_t write_op,
                                        const char *name,
 				       uint8_t comparison_operator,
@@ -5519,6 +5645,18 @@ extern "C" void rados_read_op_assert_exists(rados_read_op_t read_op)
   tracepoint(librados, rados_read_op_assert_exists_exit);
 }
 
+extern "C" void rados_read_op_cmpext(rados_read_op_t read_op,
+				     const char *cmp_buf,
+				     size_t cmp_len,
+				     uint64_t off,
+				     int *prval)
+{
+  tracepoint(librados, rados_read_op_cmpext_enter, read_op, cmp_buf,
+	     cmp_len, off, prval);
+  ((::ObjectOperation *)read_op)->cmpext(off, cmp_len, cmp_buf, prval);
+  tracepoint(librados, rados_read_op_cmpext_exit);
+}
+
 extern "C" void rados_read_op_cmpxattr(rados_read_op_t read_op,
 				       const char *name,
 				       uint8_t comparison_operator,
diff --git a/src/libradosstriper/RadosStriperImpl.cc b/src/libradosstriper/RadosStriperImpl.cc
index ea3be3b145c..fd5f13f0065 100644
--- a/src/libradosstriper/RadosStriperImpl.cc
+++ b/src/libradosstriper/RadosStriperImpl.cc
@@ -103,13 +103,13 @@
 
 /// default object layout
 struct ceph_file_layout default_file_layout = {
- fl_stripe_unit: init_le32(1<<22),
- fl_stripe_count: init_le32(1),
- fl_object_size: init_le32(1<<22),
- fl_cas_hash: init_le32(0),
- fl_object_stripe_unit: init_le32(0),
- fl_unused: init_le32(-1),
- fl_pg_pool : init_le32(-1),
+  init_le32(1<<22),	// fl_stripe_unit
+  init_le32(1),		// fl_stripe_count
+  init_le32(1<<22),	// fl_object_size
+  init_le32(0),		// fl_cas_hash
+  init_le32(0),		// fl_object_stripe_unit
+  init_le32(-1),	// fl_unused
+  init_le32(-1),	// fl_pg_pool
 };
 
 
@@ -1169,10 +1169,13 @@ int libradosstriper::RadosStriperImpl::internal_get_layout_and_size(
   // deal with size
   size_t ssize;
   rc = extract_sizet_attr(attrs, XATTR_SIZE, &ssize);
+  if (rc) {
+    return rc;
+  }
   *size = ssize;
   // make valgrind happy by setting unused fl_pg_pool
   layout->fl_pg_pool = 0;
-  return rc;
+  return 0;
 }
 
 int libradosstriper::RadosStriperImpl::openStripedObjectForRead(
diff --git a/src/librbd/AsyncObjectThrottle.cc b/src/librbd/AsyncObjectThrottle.cc
index 99f0bfda3e7..1f09091e8f5 100644
--- a/src/librbd/AsyncObjectThrottle.cc
+++ b/src/librbd/AsyncObjectThrottle.cc
@@ -100,4 +100,6 @@ void AsyncObjectThrottle<T>::start_next_op() {
 
 } // namespace librbd
 
+#ifndef TEST_F
 template class librbd::AsyncObjectThrottle<librbd::ImageCtx>;
+#endif
diff --git a/src/librbd/AsyncRequest.cc b/src/librbd/AsyncRequest.cc
index e6c25f076a6..a1e4202997d 100644
--- a/src/librbd/AsyncRequest.cc
+++ b/src/librbd/AsyncRequest.cc
@@ -66,4 +66,6 @@ void AsyncRequest<T>::finish_request() {
 
 } // namespace librbd
 
+#ifndef TEST_F
 template class librbd::AsyncRequest<librbd::ImageCtx>;
+#endif
diff --git a/src/librbd/ExclusiveLock.cc b/src/librbd/ExclusiveLock.cc
index 19ca6e42e13..a6f8b08cf36 100644
--- a/src/librbd/ExclusiveLock.cc
+++ b/src/librbd/ExclusiveLock.cc
@@ -294,6 +294,16 @@ void ExclusiveLock<I>::post_release_lock_handler(bool shutting_down, int r,
 }
 
 template <typename I>
+void ExclusiveLock<I>::post_reacquire_lock_handler(int r, Context *on_finish) {
+  ldout(m_image_ctx.cct, 10) << dendl;
+  if (r >= 0) {
+    m_image_ctx.image_watcher->notify_acquired_lock();
+  }
+
+  on_finish->complete(r);
+}
+
+template <typename I>
 struct ExclusiveLock<I>::C_InitComplete : public Context {
   ExclusiveLock *exclusive_lock;
   uint64_t features;
diff --git a/src/librbd/ExclusiveLock.h b/src/librbd/ExclusiveLock.h
index 092f7094a6c..7b2e63c2bcb 100644
--- a/src/librbd/ExclusiveLock.h
+++ b/src/librbd/ExclusiveLock.h
@@ -36,6 +36,7 @@ protected:
                                 Context *on_finish) override;
   void post_release_lock_handler(bool shutting_down, int r,
                                  Context *on_finish) override;
+  void post_reacquire_lock_handler(int r, Context *on_finish) override;
 
 private:
 
diff --git a/src/librbd/ImageCtx.cc b/src/librbd/ImageCtx.cc
index 9ea00a1bcfb..74366f5c633 100644
--- a/src/librbd/ImageCtx.cc
+++ b/src/librbd/ImageCtx.cc
@@ -198,7 +198,8 @@ struct C_InvalidateCache : public Context {
       operations(new Operations<>(*this)),
       exclusive_lock(nullptr), object_map(nullptr),
       io_work_queue(nullptr), op_work_queue(nullptr),
-      asok_hook(nullptr)
+      asok_hook(nullptr),
+      trace_endpoint("librbd")
   {
     md_ctx.dup(p);
     data_ctx.dup(p);
@@ -269,6 +270,7 @@ struct C_InvalidateCache : public Context {
       pname += snap_name;
     }
 
+    trace_endpoint.copy_name(pname);
     perf_start(pname);
 
     if (cache) {
@@ -713,7 +715,7 @@ struct C_InvalidateCache : public Context {
   void ImageCtx::aio_read_from_cache(object_t o, uint64_t object_no,
 				     bufferlist *bl, size_t len,
 				     uint64_t off, Context *onfinish,
-				     int fadvise_flags) {
+				     int fadvise_flags, ZTracer::Trace *trace) {
     snap_lock.get_read();
     ObjectCacher::OSDRead *rd = object_cacher->prepare_read(snap_id, bl, fadvise_flags);
     snap_lock.put_read();
@@ -722,7 +724,7 @@ struct C_InvalidateCache : public Context {
     extent.buffer_extents.push_back(make_pair(0, len));
     rd->extents.push_back(extent);
     cache_lock.Lock();
-    int r = object_cacher->readx(rd, object_set, onfinish);
+    int r = object_cacher->readx(rd, object_set, onfinish, trace);
     cache_lock.Unlock();
     if (r != 0)
       onfinish->complete(r);
@@ -730,7 +732,8 @@ struct C_InvalidateCache : public Context {
 
   void ImageCtx::write_to_cache(object_t o, const bufferlist& bl, size_t len,
 				uint64_t off, Context *onfinish,
-				int fadvise_flags, uint64_t journal_tid) {
+				int fadvise_flags, uint64_t journal_tid,
+				ZTracer::Trace *trace) {
     snap_lock.get_read();
     ObjectCacher::OSDWrite *wr = object_cacher->prepare_write(
       snapc, bl, ceph::real_time::min(), fadvise_flags, journal_tid);
@@ -743,7 +746,7 @@ struct C_InvalidateCache : public Context {
     wr->extents.push_back(extent);
     {
       Mutex::Locker l(cache_lock);
-      object_cacher->writex(wr, object_set, onfinish);
+      object_cacher->writex(wr, object_set, onfinish, trace);
     }
   }
 
diff --git a/src/librbd/ImageCtx.h b/src/librbd/ImageCtx.h
index b85974224a8..499f1e2781f 100644
--- a/src/librbd/ImageCtx.h
+++ b/src/librbd/ImageCtx.h
@@ -15,6 +15,7 @@
 #include "common/Readahead.h"
 #include "common/RWLock.h"
 #include "common/snap_types.h"
+#include "common/zipkin_trace.h"
 
 #include "include/buffer_fwd.h"
 #include "include/rbd/librbd.hpp"
@@ -200,6 +201,8 @@ namespace librbd {
     exclusive_lock::Policy *exclusive_lock_policy = nullptr;
     journal::Policy *journal_policy = nullptr;
 
+    ZTracer::Endpoint trace_endpoint;
+
     static bool _filter_metadata_confs(const string &prefix, std::map<string, bool> &configs,
                                        const map<string, bufferlist> &pairs, map<string, bufferlist> *res);
 
@@ -278,10 +281,10 @@ namespace librbd {
 			   uint64_t *overlap) const;
     void aio_read_from_cache(object_t o, uint64_t object_no, bufferlist *bl,
 			     size_t len, uint64_t off, Context *onfinish,
-			     int fadvise_flags);
+			     int fadvise_flags, ZTracer::Trace *trace);
     void write_to_cache(object_t o, const bufferlist& bl, size_t len,
 			uint64_t off, Context *onfinish, int fadvise_flags,
-                        uint64_t journal_tid);
+                        uint64_t journal_tid, ZTracer::Trace *trace);
     void user_flushed();
     void flush_cache(Context *onfinish);
     void shut_down_cache(Context *on_finish);
diff --git a/src/librbd/Journal.cc b/src/librbd/Journal.cc
index 050a87b355a..c19a6a80c50 100644
--- a/src/librbd/Journal.cc
+++ b/src/librbd/Journal.cc
@@ -456,25 +456,6 @@ int Journal<I>::reset(librados::IoCtx &io_ctx, const std::string &image_id) {
 }
 
 template <typename I>
-int Journal<I>::is_tag_owner(I *image_ctx, bool *is_tag_owner) {
-  return Journal<I>::is_tag_owner(image_ctx->md_ctx, image_ctx->id,
-                                  is_tag_owner, image_ctx->op_work_queue);
-}
-
-template <typename I>
-int Journal<I>::is_tag_owner(librados::IoCtx& io_ctx, std::string& image_id,
-                             bool *is_tag_owner, ContextWQ *op_work_queue) {
-  C_SaferCond ctx;
-  Journal<I>::is_tag_owner(io_ctx, image_id, is_tag_owner, op_work_queue, &ctx);
-
-  int r = ctx.wait();
-  if (r < 0) {
-    return r;
-  }
-  return r;
-}
-
-template <typename I>
 void Journal<I>::is_tag_owner(I *image_ctx, bool *owner,
                               Context *on_finish) {
   Journal<I>::is_tag_owner(image_ctx->md_ctx, image_ctx->id, owner,
@@ -496,19 +477,6 @@ void Journal<I>::is_tag_owner(librados::IoCtx& io_ctx, std::string& image_id,
 }
 
 template <typename I>
-int Journal<I>::get_tag_owner(I *image_ctx, std::string *mirror_uuid) {
-  C_SaferCond get_tags_ctx;
-  get_tag_owner(image_ctx->md_ctx, image_ctx->id, mirror_uuid,
-                image_ctx->op_work_queue, &get_tags_ctx);
-
-  int r = get_tags_ctx.wait();
-  if (r < 0) {
-    return r;
-  }
-  return 0;
-}
-
-template <typename I>
 void Journal<I>::get_tag_owner(IoCtx& io_ctx, std::string& image_id,
                                std::string *mirror_uuid,
                                ContextWQ *op_work_queue, Context *on_finish) {
@@ -1800,4 +1768,6 @@ void Journal<I>::remove_listener(journal::Listener *listener) {
 
 } // namespace librbd
 
+#ifndef TEST_F
 template class librbd::Journal<librbd::ImageCtx>;
+#endif
diff --git a/src/librbd/Journal.h b/src/librbd/Journal.h
index 4aafefba6d7..3e3cf4928dd 100644
--- a/src/librbd/Journal.h
+++ b/src/librbd/Journal.h
@@ -5,7 +5,6 @@
 #define CEPH_LIBRBD_JOURNAL_H
 
 #include "include/int_types.h"
-#include "include/atomic.h"
 #include "include/Context.h"
 #include "include/interval_set.h"
 #include "common/Cond.h"
@@ -19,9 +18,11 @@
 #include "librbd/Utils.h"
 #include "librbd/journal/Types.h"
 #include "librbd/journal/TypeTraits.h"
+
 #include <algorithm>
 #include <list>
 #include <string>
+#include <atomic>
 #include <unordered_map>
 
 class SafeTimer;
@@ -102,15 +103,11 @@ public:
   static int remove(librados::IoCtx &io_ctx, const std::string &image_id);
   static int reset(librados::IoCtx &io_ctx, const std::string &image_id);
 
-  static int is_tag_owner(ImageCtxT *image_ctx, bool *is_tag_owner);
-  static int is_tag_owner(librados::IoCtx& io_ctx, std::string& image_id,
-                          bool *is_tag_owner, ContextWQ *op_work_queue);
   static void is_tag_owner(ImageCtxT *image_ctx, bool *is_tag_owner,
                            Context *on_finish);
   static void is_tag_owner(librados::IoCtx& io_ctx, std::string& image_id,
                            bool *is_tag_owner, ContextWQ *op_work_queue,
                            Context *on_finish);
-  static int get_tag_owner(ImageCtxT *image_ctx, std::string *mirror_uuid);
   static void get_tag_owner(librados::IoCtx& io_ctx, std::string& image_id,
                             std::string *mirror_uuid,
                             ContextWQ *op_work_queue, Context *on_finish);
@@ -159,7 +156,7 @@ public:
   void wait_event(uint64_t tid, Context *on_safe);
 
   uint64_t allocate_op_tid() {
-    uint64_t op_tid = m_op_tid.inc();
+    uint64_t op_tid = ++m_op_tid;
     assert(op_tid != 0);
     return op_tid;
   }
@@ -301,7 +298,7 @@ private:
   uint64_t m_event_tid;
   Events m_events;
 
-  atomic_t m_op_tid;
+  std::atomic<uint64_t> m_op_tid = { 0 };
   TidToFutures m_op_futures;
 
   bool m_processing_entry = false;
diff --git a/src/librbd/LibrbdWriteback.cc b/src/librbd/LibrbdWriteback.cc
index f9292caba2a..d1dc08c3113 100644
--- a/src/librbd/LibrbdWriteback.cc
+++ b/src/librbd/LibrbdWriteback.cc
@@ -70,8 +70,8 @@ namespace librbd {
   class C_OrderedWrite : public Context {
   public:
     C_OrderedWrite(CephContext *cct, LibrbdWriteback::write_result_d *result,
-		   LibrbdWriteback *wb)
-      : m_cct(cct), m_result(result), m_wb_handler(wb) {}
+		   const ZTracer::Trace &trace, LibrbdWriteback *wb)
+      : m_cct(cct), m_result(result), m_trace(trace), m_wb_handler(wb) {}
     ~C_OrderedWrite() override {}
     void finish(int r) override {
       ldout(m_cct, 20) << "C_OrderedWrite completing " << m_result << dendl;
@@ -83,10 +83,12 @@ namespace librbd {
 	m_wb_handler->complete_writes(m_result->oid);
       }
       ldout(m_cct, 20) << "C_OrderedWrite finished " << m_result << dendl;
+      m_trace.event("finish");
     }
   private:
     CephContext *m_cct;
     LibrbdWriteback::write_result_d *m_result;
+    ZTracer::Trace m_trace;
     LibrbdWriteback *m_wb_handler;
   };
 
@@ -99,17 +101,19 @@ namespace librbd {
     uint64_t off;
     bufferlist bl;
     SnapContext snapc;
-    Context *req_comp;
     uint64_t journal_tid;
-    bool request_sent;
+    ZTracer::Trace trace;
+    Context *req_comp;
+    bool request_sent = false;
 
     C_WriteJournalCommit(ImageCtx *_image_ctx, const std::string &_oid,
                          uint64_t _object_no, uint64_t _off,
                          const bufferlist &_bl, const SnapContext& _snapc,
-                         Context *_req_comp, uint64_t _journal_tid)
+                         uint64_t _journal_tid,
+			 const ZTracer::Trace &trace, Context *_req_comp)
       : image_ctx(_image_ctx), oid(_oid), object_no(_object_no), off(_off),
-        bl(_bl), snapc(_snapc), req_comp(_req_comp), journal_tid(_journal_tid),
-        request_sent(false) {
+        bl(_bl), snapc(_snapc), journal_tid(_journal_tid),
+        trace(trace), req_comp(_req_comp) {
       CephContext *cct = image_ctx->cct;
       ldout(cct, 20) << this << " C_WriteJournalCommit: "
                      << "delaying write until journal tid "
@@ -162,7 +166,7 @@ namespace librbd {
 
       request_sent = true;
       auto req = new io::ObjectWriteRequest(image_ctx, oid, object_no, off,
-                                            bl, snapc, this, 0);
+                                            bl, snapc, 0, trace, this);
       req->send();
     }
   };
@@ -196,7 +200,9 @@ namespace librbd {
 			     const object_locator_t& oloc,
 			     uint64_t off, uint64_t len, snapid_t snapid,
 			     bufferlist *pbl, uint64_t trunc_size,
-			     __u32 trunc_seq, int op_flags, Context *onfinish)
+			     __u32 trunc_seq, int op_flags,
+                             const ZTracer::Trace &parent_trace,
+                             Context *onfinish)
   {
     // on completion, take the mutex and then call onfinish.
     Context *req = new C_ReadRequest(m_ictx->cct, onfinish, &m_lock);
@@ -217,8 +223,9 @@ namespace librbd {
 
     librados::AioCompletion *rados_completion =
       util::create_rados_callback(req);
-    int r = m_ictx->data_ctx.aio_operate(oid.name, rados_completion, &op,
-					 flags, NULL);
+    int r = m_ictx->data_ctx.aio_operate(
+      oid.name, rados_completion, &op, flags, nullptr,
+      (parent_trace.valid() ? parent_trace.get_info() : nullptr));
     rados_completion->release();
     assert(r >= 0);
   }
@@ -254,25 +261,34 @@ namespace librbd {
 				    const bufferlist &bl,
 				    ceph::real_time mtime, uint64_t trunc_size,
 				    __u32 trunc_seq, ceph_tid_t journal_tid,
+                                    const ZTracer::Trace &parent_trace,
 				    Context *oncommit)
   {
+    ZTracer::Trace trace;
+    if (parent_trace.valid()) {
+      trace.init("", &m_ictx->trace_endpoint, &parent_trace);
+      trace.copy_name("writeback " + oid.name);
+      trace.event("start");
+    }
+
     uint64_t object_no = oid_to_object_no(oid.name, m_ictx->object_prefix);
 
     write_result_d *result = new write_result_d(oid.name, oncommit);
     m_writes[oid.name].push(result);
     ldout(m_ictx->cct, 20) << "write will wait for result " << result << dendl;
-    C_OrderedWrite *req_comp = new C_OrderedWrite(m_ictx->cct, result, this);
+    C_OrderedWrite *req_comp = new C_OrderedWrite(m_ictx->cct, result, trace,
+                                                  this);
 
     // all IO operations are flushed prior to closing the journal
     assert(journal_tid == 0 || m_ictx->journal != NULL);
     if (journal_tid != 0) {
       m_ictx->journal->flush_event(
-        journal_tid, new C_WriteJournalCommit(m_ictx, oid.name, object_no, off,
-                                              bl, snapc, req_comp,
-					      journal_tid));
+        journal_tid, new C_WriteJournalCommit(
+	  m_ictx, oid.name, object_no, off, bl, snapc, journal_tid, trace,
+          req_comp));
     } else {
-      auto req = new io::ObjectWriteRequest(m_ictx, oid.name, object_no,
-					    off, bl, snapc, req_comp, 0);
+      auto req = new io::ObjectWriteRequest(
+	m_ictx, oid.name, object_no, off, bl, snapc, 0, trace, req_comp);
       req->send();
     }
     return ++m_tid;
diff --git a/src/librbd/LibrbdWriteback.h b/src/librbd/LibrbdWriteback.h
index 9c47f98bc18..6ffba511979 100644
--- a/src/librbd/LibrbdWriteback.h
+++ b/src/librbd/LibrbdWriteback.h
@@ -24,7 +24,8 @@ namespace librbd {
     void read(const object_t& oid, uint64_t object_no,
               const object_locator_t& oloc, uint64_t off, uint64_t len,
               snapid_t snapid, bufferlist *pbl, uint64_t trunc_size,
-              __u32 trunc_seq, int op_flags, Context *onfinish) override;
+              __u32 trunc_seq, int op_flags,
+	      const ZTracer::Trace &parent_trace, Context *onfinish) override;
 
     // Determine whether a read to this extent could be affected by a
     // write-triggered copy-on-write
@@ -37,6 +38,7 @@ namespace librbd {
                      const SnapContext& snapc, const bufferlist &bl,
                      ceph::real_time mtime, uint64_t trunc_size,
                      __u32 trunc_seq, ceph_tid_t journal_tid,
+                     const ZTracer::Trace &parent_trace,
                      Context *oncommit) override;
     using WritebackHandler::write;
 
diff --git a/src/librbd/ManagedLock.cc b/src/librbd/ManagedLock.cc
index 71f5dab4df6..ee321bf93ad 100644
--- a/src/librbd/ManagedLock.cc
+++ b/src/librbd/ManagedLock.cc
@@ -330,6 +330,11 @@ void  ManagedLock<I>::post_release_lock_handler(bool shutting_down, int r,
 }
 
 template <typename I>
+void ManagedLock<I>::post_reacquire_lock_handler(int r, Context *on_finish) {
+  on_finish->complete(r);
+}
+
+template <typename I>
 bool ManagedLock<I>::is_transition_state() const {
   switch (m_state) {
   case STATE_ACQUIRING:
@@ -565,11 +570,15 @@ void ManagedLock<I>::send_reacquire_lock() {
   ldout(m_cct, 10) << dendl;
   m_state = STATE_REACQUIRING;
 
+  auto ctx = create_context_callback<
+    ManagedLock, &ManagedLock<I>::handle_reacquire_lock>(this);
+  ctx = new FunctionContext([this, ctx](int r) {
+      post_reacquire_lock_handler(r, ctx);
+    });
+
   using managed_lock::ReacquireRequest;
   ReacquireRequest<I>* req = ReacquireRequest<I>::create(m_ioctx, m_oid,
-      m_cookie, m_new_cookie, m_mode == EXCLUSIVE,
-      create_context_callback<
-        ManagedLock, &ManagedLock<I>::handle_reacquire_lock>(this));
+      m_cookie, m_new_cookie, m_mode == EXCLUSIVE, ctx);
   m_work_queue->queue(new C_SendLockRequest<ReacquireRequest<I>>(req));
 }
 
diff --git a/src/librbd/ManagedLock.h b/src/librbd/ManagedLock.h
index d18cfbef156..c619f4823dd 100644
--- a/src/librbd/ManagedLock.h
+++ b/src/librbd/ManagedLock.h
@@ -134,6 +134,7 @@ protected:
                                         Context *on_finish);
   virtual void post_release_lock_handler(bool shutting_down, int r,
                                           Context *on_finish);
+  virtual void post_reacquire_lock_handler(int r, Context *on_finish);
 
   void execute_next_action();
 
diff --git a/src/librbd/ObjectMap.cc b/src/librbd/ObjectMap.cc
index 49c8280dae8..9a0683fc68d 100644
--- a/src/librbd/ObjectMap.cc
+++ b/src/librbd/ObjectMap.cc
@@ -242,7 +242,7 @@ void ObjectMap<I>::detained_aio_update(UpdateOperation &&op) {
       handle_detained_aio_update(cell, r, on_finish);
     });
   aio_update(CEPH_NOSNAP, op.start_object_no, op.end_object_no, op.new_state,
-             op.current_state, ctx);
+             op.current_state, op.parent_trace, ctx);
 }
 
 template <typename I>
@@ -269,6 +269,7 @@ template <typename I>
 void ObjectMap<I>::aio_update(uint64_t snap_id, uint64_t start_object_no,
                               uint64_t end_object_no, uint8_t new_state,
                               const boost::optional<uint8_t> &current_state,
+                              const ZTracer::Trace &parent_trace,
                               Context *on_finish) {
   assert(m_image_ctx.snap_lock.is_locked());
   assert((m_image_ctx.features & RBD_FEATURE_OBJECT_MAP) != 0);
@@ -306,7 +307,7 @@ void ObjectMap<I>::aio_update(uint64_t snap_id, uint64_t start_object_no,
 
   auto req = object_map::UpdateRequest<I>::create(
     m_image_ctx, &m_object_map, snap_id, start_object_no, end_object_no,
-    new_state, current_state, on_finish);
+    new_state, current_state, parent_trace, on_finish);
   req->send();
 }
 
diff --git a/src/librbd/ObjectMap.h b/src/librbd/ObjectMap.h
index fd43ae97d23..427ecdf165a 100644
--- a/src/librbd/ObjectMap.h
+++ b/src/librbd/ObjectMap.h
@@ -13,9 +13,8 @@
 
 class Context;
 class RWLock;
-namespace librados {
-  class IoCtx;
-}
+namespace librados { class IoCtx; }
+namespace ZTracer { struct Trace; }
 
 namespace librbd {
 
@@ -57,16 +56,17 @@ public:
   template <typename T, void(T::*MF)(int) = &T::complete>
   bool aio_update(uint64_t snap_id, uint64_t start_object_no, uint8_t new_state,
                   const boost::optional<uint8_t> &current_state,
-                  T *callback_object) {
+                  const ZTracer::Trace &parent_trace, T *callback_object) {
     return aio_update<T, MF>(snap_id, start_object_no, start_object_no + 1,
-                             new_state, current_state, callback_object);
+                             new_state, current_state, parent_trace,
+                             callback_object);
   }
 
   template <typename T, void(T::*MF)(int) = &T::complete>
   bool aio_update(uint64_t snap_id, uint64_t start_object_no,
                   uint64_t end_object_no, uint8_t new_state,
                   const boost::optional<uint8_t> &current_state,
-                  T *callback_object) {
+                  const ZTracer::Trace &parent_trace, T *callback_object) {
     assert(start_object_no < end_object_no);
     if (snap_id == CEPH_NOSNAP) {
       uint64_t object_no;
@@ -82,13 +82,13 @@ public:
       }
 
       UpdateOperation update_operation(start_object_no, end_object_no,
-                                       new_state, current_state,
+                                       new_state, current_state, parent_trace,
                                        util::create_context_callback<T, MF>(
                                          callback_object));
       detained_aio_update(std::move(update_operation));
     } else {
       aio_update(snap_id, start_object_no, end_object_no, new_state,
-                 current_state,
+                 current_state, parent_trace,
                  util::create_context_callback<T, MF>(callback_object));
     }
     return true;
@@ -104,15 +104,16 @@ private:
     uint64_t end_object_no;
     uint8_t new_state;
     boost::optional<uint8_t> current_state;
+    ZTracer::Trace parent_trace;
     Context *on_finish;
 
     UpdateOperation(uint64_t start_object_no, uint64_t end_object_no,
                     uint8_t new_state,
                     const boost::optional<uint8_t> &current_state,
-                    Context *on_finish)
+                    const ZTracer::Trace &parent_trace, Context *on_finish)
       : start_object_no(start_object_no), end_object_no(end_object_no),
         new_state(new_state), current_state(current_state),
-        on_finish(on_finish) {
+        parent_trace(parent_trace), on_finish(on_finish) {
     }
   };
 
@@ -131,7 +132,7 @@ private:
   void aio_update(uint64_t snap_id, uint64_t start_object_no,
                   uint64_t end_object_no, uint8_t new_state,
                   const boost::optional<uint8_t> &current_state,
-                  Context *on_finish);
+                  const ZTracer::Trace &parent_trace, Context *on_finish);
   bool update_required(uint64_t object_no, uint8_t new_state);
 
 };
diff --git a/src/librbd/Utils.h b/src/librbd/Utils.h
index ce98d3ad6e3..f75d65c43de 100644
--- a/src/librbd/Utils.h
+++ b/src/librbd/Utils.h
@@ -7,6 +7,9 @@
 #include "include/rados/librados.hpp"
 #include "include/rbd_types.h"
 #include "include/Context.h"
+#include "common/zipkin_trace.h"
+
+#include <atomic>
 #include <type_traits>
 
 namespace librbd {
@@ -153,15 +156,12 @@ inline ImageCtx *get_image_ctx(ImageCtx *image_ctx) {
 /// a shut down of the invoking class instance
 class AsyncOpTracker {
 public:
-  AsyncOpTracker() : m_refs(0) {
-  }
-
   void start_op() {
-    m_refs.inc();
+    m_refs++;
   }
 
   void finish_op() {
-    if (m_refs.dec() == 0 && m_on_finish != nullptr) {
+    if (--m_refs == 0 && m_on_finish != nullptr) {
       Context *on_finish = nullptr;
       std::swap(on_finish, m_on_finish);
       on_finish->complete(0);
@@ -173,7 +173,7 @@ public:
     assert(m_on_finish == nullptr);
 
     on_finish = create_async_context_callback(image_ctx, on_finish);
-    if (m_refs.read() == 0) {
+    if (m_refs == 0) {
       on_finish->complete(0);
       return;
     }
@@ -181,7 +181,7 @@ public:
   }
 
 private:
-  atomic_t m_refs;
+  std::atomic<uint64_t> m_refs = { 0 };
   Context *m_on_finish = nullptr;
 };
 
@@ -193,6 +193,16 @@ bool calc_sparse_extent(const bufferptr &bp,
                         size_t *write_offset,
                         size_t *write_length,
                         size_t *offset);
+
+template <typename I>
+inline ZTracer::Trace create_trace(const I &image_ctx, const char *trace_name,
+				   const ZTracer::Trace &parent_trace) {
+  if (parent_trace.valid()) {
+    return ZTracer::Trace(trace_name, &image_ctx.trace_endpoint, &parent_trace);
+  }
+  return ZTracer::Trace();
+}
+
 } // namespace util
 
 } // namespace librbd
diff --git a/src/librbd/Watcher.cc b/src/librbd/Watcher.cc
index 2964918dc67..98a0e7f0be4 100644
--- a/src/librbd/Watcher.cc
+++ b/src/librbd/Watcher.cc
@@ -10,6 +10,9 @@
 #include "common/WorkQueue.h"
 #include <boost/bind.hpp>
 
+// re-include our assert to clobber the system one; fix dout:
+#include "include/assert.h"
+
 #define dout_subsys ceph_subsys_rbd
 
 namespace librbd {
diff --git a/src/librbd/api/Group.cc b/src/librbd/api/Group.cc
index 22339ea4e44..090a98074c0 100644
--- a/src/librbd/api/Group.cc
+++ b/src/librbd/api/Group.cc
@@ -137,8 +137,12 @@ int Group<I>::list(IoCtx& io_ctx, vector<string> *names)
     r = cls_client::group_dir_list(&io_ctx, RBD_GROUP_DIRECTORY, last_read,
                                    max_read, &groups);
     if (r < 0) {
-      lderr(cct) << "error listing group in directory: "
-		 << cpp_strerror(r) << dendl;
+      if (r != -ENOENT) {
+        lderr(cct) << "error listing group in directory: "
+                   << cpp_strerror(r) << dendl;
+      } else {
+        r = 0;
+      }
       return r;
     }
     for (pair<string, string> group : groups) {
diff --git a/src/librbd/api/Mirror.cc b/src/librbd/api/Mirror.cc
index 96b8421c98c..7f424746234 100644
--- a/src/librbd/api/Mirror.cc
+++ b/src/librbd/api/Mirror.cc
@@ -383,13 +383,15 @@ int Mirror<I>::image_resync(I *ictx) {
     return r;
   }
 
-  std::string mirror_uuid;
-  r = Journal<I>::get_tag_owner(ictx, &mirror_uuid);
+  C_SaferCond tag_owner_ctx;
+  bool is_tag_owner;
+  Journal<I>::is_tag_owner(ictx, &is_tag_owner, &tag_owner_ctx);
+  r = tag_owner_ctx.wait();
   if (r < 0) {
     lderr(cct) << "failed to determine tag ownership: " << cpp_strerror(r)
                << dendl;
     return r;
-  } else if (mirror_uuid == Journal<>::LOCAL_MIRROR_UUID) {
+  } else if (is_tag_owner) {
     lderr(cct) << "image is primary, cannot resync to itself" << dendl;
     return -EINVAL;
   }
diff --git a/src/librbd/cache/ImageWriteback.cc b/src/librbd/cache/ImageWriteback.cc
index 83c3b5bb14b..cff7a531db2 100644
--- a/src/librbd/cache/ImageWriteback.cc
+++ b/src/librbd/cache/ImageWriteback.cc
@@ -30,7 +30,7 @@ void ImageWriteback<I>::aio_read(Extents &&image_extents, bufferlist *bl,
   auto aio_comp = io::AioCompletion::create_and_start(on_finish, &m_image_ctx,
                                                       io::AIO_TYPE_READ);
   io::ImageReadRequest<I> req(m_image_ctx, aio_comp, std::move(image_extents),
-                              io::ReadResult{bl}, fadvise_flags);
+                              io::ReadResult{bl}, fadvise_flags, {});
   req.set_bypass_image_cache();
   req.send();
 }
@@ -46,14 +46,15 @@ void ImageWriteback<I>::aio_write(Extents &&image_extents,
   auto aio_comp = io::AioCompletion::create_and_start(on_finish, &m_image_ctx,
                                                       io::AIO_TYPE_WRITE);
   io::ImageWriteRequest<I> req(m_image_ctx, aio_comp, std::move(image_extents),
-                               std::move(bl), fadvise_flags);
+                               std::move(bl), fadvise_flags, {});
   req.set_bypass_image_cache();
   req.send();
 }
 
 template <typename I>
 void ImageWriteback<I>::aio_discard(uint64_t offset, uint64_t length,
-                                    bool skip_partial_discard, Context *on_finish) {
+                                    bool skip_partial_discard,
+				    Context *on_finish) {
   CephContext *cct = m_image_ctx.cct;
   ldout(cct, 20) << "offset=" << offset << ", "
                  << "length=" << length << ", "
@@ -62,7 +63,7 @@ void ImageWriteback<I>::aio_discard(uint64_t offset, uint64_t length,
   auto aio_comp = io::AioCompletion::create_and_start(on_finish, &m_image_ctx,
                                                       io::AIO_TYPE_DISCARD);
   io::ImageDiscardRequest<I> req(m_image_ctx, aio_comp, offset, length,
-                                 skip_partial_discard);
+                                 skip_partial_discard, {});
   req.set_bypass_image_cache();
   req.send();
 }
@@ -74,7 +75,7 @@ void ImageWriteback<I>::aio_flush(Context *on_finish) {
 
   auto aio_comp = io::AioCompletion::create_and_start(on_finish, &m_image_ctx,
                                                       io::AIO_TYPE_FLUSH);
-  io::ImageFlushRequest<I> req(m_image_ctx, aio_comp);
+  io::ImageFlushRequest<I> req(m_image_ctx, aio_comp, {});
   req.set_bypass_image_cache();
   req.send();
 }
@@ -92,7 +93,7 @@ void ImageWriteback<I>::aio_writesame(uint64_t offset, uint64_t length,
   auto aio_comp = io::AioCompletion::create_and_start(on_finish, &m_image_ctx,
                                                       io::AIO_TYPE_WRITESAME);
   io::ImageWriteSameRequest<I> req(m_image_ctx, aio_comp, offset, length,
-                                   std::move(bl), fadvise_flags);
+                                   std::move(bl), fadvise_flags, {});
   req.set_bypass_image_cache();
   req.send();
 }
diff --git a/src/librbd/image/OpenRequest.cc b/src/librbd/image/OpenRequest.cc
index 6dca0405011..579088b3cf0 100644
--- a/src/librbd/image/OpenRequest.cc
+++ b/src/librbd/image/OpenRequest.cc
@@ -121,23 +121,19 @@ Context *OpenRequest<I>::handle_v2_detect_header(int *result) {
 
 template <typename I>
 void OpenRequest<I>::send_v2_get_id() {
-  if (m_image_ctx->id.empty()) {
-    CephContext *cct = m_image_ctx->cct;
-    ldout(cct, 10) << this << " " << __func__ << dendl;
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 10) << this << " " << __func__ << dendl;
 
-    librados::ObjectReadOperation op;
-    cls_client::get_id_start(&op);
+  librados::ObjectReadOperation op;
+  cls_client::get_id_start(&op);
 
-    using klass = OpenRequest<I>;
-    librados::AioCompletion *comp =
-      create_rados_callback<klass, &klass::handle_v2_get_id>(this);
-    m_out_bl.clear();
-    m_image_ctx->md_ctx.aio_operate(util::id_obj_name(m_image_ctx->name),
-                                    comp, &op, &m_out_bl);
-    comp->release();
-  } else {
-    send_v2_get_name();
-  }
+  using klass = OpenRequest<I>;
+  librados::AioCompletion *comp =
+    create_rados_callback<klass, &klass::handle_v2_get_id>(this);
+  m_out_bl.clear();
+  m_image_ctx->md_ctx.aio_operate(util::id_obj_name(m_image_ctx->name),
+                                  comp, &op, &m_out_bl);
+  comp->release();
 }
 
 template <typename I>
diff --git a/src/librbd/internal.cc b/src/librbd/internal.cc
index 7558169c9e4..b790b3539f7 100644
--- a/src/librbd/internal.cc
+++ b/src/librbd/internal.cc
@@ -546,8 +546,12 @@ int validate_pool(IoCtx &io_ctx, CephContext *cct) {
 
     bufferlist bl;
     int r = io_ctx.read(RBD_DIRECTORY, bl, 0, 0);
-    if (r < 0)
+    if (r < 0) {
+      if (r == -ENOENT) {
+        r = 0;
+      }
       return r;
+    }
 
     // old format images are in a tmap
     if (bl.length()) {
@@ -1045,7 +1049,8 @@ int validate_pool(IoCtx &io_ctx, CephContext *cct) {
   }
 
   int get_parent_info(ImageCtx *ictx, string *parent_pool_name,
-		      string *parent_name, string *parent_snap_name)
+                      string *parent_name, string *parent_id,
+                      string *parent_snap_name)
   {
     int r = ictx->state->refresh_if_required();
     if (r < 0)
@@ -1064,7 +1069,8 @@ int validate_pool(IoCtx &io_ctx, CephContext *cct) {
     } else {
       r = ictx->get_parent_spec(ictx->snap_id, &parent_spec);
       if (r < 0) {
-	lderr(ictx->cct) << "Can't find snapshot id = " << ictx->snap_id << dendl;
+	lderr(ictx->cct) << "Can't find snapshot id = " << ictx->snap_id
+                         << dendl;
 	return r;
       }
       if (parent_spec.pool_id == -1)
@@ -1093,13 +1099,11 @@ int validate_pool(IoCtx &io_ctx, CephContext *cct) {
     }
 
     if (parent_name) {
-      r = cls_client::dir_get_name(&ictx->parent->md_ctx, RBD_DIRECTORY,
-				   parent_spec.image_id, parent_name);
-      if (r < 0) {
-	lderr(ictx->cct) << "error getting parent image name: "
-			 << cpp_strerror(r) << dendl;
-	return r;
-      }
+      RWLock::RLocker snap_locker(ictx->parent->snap_lock);
+      *parent_name = ictx->parent->name;
+    }
+    if (parent_id) {
+      *parent_id = ictx->parent->id;
     }
 
     return 0;
@@ -1350,10 +1354,21 @@ int validate_pool(IoCtx &io_ctx, CephContext *cct) {
     ImageCtx *ictx = new ImageCtx(image_name, "", nullptr, io_ctx, false);
     int r = ictx->state->open(true);
     if (r < 0) {
-      ldout(cct, 2) << "error opening image: " << cpp_strerror(-r) << dendl;
-      delete ictx;
+      ictx = nullptr;
+
       if (r != -ENOENT) {
-	return r;
+        ldout(cct, 2) << "error opening image: " << cpp_strerror(-r) << dendl;
+        return r;
+      }
+
+      // try to get image id from the directory
+      r = cls_client::dir_get_id(&io_ctx, RBD_DIRECTORY, image_name, &image_id);
+      if (r < 0) {
+        if (r != -ENOENT) {
+          ldout(cct, 2) << "error reading image id from dirctory: "
+                        << cpp_strerror(-r) << dendl;
+        }
+        return r;
       }
     } else {
       if (ictx->old_format) {
@@ -1376,6 +1391,9 @@ int validate_pool(IoCtx &io_ctx, CephContext *cct) {
     }
 
     BOOST_SCOPE_EXIT_ALL(ictx, cct) {
+      if (ictx == nullptr)
+        return;
+
       bool is_locked = ictx->exclusive_lock != nullptr &&
                        ictx->exclusive_lock->is_lock_owner();
       if (is_locked) {
@@ -1433,6 +1451,28 @@ int validate_pool(IoCtx &io_ctx, CephContext *cct) {
     return 0;
   }
 
+  int trash_get(IoCtx &io_ctx, const std::string &id,
+                trash_image_info_t *info) {
+    CephContext *cct((CephContext *)io_ctx.cct());
+    ldout(cct, 20) << __func__ << " " << &io_ctx << dendl;
+
+    cls::rbd::TrashImageSpec spec;
+    int r = cls_client::trash_get(&io_ctx, id, &spec);
+    if (r == -ENOENT) {
+      return r;
+    } else if (r < 0) {
+      lderr(cct) << "error retrieving trash entry: " << cpp_strerror(r)
+                 << dendl;
+      return r;
+    }
+
+    rbd_trash_image_source_t source = static_cast<rbd_trash_image_source_t>(
+      spec.source);
+    *info = trash_image_info_t{id, spec.name, source, spec.deletion_time.sec(),
+                               spec.deferment_end_time.sec()};
+    return 0;
+  }
+
   int trash_list(IoCtx &io_ctx, vector<trash_image_info_t> &entries) {
     CephContext *cct((CephContext *)io_ctx.cct());
     ldout(cct, 20) << "trash_list " << &io_ctx << dendl;
@@ -1443,6 +1483,8 @@ int validate_pool(IoCtx &io_ctx, CephContext *cct) {
       if (r != -ENOENT) {
         lderr(cct) << "error listing rbd_trash entries: " << cpp_strerror(r)
                    << dendl;
+      } else {
+        r = 0;
       }
       return r;
     }
@@ -1812,7 +1854,8 @@ int validate_pool(IoCtx &io_ctx, CephContext *cct) {
 	  m_dest->io_work_queue->aio_write(comp, m_offset + write_offset,
 					   write_length,
 					   std::move(*write_bl),
-					   LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
+					   LIBRADOS_OP_FLAG_FADVISE_DONTNEED,
+					   std::move(read_trace));
 	  write_offset = offset;
 	  write_length = 0;
 	}
@@ -1822,6 +1865,8 @@ int validate_pool(IoCtx &io_ctx, CephContext *cct) {
       gather_ctx->activate();
     }
 
+    ZTracer::Trace read_trace;
+
   private:
     SimpleThrottle *m_throttle;
     ImageCtx *m_dest;
@@ -1861,10 +1906,16 @@ int validate_pool(IoCtx &io_ctx, CephContext *cct) {
       }
     }
 
+    ZTracer::Trace trace;
+    if (cct->_conf->rbd_blkin_trace_all) {
+      trace.init("copy", &src->trace_endpoint);
+    }
+
     RWLock::RLocker owner_lock(src->owner_lock);
     SimpleThrottle throttle(src->concurrent_management_ops, false);
     uint64_t period = src->get_stripe_period();
-    unsigned fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL | LIBRADOS_OP_FLAG_FADVISE_NOCACHE;
+    unsigned fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL |
+			     LIBRADOS_OP_FLAG_FADVISE_NOCACHE;
     for (uint64_t offset = 0; offset < src_size; offset += period) {
       if (throttle.pending_error()) {
         return throttle.wait_for_ret();
@@ -1872,11 +1923,16 @@ int validate_pool(IoCtx &io_ctx, CephContext *cct) {
 
       uint64_t len = min(period, src_size - offset);
       bufferlist *bl = new bufferlist();
-      Context *ctx = new C_CopyRead(&throttle, dest, offset, bl, sparse_size);
-      auto comp = io::AioCompletion::create_and_start(ctx, src,
-                                                      io::AIO_TYPE_READ);
-      io::ImageRequest<>::aio_read(src, comp, {{offset, len}},
-                                   io::ReadResult{bl}, fadvise_flags);
+      auto ctx = new C_CopyRead(&throttle, dest, offset, bl, sparse_size);
+      auto comp = io::AioCompletion::create_and_start<Context>(
+	ctx, src, io::AIO_TYPE_READ);
+
+      io::ImageReadRequest<> req(*src, comp, {{offset, len}},
+				 io::ReadResult{bl}, fadvise_flags,
+				 std::move(trace));
+      ctx->read_trace = req.get_trace();
+
+      req.send();
       prog_ctx.update_progress(offset, src_size);
     }
 
@@ -2091,6 +2147,11 @@ int validate_pool(IoCtx &io_ctx, CephContext *cct) {
     uint64_t period = ictx->get_stripe_period();
     uint64_t left = mylen;
 
+    ZTracer::Trace trace;
+    if (ictx->cct->_conf->rbd_blkin_trace_all) {
+      trace.init("read_iterate", &ictx->trace_endpoint);
+    }
+
     RWLock::RLocker owner_locker(ictx->owner_lock);
     start_time = ceph_clock_now();
     while (left > 0) {
@@ -2103,7 +2164,7 @@ int validate_pool(IoCtx &io_ctx, CephContext *cct) {
       auto c = io::AioCompletion::create_and_start(&ctx, ictx,
                                                    io::AIO_TYPE_READ);
       io::ImageRequest<>::aio_read(ictx, c, {{off, read_len}},
-                                   io::ReadResult{&bl}, 0);
+                                   io::ReadResult{&bl}, 0, std::move(trace));
 
       int ret = ctx.wait();
       if (ret < 0) {
@@ -2288,7 +2349,7 @@ int validate_pool(IoCtx &io_ctx, CephContext *cct) {
 	  ictx->readahead.inc_pending();
 	  ictx->aio_read_from_cache(q->oid, q->objectno, NULL,
 				    q->length, q->offset,
-				    req_comp, 0);
+				    req_comp, 0, nullptr);
 	}
       }
       ictx->perfcounter->inc(l_librbd_readahead);
diff --git a/src/librbd/internal.h b/src/librbd/internal.h
index 59c9fad926a..deeaf945595 100644
--- a/src/librbd/internal.h
+++ b/src/librbd/internal.h
@@ -128,7 +128,8 @@ namespace librbd {
   int get_features(ImageCtx *ictx, uint64_t *features);
   int get_overlap(ImageCtx *ictx, uint64_t *overlap);
   int get_parent_info(ImageCtx *ictx, std::string *parent_pool_name,
-		      std::string *parent_name, std::string *parent_snap_name);
+                      std::string *parent_name, std::string *parent_id,
+                      std::string *parent_snap_name);
   int get_flags(ImageCtx *ictx, uint64_t *flags);
   int set_image_notification(ImageCtx *ictx, int fd, int type);
   int is_exclusive_lock_owner(ImageCtx *ictx, bool *is_owner);
@@ -142,8 +143,10 @@ namespace librbd {
   int remove(librados::IoCtx& io_ctx, const std::string &image_name,
              const std::string &image_id, ProgressContext& prog_ctx,
              bool force=false, bool from_trash_remove=false);
+
   int trash_move(librados::IoCtx &io_ctx, rbd_trash_image_source_t source,
                  const std::string &image_name, uint64_t delay);
+  int trash_get(IoCtx &io_ctx, const std::string &id, trash_image_info_t *info);
   int trash_list(librados::IoCtx &io_ctx,
                  std::vector<trash_image_info_t> &entries);
   int trash_remove(librados::IoCtx &io_ctx, const std::string &image_id,
diff --git a/src/librbd/io/AioCompletion.cc b/src/librbd/io/AioCompletion.cc
index 7f83f4deab5..cc40e8ffbe1 100644
--- a/src/librbd/io/AioCompletion.cc
+++ b/src/librbd/io/AioCompletion.cc
@@ -23,7 +23,8 @@
 
 #define dout_subsys ceph_subsys_rbd
 #undef dout_prefix
-#define dout_prefix *_dout << "librbd::io::AioCompletion: "
+#define dout_prefix *_dout << "librbd::io::AioCompletion: " << this \
+                           << " " << __func__ << ": "
 
 namespace librbd {
 namespace io {
@@ -44,7 +45,7 @@ void AioCompletion::finalize(ssize_t rval)
   assert(ictx != nullptr);
   CephContext *cct = ictx->cct;
 
-  ldout(cct, 20) << this << " " << __func__ << ": r=" << rval << dendl;
+  ldout(cct, 20) << "r=" << rval << dendl;
   if (rval >= 0 && aio_type == AIO_TYPE_READ) {
     read_result.assemble_result(cct);
   }
@@ -133,8 +134,7 @@ void AioCompletion::fail(int r)
   assert(ictx != nullptr);
   CephContext *cct = ictx->cct;
 
-  lderr(cct) << this << " " << __func__ << ": " << cpp_strerror(r)
-             << dendl;
+  lderr(cct) << cpp_strerror(r) << dendl;
   assert(pending_count == 0);
   rval = r;
   complete();
@@ -146,7 +146,7 @@ void AioCompletion::set_request_count(uint32_t count) {
   assert(ictx != nullptr);
   CephContext *cct = ictx->cct;
 
-  ldout(cct, 20) << this << " " << __func__ << ": pending=" << count << dendl;
+  ldout(cct, 20) << "pending=" << count << dendl;
   assert(pending_count == 0);
   pending_count = count;
   lock.Unlock();
@@ -170,7 +170,7 @@ void AioCompletion::complete_request(ssize_t r)
   assert(pending_count);
   int count = --pending_count;
 
-  ldout(cct, 20) << this << " " << __func__ << ": cb=" << complete_cb << ", "
+  ldout(cct, 20) << "cb=" << complete_cb << ", "
                  << "pending=" << pending_count << dendl;
   if (!count && blockers == 0) {
     finalize(rval);
diff --git a/src/librbd/io/CopyupRequest.cc b/src/librbd/io/CopyupRequest.cc
index b11dbad27b6..aee2f14d7b0 100644
--- a/src/librbd/io/CopyupRequest.cc
+++ b/src/librbd/io/CopyupRequest.cc
@@ -23,7 +23,8 @@
 
 #define dout_subsys ceph_subsys_rbd
 #undef dout_prefix
-#define dout_prefix *_dout << "librbd::io::CopyupRequest: "
+#define dout_prefix *_dout << "librbd::io::CopyupRequest: " << this \
+                           << " " << __func__ << ": "
 
 namespace librbd {
 namespace io {
@@ -34,9 +35,9 @@ class UpdateObjectMap : public C_AsyncObjectThrottle<> {
 public:
   UpdateObjectMap(AsyncObjectThrottle<> &throttle, ImageCtx *image_ctx,
                   uint64_t object_no, const std::vector<uint64_t> *snap_ids,
-                  size_t snap_id_idx)
-    : C_AsyncObjectThrottle(throttle, *image_ctx),
-      m_object_no(object_no), m_snap_ids(*snap_ids), m_snap_id_idx(snap_id_idx)
+                  const ZTracer::Trace &trace, size_t snap_id_idx)
+    : C_AsyncObjectThrottle(throttle, *image_ctx), m_object_no(object_no),
+      m_snap_ids(*snap_ids), m_trace(trace), m_snap_id_idx(snap_id_idx)
   {
   }
 
@@ -48,7 +49,7 @@ public:
       assert(m_image_ctx.exclusive_lock->is_lock_owner());
       assert(m_image_ctx.object_map != nullptr);
       bool sent = m_image_ctx.object_map->aio_update<Context>(
-        CEPH_NOSNAP, m_object_no, OBJECT_EXISTS, {}, this);
+        CEPH_NOSNAP, m_object_no, OBJECT_EXISTS, {}, m_trace, this);
       return (sent ? 0 : 1);
     }
 
@@ -65,7 +66,7 @@ public:
     }
 
     bool sent = m_image_ctx.object_map->aio_update<Context>(
-      snap_id, m_object_no, state, {}, this);
+      snap_id, m_object_no, state, {}, m_trace, this);
     assert(sent);
     return 0;
   }
@@ -73,6 +74,7 @@ public:
 private:
   uint64_t m_object_no;
   const std::vector<uint64_t> &m_snap_ids;
+  const ZTracer::Trace &m_trace;
   size_t m_snap_id_idx;
 };
 
@@ -80,9 +82,12 @@ private:
 
 
 CopyupRequest::CopyupRequest(ImageCtx *ictx, const std::string &oid,
-                             uint64_t objectno, Extents &&image_extents)
+                             uint64_t objectno, Extents &&image_extents,
+			     const ZTracer::Trace &parent_trace)
   : m_ictx(ictx), m_oid(oid), m_object_no(objectno),
-    m_image_extents(image_extents), m_state(STATE_READ_FROM_PARENT)
+    m_image_extents(image_extents),
+    m_trace(util::create_trace(*m_ictx, "copy-up", parent_trace)),
+    m_state(STATE_READ_FROM_PARENT)
 {
   m_async_op.start_op(*m_ictx);
 }
@@ -93,7 +98,7 @@ CopyupRequest::~CopyupRequest() {
 }
 
 void CopyupRequest::append_request(ObjectRequest<> *req) {
-  ldout(m_ictx->cct, 20) << __func__ << " " << this << ": " << req << dendl;
+  ldout(m_ictx->cct, 20) << req << dendl;
   m_pending_requests.push_back(req);
 }
 
@@ -101,8 +106,7 @@ void CopyupRequest::complete_requests(int r) {
   while (!m_pending_requests.empty()) {
     vector<ObjectRequest<> *>::iterator it = m_pending_requests.begin();
     ObjectRequest<> *req = *it;
-    ldout(m_ictx->cct, 20) << __func__ << " completing request " << req
-                           << dendl;
+    ldout(m_ictx->cct, 20) << "completing request " << req << dendl;
     req->complete(r);
     m_pending_requests.erase(it);
   }
@@ -117,8 +121,7 @@ bool CopyupRequest::send_copyup() {
     add_copyup_op = true;
   }
 
-  ldout(m_ictx->cct, 20) << __func__ << " " << this
-                         << ": oid " << m_oid << dendl;
+  ldout(m_ictx->cct, 20) << "oid " << m_oid << dendl;
   m_state = STATE_COPYUP;
 
   m_ictx->snap_lock.get_read();
@@ -128,7 +131,7 @@ bool CopyupRequest::send_copyup() {
   std::vector<librados::snap_t> snaps;
 
   if (!copy_on_read) {
-    m_pending_copyups.inc();
+    m_pending_copyups++;
   }
 
   int r;
@@ -143,17 +146,18 @@ bool CopyupRequest::send_copyup() {
     // all snapshots are detected from the parent for this object.  If
     // this is a CoW request, a second request will be created for the
     // actual modification.
-    m_pending_copyups.inc();
+    m_pending_copyups++;
 
-    ldout(m_ictx->cct, 20) << __func__ << " " << this << " copyup with "
-                           << "empty snapshot context" << dendl;
+    ldout(m_ictx->cct, 20) << "copyup with empty snapshot context" << dendl;
     librados::AioCompletion *comp = util::create_rados_callback(this);
 
     librados::Rados rados(m_ictx->data_ctx);
     r = rados.ioctx_create2(m_ictx->data_ctx.get_id(), m_data_ctx);
     assert(r == 0);
 
-    r = m_data_ctx.aio_operate(m_oid, comp, &copyup_op, 0, snaps);
+    r = m_data_ctx.aio_operate(
+      m_oid, comp, &copyup_op, 0, snaps,
+      (m_trace.valid() ? m_trace.get_info() : nullptr));
     assert(r == 0);
     comp->release();
   }
@@ -168,15 +172,17 @@ bool CopyupRequest::send_copyup() {
     // merge all pending write ops into this single RADOS op
     for (size_t i=0; i<m_pending_requests.size(); ++i) {
       ObjectRequest<> *req = m_pending_requests[i];
-      ldout(m_ictx->cct, 20) << __func__ << " add_copyup_ops " << req
-                             << dendl;
-      req->add_copyup_ops(&write_op);
+      ldout(m_ictx->cct, 20) << "add_copyup_ops " << req << dendl;
+      bool set_hints = (i == 0);
+      req->add_copyup_ops(&write_op, set_hints);
     }
     assert(write_op.size() != 0);
 
     snaps.insert(snaps.end(), snapc.snaps.begin(), snapc.snaps.end());
     librados::AioCompletion *comp = util::create_rados_callback(this);
-    r = m_ictx->data_ctx.aio_operate(m_oid, comp, &write_op);
+    r = m_ictx->data_ctx.aio_operate(
+      m_oid, comp, &write_op, snapc.seq, snaps,
+      (m_trace.valid() ? m_trace.get_info() : nullptr));
     assert(r == 0);
     comp->release();
   }
@@ -201,13 +207,12 @@ void CopyupRequest::send()
   AioCompletion *comp = AioCompletion::create_and_start(
     this, m_ictx, AIO_TYPE_READ);
 
-  ldout(m_ictx->cct, 20) << __func__ << " " << this
-                         << ": completion " << comp
+  ldout(m_ictx->cct, 20) << "completion " << comp
                          << ", oid " << m_oid
                          << ", extents " << m_image_extents
                          << dendl;
   ImageRequest<>::aio_read(m_ictx->parent, comp, std::move(m_image_extents),
-                           ReadResult{&m_copyup_data}, 0);
+                           ReadResult{&m_copyup_data}, 0, m_trace);
 }
 
 void CopyupRequest::complete(int r)
@@ -221,8 +226,7 @@ void CopyupRequest::complete(int r)
 bool CopyupRequest::should_complete(int r)
 {
   CephContext *cct = m_ictx->cct;
-  ldout(cct, 20) << __func__ << " " << this
-                 << ": oid " << m_oid
+  ldout(cct, 20) << "oid " << m_oid
                  << ", r " << r << dendl;
 
   uint64_t pending_copyups;
@@ -232,7 +236,7 @@ bool CopyupRequest::should_complete(int r)
     remove_from_list();
     if (r >= 0 || r == -ENOENT) {
       if (is_copyup_required()) {
-        ldout(cct, 20) << __func__ << " " << this << " nop, skipping" << dendl;
+        ldout(cct, 20) << "nop, skipping" << dendl;
         return true;
       }
 
@@ -252,7 +256,7 @@ bool CopyupRequest::should_complete(int r)
 
   case STATE_COPYUP:
     // invoked via a finisher in librados, so thread safe
-    pending_copyups = m_pending_copyups.dec();
+    pending_copyups = --m_pending_copyups;
     ldout(cct, 20) << "COPYUP (" << pending_copyups << " pending)"
                    << dendl;
     if (r == -ENOENT) {
@@ -285,7 +289,7 @@ void CopyupRequest::remove_from_list()
 
 bool CopyupRequest::send_object_map_head() {
   CephContext *cct = m_ictx->cct;
-  ldout(cct, 20) << __func__ << " " << this << dendl;
+  ldout(cct, 20) << dendl;
 
   m_state = STATE_OBJECT_MAP_HEAD;
 
@@ -321,7 +325,7 @@ bool CopyupRequest::send_object_map_head() {
         }
 
         current_state = (*m_ictx->object_map)[m_object_no];
-        ldout(cct, 20) << __func__ << " " << req->get_op_type() << " object no "
+        ldout(cct, 20) << req->get_op_type() << " object no "
                        << m_object_no << " current state "
                        << stringify(static_cast<uint32_t>(current_state))
                        << " new state " << stringify(static_cast<uint32_t>(new_state))
@@ -332,7 +336,8 @@ bool CopyupRequest::send_object_map_head() {
 
       if (may_update && (new_state != current_state) &&
           m_ictx->object_map->aio_update<CopyupRequest>(
-            CEPH_NOSNAP, m_object_no, new_state, current_state, this)) {
+            CEPH_NOSNAP, m_object_no, new_state, current_state, m_trace,
+            this)) {
         return false;
       }
     }
@@ -348,14 +353,13 @@ bool CopyupRequest::send_object_map() {
     return send_copyup();
   } else {
     // update object maps for HEAD and all existing snapshots
-    ldout(m_ictx->cct, 20) << __func__ << " " << this
-                           << ": oid " << m_oid << dendl;
+    ldout(m_ictx->cct, 20) << "oid " << m_oid << dendl;
     m_state = STATE_OBJECT_MAP;
 
     RWLock::RLocker owner_locker(m_ictx->owner_lock);
     AsyncObjectThrottle<>::ContextFactory context_factory(
       boost::lambda::bind(boost::lambda::new_ptr<UpdateObjectMap>(),
-      boost::lambda::_1, m_ictx, m_object_no, &m_snap_ids,
+      boost::lambda::_1, m_ictx, m_object_no, &m_snap_ids, m_trace,
       boost::lambda::_2));
     AsyncObjectThrottle<> *throttle = new AsyncObjectThrottle<>(
       NULL, *m_ictx, context_factory, util::create_context_callback(this),
diff --git a/src/librbd/io/CopyupRequest.h b/src/librbd/io/CopyupRequest.h
index d0183705c5f..0a5b205e92e 100644
--- a/src/librbd/io/CopyupRequest.h
+++ b/src/librbd/io/CopyupRequest.h
@@ -5,13 +5,17 @@
 #define CEPH_LIBRBD_IO_COPYUP_REQUEST_H
 
 #include "librbd/AsyncOperation.h"
-#include "include/atomic.h"
 #include "include/int_types.h"
 #include "include/rados/librados.hpp"
 #include "include/buffer.h"
+#include "common/zipkin_trace.h"
 #include "librbd/io/Types.h"
+
 #include <string>
 #include <vector>
+#include <atomic>
+
+namespace ZTracer { struct Trace; }
 
 namespace librbd {
 
@@ -25,7 +29,7 @@ template <typename I> class ObjectRequest;
 class CopyupRequest {
 public:
   CopyupRequest(ImageCtx *ictx, const std::string &oid, uint64_t objectno,
-                Extents &&image_extents);
+                Extents &&image_extents, const ZTracer::Trace &parent_trace);
   ~CopyupRequest();
 
   void append_request(ObjectRequest<ImageCtx> *req);
@@ -76,10 +80,12 @@ private:
   std::string m_oid;
   uint64_t m_object_no;
   Extents m_image_extents;
+  ZTracer::Trace m_trace;
+
   State m_state;
   ceph::bufferlist m_copyup_data;
   std::vector<ObjectRequest<ImageCtx> *> m_pending_requests;
-  atomic_t m_pending_copyups;
+  std::atomic<unsigned> m_pending_copyups { 0 };
 
   AsyncOperation m_async_op;
 
diff --git a/src/librbd/io/ImageRequest.cc b/src/librbd/io/ImageRequest.cc
index 71ae944240e..80c7208a144 100644
--- a/src/librbd/io/ImageRequest.cc
+++ b/src/librbd/io/ImageRequest.cc
@@ -16,7 +16,8 @@
 
 #define dout_subsys ceph_subsys_rbd
 #undef dout_prefix
-#define dout_prefix *_dout << "librbd::io::ImageRequest: "
+#define dout_prefix *_dout << "librbd::io::ImageRequest: " << this \
+                           << " " << __func__ << ": "
 
 namespace librbd {
 namespace io {
@@ -38,8 +39,7 @@ struct C_DiscardJournalCommit : public Context {
     : image_ctx(_image_ctx), aio_comp(_aio_comp),
       object_extents(_object_extents) {
     CephContext *cct = image_ctx.cct;
-    ldout(cct, 20) << this << " C_DiscardJournalCommit: "
-                   << "delaying cache discard until journal tid " << tid << " "
+    ldout(cct, 20) << "delaying cache discard until journal tid " << tid << " "
                    << "safe" << dendl;
 
     aio_comp->add_request();
@@ -47,7 +47,7 @@ struct C_DiscardJournalCommit : public Context {
 
   void finish(int r) override {
     CephContext *cct = image_ctx.cct;
-    ldout(cct, 20) << this << " C_DiscardJournalCommit: "
+    ldout(cct, 20) << "C_DiscardJournalCommit: "
                    << "journal committed: discarding from cache" << dendl;
 
     Mutex::Locker cache_locker(image_ctx.cache_lock);
@@ -65,8 +65,7 @@ struct C_FlushJournalCommit : public Context {
                        uint64_t tid)
     : image_ctx(_image_ctx), aio_comp(_aio_comp) {
     CephContext *cct = image_ctx.cct;
-    ldout(cct, 20) << this << " C_FlushJournalCommit: "
-                   << "delaying flush until journal tid " << tid << " "
+    ldout(cct, 20) << "delaying flush until journal tid " << tid << " "
                    << "safe" << dendl;
 
     aio_comp->add_request();
@@ -74,8 +73,7 @@ struct C_FlushJournalCommit : public Context {
 
   void finish(int r) override {
     CephContext *cct = image_ctx.cct;
-    ldout(cct, 20) << this << " C_FlushJournalCommit: journal committed"
-                   << dendl;
+    ldout(cct, 20) << "C_FlushJournalCommit: journal committed" << dendl;
     aio_comp->complete_request(r);
   }
 };
@@ -113,41 +111,47 @@ private:
 template <typename I>
 void ImageRequest<I>::aio_read(I *ictx, AioCompletion *c,
                                Extents &&image_extents,
-                               ReadResult &&read_result, int op_flags) {
+                               ReadResult &&read_result, int op_flags,
+			       const ZTracer::Trace &parent_trace) {
   ImageReadRequest<I> req(*ictx, c, std::move(image_extents),
-                          std::move(read_result), op_flags);
+                          std::move(read_result), op_flags, parent_trace);
   req.send();
 }
 
 template <typename I>
 void ImageRequest<I>::aio_write(I *ictx, AioCompletion *c,
                                 Extents &&image_extents, bufferlist &&bl,
-                                int op_flags) {
+                                int op_flags,
+				const ZTracer::Trace &parent_trace) {
   ImageWriteRequest<I> req(*ictx, c, std::move(image_extents), std::move(bl),
-                           op_flags);
+                           op_flags, parent_trace);
   req.send();
 }
 
 template <typename I>
 void ImageRequest<I>::aio_discard(I *ictx, AioCompletion *c,
                                   uint64_t off, uint64_t len,
-                                  bool skip_partial_discard) {
-  ImageDiscardRequest<I> req(*ictx, c, off, len, skip_partial_discard);
+                                  bool skip_partial_discard,
+				  const ZTracer::Trace &parent_trace) {
+  ImageDiscardRequest<I> req(*ictx, c, off, len, skip_partial_discard,
+			     parent_trace);
   req.send();
 }
 
 template <typename I>
-void ImageRequest<I>::aio_flush(I *ictx, AioCompletion *c) {
-  ImageFlushRequest<I> req(*ictx, c);
+void ImageRequest<I>::aio_flush(I *ictx, AioCompletion *c,
+				const ZTracer::Trace &parent_trace) {
+  ImageFlushRequest<I> req(*ictx, c, parent_trace);
   req.send();
 }
 
 template <typename I>
 void ImageRequest<I>::aio_writesame(I *ictx, AioCompletion *c,
                                     uint64_t off, uint64_t len,
-                                    bufferlist &&bl,
-                                    int op_flags) {
-  ImageWriteSameRequest<I> req(*ictx, c, off, len, std::move(bl), op_flags);
+                                    bufferlist &&bl, int op_flags,
+				    const ZTracer::Trace &parent_trace) {
+  ImageWriteSameRequest<I> req(*ictx, c, off, len, std::move(bl), op_flags,
+			       parent_trace);
   req.send();
 }
 
@@ -206,9 +210,10 @@ void ImageRequest<I>::fail(int r) {
 template <typename I>
 ImageReadRequest<I>::ImageReadRequest(I &image_ctx, AioCompletion *aio_comp,
                                       Extents &&image_extents,
-                                      ReadResult &&read_result,
-                                      int op_flags)
-  : ImageRequest<I>(image_ctx, aio_comp, std::move(image_extents)),
+                                      ReadResult &&read_result, int op_flags,
+				      const ZTracer::Trace &parent_trace)
+  : ImageRequest<I>(image_ctx, aio_comp, std::move(image_extents), "read",
+		    parent_trace),
     m_op_flags(op_flags) {
   aio_comp->read_result = std::move(read_result);
 }
@@ -273,7 +278,7 @@ void ImageReadRequest<I>::send_request() {
   // issue the requests
   for (auto &object_extent : object_extents) {
     for (auto &extent : object_extent.second) {
-      ldout(cct, 20) << " oid " << extent.oid << " " << extent.offset << "~"
+      ldout(cct, 20) << "oid " << extent.oid << " " << extent.offset << "~"
                      << extent.length << " from " << extent.buffer_extents
                      << dendl;
 
@@ -281,16 +286,17 @@ void ImageReadRequest<I>::send_request() {
         aio_comp);
       ObjectReadRequest<I> *req = ObjectReadRequest<I>::create(
         &image_ctx, extent.oid.name, extent.objectno, extent.offset,
-        extent.length, extent.buffer_extents, snap_id, true, req_comp,
-        m_op_flags);
+        extent.length, extent.buffer_extents, snap_id, true, m_op_flags,
+	this->m_trace, req_comp);
       req_comp->request = req;
 
       if (image_ctx.object_cacher) {
         C_ObjectCacheRead<I> *cache_comp = new C_ObjectCacheRead<I>(image_ctx,
                                                                     req);
-        image_ctx.aio_read_from_cache(extent.oid, extent.objectno,
-                                      &req->data(), extent.length,
-                                      extent.offset, cache_comp, m_op_flags);
+        image_ctx.aio_read_from_cache(
+          extent.oid, extent.objectno, &req->data(), extent.length,
+          extent.offset, cache_comp, m_op_flags,
+          (this->m_trace.valid() ? &this->m_trace : nullptr));
       } else {
         req->send();
       }
@@ -395,7 +401,7 @@ void AbstractImageWriteRequest<I>::send_object_requests(
   AioCompletion *aio_comp = this->m_aio_comp;
   for (ObjectExtents::const_iterator p = object_extents.begin();
        p != object_extents.end(); ++p) {
-    ldout(cct, 20) << " oid " << p->oid << " " << p->offset << "~" << p->length
+    ldout(cct, 20) << "oid " << p->oid << " " << p->offset << "~" << p->length
                    << " from " << p->buffer_extents << dendl;
     C_AioRequest *req_comp = new C_AioRequest(aio_comp);
     ObjectRequestHandle *request = create_object_request(*p, snapc,
@@ -471,9 +477,10 @@ void ImageWriteRequest<I>::send_object_cache_requests(
 
     AioCompletion *aio_comp = this->m_aio_comp;
     C_AioRequest *req_comp = new C_AioRequest(aio_comp);
-    image_ctx.write_to_cache(object_extent.oid, bl, object_extent.length,
-                             object_extent.offset, req_comp, m_op_flags,
-                             journal_tid);
+    image_ctx.write_to_cache(
+      object_extent.oid, bl, object_extent.length, object_extent.offset,
+      req_comp, m_op_flags, journal_tid,
+      (this->m_trace.valid() ? &this->m_trace : nullptr));
   }
 }
 
@@ -501,7 +508,7 @@ ObjectRequestHandle *ImageWriteRequest<I>::create_object_request(
   assemble_extent(object_extent, &bl);
   ObjectRequest<I> *req = ObjectRequest<I>::create_write(
     &image_ctx, object_extent.oid.name, object_extent.objectno,
-    object_extent.offset, bl, snapc, on_finish, m_op_flags);
+    object_extent.offset, bl, snapc, m_op_flags, this->m_trace, on_finish);
   return req;
 }
 
@@ -543,7 +550,7 @@ void ImageDiscardRequest<I>::prune_object_extents(ObjectExtents &object_extents)
 
   for (auto p = object_extents.begin(); p != object_extents.end(); ) {
     if (p->offset + p->length < image_ctx.layout.object_size) {
-      ldout(cct, 20) << " oid " << p->oid << " " << p->offset << "~"
+      ldout(cct, 20) << "oid " << p->oid << " " << p->offset << "~"
 		     << p->length << " from " << p->buffer_extents
 		     << ": skip partial discard" << dendl;
       p = object_extents.erase(p);
@@ -602,16 +609,17 @@ ObjectRequestHandle *ImageDiscardRequest<I>::create_object_request(
   if (object_extent.length == image_ctx.layout.object_size) {
     req = ObjectRequest<I>::create_remove(
       &image_ctx, object_extent.oid.name, object_extent.objectno, snapc,
-      on_finish);
+      this->m_trace, on_finish);
   } else if (object_extent.offset + object_extent.length ==
                image_ctx.layout.object_size) {
     req = ObjectRequest<I>::create_truncate(
       &image_ctx, object_extent.oid.name, object_extent.objectno,
-      object_extent.offset, snapc, on_finish);
+      object_extent.offset, snapc, this->m_trace, on_finish);
   } else {
     req = ObjectRequest<I>::create_zero(
       &image_ctx, object_extent.oid.name, object_extent.objectno,
-      object_extent.offset, object_extent.length, snapc, on_finish);
+      object_extent.offset, object_extent.length, snapc,
+      this->m_trace, on_finish);
   }
   return req;
 }
@@ -776,9 +784,10 @@ void ImageWriteSameRequest<I>::send_object_cache_requests(
 
     AioCompletion *aio_comp = this->m_aio_comp;
     C_AioRequest *req_comp = new C_AioRequest(aio_comp);
-    image_ctx.write_to_cache(object_extent.oid, bl, object_extent.length,
-                             object_extent.offset, req_comp, m_op_flags,
-                             journal_tid);
+    image_ctx.write_to_cache(
+      object_extent.oid, bl, object_extent.length, object_extent.offset,
+      req_comp, m_op_flags, journal_tid,
+      (this->m_trace.valid() ? &this->m_trace : nullptr));
   }
 }
 
@@ -809,13 +818,12 @@ ObjectRequestHandle *ImageWriteSameRequest<I>::create_object_request(
     req = ObjectRequest<I>::create_writesame(
       &image_ctx, object_extent.oid.name, object_extent.objectno,
       object_extent.offset, object_extent.length,
-      bl, snapc, on_finish, m_op_flags);
+      bl, snapc, m_op_flags, this->m_trace, on_finish);
     return req;
   }
   req = ObjectRequest<I>::create_write(
     &image_ctx, object_extent.oid.name, object_extent.objectno,
-    object_extent.offset,
-    bl, snapc, on_finish, m_op_flags);
+    object_extent.offset, bl, snapc, m_op_flags, this->m_trace, on_finish);
   return req;
 }
 
diff --git a/src/librbd/io/ImageRequest.h b/src/librbd/io/ImageRequest.h
index 6ae7c289c57..21b88b098cf 100644
--- a/src/librbd/io/ImageRequest.h
+++ b/src/librbd/io/ImageRequest.h
@@ -7,7 +7,9 @@
 #include "include/int_types.h"
 #include "include/buffer_fwd.h"
 #include "common/snap_types.h"
+#include "common/zipkin_trace.h"
 #include "osd/osd_types.h"
+#include "librbd/Utils.h"
 #include "librbd/io/Types.h"
 #include <list>
 #include <utility>
@@ -27,18 +29,24 @@ class ImageRequest {
 public:
   typedef std::vector<std::pair<uint64_t,uint64_t> > Extents;
 
-  virtual ~ImageRequest() {}
+  virtual ~ImageRequest() {
+    m_trace.event("finish");
+  }
 
   static void aio_read(ImageCtxT *ictx, AioCompletion *c,
                        Extents &&image_extents, ReadResult &&read_result,
-                       int op_flags);
+                       int op_flags, const ZTracer::Trace &parent_trace);
   static void aio_write(ImageCtxT *ictx, AioCompletion *c,
-                        Extents &&image_extents, bufferlist &&bl, int op_flags);
+                        Extents &&image_extents, bufferlist &&bl, int op_flags,
+			const ZTracer::Trace &parent_trace);
   static void aio_discard(ImageCtxT *ictx, AioCompletion *c, uint64_t off,
-                          uint64_t len, bool skip_partial_discard);
-  static void aio_flush(ImageCtxT *ictx, AioCompletion *c);
+                          uint64_t len, bool skip_partial_discard,
+			  const ZTracer::Trace &parent_trace);
+  static void aio_flush(ImageCtxT *ictx, AioCompletion *c,
+			const ZTracer::Trace &parent_trace);
   static void aio_writesame(ImageCtxT *ictx, AioCompletion *c, uint64_t off,
-                            uint64_t len, bufferlist &&bl, int op_flags);
+                            uint64_t len, bufferlist &&bl, int op_flags,
+			    const ZTracer::Trace &parent_trace);
 
   virtual bool is_write_op() const {
     return false;
@@ -53,19 +61,28 @@ public:
     m_bypass_image_cache = true;
   }
 
+  inline const ZTracer::Trace &get_trace() const {
+    return m_trace;
+  }
+
 protected:
   typedef std::list<ObjectRequestHandle *> ObjectRequests;
 
   ImageCtxT &m_image_ctx;
   AioCompletion *m_aio_comp;
   Extents m_image_extents;
+  ZTracer::Trace m_trace;
   bool m_bypass_image_cache = false;
 
   ImageRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp,
-               Extents &&image_extents)
+               Extents &&image_extents, const char *trace_name,
+	       const ZTracer::Trace &parent_trace)
     : m_image_ctx(image_ctx), m_aio_comp(aio_comp),
-      m_image_extents(image_extents) {
+      m_image_extents(std::move(image_extents)),
+      m_trace(util::create_trace(image_ctx, trace_name, parent_trace)) {
+    m_trace.event("start");
   }
+  
 
   virtual int clip_request();
   virtual void send_request() = 0;
@@ -82,7 +99,7 @@ public:
 
   ImageReadRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp,
                    Extents &&image_extents, ReadResult &&read_result,
-                   int op_flags);
+                   int op_flags, const ZTracer::Trace &parent_trace);
 
 protected:
   int clip_request() override;
@@ -120,8 +137,10 @@ protected:
   typedef std::vector<ObjectExtent> ObjectExtents;
 
   AbstractImageWriteRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp,
-                            Extents &&image_extents)
-    : ImageRequest<ImageCtxT>(image_ctx, aio_comp, std::move(image_extents)),
+                            Extents &&image_extents, const char *trace_name,
+			    const ZTracer::Trace &parent_trace)
+    : ImageRequest<ImageCtxT>(image_ctx, aio_comp, std::move(image_extents),
+			      trace_name, parent_trace),
       m_synchronous(false) {
   }
 
@@ -156,9 +175,10 @@ public:
   using typename ImageRequest<ImageCtxT>::Extents;
 
   ImageWriteRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp,
-                    Extents &&image_extents, bufferlist &&bl, int op_flags)
-    : AbstractImageWriteRequest<ImageCtxT>(image_ctx, aio_comp,
-                                           std::move(image_extents)),
+                    Extents &&image_extents, bufferlist &&bl, int op_flags,
+		    const ZTracer::Trace &parent_trace)
+    : AbstractImageWriteRequest<ImageCtxT>(
+	image_ctx, aio_comp, std::move(image_extents), "write", parent_trace),
       m_bl(std::move(bl)), m_op_flags(op_flags) {
   }
 
@@ -201,8 +221,10 @@ template <typename ImageCtxT = ImageCtx>
 class ImageDiscardRequest : public AbstractImageWriteRequest<ImageCtxT> {
 public:
   ImageDiscardRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp,
-                      uint64_t off, uint64_t len, bool skip_partial_discard)
-    : AbstractImageWriteRequest<ImageCtxT>(image_ctx, aio_comp, {{off, len}}),
+                      uint64_t off, uint64_t len, bool skip_partial_discard,
+		      const ZTracer::Trace &parent_trace)
+    : AbstractImageWriteRequest<ImageCtxT>(
+	image_ctx, aio_comp, {{off, len}}, "discard", parent_trace),
       m_skip_partial_discard(skip_partial_discard) {
   }
 
@@ -239,8 +261,9 @@ private:
 template <typename ImageCtxT = ImageCtx>
 class ImageFlushRequest : public ImageRequest<ImageCtxT> {
 public:
-  ImageFlushRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp)
-    : ImageRequest<ImageCtxT>(image_ctx, aio_comp, {}) {
+  ImageFlushRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp,
+		    const ZTracer::Trace &parent_trace)
+    : ImageRequest<ImageCtxT>(image_ctx, aio_comp, {}, "flush", parent_trace) {
   }
 
   bool is_write_op() const override {
@@ -269,8 +292,9 @@ class ImageWriteSameRequest : public AbstractImageWriteRequest<ImageCtxT> {
 public:
   ImageWriteSameRequest(ImageCtxT &image_ctx, AioCompletion *aio_comp,
                         uint64_t off, uint64_t len, bufferlist &&bl,
-                        int op_flags)
-    : AbstractImageWriteRequest<ImageCtxT>(image_ctx, aio_comp, {{off, len}}),
+                        int op_flags, const ZTracer::Trace &parent_trace)
+    : AbstractImageWriteRequest<ImageCtxT>(
+	image_ctx, aio_comp, {{off, len}}, "writesame", parent_trace),
       m_data_bl(std::move(bl)), m_op_flags(op_flags) {
   }
 
diff --git a/src/librbd/io/ImageRequestWQ.cc b/src/librbd/io/ImageRequestWQ.cc
index d6161141216..2758790eb08 100644
--- a/src/librbd/io/ImageRequestWQ.cc
+++ b/src/librbd/io/ImageRequestWQ.cc
@@ -3,6 +3,7 @@
 
 #include "librbd/io/ImageRequestWQ.h"
 #include "common/errno.h"
+#include "common/zipkin_trace.h"
 #include "librbd/ExclusiveLock.h"
 #include "librbd/ImageCtx.h"
 #include "librbd/ImageState.h"
@@ -14,7 +15,8 @@
 
 #define dout_subsys ceph_subsys_rbd
 #undef dout_prefix
-#define dout_prefix *_dout << "librbd::io::ImageRequestWQ: "
+#define dout_prefix *_dout << "librbd::io::ImageRequestWQ: " << this \
+                           << " " << __func__ << ": "
 
 namespace librbd {
 namespace io {
@@ -28,14 +30,14 @@ ImageRequestWQ::ImageRequestWQ(ImageCtx *image_ctx, const string &name,
     m_queued_writes(0), m_in_flight_ops(0), m_refresh_in_progress(false),
     m_shutdown(false), m_on_shutdown(nullptr) {
   CephContext *cct = m_image_ctx.cct;
-  ldout(cct, 5) << this << " " << ": ictx=" << image_ctx << dendl;
+  ldout(cct, 5) << "ictx=" << image_ctx << dendl;
   tp->add_work_queue(this);
 }
 
 ssize_t ImageRequestWQ::read(uint64_t off, uint64_t len,
                              ReadResult &&read_result, int op_flags) {
   CephContext *cct = m_image_ctx.cct;
-  ldout(cct, 20) << "read: ictx=" << &m_image_ctx << ", off=" << off << ", "
+  ldout(cct, 20) << "ictx=" << &m_image_ctx << ", off=" << off << ", "
                  << "len = " << len << dendl;
 
   C_SaferCond cond;
@@ -47,7 +49,7 @@ ssize_t ImageRequestWQ::read(uint64_t off, uint64_t len,
 ssize_t ImageRequestWQ::write(uint64_t off, uint64_t len,
                               bufferlist &&bl, int op_flags) {
   CephContext *cct = m_image_ctx.cct;
-  ldout(cct, 20) << "write: ictx=" << &m_image_ctx << ", off=" << off << ", "
+  ldout(cct, 20) << "ictx=" << &m_image_ctx << ", off=" << off << ", "
                  << "len = " << len << dendl;
 
   m_image_ctx.snap_lock.get_read();
@@ -71,7 +73,7 @@ ssize_t ImageRequestWQ::write(uint64_t off, uint64_t len,
 
 ssize_t ImageRequestWQ::discard(uint64_t off, uint64_t len, bool skip_partial_discard) {
   CephContext *cct = m_image_ctx.cct;
-  ldout(cct, 20) << "discard: ictx=" << &m_image_ctx << ", off=" << off << ", "
+  ldout(cct, 20) << "ictx=" << &m_image_ctx << ", off=" << off << ", "
                  << "len = " << len << dendl;
 
   m_image_ctx.snap_lock.get_read();
@@ -96,7 +98,7 @@ ssize_t ImageRequestWQ::discard(uint64_t off, uint64_t len, bool skip_partial_di
 ssize_t ImageRequestWQ::writesame(uint64_t off, uint64_t len, bufferlist &&bl,
                                   int op_flags) {
   CephContext *cct = m_image_ctx.cct;
-  ldout(cct, 20) << "writesame ictx=" << &m_image_ctx << ", off=" << off << ", "
+  ldout(cct, 20) << "ictx=" << &m_image_ctx << ", off=" << off << ", "
                  << "len = " << len << ", data_len " << bl.length() << dendl;
 
   m_image_ctx.snap_lock.get_read();
@@ -121,9 +123,15 @@ ssize_t ImageRequestWQ::writesame(uint64_t off, uint64_t len, bufferlist &&bl,
 void ImageRequestWQ::aio_read(AioCompletion *c, uint64_t off, uint64_t len,
                               ReadResult &&read_result, int op_flags,
                               bool native_async) {
-  c->init_time(&m_image_ctx, AIO_TYPE_READ);
   CephContext *cct = m_image_ctx.cct;
-  ldout(cct, 20) << "aio_read: ictx=" << &m_image_ctx << ", "
+  ZTracer::Trace trace;
+  if (cct->_conf->rbd_blkin_trace_all) {
+    trace.init("wq: read", &m_image_ctx.trace_endpoint);
+    trace.event("start");
+  }
+
+  c->init_time(&m_image_ctx, AIO_TYPE_READ);
+  ldout(cct, 20) << "ictx=" << &m_image_ctx << ", "
                  << "completion=" << c << ", off=" << off << ", "
                  << "len=" << len << ", " << "flags=" << op_flags << dendl;
 
@@ -148,21 +156,28 @@ void ImageRequestWQ::aio_read(AioCompletion *c, uint64_t off, uint64_t len,
   if (m_image_ctx.non_blocking_aio || writes_blocked() || !writes_empty() ||
       lock_required) {
     queue(new ImageReadRequest<>(m_image_ctx, c, {{off, len}},
-                                 std::move(read_result), op_flags));
+                                 std::move(read_result), op_flags, trace));
   } else {
     c->start_op();
     ImageRequest<>::aio_read(&m_image_ctx, c, {{off, len}},
-                             std::move(read_result), op_flags);
+                             std::move(read_result), op_flags, trace);
     finish_in_flight_op();
   }
+  trace.event("finish");
 }
 
 void ImageRequestWQ::aio_write(AioCompletion *c, uint64_t off, uint64_t len,
                                bufferlist &&bl, int op_flags,
                                bool native_async) {
-  c->init_time(&m_image_ctx, AIO_TYPE_WRITE);
   CephContext *cct = m_image_ctx.cct;
-  ldout(cct, 20) << "aio_write: ictx=" << &m_image_ctx << ", "
+  ZTracer::Trace trace;
+  if (cct->_conf->rbd_blkin_trace_all) {
+    trace.init("wq: write", &m_image_ctx.trace_endpoint);
+    trace.event("init");
+  }
+
+  c->init_time(&m_image_ctx, AIO_TYPE_WRITE);
+  ldout(cct, 20) << "ictx=" << &m_image_ctx << ", "
                  << "completion=" << c << ", off=" << off << ", "
                  << "len=" << len << ", flags=" << op_flags << dendl;
 
@@ -177,21 +192,28 @@ void ImageRequestWQ::aio_write(AioCompletion *c, uint64_t off, uint64_t len,
   RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
   if (m_image_ctx.non_blocking_aio || writes_blocked()) {
     queue(new ImageWriteRequest<>(m_image_ctx, c, {{off, len}},
-                                  std::move(bl), op_flags));
+                                  std::move(bl), op_flags, trace));
   } else {
     c->start_op();
     ImageRequest<>::aio_write(&m_image_ctx, c, {{off, len}},
-                              std::move(bl), op_flags);
+                              std::move(bl), op_flags, trace);
     finish_in_flight_op();
   }
+  trace.event("finish");
 }
 
 void ImageRequestWQ::aio_discard(AioCompletion *c, uint64_t off,
                                  uint64_t len, bool skip_partial_discard,
                                  bool native_async) {
-  c->init_time(&m_image_ctx, AIO_TYPE_DISCARD);
   CephContext *cct = m_image_ctx.cct;
-  ldout(cct, 20) << "aio_discard: ictx=" << &m_image_ctx << ", "
+  ZTracer::Trace trace;
+  if (cct->_conf->rbd_blkin_trace_all) {
+    trace.init("wq: discard", &m_image_ctx.trace_endpoint);
+    trace.event("init");
+  }
+
+  c->init_time(&m_image_ctx, AIO_TYPE_DISCARD);
+  ldout(cct, 20) << "ictx=" << &m_image_ctx << ", "
                  << "completion=" << c << ", off=" << off << ", len=" << len
                  << dendl;
 
@@ -205,18 +227,27 @@ void ImageRequestWQ::aio_discard(AioCompletion *c, uint64_t off,
 
   RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
   if (m_image_ctx.non_blocking_aio || writes_blocked()) {
-    queue(new ImageDiscardRequest<>(m_image_ctx, c, off, len, skip_partial_discard));
+    queue(new ImageDiscardRequest<>(m_image_ctx, c, off, len,
+				    skip_partial_discard, trace));
   } else {
     c->start_op();
-    ImageRequest<>::aio_discard(&m_image_ctx, c, off, len, skip_partial_discard);
+    ImageRequest<>::aio_discard(&m_image_ctx, c, off, len,
+				skip_partial_discard, trace);
     finish_in_flight_op();
   }
+  trace.event("finish");
 }
 
 void ImageRequestWQ::aio_flush(AioCompletion *c, bool native_async) {
-  c->init_time(&m_image_ctx, AIO_TYPE_FLUSH);
   CephContext *cct = m_image_ctx.cct;
-  ldout(cct, 20) << "aio_flush: ictx=" << &m_image_ctx << ", "
+  ZTracer::Trace trace;
+  if (cct->_conf->rbd_blkin_trace_all) {
+    trace.init("wq: flush", &m_image_ctx.trace_endpoint);
+    trace.event("init");
+  }
+
+  c->init_time(&m_image_ctx, AIO_TYPE_FLUSH);
+  ldout(cct, 20) << "ictx=" << &m_image_ctx << ", "
                  << "completion=" << c << dendl;
 
   if (native_async && m_image_ctx.event_socket.is_valid()) {
@@ -229,19 +260,26 @@ void ImageRequestWQ::aio_flush(AioCompletion *c, bool native_async) {
 
   RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
   if (m_image_ctx.non_blocking_aio || writes_blocked() || !writes_empty()) {
-    queue(new ImageFlushRequest<>(m_image_ctx, c));
+    queue(new ImageFlushRequest<>(m_image_ctx, c, trace));
   } else {
-    ImageRequest<>::aio_flush(&m_image_ctx, c);
+    ImageRequest<>::aio_flush(&m_image_ctx, c, trace);
     finish_in_flight_op();
   }
+  trace.event("finish");
 }
 
 void ImageRequestWQ::aio_writesame(AioCompletion *c, uint64_t off, uint64_t len,
                                    bufferlist &&bl, int op_flags,
                                    bool native_async) {
-  c->init_time(&m_image_ctx, AIO_TYPE_WRITESAME);
   CephContext *cct = m_image_ctx.cct;
-  ldout(cct, 20) << "aio_writesame: ictx=" << &m_image_ctx << ", "
+  ZTracer::Trace trace;
+  if (cct->_conf->rbd_blkin_trace_all) {
+    trace.init("wq: writesame", &m_image_ctx.trace_endpoint);
+    trace.event("init");
+  }
+
+  c->init_time(&m_image_ctx, AIO_TYPE_WRITESAME);
+  ldout(cct, 20) << "ictx=" << &m_image_ctx << ", "
                  << "completion=" << c << ", off=" << off << ", "
                  << "len=" << len << ", data_len = " << bl.length() << ", "
                  << "flags=" << op_flags << dendl;
@@ -257,13 +295,14 @@ void ImageRequestWQ::aio_writesame(AioCompletion *c, uint64_t off, uint64_t len,
   RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
   if (m_image_ctx.non_blocking_aio || writes_blocked()) {
     queue(new ImageWriteSameRequest<>(m_image_ctx, c, off, len, std::move(bl),
-                                      op_flags));
+                                      op_flags, trace));
   } else {
     c->start_op();
     ImageRequest<>::aio_writesame(&m_image_ctx, c, off, len, std::move(bl),
-                                  op_flags);
+                                  op_flags, trace);
     finish_in_flight_op();
   }
+  trace.event("finish");
 }
 
 void ImageRequestWQ::shut_down(Context *on_shutdown) {
@@ -275,9 +314,9 @@ void ImageRequestWQ::shut_down(Context *on_shutdown) {
     m_shutdown = true;
 
     CephContext *cct = m_image_ctx.cct;
-    ldout(cct, 5) << __func__ << ": in_flight=" << m_in_flight_ops.read()
+    ldout(cct, 5) << __func__ << ": in_flight=" << m_in_flight_ops.load()
                   << dendl;
-    if (m_in_flight_ops.read() > 0) {
+    if (m_in_flight_ops > 0) {
       m_on_shutdown = on_shutdown;
       return;
     }
@@ -289,8 +328,8 @@ void ImageRequestWQ::shut_down(Context *on_shutdown) {
 
 bool ImageRequestWQ::is_lock_request_needed() const {
   RWLock::RLocker locker(m_lock);
-  return (m_queued_writes.read() > 0 ||
-          (m_require_lock_on_read && m_queued_reads.read() > 0));
+  return (m_queued_writes > 0 ||
+          (m_require_lock_on_read && m_queued_reads > 0));
 }
 
 int ImageRequestWQ::block_writes() {
@@ -306,9 +345,9 @@ void ImageRequestWQ::block_writes(Context *on_blocked) {
   {
     RWLock::WLocker locker(m_lock);
     ++m_write_blockers;
-    ldout(cct, 5) << __func__ << ": " << &m_image_ctx << ", "
-                  << "num=" << m_write_blockers << dendl;
-    if (!m_write_blocker_contexts.empty() || m_in_progress_writes.read() > 0) {
+    ldout(cct, 5) << &m_image_ctx << ", " << "num="
+                  << m_write_blockers << dendl;
+    if (!m_write_blocker_contexts.empty() || m_in_progress_writes > 0) {
       m_write_blocker_contexts.push_back(on_blocked);
       return;
     }
@@ -327,8 +366,8 @@ void ImageRequestWQ::unblock_writes() {
     assert(m_write_blockers > 0);
     --m_write_blockers;
 
-    ldout(cct, 5) << __func__ << ": " << &m_image_ctx << ", "
-                  << "num=" << m_write_blockers << dendl;
+    ldout(cct, 5) << &m_image_ctx << ", " << "num="
+                  << m_write_blockers << dendl;
     if (m_write_blockers == 0) {
       wake_up = true;
     }
@@ -341,7 +380,7 @@ void ImageRequestWQ::unblock_writes() {
 
 void ImageRequestWQ::set_require_lock_on_read() {
   CephContext *cct = m_image_ctx.cct;
-  ldout(cct, 20) << __func__ << dendl;
+  ldout(cct, 20) << dendl;
 
   RWLock::WLocker locker(m_lock);
   m_require_lock_on_read = true;
@@ -349,7 +388,7 @@ void ImageRequestWQ::set_require_lock_on_read() {
 
 void ImageRequestWQ::clear_require_lock_on_read() {
   CephContext *cct = m_image_ctx.cct;
-  ldout(cct, 20) << __func__ << dendl;
+  ldout(cct, 20) << dendl;
 
   {
     RWLock::WLocker locker(m_lock);
@@ -380,7 +419,7 @@ void *ImageRequestWQ::_void_dequeue() {
 
       // refresh will requeue the op -- don't count it as in-progress
       if (!refresh_required) {
-        m_in_progress_writes.inc();
+        m_in_progress_writes++;
       }
     } else if (m_require_lock_on_read) {
       return nullptr;
@@ -410,7 +449,7 @@ void *ImageRequestWQ::_void_dequeue() {
 
 void ImageRequestWQ::process(ImageRequest<> *req) {
   CephContext *cct = m_image_ctx.cct;
-  ldout(cct, 20) << __func__ << ": ictx=" << &m_image_ctx << ", "
+  ldout(cct, 20) << "ictx=" << &m_image_ctx << ", "
                  << "req=" << req << dendl;
 
   req->send();
@@ -427,11 +466,11 @@ void ImageRequestWQ::process(ImageRequest<> *req) {
 void ImageRequestWQ::finish_queued_op(ImageRequest<> *req) {
   RWLock::RLocker locker(m_lock);
   if (req->is_write_op()) {
-    assert(m_queued_writes.read() > 0);
-    m_queued_writes.dec();
+    assert(m_queued_writes > 0);
+    m_queued_writes--;
   } else {
-    assert(m_queued_reads.read() > 0);
-    m_queued_reads.dec();
+    assert(m_queued_reads > 0);
+    m_queued_reads--;
   }
 }
 
@@ -439,8 +478,8 @@ void ImageRequestWQ::finish_in_progress_write() {
   bool writes_blocked = false;
   {
     RWLock::RLocker locker(m_lock);
-    assert(m_in_progress_writes.read() > 0);
-    if (m_in_progress_writes.dec() == 0 &&
+    assert(m_in_progress_writes > 0);
+    if (--m_in_progress_writes == 0 &&
         !m_write_blocker_contexts.empty()) {
       writes_blocked = true;
     }
@@ -462,7 +501,7 @@ int ImageRequestWQ::start_in_flight_op(AioCompletion *c) {
     return false;
   }
 
-  m_in_flight_ops.inc();
+  m_in_flight_ops++;
   return true;
 }
 
@@ -470,14 +509,14 @@ void ImageRequestWQ::finish_in_flight_op() {
   Context *on_shutdown;
   {
     RWLock::RLocker locker(m_lock);
-    if (m_in_flight_ops.dec() > 0 || !m_shutdown) {
+    if (--m_in_flight_ops > 0 || !m_shutdown) {
       return;
     }
     on_shutdown = m_on_shutdown;
   }
 
   CephContext *cct = m_image_ctx.cct;
-  ldout(cct, 5) << __func__ << ": completing shut down" << dendl;
+  ldout(cct, 5) << "completing shut down" << dendl;
 
   assert(on_shutdown != nullptr);
   m_image_ctx.flush(on_shutdown);
@@ -494,7 +533,7 @@ bool ImageRequestWQ::is_lock_required() const {
 
 void ImageRequestWQ::queue(ImageRequest<> *req) {
   CephContext *cct = m_image_ctx.cct;
-  ldout(cct, 20) << __func__ << ": ictx=" << &m_image_ctx << ", "
+  ldout(cct, 20) << "ictx=" << &m_image_ctx << ", "
                  << "req=" << req << dendl;
 
   assert(m_image_ctx.owner_lock.is_locked());
@@ -512,9 +551,9 @@ void ImageRequestWQ::queue(ImageRequest<> *req) {
   }
 
   if (write_op) {
-    m_queued_writes.inc();
+    m_queued_writes++;
   } else {
-    m_queued_reads.inc();
+    m_queued_reads++;
   }
 
   ThreadPool::PointerWQ<ImageRequest<> >::queue(req);
diff --git a/src/librbd/io/ImageRequestWQ.h b/src/librbd/io/ImageRequestWQ.h
index f5a8d955da5..0ed843dafeb 100644
--- a/src/librbd/io/ImageRequestWQ.h
+++ b/src/librbd/io/ImageRequestWQ.h
@@ -5,10 +5,11 @@
 #define CEPH_LIBRBD_IO_IMAGE_REQUEST_WQ_H
 
 #include "include/Context.h"
-#include "include/atomic.h"
 #include "common/RWLock.h"
 #include "common/WorkQueue.h"
+
 #include <list>
+#include <atomic>
 
 namespace librbd {
 
@@ -98,10 +99,10 @@ private:
   Contexts m_write_blocker_contexts;
   uint32_t m_write_blockers;
   bool m_require_lock_on_read = false;
-  atomic_t m_in_progress_writes;
-  atomic_t m_queued_reads;
-  atomic_t m_queued_writes;
-  atomic_t m_in_flight_ops;
+  std::atomic<unsigned> m_in_progress_writes { 0 };
+  std::atomic<unsigned> m_queued_reads { 0 };
+  std::atomic<unsigned> m_queued_writes { 0 };
+  std::atomic<unsigned> m_in_flight_ops { 0 };
 
   bool m_refresh_in_progress;
 
@@ -110,7 +111,7 @@ private:
 
   inline bool writes_empty() const {
     RWLock::RLocker locker(m_lock);
-    return (m_queued_writes.read() == 0);
+    return (m_queued_writes == 0);
   }
 
   void finish_queued_op(ImageRequest<ImageCtx> *req);
diff --git a/src/librbd/io/ObjectRequest.cc b/src/librbd/io/ObjectRequest.cc
index 2b2b030c052..1cdb696b717 100644
--- a/src/librbd/io/ObjectRequest.cc
+++ b/src/librbd/io/ObjectRequest.cc
@@ -24,7 +24,8 @@
 
 #define dout_subsys ceph_subsys_rbd
 #undef dout_prefix
-#define dout_prefix *_dout << "librbd::io::ObjectRequest: "
+#define dout_prefix *_dout << "librbd::io::ObjectRequest: " << this \
+                           << " " << __func__ << ": "
 
 namespace librbd {
 namespace io {
@@ -34,9 +35,10 @@ ObjectRequest<I>*
 ObjectRequest<I>::create_remove(I *ictx, const std::string &oid,
                                 uint64_t object_no,
                                 const ::SnapContext &snapc,
+				const ZTracer::Trace &parent_trace,
                                 Context *completion) {
   return new ObjectRemoveRequest(util::get_image_ctx(ictx), oid, object_no,
-                                 snapc, completion);
+                                 snapc, parent_trace, completion);
 }
 
 template <typename I>
@@ -44,9 +46,10 @@ ObjectRequest<I>*
 ObjectRequest<I>::create_truncate(I *ictx, const std::string &oid,
                                   uint64_t object_no, uint64_t object_off,
                                   const ::SnapContext &snapc,
-                                  Context *completion) {
+                                  const ZTracer::Trace &parent_trace,
+				  Context *completion) {
   return new ObjectTruncateRequest(util::get_image_ctx(ictx), oid, object_no,
-                                   object_off, snapc, completion);
+                                   object_off, snapc, parent_trace, completion);
 }
 
 template <typename I>
@@ -54,10 +57,12 @@ ObjectRequest<I>*
 ObjectRequest<I>::create_write(I *ictx, const std::string &oid,
                                uint64_t object_no, uint64_t object_off,
                                const ceph::bufferlist &data,
-                               const ::SnapContext &snapc,
-                               Context *completion, int op_flags) {
+                               const ::SnapContext &snapc, int op_flags,
+			       const ZTracer::Trace &parent_trace,
+                               Context *completion) {
   return new ObjectWriteRequest(util::get_image_ctx(ictx), oid, object_no,
-                                object_off, data, snapc, completion, op_flags);
+                                object_off, data, snapc, op_flags, parent_trace,
+				completion);
 }
 
 template <typename I>
@@ -66,9 +71,11 @@ ObjectRequest<I>::create_zero(I *ictx, const std::string &oid,
                               uint64_t object_no, uint64_t object_off,
                               uint64_t object_len,
                               const ::SnapContext &snapc,
+			      const ZTracer::Trace &parent_trace,
                               Context *completion) {
   return new ObjectZeroRequest(util::get_image_ctx(ictx), oid, object_no,
-                               object_off, object_len, snapc, completion);
+                               object_off, object_len, snapc, parent_trace,
+			       completion);
 }
 
 template <typename I>
@@ -77,21 +84,29 @@ ObjectRequest<I>::create_writesame(I *ictx, const std::string &oid,
                                    uint64_t object_no, uint64_t object_off,
                                    uint64_t object_len,
                                    const ceph::bufferlist &data,
-                                   const ::SnapContext &snapc,
-                                   Context *completion, int op_flags) {
+                                   const ::SnapContext &snapc, int op_flags,
+				   const ZTracer::Trace &parent_trace,
+                                   Context *completion) {
   return new ObjectWriteSameRequest(util::get_image_ctx(ictx), oid, object_no,
                                     object_off, object_len, data, snapc,
-                                    completion, op_flags);
+                                    op_flags, parent_trace, completion);
 }
 
 template <typename I>
 ObjectRequest<I>::ObjectRequest(ImageCtx *ictx, const std::string &oid,
                                 uint64_t objectno, uint64_t off,
                                 uint64_t len, librados::snap_t snap_id,
-                                Context *completion, bool hide_enoent)
+                                bool hide_enoent, const char *trace_name,
+				const ZTracer::Trace &trace,
+				Context *completion)
   : m_ictx(ictx), m_oid(oid), m_object_no(objectno), m_object_off(off),
     m_object_len(len), m_snap_id(snap_id), m_completion(completion),
-    m_hide_enoent(hide_enoent) {
+    m_hide_enoent(hide_enoent),
+    m_trace(util::create_trace(*ictx, "", trace)) {
+  if (m_trace.valid()) {
+    m_trace.copy_name(trace_name + std::string(" ") + oid);
+    m_trace.event("start");
+  }
 
   Striper::extent_to_file(m_ictx->cct, &m_ictx->layout, m_object_no,
                           0, m_ictx->layout.object_size, m_parent_extents);
@@ -105,7 +120,7 @@ template <typename I>
 void ObjectRequest<I>::complete(int r)
 {
   if (should_complete(r)) {
-    ldout(m_ictx->cct, 20) << "complete " << this << dendl;
+    ldout(m_ictx->cct, 20) << dendl;
     if (m_hide_enoent && r == -ENOENT) {
       r = 0;
     }
@@ -124,8 +139,8 @@ bool ObjectRequest<I>::compute_parent_extents() {
   if (r < 0) {
     // NOTE: it's possible for a snapshot to be deleted while we are
     // still reading from it
-    lderr(m_ictx->cct) << this << " compute_parent_extents: failed to "
-                       << "retrieve parent overlap: " << cpp_strerror(r)
+    lderr(m_ictx->cct) << "failed to retrieve parent overlap: "
+                       << cpp_strerror(r)
                        << dendl;
     m_has_parent = false;
     m_parent_extents.clear();
@@ -135,8 +150,7 @@ bool ObjectRequest<I>::compute_parent_extents() {
   uint64_t object_overlap = m_ictx->prune_parent_extents(
     m_parent_extents, parent_overlap);
   if (object_overlap > 0) {
-    ldout(m_ictx->cct, 20) << this << " compute_parent_extents: "
-                           << "overlap " << parent_overlap << " "
+    ldout(m_ictx->cct, 20) << "overlap " << parent_overlap << " "
                            << "extents " << m_parent_extents << dendl;
     m_has_parent = !m_parent_extents.empty();
     return true;
@@ -159,9 +173,11 @@ ObjectReadRequest<I>::ObjectReadRequest(I *ictx, const std::string &oid,
                                         uint64_t objectno, uint64_t offset,
                                         uint64_t len, Extents& be,
                                         librados::snap_t snap_id, bool sparse,
-                                        Context *completion, int op_flags)
+					int op_flags,
+					const ZTracer::Trace &parent_trace,
+                                        Context *completion)
   : ObjectRequest<I>(util::get_image_ctx(ictx), oid, objectno, offset, len,
-                     snap_id, completion, false),
+                     snap_id, false, "read", parent_trace, completion),
     m_buffer_extents(be), m_tried_parent(false), m_sparse(sparse),
     m_op_flags(op_flags), m_state(LIBRBD_AIO_READ_FLAT) {
   guard_read();
@@ -175,7 +191,7 @@ void ObjectReadRequest<I>::guard_read()
   RWLock::RLocker parent_locker(image_ctx->parent_lock);
 
   if (this->has_parent()) {
-    ldout(image_ctx->cct, 20) << __func__ << " guarding read" << dendl;
+    ldout(image_ctx->cct, 20) << "guarding read" << dendl;
     m_state = LIBRBD_AIO_READ_GUARD;
   }
 }
@@ -184,8 +200,7 @@ template <typename I>
 bool ObjectReadRequest<I>::should_complete(int r)
 {
   ImageCtx *image_ctx = this->m_ictx;
-  ldout(image_ctx->cct, 20) << "should_complete " << this << " "
-                            << this->m_oid << " "
+  ldout(image_ctx->cct, 20) << this->m_oid << " "
                             << this->m_object_off << "~" << this->m_object_len
                             << " r = " << r << dendl;
 
@@ -193,8 +208,7 @@ bool ObjectReadRequest<I>::should_complete(int r)
 
   switch (m_state) {
   case LIBRBD_AIO_READ_GUARD:
-    ldout(image_ctx->cct, 20) << "should_complete " << this
-                              << " READ_CHECK_GUARD" << dendl;
+    ldout(image_ctx->cct, 20) << "READ_CHECK_GUARD" << dendl;
 
     // This is the step to read from parent
     if (!m_tried_parent && r == -ENOENT) {
@@ -235,8 +249,7 @@ bool ObjectReadRequest<I>::should_complete(int r)
     }
     break;
   case LIBRBD_AIO_READ_COPYUP:
-    ldout(image_ctx->cct, 20) << "should_complete " << this << " READ_COPYUP"
-                              << dendl;
+    ldout(image_ctx->cct, 20) << "READ_COPYUP" << dendl;
     // This is the extra step for copy-on-read: kick off an asynchronous copyup.
     // It is different from copy-on-write as asynchronous copyup will finish
     // by itself so state won't go back to LIBRBD_AIO_READ_GUARD.
@@ -250,8 +263,7 @@ bool ObjectReadRequest<I>::should_complete(int r)
     }
     break;
   case LIBRBD_AIO_READ_FLAT:
-    ldout(image_ctx->cct, 20) << "should_complete " << this << " READ_FLAT"
-                              << dendl;
+    ldout(image_ctx->cct, 20) << "READ_FLAT" << dendl;
     // The read content should be deposit in m_read_data
     break;
   default:
@@ -265,8 +277,8 @@ bool ObjectReadRequest<I>::should_complete(int r)
 template <typename I>
 void ObjectReadRequest<I>::send() {
   ImageCtx *image_ctx = this->m_ictx;
-  ldout(image_ctx->cct, 20) << "send " << this << " " << this->m_oid << " "
-                            << this->m_object_off << "~" << this->m_object_len
+  ldout(image_ctx->cct, 20) << this->m_oid << " " << this->m_object_off
+                            << "~" << this->m_object_len
                             << dendl;
 
   {
@@ -293,8 +305,9 @@ void ObjectReadRequest<I>::send() {
 
   librados::AioCompletion *rados_completion =
     util::create_rados_callback(this);
-  int r = image_ctx->data_ctx.aio_operate(this->m_oid, rados_completion, &op,
-                                          flags, nullptr);
+  int r = image_ctx->data_ctx.aio_operate(
+    this->m_oid, rados_completion, &op, flags, nullptr,
+    (this->m_trace.valid() ? this->m_trace.get_info() : nullptr));
   assert(r == 0);
 
   rados_completion->release();
@@ -304,6 +317,9 @@ template <typename I>
 void ObjectReadRequest<I>::send_copyup()
 {
   ImageCtx *image_ctx = this->m_ictx;
+  ldout(image_ctx->cct, 20) << this->m_oid << " " << this->m_object_off
+                            << "~" << this->m_object_len << dendl;
+
   {
     RWLock::RLocker snap_locker(image_ctx->snap_lock);
     RWLock::RLocker parent_locker(image_ctx->parent_lock);
@@ -321,7 +337,7 @@ void ObjectReadRequest<I>::send_copyup()
     // create and kick off a CopyupRequest
     CopyupRequest *new_req = new CopyupRequest(
       image_ctx, this->m_oid, this->m_object_no,
-      std::move(this->m_parent_extents));
+      std::move(this->m_parent_extents), this->m_trace);
     this->m_parent_extents.clear();
 
     image_ctx->copyup_list[this->m_object_no] = new_req;
@@ -336,13 +352,11 @@ void ObjectReadRequest<I>::read_from_parent(Extents&& parent_extents)
   AioCompletion *parent_completion = AioCompletion::create_and_start<
     ObjectRequest<I> >(this, image_ctx, AIO_TYPE_READ);
 
-  ldout(image_ctx->cct, 20) << "read_from_parent this = " << this
-                            << " parent completion " << parent_completion
-                            << " extents " << parent_extents
-                            << dendl;
+  ldout(image_ctx->cct, 20) << "parent completion " << parent_completion
+                            << " extents " << parent_extents << dendl;
   ImageRequest<>::aio_read(image_ctx->parent, parent_completion,
                            std::move(parent_extents),
-                           ReadResult{&m_read_data}, 0);
+                           ReadResult{&m_read_data}, 0, this->m_trace);
 }
 
 /** write **/
@@ -353,10 +367,12 @@ AbstractObjectWriteRequest::AbstractObjectWriteRequest(ImageCtx *ictx,
                                                        uint64_t object_off,
                                                        uint64_t len,
                                                        const ::SnapContext &snapc,
-                                                       Context *completion,
-                                                       bool hide_enoent)
+						       bool hide_enoent,
+						       const char *trace_name,
+						       const ZTracer::Trace &parent_trace,
+                                                       Context *completion)
   : ObjectRequest(ictx, oid, object_no, object_off, len, CEPH_NOSNAP,
-                  completion, hide_enoent),
+                  hide_enoent, trace_name, parent_trace, completion),
     m_state(LIBRBD_AIO_WRITE_FLAT), m_snap_seq(snapc.seq.val)
 {
   m_snaps.insert(m_snaps.end(), snapc.snaps.begin(), snapc.snaps.end());
@@ -367,15 +383,15 @@ void AbstractObjectWriteRequest::guard_write()
   if (has_parent()) {
     m_state = LIBRBD_AIO_WRITE_GUARD;
     m_write.assert_exists();
-    ldout(m_ictx->cct, 20) << __func__ << " guarding write" << dendl;
+    ldout(m_ictx->cct, 20) << "guarding write" << dendl;
   }
 }
 
 bool AbstractObjectWriteRequest::should_complete(int r)
 {
-  ldout(m_ictx->cct, 20) << get_op_type() << " " << this << " " << m_oid
-                         << " " << m_object_off << "~" << m_object_len
-                         << " should_complete: r = " << r << dendl;
+  ldout(m_ictx->cct, 20) << get_op_type() << m_oid << " "
+                         << m_object_off << "~" << m_object_len
+                         << " r = " << r << dendl;
 
   bool finished = true;
   switch (m_state) {
@@ -443,9 +459,8 @@ bool AbstractObjectWriteRequest::should_complete(int r)
 }
 
 void AbstractObjectWriteRequest::send() {
-  ldout(m_ictx->cct, 20) << "send " << get_op_type() << " " << this <<" "
-                         << m_oid << " " << m_object_off << "~"
-                         << m_object_len << dendl;
+  ldout(m_ictx->cct, 20) << get_op_type() << " " << m_oid << " "
+                         << m_object_off << "~" << m_object_len << dendl;
   {
     RWLock::RLocker snap_lock(m_ictx->snap_lock);
     if (m_ictx->object_map == nullptr) {
@@ -461,7 +476,7 @@ void AbstractObjectWriteRequest::send() {
 }
 
 void AbstractObjectWriteRequest::send_pre_object_map_update() {
-  ldout(m_ictx->cct, 20) << __func__ << dendl;
+  ldout(m_ictx->cct, 20) << dendl;
 
   {
     RWLock::RLocker snap_lock(m_ictx->snap_lock);
@@ -469,13 +484,12 @@ void AbstractObjectWriteRequest::send_pre_object_map_update() {
       uint8_t new_state;
       pre_object_map_update(&new_state);
       RWLock::WLocker object_map_locker(m_ictx->object_map_lock);
-      ldout(m_ictx->cct, 20) << __func__ << this << " " << m_oid << " "
-                             << m_object_off << "~" << m_object_len
-                             << dendl;
+      ldout(m_ictx->cct, 20) << m_oid << " " << m_object_off
+                             << "~" << m_object_len << dendl;
       m_state = LIBRBD_AIO_WRITE_PRE;
 
       if (m_ictx->object_map->aio_update<ObjectRequest>(
-            CEPH_NOSNAP, m_object_no, new_state, {}, this)) {
+            CEPH_NOSNAP, m_object_no, new_state, {}, this->m_trace, this)) {
         return;
       }
     }
@@ -485,6 +499,8 @@ void AbstractObjectWriteRequest::send_pre_object_map_update() {
 }
 
 bool AbstractObjectWriteRequest::send_post_object_map_update() {
+  ldout(m_ictx->cct, 20) << dendl;
+
   RWLock::RLocker snap_locker(m_ictx->snap_lock);
   if (m_ictx->object_map == nullptr || !post_object_map_update()) {
     return true;
@@ -494,12 +510,13 @@ bool AbstractObjectWriteRequest::send_post_object_map_update() {
   assert(m_ictx->exclusive_lock->is_lock_owner());
 
   RWLock::WLocker object_map_locker(m_ictx->object_map_lock);
-  ldout(m_ictx->cct, 20) << __func__ << this << " " << m_oid << " "
-                         << m_object_off << "~" << m_object_len << dendl;
+  ldout(m_ictx->cct, 20) << m_oid << " " << m_object_off
+                         << "~" << m_object_len << dendl;
   m_state = LIBRBD_AIO_WRITE_POST;
 
   if (m_ictx->object_map->aio_update<ObjectRequest>(
-        CEPH_NOSNAP, m_object_no, OBJECT_NONEXISTENT, OBJECT_PENDING, this)) {
+        CEPH_NOSNAP, m_object_no, OBJECT_NONEXISTENT, OBJECT_PENDING,
+        this->m_trace, this)) {
     return false;
   }
 
@@ -507,8 +524,7 @@ bool AbstractObjectWriteRequest::send_post_object_map_update() {
 }
 
 void AbstractObjectWriteRequest::send_write() {
-  ldout(m_ictx->cct, 20) << "send_write " << this << " " << m_oid << " "
-      		         << m_object_off << "~" << m_object_len
+  ldout(m_ictx->cct, 20) << m_oid << " " << m_object_off << "~" << m_object_len
                          << " object exist " << m_object_exist << dendl;
 
   if (!m_object_exist && has_parent()) {
@@ -521,8 +537,8 @@ void AbstractObjectWriteRequest::send_write() {
 
 void AbstractObjectWriteRequest::send_copyup()
 {
-  ldout(m_ictx->cct, 20) << "send_copyup " << this << " " << m_oid << " "
-                         << m_object_off << "~" << m_object_len << dendl;
+  ldout(m_ictx->cct, 20) << m_oid << " " << m_object_off
+                         << "~" << m_object_len << dendl;
   m_state = LIBRBD_AIO_WRITE_COPYUP;
 
   m_ictx->copyup_list_lock.Lock();
@@ -531,7 +547,8 @@ void AbstractObjectWriteRequest::send_copyup()
   if (it == m_ictx->copyup_list.end()) {
     CopyupRequest *new_req = new CopyupRequest(m_ictx, m_oid,
                                                m_object_no,
-                                               std::move(m_parent_extents));
+                                               std::move(m_parent_extents),
+					       this->m_trace);
     m_parent_extents.clear();
 
     // make sure to wait on this CopyupRequest
@@ -552,13 +569,14 @@ void AbstractObjectWriteRequest::send_write_op()
     guard_write();
   }
 
-  add_write_ops(&m_write);
+  add_write_ops(&m_write, true);
   assert(m_write.size() != 0);
 
   librados::AioCompletion *rados_completion =
     util::create_rados_callback(this);
-  int r = m_ictx->data_ctx.aio_operate(m_oid, rados_completion, &m_write,
-                                       m_snap_seq, m_snaps);
+  int r = m_ictx->data_ctx.aio_operate(
+    m_oid, rados_completion, &m_write, m_snap_seq, m_snaps,
+    (this->m_trace.valid() ? this->m_trace.get_info() : nullptr));
   assert(r == 0);
   rados_completion->release();
 }
@@ -581,9 +599,10 @@ void AbstractObjectWriteRequest::handle_write_guard()
   }
 }
 
-void ObjectWriteRequest::add_write_ops(librados::ObjectWriteOperation *wr) {
+void ObjectWriteRequest::add_write_ops(librados::ObjectWriteOperation *wr,
+                                       bool set_hints) {
   RWLock::RLocker snap_locker(m_ictx->snap_lock);
-  if (m_ictx->enable_alloc_hint &&
+  if (set_hints && m_ictx->enable_alloc_hint &&
       (m_ictx->object_map == nullptr || !m_object_exist)) {
     wr->set_alloc_hint(m_ictx->get_object_size(), m_ictx->get_object_size());
   }
@@ -598,8 +617,7 @@ void ObjectWriteRequest::add_write_ops(librados::ObjectWriteOperation *wr) {
 
 void ObjectWriteRequest::send_write() {
   bool write_full = (m_object_off == 0 && m_object_len == m_ictx->get_object_size());
-  ldout(m_ictx->cct, 20) << "send_write " << this << " " << m_oid << " "
-                         << m_object_off << "~" << m_object_len
+  ldout(m_ictx->cct, 20) << m_oid << " " << m_object_off << "~" << m_object_len
                          << " object exist " << m_object_exist
                          << " write_full " << write_full << dendl;
   if (write_full && !has_parent()) {
@@ -617,14 +635,13 @@ void ObjectRemoveRequest::guard_write() {
   }
 }
 void ObjectRemoveRequest::send_write() {
-  ldout(m_ictx->cct, 20) << "send_write " << this << " " << m_oid << " "
-                         << m_object_off << "~" << m_object_len << dendl;
+  ldout(m_ictx->cct, 20) << m_oid << " " << m_object_off
+                         << "~" << m_object_len << dendl;
   send_pre_object_map_update();
 }
 
 void ObjectTruncateRequest::send_write() {
-  ldout(m_ictx->cct, 20) << "send_write " << this << " " << m_oid
-                         << " truncate " << m_object_off << dendl;
+  ldout(m_ictx->cct, 20) << m_oid << " truncate " << m_object_off << dendl;
   if (!m_object_exist && ! has_parent()) {
     m_state = LIBRBD_AIO_WRITE_FLAT;
     Context *ctx = util::create_context_callback<ObjectRequest>(this);
@@ -634,9 +651,10 @@ void ObjectTruncateRequest::send_write() {
   }
 }
 
-void ObjectWriteSameRequest::add_write_ops(librados::ObjectWriteOperation *wr) {
+void ObjectWriteSameRequest::add_write_ops(librados::ObjectWriteOperation *wr,
+                                           bool set_hints) {
   RWLock::RLocker snap_locker(m_ictx->snap_lock);
-  if (m_ictx->enable_alloc_hint &&
+  if (set_hints && m_ictx->enable_alloc_hint &&
       (m_ictx->object_map == nullptr || !m_object_exist)) {
     wr->set_alloc_hint(m_ictx->get_object_size(), m_ictx->get_object_size());
   }
@@ -647,8 +665,7 @@ void ObjectWriteSameRequest::add_write_ops(librados::ObjectWriteOperation *wr) {
 
 void ObjectWriteSameRequest::send_write() {
   bool write_full = (m_object_off == 0 && m_object_len == m_ictx->get_object_size());
-  ldout(m_ictx->cct, 20) << "send_write " << this << " " << m_oid << " "
-                         << m_object_off << "~" << m_object_len
+  ldout(m_ictx->cct, 20) << m_oid << " " << m_object_off << "~" << m_object_len
                          << " write_full " << write_full << dendl;
   if (write_full && !has_parent()) {
     m_guard = false;
diff --git a/src/librbd/io/ObjectRequest.h b/src/librbd/io/ObjectRequest.h
index f57bbe0b72f..85529a19356 100644
--- a/src/librbd/io/ObjectRequest.h
+++ b/src/librbd/io/ObjectRequest.h
@@ -5,13 +5,12 @@
 #define CEPH_LIBRBD_IO_OBJECT_REQUEST_H
 
 #include "include/int_types.h"
-
-#include <map>
-
-#include "common/snap_types.h"
 #include "include/buffer.h"
 #include "include/rados/librados.hpp"
+#include "common/snap_types.h"
+#include "common/zipkin_trace.h"
 #include "librbd/ObjectMap.h"
+#include <map>
 
 class Context;
 
@@ -50,23 +49,27 @@ public:
                                       const std::string &oid,
                                       uint64_t object_no,
                                       const ::SnapContext &snapc,
+				      const ZTracer::Trace &parent_trace,
                                       Context *completion);
   static ObjectRequest* create_truncate(ImageCtxT *ictx,
                                         const std::string &oid,
                                         uint64_t object_no,
                                         uint64_t object_off,
                                         const ::SnapContext &snapc,
+					const ZTracer::Trace &parent_trace,
                                         Context *completion);
   static ObjectRequest* create_write(ImageCtxT *ictx, const std::string &oid,
                                      uint64_t object_no,
                                      uint64_t object_off,
                                      const ceph::bufferlist &data,
-                                     const ::SnapContext &snapc,
-                                     Context *completion, int op_flags);
+                                     const ::SnapContext &snapc, int op_flags,
+				     const ZTracer::Trace &parent_trace,
+                                     Context *completion);
   static ObjectRequest* create_zero(ImageCtxT *ictx, const std::string &oid,
                                     uint64_t object_no, uint64_t object_off,
                                     uint64_t object_len,
                                     const ::SnapContext &snapc,
+				    const ZTracer::Trace &parent_trace,
                                     Context *completion);
   static ObjectRequest* create_writesame(ImageCtxT *ictx,
                                          const std::string &oid,
@@ -75,15 +78,22 @@ public:
                                          uint64_t object_len,
                                          const ceph::bufferlist &data,
                                          const ::SnapContext &snapc,
-                                         Context *completion, int op_flags);
+					 int op_flags,
+					 const ZTracer::Trace &parent_trace,
+                                         Context *completion);
 
   ObjectRequest(ImageCtx *ictx, const std::string &oid,
                 uint64_t objectno, uint64_t off, uint64_t len,
-                librados::snap_t snap_id,
-                Context *completion, bool hide_enoent);
-  ~ObjectRequest() override {}
+                librados::snap_t snap_id, bool hide_enoent,
+		const char *trace_name, const ZTracer::Trace &parent_trace,
+		Context *completion);
+  ~ObjectRequest() override {
+    m_trace.event("finish");
+  }
 
-  virtual void add_copyup_ops(librados::ObjectWriteOperation *wr) {};
+  virtual void add_copyup_ops(librados::ObjectWriteOperation *wr,
+                              bool set_hints) {
+  };
 
   void complete(int r) override;
 
@@ -111,6 +121,7 @@ protected:
   Context *m_completion;
   Extents m_parent_extents;
   bool m_hide_enoent;
+  ZTracer::Trace m_trace;
 
 private:
   bool m_has_parent = false;
@@ -126,16 +137,19 @@ public:
                                    uint64_t objectno, uint64_t offset,
                                    uint64_t len, Extents &buffer_extents,
                                    librados::snap_t snap_id, bool sparse,
-                                   Context *completion, int op_flags) {
+				   int op_flags,
+				   const ZTracer::Trace &parent_trace,
+                                   Context *completion) {
     return new ObjectReadRequest(ictx, oid, objectno, offset, len,
-                                 buffer_extents, snap_id, sparse, completion,
-                                 op_flags);
+                                 buffer_extents, snap_id, sparse, op_flags,
+				 parent_trace, completion);
   }
 
   ObjectReadRequest(ImageCtxT *ictx, const std::string &oid,
                     uint64_t objectno, uint64_t offset, uint64_t len,
                     Extents& buffer_extents, librados::snap_t snap_id,
-                    bool sparse, Context *completion, int op_flags);
+                    bool sparse, int op_flags,
+		    const ZTracer::Trace &parent_trace, Context *completion);
 
   bool should_complete(int r) override;
   void send() override;
@@ -207,11 +221,14 @@ public:
   AbstractObjectWriteRequest(ImageCtx *ictx, const std::string &oid,
                              uint64_t object_no, uint64_t object_off,
                              uint64_t len, const ::SnapContext &snapc,
-                             Context *completion, bool hide_enoent);
+			     bool hide_enoent, const char *trace_name,
+			     const ZTracer::Trace &parent_trace,
+                             Context *completion);
 
-  void add_copyup_ops(librados::ObjectWriteOperation *wr) override
+  void add_copyup_ops(librados::ObjectWriteOperation *wr,
+                      bool set_hints) override
   {
-    add_write_ops(wr);
+    add_write_ops(wr, set_hints);
   }
 
   bool should_complete(int r) override;
@@ -270,7 +287,8 @@ protected:
   bool m_object_exist;
   bool m_guard = true;
 
-  virtual void add_write_ops(librados::ObjectWriteOperation *wr) = 0;
+  virtual void add_write_ops(librados::ObjectWriteOperation *wr,
+                             bool set_hints) = 0;
   virtual void guard_write();
   virtual bool post_object_map_update() {
     return false;
@@ -290,10 +308,11 @@ class ObjectWriteRequest : public AbstractObjectWriteRequest {
 public:
   ObjectWriteRequest(ImageCtx *ictx, const std::string &oid, uint64_t object_no,
                      uint64_t object_off, const ceph::bufferlist &data,
-                     const ::SnapContext &snapc, Context *completion,
-                     int op_flags)
+                     const ::SnapContext &snapc, int op_flags,
+		     const ZTracer::Trace &parent_trace, Context *completion)
     : AbstractObjectWriteRequest(ictx, oid, object_no, object_off,
-                                 data.length(), snapc, completion, false),
+                                 data.length(), snapc, false, "write",
+				 parent_trace, completion),
       m_write_data(data), m_op_flags(op_flags) {
   }
 
@@ -311,7 +330,8 @@ public:
   }
 
 protected:
-  void add_write_ops(librados::ObjectWriteOperation *wr) override;
+  void add_write_ops(librados::ObjectWriteOperation *wr,
+                     bool set_hints) override;
 
   void send_write() override;
 
@@ -324,9 +344,9 @@ class ObjectRemoveRequest : public AbstractObjectWriteRequest {
 public:
   ObjectRemoveRequest(ImageCtx *ictx, const std::string &oid,
                       uint64_t object_no, const ::SnapContext &snapc,
-                      Context *completion)
-    : AbstractObjectWriteRequest(ictx, oid, object_no, 0, 0, snapc, completion,
-                                 true),
+		      const ZTracer::Trace &parent_trace, Context *completion)
+    : AbstractObjectWriteRequest(ictx, oid, object_no, 0, 0, snapc, true,
+				 "remote", parent_trace, completion),
       m_object_state(OBJECT_NONEXISTENT) {
   }
 
@@ -358,7 +378,8 @@ public:
   void send_write() override;
 
 protected:
-  void add_write_ops(librados::ObjectWriteOperation *wr) override {
+  void add_write_ops(librados::ObjectWriteOperation *wr,
+                     bool set_hints) override {
     if (has_parent()) {
       wr->truncate(0);
     } else {
@@ -376,10 +397,10 @@ public:
   // update is needed. pre update is decided as usual (by checking
   // the state of the object in the map).
   ObjectTrimRequest(ImageCtx *ictx, const std::string &oid, uint64_t object_no,
-                    const ::SnapContext &snapc, Context *completion,
-                    bool post_object_map_update)
-    : AbstractObjectWriteRequest(ictx, oid, object_no, 0, 0, snapc, completion,
-                                 true),
+                    const ::SnapContext &snapc, bool post_object_map_update,
+		    Context *completion)
+    : AbstractObjectWriteRequest(ictx, oid, object_no, 0, 0, snapc, true,
+				 "trim", {}, completion),
       m_post_object_map_update(post_object_map_update) {
   }
 
@@ -397,7 +418,8 @@ public:
   }
 
 protected:
-  void add_write_ops(librados::ObjectWriteOperation *wr) override {
+  void add_write_ops(librados::ObjectWriteOperation *wr,
+                     bool set_hints) override {
     wr->remove();
   }
 
@@ -409,9 +431,10 @@ class ObjectTruncateRequest : public AbstractObjectWriteRequest {
 public:
   ObjectTruncateRequest(ImageCtx *ictx, const std::string &oid,
                         uint64_t object_no, uint64_t object_off,
-                        const ::SnapContext &snapc, Context *completion)
+                        const ::SnapContext &snapc,
+			const ZTracer::Trace &parent_trace, Context *completion)
     : AbstractObjectWriteRequest(ictx, oid, object_no, object_off, 0, snapc,
-                                 completion, true) {
+                                 true, "truncate", parent_trace, completion) {
   }
 
   const char* get_op_type() const override {
@@ -429,7 +452,8 @@ public:
   void send_write() override;
 
 protected:
-  void add_write_ops(librados::ObjectWriteOperation *wr) override {
+  void add_write_ops(librados::ObjectWriteOperation *wr,
+                     bool set_hints) override {
     wr->truncate(m_object_off);
   }
 };
@@ -438,9 +462,11 @@ class ObjectZeroRequest : public AbstractObjectWriteRequest {
 public:
   ObjectZeroRequest(ImageCtx *ictx, const std::string &oid, uint64_t object_no,
                     uint64_t object_off, uint64_t object_len,
-                    const ::SnapContext &snapc, Context *completion)
+                    const ::SnapContext &snapc,
+		    const ZTracer::Trace &parent_trace, Context *completion)
     : AbstractObjectWriteRequest(ictx, oid, object_no, object_off, object_len,
-                                 snapc, completion, true) {
+                                 snapc, true, "zero", parent_trace,
+				 completion) {
   }
 
   const char* get_op_type() const override {
@@ -453,20 +479,23 @@ public:
   }
 
 protected:
-  void add_write_ops(librados::ObjectWriteOperation *wr) override {
+  void add_write_ops(librados::ObjectWriteOperation *wr,
+                     bool set_hints) override {
     wr->zero(m_object_off, m_object_len);
   }
 };
 
 class ObjectWriteSameRequest : public AbstractObjectWriteRequest {
 public:
-  ObjectWriteSameRequest(ImageCtx *ictx, const std::string &oid, uint64_t object_no,
-                         uint64_t object_off, uint64_t object_len,
-                         const ceph::bufferlist &data,
-                         const ::SnapContext &snapc, Context *completion,
-                         int op_flags)
+  ObjectWriteSameRequest(ImageCtx *ictx, const std::string &oid,
+			 uint64_t object_no, uint64_t object_off,
+			 uint64_t object_len, const ceph::bufferlist &data,
+                         const ::SnapContext &snapc, int op_flags,
+			 const ZTracer::Trace &parent_trace,
+			 Context *completion)
     : AbstractObjectWriteRequest(ictx, oid, object_no, object_off,
-                                 object_len, snapc, completion, false),
+                                 object_len, snapc, false, "writesame",
+				 parent_trace, completion),
       m_write_data(data), m_op_flags(op_flags) {
   }
 
@@ -480,7 +509,8 @@ public:
   }
 
 protected:
-  void add_write_ops(librados::ObjectWriteOperation *wr) override;
+  void add_write_ops(librados::ObjectWriteOperation *wr,
+                     bool set_hints) override;
 
   void send_write() override;
 
diff --git a/src/librbd/io/ReadResult.cc b/src/librbd/io/ReadResult.cc
index 433dac73ca8..1b8cdb0e6ac 100644
--- a/src/librbd/io/ReadResult.cc
+++ b/src/librbd/io/ReadResult.cc
@@ -10,7 +10,8 @@
 
 #define dout_subsys ceph_subsys_rbd
 #undef dout_prefix
-#define dout_prefix *_dout << "librbd::io::ReadResult: "
+#define dout_prefix *_dout << "librbd::io::ReadResult: " << this \
+                           << " " << __func__ << ": "
 
 namespace librbd {
 namespace io {
@@ -89,7 +90,7 @@ void ReadResult::C_ReadRequest::finish(int r) {
 
 void ReadResult::C_ImageReadRequest::finish(int r) {
   CephContext *cct = aio_completion->ictx->cct;
-  ldout(cct, 10) << "C_ImageReadRequest::finish() " << this << ": r=" << r
+  ldout(cct, 10) << "C_ImageReadRequest: r=" << r
                  << dendl;
   if (r >= 0) {
     size_t length = 0;
@@ -114,7 +115,7 @@ void ReadResult::C_SparseReadRequestBase::finish(ExtentMap &extent_map,
                                                  bufferlist &bl, int r) {
   aio_completion->lock.Lock();
   CephContext *cct = aio_completion->ictx->cct;
-  ldout(cct, 10) << "C_SparseReadRequest::finish() " << this << " r = " << r
+  ldout(cct, 10) << "C_SparseReadRequestBase: r = " << r
                  << dendl;
 
   if (r >= 0 || r == -ENOENT) { // this was a sparse_read operation
diff --git a/src/librbd/journal/DemoteRequest.cc b/src/librbd/journal/DemoteRequest.cc
index c41961d73ba..d51f777efe9 100644
--- a/src/librbd/journal/DemoteRequest.cc
+++ b/src/librbd/journal/DemoteRequest.cc
@@ -131,7 +131,7 @@ void DemoteRequest<I>::append_event() {
   CephContext *cct = m_image_ctx.cct;
   ldout(cct, 20) << dendl;
 
-  EventEntry event_entry{DemoteEvent{}, ceph_clock_now()};
+  EventEntry event_entry{DemotePromoteEvent{}, {}};
   bufferlist event_entry_bl;
   ::encode(event_entry, event_entry_bl);
 
diff --git a/src/librbd/journal/PromoteRequest.cc b/src/librbd/journal/PromoteRequest.cc
index b541e48197e..fb85bcb8661 100644
--- a/src/librbd/journal/PromoteRequest.cc
+++ b/src/librbd/journal/PromoteRequest.cc
@@ -102,6 +102,93 @@ void PromoteRequest<I>::handle_allocate_tag(int r) {
   if (r < 0) {
     m_ret_val = r;
     lderr(cct) << "failed to allocate tag: " << cpp_strerror(r) << dendl;
+    shut_down();
+    return;
+  }
+
+  m_tag_tid = m_tag.tid;
+  append_event();
+}
+
+template <typename I>
+void PromoteRequest<I>::append_event() {
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 20) << dendl;
+
+  EventEntry event_entry{DemotePromoteEvent{}, {}};
+  bufferlist event_entry_bl;
+  ::encode(event_entry, event_entry_bl);
+
+  m_journaler->start_append(0, 0, 0);
+  m_future = m_journaler->append(m_tag_tid, event_entry_bl);
+
+  auto ctx = create_context_callback<
+    PromoteRequest<I>, &PromoteRequest<I>::handle_append_event>(this);
+  m_future.flush(ctx);
+}
+
+template <typename I>
+void PromoteRequest<I>::handle_append_event(int r) {
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 20) << "r=" << r << dendl;
+
+  if (r < 0) {
+    m_ret_val = r;
+    lderr(cct) << "failed to append promotion journal event: "
+               << cpp_strerror(r) << dendl;
+    stop_append();
+    return;
+  }
+
+  commit_event();
+}
+
+template <typename I>
+void PromoteRequest<I>::commit_event() {
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 20) << dendl;
+
+  m_journaler->committed(m_future);
+
+  auto ctx = create_context_callback<
+    PromoteRequest<I>, &PromoteRequest<I>::handle_commit_event>(this);
+  m_journaler->flush_commit_position(ctx);
+}
+
+template <typename I>
+void PromoteRequest<I>::handle_commit_event(int r) {
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 20) << "r=" << r << dendl;
+
+  if (r < 0) {
+    m_ret_val = r;
+    lderr(cct) << "failed to flush promote commit position: "
+               << cpp_strerror(r) << dendl;
+  }
+
+  stop_append();
+}
+
+template <typename I>
+void PromoteRequest<I>::stop_append() {
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 20) << dendl;
+
+  auto ctx = create_context_callback<
+    PromoteRequest<I>, &PromoteRequest<I>::handle_stop_append>(this);
+  m_journaler->stop_append(ctx);
+}
+
+template <typename I>
+void PromoteRequest<I>::handle_stop_append(int r) {
+  CephContext *cct = m_image_ctx->cct;
+  ldout(cct, 20) << "r=" << r << dendl;
+
+  if (r < 0) {
+    if (m_ret_val == 0) {
+      m_ret_val = r;
+    }
+    lderr(cct) << "failed to stop journal append: " << cpp_strerror(r) << dendl;
   }
 
   shut_down();
diff --git a/src/librbd/journal/PromoteRequest.h b/src/librbd/journal/PromoteRequest.h
index 4ef1d116c58..0d01f596108 100644
--- a/src/librbd/journal/PromoteRequest.h
+++ b/src/librbd/journal/PromoteRequest.h
@@ -7,6 +7,7 @@
 #include "include/int_types.h"
 #include "common/Mutex.h"
 #include "cls/journal/cls_journal_types.h"
+#include "journal/Future.h"
 #include "librbd/journal/Types.h"
 #include "librbd/journal/TypeTraits.h"
 
@@ -37,13 +38,22 @@ private:
    * <start>
    *    |
    *    v
-   *  OPEN
-   *    |
-   *    v
-   * ALLOCATE_TAG
-   *    |
-   *    v
-   * SHUT_DOWN
+   * OPEN * * * * * * * * * *
+   *    |                   *
+   *    v                   *
+   * ALLOCATE_TAG * * * * * *
+   *    |                   *
+   *    v                   *
+   * APPEND_EVENT * * *     *
+   *    |             *     *
+   *    v             *     *
+   * COMMIT_EVENT     *     *
+   *    |             *     *
+   *    v             *     *
+   * STOP_APPEND <* * *     *
+   *    |                   *
+   *    v                   *
+   * SHUT_DOWN <* * * * * * *
    *    |
    *    v
    * <finish>
@@ -52,6 +62,7 @@ private:
    */
 
   typedef typename TypeTraits<ImageCtxT>::Journaler Journaler;
+  typedef typename TypeTraits<ImageCtxT>::Future Future;
 
   ImageCtxT *m_image_ctx;
   bool m_force;
@@ -66,6 +77,7 @@ private:
   TagData m_tag_data;
 
   cls::journal::Tag m_tag;
+  Future m_future;
 
   void send_open();
   void handle_open(int r);
@@ -73,6 +85,15 @@ private:
   void allocate_tag();
   void handle_allocate_tag(int r);
 
+  void append_event();
+  void handle_append_event(int r);
+
+  void commit_event();
+  void handle_commit_event(int r);
+
+  void stop_append();
+  void handle_stop_append(int r);
+
   void shut_down();
   void handle_shut_down(int r);
 
diff --git a/src/librbd/journal/Replay.cc b/src/librbd/journal/Replay.cc
index 481f7510c53..2d408d5b8eb 100644
--- a/src/librbd/journal/Replay.cc
+++ b/src/librbd/journal/Replay.cc
@@ -249,7 +249,7 @@ void Replay<I>::shut_down(bool cancel_ops, Context *on_finish) {
   // execute the following outside of lock scope
   if (flush_comp != nullptr) {
     RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
-    io::ImageRequest<I>::aio_flush(&m_image_ctx, flush_comp);
+    io::ImageRequest<I>::aio_flush(&m_image_ctx, flush_comp, {});
   }
   if (on_finish != nullptr) {
     on_finish->complete(0);
@@ -266,7 +266,7 @@ void Replay<I>::flush(Context *on_finish) {
   }
 
   RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
-  io::ImageRequest<I>::aio_flush(&m_image_ctx, aio_comp);
+  io::ImageRequest<I>::aio_flush(&m_image_ctx, aio_comp, {});
 }
 
 template <typename I>
@@ -319,13 +319,14 @@ void Replay<I>::handle_event(const journal::AioDiscardEvent &event,
                                                io::AIO_TYPE_DISCARD,
                                                &flush_required);
   io::ImageRequest<I>::aio_discard(&m_image_ctx, aio_comp, event.offset,
-                                   event.length, event.skip_partial_discard);
+                                   event.length, event.skip_partial_discard,
+				   {});
   if (flush_required) {
     m_lock.Lock();
     auto flush_comp = create_aio_flush_completion(nullptr);
     m_lock.Unlock();
 
-    io::ImageRequest<I>::aio_flush(&m_image_ctx, flush_comp);
+    io::ImageRequest<I>::aio_flush(&m_image_ctx, flush_comp, {});
   }
 }
 
@@ -342,13 +343,13 @@ void Replay<I>::handle_event(const journal::AioWriteEvent &event,
                                                &flush_required);
   io::ImageRequest<I>::aio_write(&m_image_ctx, aio_comp,
                                  {{event.offset, event.length}},
-                                 std::move(data), 0);
+                                 std::move(data), 0, {});
   if (flush_required) {
     m_lock.Lock();
     auto flush_comp = create_aio_flush_completion(nullptr);
     m_lock.Unlock();
 
-    io::ImageRequest<I>::aio_flush(&m_image_ctx, flush_comp);
+    io::ImageRequest<I>::aio_flush(&m_image_ctx, flush_comp, {});
   }
 }
 
@@ -363,7 +364,7 @@ void Replay<I>::handle_event(const journal::AioFlushEvent &event,
     Mutex::Locker locker(m_lock);
     aio_comp = create_aio_flush_completion(on_safe);
   }
-  io::ImageRequest<I>::aio_flush(&m_image_ctx, aio_comp);
+  io::ImageRequest<I>::aio_flush(&m_image_ctx, aio_comp, {});
 
   on_ready->complete(0);
 }
@@ -380,13 +381,13 @@ void Replay<I>::handle_event(const journal::AioWriteSameEvent &event,
                                                io::AIO_TYPE_WRITESAME,
                                                &flush_required);
   io::ImageRequest<I>::aio_writesame(&m_image_ctx, aio_comp, event.offset,
-                                     event.length, std::move(data), 0);
+                                     event.length, std::move(data), 0, {});
   if (flush_required) {
     m_lock.Lock();
     auto flush_comp = create_aio_flush_completion(nullptr);
     m_lock.Unlock();
 
-    io::ImageRequest<I>::aio_flush(&m_image_ctx, flush_comp);
+    io::ImageRequest<I>::aio_flush(&m_image_ctx, flush_comp, {});
   }
 }
 
@@ -670,10 +671,10 @@ void Replay<I>::handle_event(const journal::FlattenEvent &event,
 }
 
 template <typename I>
-void Replay<I>::handle_event(const journal::DemoteEvent &event,
+void Replay<I>::handle_event(const journal::DemotePromoteEvent &event,
 			     Context *on_ready, Context *on_safe) {
   CephContext *cct = m_image_ctx.cct;
-  ldout(cct, 20) << ": Demote event" << dendl;
+  ldout(cct, 20) << ": Demote/Promote event" << dendl;
   on_ready->complete(0);
   on_safe->complete(0);
 }
diff --git a/src/librbd/journal/Replay.h b/src/librbd/journal/Replay.h
index afe71b994f7..d4a65e089bb 100644
--- a/src/librbd/journal/Replay.h
+++ b/src/librbd/journal/Replay.h
@@ -157,7 +157,7 @@ private:
                     Context *on_safe);
   void handle_event(const FlattenEvent &event, Context *on_ready,
                     Context *on_safe);
-  void handle_event(const DemoteEvent &event, Context *on_ready,
+  void handle_event(const DemotePromoteEvent &event, Context *on_ready,
                     Context *on_safe);
   void handle_event(const SnapLimitEvent &event, Context *on_ready,
                     Context *on_safe);
diff --git a/src/librbd/journal/Types.cc b/src/librbd/journal/Types.cc
index c30a5752df2..857542bd74b 100644
--- a/src/librbd/journal/Types.cc
+++ b/src/librbd/journal/Types.cc
@@ -272,13 +272,13 @@ void ResizeEvent::dump(Formatter *f) const {
   f->dump_unsigned("size", size);
 }
 
-void DemoteEvent::encode(bufferlist& bl) const {
+void DemotePromoteEvent::encode(bufferlist& bl) const {
 }
 
-void DemoteEvent::decode(__u8 version, bufferlist::iterator& it) {
+void DemotePromoteEvent::decode(__u8 version, bufferlist::iterator& it) {
 }
 
-void DemoteEvent::dump(Formatter *f) const {
+void DemotePromoteEvent::dump(Formatter *f) const {
 }
 
 void UpdateFeaturesEvent::encode(bufferlist& bl) const {
@@ -400,8 +400,8 @@ void EventEntry::decode(bufferlist::iterator& it) {
   case EVENT_TYPE_FLATTEN:
     event = FlattenEvent();
     break;
-  case EVENT_TYPE_DEMOTE:
-    event = DemoteEvent();
+  case EVENT_TYPE_DEMOTE_PROMOTE:
+    event = DemotePromoteEvent();
     break;
   case EVENT_TYPE_UPDATE_FEATURES:
     event = UpdateFeaturesEvent();
@@ -484,7 +484,7 @@ void EventEntry::generate_test_instances(std::list<EventEntry *> &o) {
 
   o.push_back(new EventEntry(FlattenEvent(123), utime_t(1, 1)));
 
-  o.push_back(new EventEntry(DemoteEvent()));
+  o.push_back(new EventEntry(DemotePromoteEvent()));
 
   o.push_back(new EventEntry(UpdateFeaturesEvent()));
   o.push_back(new EventEntry(UpdateFeaturesEvent(123, 127, true), utime_t(1, 1)));
@@ -749,8 +749,8 @@ std::ostream &operator<<(std::ostream &out, const EventType &type) {
   case EVENT_TYPE_FLATTEN:
     out << "Flatten";
     break;
-  case EVENT_TYPE_DEMOTE:
-    out << "Demote";
+  case EVENT_TYPE_DEMOTE_PROMOTE:
+    out << "Demote/Promote";
     break;
   case EVENT_TYPE_UPDATE_FEATURES:
     out << "UpdateFeatures";
diff --git a/src/librbd/journal/Types.h b/src/librbd/journal/Types.h
index 9c8057bd36a..adc6f295575 100644
--- a/src/librbd/journal/Types.h
+++ b/src/librbd/journal/Types.h
@@ -37,7 +37,7 @@ enum EventType {
   EVENT_TYPE_RENAME          = 10,
   EVENT_TYPE_RESIZE          = 11,
   EVENT_TYPE_FLATTEN         = 12,
-  EVENT_TYPE_DEMOTE          = 13,
+  EVENT_TYPE_DEMOTE_PROMOTE  = 13,
   EVENT_TYPE_SNAP_LIMIT      = 14,
   EVENT_TYPE_UPDATE_FEATURES = 15,
   EVENT_TYPE_METADATA_SET    = 16,
@@ -316,8 +316,9 @@ struct FlattenEvent : public OpEventBase {
   using OpEventBase::dump;
 };
 
-struct DemoteEvent {
-  static const EventType TYPE = static_cast<EventType>(EVENT_TYPE_DEMOTE);
+struct DemotePromoteEvent {
+  static const EventType TYPE = static_cast<EventType>(
+    EVENT_TYPE_DEMOTE_PROMOTE);
 
   void encode(bufferlist& bl) const;
   void decode(__u8 version, bufferlist::iterator& it);
@@ -395,7 +396,7 @@ typedef boost::variant<AioDiscardEvent,
                        RenameEvent,
                        ResizeEvent,
                        FlattenEvent,
-                       DemoteEvent,
+                       DemotePromoteEvent,
 		       SnapLimitEvent,
                        UpdateFeaturesEvent,
                        MetadataSetEvent,
diff --git a/src/librbd/librbd.cc b/src/librbd/librbd.cc
index 5a4bf51d5ba..86ddd0a8573 100644
--- a/src/librbd/librbd.cc
+++ b/src/librbd/librbd.cc
@@ -184,6 +184,15 @@ void mirror_image_status_cpp_to_c(const librbd::mirror_image_status_t &cpp_statu
   c_status->up = cpp_status.up;
 }
 
+void trash_image_info_cpp_to_c(const librbd::trash_image_info_t &cpp_info,
+                               rbd_trash_image_info_t *c_info) {
+  c_info->id = strdup(cpp_info.id.c_str());
+  c_info->name = strdup(cpp_info.name.c_str());
+  c_info->source = cpp_info.source;
+  c_info->deletion_time = cpp_info.deletion_time;
+  c_info->deferment_end_time = cpp_info.deferment_end_time;
+}
+
 struct C_MirrorImageGetInfo : public Context {
     rbd_mirror_image_info_t *mirror_image_info;
   Context *on_finish;
@@ -551,16 +560,22 @@ namespace librbd {
     return r;
   }
 
+  int RBD::trash_get(IoCtx &io_ctx, const char *id, trash_image_info_t *info) {
+    return librbd::trash_get(io_ctx, id, info);
+  }
+
   int RBD::trash_list(IoCtx &io_ctx, vector<trash_image_info_t> &entries) {
     TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
     tracepoint(librbd, trash_list_enter,
                io_ctx.get_pool_name().c_str(), io_ctx.get_id());
     int r = librbd::trash_list(io_ctx, entries);
+#ifdef WITH_LTTNG
     if (r >= 0) {
       for (const auto& entry : entries) {
 	tracepoint(librbd, trash_list_entry, entry.id.c_str());
       }
     }
+#endif
     tracepoint(librbd, trash_list_exit, r, r);
     return r;
   }
@@ -1030,11 +1045,23 @@ namespace librbd {
   int Image::parent_info(string *parent_pool_name, string *parent_name,
 			 string *parent_snap_name)
   {
+    return parent_info2(parent_pool_name, parent_name, nullptr,
+                        parent_snap_name);
+  }
+
+  int Image::parent_info2(string *parent_pool_name, string *parent_name,
+                          string *parent_id, string *parent_snap_name)
+  {
     ImageCtx *ictx = (ImageCtx *)ctx;
-    tracepoint(librbd, get_parent_info_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
+    tracepoint(librbd, get_parent_info_enter, ictx, ictx->name.c_str(),
+               ictx->snap_name.c_str(), ictx->read_only);
     int r = librbd::get_parent_info(ictx, parent_pool_name, parent_name,
-				   parent_snap_name);
-    tracepoint(librbd, get_parent_info_exit, r, parent_pool_name ? parent_pool_name->c_str() : NULL, parent_name ? parent_name->c_str() : NULL, parent_snap_name ? parent_snap_name->c_str() : NULL);
+                                    parent_id, parent_snap_name);
+    tracepoint(librbd, get_parent_info_exit, r,
+               parent_pool_name ? parent_pool_name->c_str() : NULL,
+               parent_name ? parent_name->c_str() : NULL,
+               parent_id ? parent_id->c_str() : NULL,
+               parent_snap_name ? parent_snap_name->c_str() : NULL);
     return r;
   }
 
@@ -2114,11 +2141,6 @@ extern "C" int rbd_list(rados_ioctx_t p, char *names, size_t *size)
   tracepoint(librbd, list_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id());
   vector<string> cpp_names;
   int r = librbd::list(io_ctx, cpp_names);
-  if (r == -ENOENT) {
-    tracepoint(librbd, list_exit, 0, *size);
-    return 0;
-  }
-
   if (r < 0) {
     tracepoint(librbd, list_exit, r, *size);
     return r;
@@ -2283,6 +2305,26 @@ extern "C" int rbd_trash_move(rados_ioctx_t p, const char *name,
   return r;
 }
 
+extern "C" int rbd_trash_get(rados_ioctx_t io, const char *id,
+                             rbd_trash_image_info_t *info) {
+  librados::IoCtx io_ctx;
+  librados::IoCtx::from_rados_ioctx_t(io, io_ctx);
+
+  librbd::trash_image_info_t cpp_info;
+  int r = librbd::trash_get(io_ctx, id, &cpp_info);
+  if (r < 0) {
+    return r;
+  }
+
+  trash_image_info_cpp_to_c(cpp_info, info);
+  return 0;
+}
+
+extern "C" void rbd_trash_get_cleanup(rbd_trash_image_info_t *info) {
+  free(info->id);
+  free(info->name);
+}
+
 extern "C" int rbd_trash_list(rados_ioctx_t p, rbd_trash_image_info_t *entries,
                               size_t *num_entries) {
   librados::IoCtx io_ctx;
@@ -2306,12 +2348,7 @@ extern "C" int rbd_trash_list(rados_ioctx_t p, rbd_trash_image_info_t *entries,
 
   int i=0;
   for (const auto &entry : cpp_entries) {
-    entries[i].id = strdup(entry.id.c_str());
-    entries[i].name = strdup(entry.name.c_str());
-    entries[i].source = entry.source;
-    entries[i].deletion_time = entry.deletion_time;
-    entries[i].deferment_end_time = entry.deferment_end_time;
-    i++;
+    trash_image_info_cpp_to_c(entry, &entries[i++]);
   }
   *num_entries = cpp_entries.size();
 
@@ -2321,8 +2358,7 @@ extern "C" int rbd_trash_list(rados_ioctx_t p, rbd_trash_image_info_t *entries,
 extern "C" void rbd_trash_list_cleanup(rbd_trash_image_info_t *entries,
                                        size_t num_entries) {
   for (size_t i=0; i < num_entries; i++) {
-    free(entries[i].id);
-    free(entries[i].name);
+    rbd_trash_get_cleanup(&entries[i]);
   }
 }
 
@@ -2825,22 +2861,38 @@ extern "C" int64_t rbd_get_data_pool_id(rbd_image_t image)
 }
 
 extern "C" int rbd_get_parent_info(rbd_image_t image,
-  char *parent_pool_name, size_t ppool_namelen, char *parent_name,
-  size_t pnamelen, char *parent_snap_name, size_t psnap_namelen)
+                                   char *parent_pool_name, size_t ppool_namelen,
+                                   char *parent_name, size_t pnamelen,
+                                   char *parent_snap_name, size_t psnap_namelen)
+{
+  return rbd_get_parent_info2(image, parent_pool_name, ppool_namelen,
+                              parent_name, pnamelen, nullptr, 0,
+                              parent_snap_name, psnap_namelen);
+}
+
+extern "C" int rbd_get_parent_info2(rbd_image_t image,
+                                    char *parent_pool_name,
+                                    size_t ppool_namelen,
+                                    char *parent_name, size_t pnamelen,
+                                    char *parent_id, size_t pidlen,
+                                    char *parent_snap_name,
+                                    size_t psnap_namelen)
 {
   librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
-  tracepoint(librbd, get_parent_info_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only);
-  string p_pool_name, p_name, p_snap_name;
+  tracepoint(librbd, get_parent_info_enter, ictx, ictx->name.c_str(),
+             ictx->snap_name.c_str(), ictx->read_only);
+  string p_pool_name, p_name, p_id, p_snap_name;
 
-  int r = librbd::get_parent_info(ictx, &p_pool_name, &p_name, &p_snap_name);
+  int r = librbd::get_parent_info(ictx, &p_pool_name, &p_name, &p_id,
+                                  &p_snap_name);
   if (r < 0) {
-    tracepoint(librbd, get_parent_info_exit, r, NULL, NULL, NULL);
+    tracepoint(librbd, get_parent_info_exit, r, NULL, NULL, NULL, NULL);
     return r;
   }
 
   if (parent_pool_name) {
     if (p_pool_name.length() + 1 > ppool_namelen) {
-      tracepoint(librbd, get_parent_info_exit, -ERANGE, NULL, NULL, NULL);
+      tracepoint(librbd, get_parent_info_exit, -ERANGE, NULL, NULL, NULL, NULL);
       return -ERANGE;
     }
 
@@ -2848,22 +2900,31 @@ extern "C" int rbd_get_parent_info(rbd_image_t image,
   }
   if (parent_name) {
     if (p_name.length() + 1 > pnamelen) {
-      tracepoint(librbd, get_parent_info_exit, -ERANGE, NULL, NULL, NULL);
+      tracepoint(librbd, get_parent_info_exit, -ERANGE, NULL, NULL, NULL, NULL);
       return -ERANGE;
     }
 
     strcpy(parent_name, p_name.c_str());
   }
+  if (parent_id) {
+    if (p_id.length() + 1 > pidlen) {
+      tracepoint(librbd, get_parent_info_exit, -ERANGE, NULL, NULL, NULL, NULL);
+      return -ERANGE;
+    }
+
+    strcpy(parent_id, p_id.c_str());
+  }
   if (parent_snap_name) {
     if (p_snap_name.length() + 1 > psnap_namelen) {
-      tracepoint(librbd, get_parent_info_exit, -ERANGE, NULL, NULL, NULL);
+      tracepoint(librbd, get_parent_info_exit, -ERANGE, NULL, NULL, NULL, NULL);
       return -ERANGE;
     }
 
     strcpy(parent_snap_name, p_snap_name.c_str());
   }
 
-  tracepoint(librbd, get_parent_info_exit, 0, parent_pool_name, parent_name, parent_snap_name);
+  tracepoint(librbd, get_parent_info_exit, 0, parent_pool_name, parent_name,
+             parent_id, parent_snap_name);
   return 0;
 }
 
@@ -3972,14 +4033,6 @@ extern "C" int rbd_group_list(rados_ioctx_t p, char *names, size_t *size)
 
   vector<string> cpp_names;
   int r = librbd::list(io_ctx, cpp_names);
-
-  if (r == -ENOENT) {
-    *size = 0;
-    *names = '\0';
-    tracepoint(librbd, group_list_exit, 0);
-    return 0;
-  }
-
   if (r < 0) {
     tracepoint(librbd, group_list_exit, r);
     return r;
diff --git a/src/librbd/object_map/UpdateRequest.cc b/src/librbd/object_map/UpdateRequest.cc
index e88085add41..8c0ec69de8d 100644
--- a/src/librbd/object_map/UpdateRequest.cc
+++ b/src/librbd/object_map/UpdateRequest.cc
@@ -42,7 +42,10 @@ void UpdateRequest<I>::send() {
 				m_new_state, m_current_state);
 
   librados::AioCompletion *rados_completion = create_callback_completion();
-  int r = m_image_ctx.md_ctx.aio_operate(oid, rados_completion, &op);
+  std::vector<librados::snap_t> snaps;
+  int r = m_image_ctx.md_ctx.aio_operate(
+    oid, rados_completion, &op, 0, snaps,
+    (m_trace.valid() ? m_trace.get_info() : nullptr));
   assert(r == 0);
   rados_completion->release();
 }
diff --git a/src/librbd/object_map/UpdateRequest.h b/src/librbd/object_map/UpdateRequest.h
index 0bb9cd756e2..175160752da 100644
--- a/src/librbd/object_map/UpdateRequest.h
+++ b/src/librbd/object_map/UpdateRequest.h
@@ -7,6 +7,8 @@
 #include "include/int_types.h"
 #include "librbd/object_map/Request.h"
 #include "common/bit_vector.hpp"
+#include "common/zipkin_trace.h"
+#include "librbd/Utils.h"
 #include <boost/optional.hpp>
 
 class Context;
@@ -25,21 +27,27 @@ public:
                                uint64_t snap_id, uint64_t start_object_no,
                                uint64_t end_object_no, uint8_t new_state,
                                const boost::optional<uint8_t> &current_state,
+                               const ZTracer::Trace &parent_trace,
                                Context *on_finish) {
     return new UpdateRequest(image_ctx, object_map, snap_id, start_object_no,
                              end_object_no, new_state, current_state,
-                             on_finish);
+                             parent_trace, on_finish);
   }
 
   UpdateRequest(ImageCtx &image_ctx, ceph::BitVector<2> *object_map,
                 uint64_t snap_id, uint64_t start_object_no,
                 uint64_t end_object_no, uint8_t new_state,
                 const boost::optional<uint8_t> &current_state,
-      	        Context *on_finish)
+      	        const ZTracer::Trace &parent_trace, Context *on_finish)
     : Request(image_ctx, snap_id, on_finish), m_object_map(*object_map),
       m_start_object_no(start_object_no), m_end_object_no(end_object_no),
-      m_new_state(new_state), m_current_state(current_state)
+      m_new_state(new_state), m_current_state(current_state),
+      m_trace(util::create_trace(image_ctx, "update object map", parent_trace))
   {
+    m_trace.event("start");
+  }
+  virtual ~UpdateRequest() {
+    m_trace.event("finish");
   }
 
   void send() override;
@@ -53,6 +61,7 @@ private:
   uint64_t m_end_object_no;
   uint8_t m_new_state;
   boost::optional<uint8_t> m_current_state;
+  ZTracer::Trace m_trace;
 };
 
 } // namespace object_map
diff --git a/src/librbd/operation/FlattenRequest.cc b/src/librbd/operation/FlattenRequest.cc
index 2cfa1ad1c13..e01dc2bc67b 100644
--- a/src/librbd/operation/FlattenRequest.cc
+++ b/src/librbd/operation/FlattenRequest.cc
@@ -42,7 +42,7 @@ public:
     bufferlist bl;
     string oid = image_ctx.get_object_name(m_object_no);
     auto req = new io::ObjectWriteRequest(&image_ctx, oid, m_object_no, 0,
-                                          bl, m_snapc, this, 0);
+                                          bl, m_snapc, 0, {}, this);
     if (!req->has_parent()) {
       // stop early if the parent went away - it just means
       // another flatten finished first or the image was resized
diff --git a/src/librbd/operation/Request.cc b/src/librbd/operation/Request.cc
index 7534004edc3..d6201d4cb46 100644
--- a/src/librbd/operation/Request.cc
+++ b/src/librbd/operation/Request.cc
@@ -178,4 +178,6 @@ void Request<I>::handle_op_event_safe(int r) {
 } // namespace operation
 } // namespace librbd
 
+#ifndef TEST_F
 template class librbd::operation::Request<librbd::ImageCtx>;
+#endif
diff --git a/src/librbd/operation/TrimRequest.cc b/src/librbd/operation/TrimRequest.cc
index 5c59e3c16a7..46ec967b5ed 100644
--- a/src/librbd/operation/TrimRequest.cc
+++ b/src/librbd/operation/TrimRequest.cc
@@ -46,7 +46,7 @@ public:
     ldout(image_ctx.cct, 10) << "removing (with copyup) " << oid << dendl;
 
     auto req = new io::ObjectTrimRequest(&image_ctx, oid, m_object_no,
-                                         m_snapc, this, false);
+                                         m_snapc, false, this);
     req->send();
     return 0;
   }
@@ -277,7 +277,7 @@ void TrimRequest<I>::send_pre_copyup() {
       RWLock::WLocker object_map_locker(image_ctx.object_map_lock);
       if (image_ctx.object_map->template aio_update<AsyncRequest<I> >(
             CEPH_NOSNAP, m_copyup_start, m_copyup_end, OBJECT_PENDING,
-            OBJECT_EXISTS, this)) {
+            OBJECT_EXISTS, {}, this)) {
         return;
       }
     }
@@ -309,7 +309,7 @@ void TrimRequest<I>::send_pre_remove() {
       RWLock::WLocker object_map_locker(image_ctx.object_map_lock);
       if (image_ctx.object_map->template aio_update<AsyncRequest<I> >(
             CEPH_NOSNAP, m_delete_start, m_num_objects, OBJECT_PENDING,
-            OBJECT_EXISTS, this)) {
+            OBJECT_EXISTS, {}, this)) {
         return;
       }
     }
@@ -337,7 +337,7 @@ void TrimRequest<I>::send_post_copyup() {
       RWLock::WLocker object_map_locker(image_ctx.object_map_lock);
       if (image_ctx.object_map->template aio_update<AsyncRequest<I> >(
             CEPH_NOSNAP, m_copyup_start, m_copyup_end, OBJECT_NONEXISTENT,
-            OBJECT_PENDING, this)) {
+            OBJECT_PENDING, {}, this)) {
         return;
       }
     }
@@ -365,7 +365,7 @@ void TrimRequest<I>::send_post_remove() {
       RWLock::WLocker object_map_locker(image_ctx.object_map_lock);
       if (image_ctx.object_map->template aio_update<AsyncRequest<I> >(
             CEPH_NOSNAP, m_delete_start, m_num_objects, OBJECT_NONEXISTENT,
-            OBJECT_PENDING, this)) {
+            OBJECT_PENDING, {}, this)) {
         return;
       }
     }
@@ -416,10 +416,10 @@ void TrimRequest<I>::send_clean_boundary() {
     io::ObjectRequest<> *req;
     if (p->offset == 0) {
       req = new io::ObjectTrimRequest(&image_ctx, p->oid.name, p->objectno,
-                                      snapc, req_comp, true);
+                                      snapc, true, req_comp);
     } else {
       req = new io::ObjectTruncateRequest(&image_ctx, p->oid.name, p->objectno,
-                                          p->offset, snapc, req_comp);
+                                          p->offset, snapc, {}, req_comp);
     }
     req->send();
   }
diff --git a/src/log/test.cc b/src/log/test.cc
index 573cc35b5e4..e11505af456 100644
--- a/src/log/test.cc
+++ b/src/log/test.cc
@@ -3,6 +3,7 @@
 #include "log/Log.h"
 #include "common/Clock.h"
 #include "common/PrebufferedStreambuf.h"
+#include "include/coredumpctl.h"
 #include "SubsystemMap.h"
 
 using namespace ceph::logging;
@@ -201,7 +202,10 @@ void do_segv()
 
   log.inject_segv();
   Entry *e = new Entry(ceph_clock_now(), pthread_self(), 10, 1);
-  log.submit_entry(e);  // this should segv
+  {
+    PrCtl unset_dumpable;
+    log.submit_entry(e);  // this should segv
+  }
 
   log.flush();
   log.stop();
diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
index 6be3a1658c6..03ba644631e 100644
--- a/src/mds/CDir.cc
+++ b/src/mds/CDir.cc
@@ -115,6 +115,7 @@ ostream& operator<<(ostream& out, const CDir& dir)
   if (dir.state_test(CDir::STATE_COMPLETE)) out << "|complete";
   if (dir.state_test(CDir::STATE_FREEZINGTREE)) out << "|freezingtree";
   if (dir.state_test(CDir::STATE_FROZENTREE)) out << "|frozentree";
+  if (dir.state_test(CDir::STATE_AUXSUBTREE)) out << "|auxsubtree";
   //if (dir.state_test(CDir::STATE_FROZENTREELEAF)) out << "|frozentreeleaf";
   if (dir.state_test(CDir::STATE_FROZENDIR)) out << "|frozendir";
   if (dir.state_test(CDir::STATE_FREEZINGDIR)) out << "|freezingdir";
@@ -1009,7 +1010,16 @@ void CDir::merge(list<CDir*>& subs, list<MDSInternalContextBase*>& waiters, bool
 {
   dout(10) << "merge " << subs << dendl;
 
-  set_dir_auth(subs.front()->get_dir_auth());
+  mds_authority_t new_auth = CDIR_AUTH_DEFAULT;
+  for (auto dir : subs) {
+    if (dir->get_dir_auth() != CDIR_AUTH_DEFAULT &&
+	dir->get_dir_auth() != new_auth) {
+      assert(new_auth == CDIR_AUTH_DEFAULT);
+      new_auth = dir->get_dir_auth();
+    }
+  }
+
+  set_dir_auth(new_auth);
   prepare_new_fragment(replay);
 
   nest_info_t rstatdiff;
@@ -1018,8 +1028,7 @@ void CDir::merge(list<CDir*>& subs, list<MDSInternalContextBase*>& waiters, bool
   version_t rstat_version = inode->get_projected_inode()->rstat.version;
   version_t dirstat_version = inode->get_projected_inode()->dirstat.version;
 
-  for (list<CDir*>::iterator p = subs.begin(); p != subs.end(); ++p) {
-    CDir *dir = *p;
+  for (auto dir : subs) {
     dout(10) << " subfrag " << dir->get_frag() << " " << *dir << dendl;
     assert(!dir->is_auth() || dir->is_complete() || replay);
 
@@ -1228,10 +1237,7 @@ void CDir::add_waiter(uint64_t tag, MDSInternalContextBase *c)
     }
   }
 
-  if (tag & WAIT_CREATED) {
-    assert(state_test(STATE_CREATING));
-    assert(state_test(STATE_FRAGMENTING));
-  }
+  assert(!(tag & WAIT_CREATED) || state_test(STATE_CREATING));
 
   MDSCacheObject::add_waiter(tag, c);
 }
@@ -1346,11 +1352,9 @@ void CDir::mark_new(LogSegment *ls)
   ls->new_dirfrags.push_back(&item_new);
   state_clear(STATE_CREATING);
 
-  if (state_test(CDir::STATE_FRAGMENTING)) {
-    list<MDSInternalContextBase*> ls;
-    take_waiting(CDir::WAIT_CREATED, ls);
-    cache->mds->queue_waiters(ls);
-  }
+  list<MDSInternalContextBase*> waiters;
+  take_waiting(CDir::WAIT_CREATED, waiters);
+  cache->mds->queue_waiters(waiters);
 }
 
 void CDir::mark_clean()
diff --git a/src/mds/CDir.h b/src/mds/CDir.h
index 309af4aba54..3a65a82b3e0 100644
--- a/src/mds/CDir.h
+++ b/src/mds/CDir.h
@@ -92,6 +92,7 @@ public:
   static const unsigned STATE_ASSIMRSTAT =    (1<<17);  // assimilating inode->frag rstats
   static const unsigned STATE_DIRTYDFT =      (1<<18);  // dirty dirfragtree
   static const unsigned STATE_BADFRAG =       (1<<19);  // bad dirfrag
+  static const unsigned STATE_AUXSUBTREE =    (1<<20);  // no subtree merge
 
   // common states
   static const unsigned STATE_CLEAN =  0;
@@ -117,6 +118,7 @@ public:
   (STATE_DIRTY|
    STATE_EXPORTBOUND |
    STATE_IMPORTBOUND |
+   STATE_AUXSUBTREE |
    STATE_REJOINUNDEF);
 
   // -- rep spec --
diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc
index a2541b6578d..7af16b5641e 100644
--- a/src/mds/CInode.cc
+++ b/src/mds/CInode.cc
@@ -244,6 +244,10 @@ ostream& operator<<(ostream& out, const CInode& in)
     in.print_pin_set(out);
   }
 
+  if (in.inode.export_pin != MDS_RANK_NONE) {
+    out << " export_pin=" << in.inode.export_pin;
+  }
+
   out << " " << &in;
   out << "]";
   return out;
@@ -648,6 +652,8 @@ CDir *CInode::add_dirfrag(CDir *dir)
     dir->get(CDir::PIN_STICKY);
   }
 
+  maybe_export_pin();
+
   return dir;
 }
 
@@ -1540,6 +1546,7 @@ void CInode::encode_lock_state(int type, bufferlist& bl)
       ::encode(inode.ctime, bl);
       ::encode(inode.layout, bl, mdcache->mds->mdsmap->get_up_features());
       ::encode(inode.quota, bl);
+      ::encode(inode.export_pin, bl);
     }
     break;
   
@@ -1802,6 +1809,8 @@ void CInode::decode_lock_state(int type, bufferlist& bl)
       if (inode.ctime < tm) inode.ctime = tm;
       ::decode(inode.layout, p);
       ::decode(inode.quota, p);
+      ::decode(inode.export_pin, p);
+      maybe_export_pin();
     }
     break;
 
@@ -4397,3 +4406,108 @@ int64_t CInode::get_backtrace_pool() const
     return inode.layout.pool_id;
   }
 }
+
+class C_CInode_ExportPin : public MDSInternalContext {
+public:
+  explicit C_CInode_ExportPin(CInode *in) : MDSInternalContext(in->mdcache->mds), in(in) {
+    in->get(MDSCacheObject::PIN_PTRWAITER);
+  }
+  ~C_CInode_ExportPin() {
+    in->put(MDSCacheObject::PIN_PTRWAITER);
+  }
+
+  void finish(int r) override {
+    in->maybe_export_pin();
+  }
+private:
+  CInode *in;
+};
+
+void CInode::maybe_export_pin()
+{
+  if (g_conf->mds_bal_export_pin && is_dir() && is_normal()) {
+    mds_rank_t pin = get_export_pin(false);
+    dout(20) << "maybe_export_pin export_pin=" << pin << " on " << *this << dendl;
+    if (pin == mdcache->mds->get_nodeid()) {
+      for (auto it = dirfrags.begin(); it != dirfrags.end(); it++) {
+        CDir *cd = it->second;
+        dout(20) << "dirfrag: " << *cd << dendl;
+        if (cd->state_test(CDir::STATE_CREATING)) {
+          /* inode is not journaled yet */
+          cd->add_waiter(CDir::WAIT_CREATED, new C_CInode_ExportPin(this));
+          dout(15) << "aux subtree pin of " << *cd << " delayed for finished creation" << dendl;
+          continue;
+        }
+        if (cd->state_test(CDir::STATE_AUXSUBTREE)) continue;
+        CDir *subtree = mdcache->get_subtree_root(cd);
+        assert(subtree);
+        if (subtree->is_ambiguous_auth()) {
+          subtree->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_CInode_ExportPin(this));
+          dout(15) << "aux subtree pin of " << *cd << " delayed for single auth on subtree " << *subtree << dendl;
+        } else if (subtree->is_auth()) {
+          assert(cd->is_auth());
+          if (subtree->is_frozen() || subtree->is_freezing()) {
+            subtree->add_waiter(MDSCacheObject::WAIT_UNFREEZE, new C_CInode_ExportPin(this));
+            dout(15) << "aux subtree pin of " << *cd << " delayed for unfreeze on subtree " << *subtree << dendl;
+          } else {
+            cd->state_set(CDir::STATE_AUXSUBTREE);
+            mdcache->adjust_subtree_auth(cd, mdcache->mds->get_nodeid());
+            dout(15) << "aux subtree pinned " << *cd << dendl;
+          }
+        } else {
+          assert(!cd->is_auth());
+          dout(15) << "not setting aux subtree pin for " << *cd << " because not auth" << dendl;
+        }
+      }
+    } else if (pin != MDS_RANK_NONE) {
+      for (auto it = dirfrags.begin(); it != dirfrags.end(); it++) {
+        CDir *cd = it->second;
+        if (cd->is_auth() && cd->state_test(CDir::STATE_AUXSUBTREE)) {
+          assert(!(cd->is_frozen() || cd->is_freezing()));
+          assert(!cd->state_test(CDir::STATE_EXPORTBOUND));
+          cd->state_clear(CDir::STATE_AUXSUBTREE); /* merge will happen eventually */
+          dout(15) << "cleared aux subtree pin " << *cd << dendl;
+        }
+      }
+      dout(20) << "adding to export_pin_queue " << *this << dendl;
+      mdcache->export_pin_queue.insert(this);
+    }
+  }
+}
+
+void CInode::set_export_pin(mds_rank_t rank)
+{
+  assert(is_dir());
+  assert(is_projected());
+  get_projected_inode()->export_pin = rank;
+  maybe_export_pin();
+}
+
+mds_rank_t CInode::get_export_pin(bool inherit) const
+{
+  /* An inode that is export pinned may not necessarily be a subtree root, we
+   * need to traverse the parents. A base or system inode cannot be pinned.
+   * N.B. inodes not yet linked into a dir (i.e. anonymous inodes) will not
+   * have a parent yet.
+   */
+  for (const CInode *in = this; !in->is_base() && !in->is_system() && in->get_projected_parent_dn(); in = in->get_projected_parent_dn()->dir->inode) {
+    mds_rank_t pin = in->get_projected_inode()->export_pin;
+    if (pin >= 0) {
+      return pin;
+    }
+    if (!inherit) break;
+  }
+  return MDS_RANK_NONE;
+}
+
+bool CInode::is_exportable(mds_rank_t dest) const
+{
+  mds_rank_t pin = get_export_pin();
+  if (pin == dest) {
+    return true;
+  } else if (pin >= 0) {
+    return false;
+  } else {
+    return true;
+  }
+}
diff --git a/src/mds/CInode.h b/src/mds/CInode.h
index 8b8cdfe175b..eae86ef4c33 100644
--- a/src/mds/CInode.h
+++ b/src/mds/CInode.h
@@ -627,6 +627,7 @@ public:
   friend class StrayManager;
   friend class CDir;
   friend class CInodeExport;
+  friend class C_CInode_ExportPin;
 
   // ---------------------------
   CInode(MDCache *c, bool auth=true, snapid_t f=2, snapid_t l=CEPH_NOSNAP) : 
@@ -682,6 +683,7 @@ public:
   bool is_mdsdir() const { return MDS_INO_IS_MDSDIR(inode.ino); }
   bool is_base() const { return is_root() || is_mdsdir(); }
   bool is_system() const { return inode.ino < MDS_INO_SYSTEM_BASE; }
+  bool is_normal() const { return !(is_base() || is_system() || is_stray()); }
 
   bool is_head() const { return last == CEPH_NOSNAP; }
 
@@ -1066,6 +1068,13 @@ public:
     projected_parent.pop_front();
   }
 
+private:
+  void maybe_export_pin();
+public:
+  void set_export_pin(mds_rank_t rank);
+  mds_rank_t get_export_pin(bool inherit=true) const;
+  bool is_exportable(mds_rank_t dest) const;
+
   void print(ostream& out) override;
   void dump(Formatter *f) const;
 
diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
index 58c8795b7a8..18125e3ef45 100644
--- a/src/mds/Locker.cc
+++ b/src/mds/Locker.cc
@@ -47,8 +47,6 @@
 
 #define dout_subsys ceph_subsys_mds
 #undef dout_prefix
-#undef DOUT_COND
-#define DOUT_COND(cct, l) l<=cct->_conf->debug_mds || l <= cct->_conf->debug_mds_locker
 #define dout_context g_ceph_context
 #define dout_prefix _prefix(_dout, mds)
 static ostream& _prefix(std::ostream *_dout, MDSRank *mds) {
@@ -1052,8 +1050,8 @@ public:
     p->get(MDSCacheObject::PIN_PTRWAITER);    
   }
   void finish(int r) override {
-    p->put(MDSCacheObject::PIN_PTRWAITER);
     locker->try_eval(p, mask);
+    p->put(MDSCacheObject::PIN_PTRWAITER);
   }
 };
 
@@ -1755,30 +1753,29 @@ version_t Locker::issue_file_data_version(CInode *in)
 class C_Locker_FileUpdate_finish : public LockerLogContext {
   CInode *in;
   MutationRef mut;
-  bool share;
+  bool share_max;
+  bool need_issue;
   client_t client;
-  Capability *cap;
   MClientCaps *ack;
 public:
   C_Locker_FileUpdate_finish(Locker *l, CInode *i, MutationRef& m,
-				bool e=false, client_t c=-1,
-				Capability *cp = 0,
+				bool sm=false, bool ni=false, client_t c=-1,
 				MClientCaps *ac = 0)
-    : LockerLogContext(l), in(i), mut(m), share(e), client(c), cap(cp),
-      ack(ac) {
+    : LockerLogContext(l), in(i), mut(m), share_max(sm), need_issue(ni),
+      client(c), ack(ac) {
     in->get(CInode::PIN_PTRWAITER);
   }
   void finish(int r) override {
-    locker->file_update_finish(in, mut, share, client, cap, ack);
+    locker->file_update_finish(in, mut, share_max, need_issue, client, ack);
+    in->put(CInode::PIN_PTRWAITER);
   }
 };
 
-void Locker::file_update_finish(CInode *in, MutationRef& mut, bool share, client_t client,
-				Capability *cap, MClientCaps *ack)
+void Locker::file_update_finish(CInode *in, MutationRef& mut, bool share_max, bool issue_client_cap,
+				client_t client, MClientCaps *ack)
 {
   dout(10) << "file_update_finish on " << *in << dendl;
   in->pop_and_dirty_projected_inode(mut->ls);
-  in->put(CInode::PIN_PTRWAITER);
 
   mut->apply();
   
@@ -1823,12 +1820,13 @@ void Locker::file_update_finish(CInode *in, MutationRef& mut, bool share, client
       eval_cap_gather(in, &need_issue);
     }
   } else {
-    if (cap && (cap->wanted() & ~cap->pending()) &&
-	need_issue.count(in) == 0) {  // if we won't issue below anyway
-      issue_caps(in, cap);
+    if (issue_client_cap && need_issue.count(in) == 0) {
+      Capability *cap = in->get_client_cap(client);
+      if (cap && (cap->wanted() & ~cap->pending()))
+	issue_caps(in, cap);
     }
   
-    if (share && in->is_auth() &&
+    if (share_max && in->is_auth() &&
 	(in->filelock.gcaps_allowed(CAP_LONER) & (CEPH_CAP_GWR|CEPH_CAP_GBUFFER)))
       share_inode_max_size(in);
   }
@@ -1968,10 +1966,12 @@ bool Locker::issue_caps(CInode *in, Capability *only_cap)
 	     << " wanted " << ccap_string(wanted)
 	     << dendl;
 
-    // skip if suppress, and not revocation
-    if (cap->is_suppress() && !(pending & ~allowed)) {
-      dout(20) << "  suppressed and !revoke, skipping client." << it->first << dendl;
-      continue;
+    if (!(pending & ~allowed)) {
+      // skip if suppress or new, and not revocation
+      if (cap->is_new() || cap->is_suppress()) {
+	dout(20) << "  !revoke and new|suppressed, skipping client." << it->first << dendl;
+	continue;
+      }
     }
 
     // notify clients about deleted inode, to make sure they release caps ASAP.
@@ -2146,9 +2146,9 @@ public:
     in->get(CInode::PIN_PTRWAITER);
   }
   void finish(int r) override {
-    in->put(CInode::PIN_PTRWAITER);
     if (!in->is_auth())
       locker->request_inode_file_caps(in);
+    in->put(CInode::PIN_PTRWAITER);
   }
 };
 
@@ -2225,9 +2225,9 @@ public:
     in->get(CInode::PIN_PTRWAITER);
   }
   void finish(int r) override {
-    in->put(CInode::PIN_PTRWAITER);
     if (in->is_auth())
       locker->check_inode_max_size(in, false, new_max_size, newsize, mtime);
+    in->put(CInode::PIN_PTRWAITER);
   }
 };
 
@@ -3073,10 +3073,8 @@ void Locker::_do_snap_update(CInode *in, snapid_t snap, int dirty, snapid_t foll
     le->metablob.add_client_flush(metareqid_t(m->get_source(), ack->get_client_tid()),
 				  ack->get_oldest_flush_tid());
 
-  mds->mdlog->submit_entry(le, new C_Locker_FileUpdate_finish(this, in, mut,
-								 false,
-								 client, NULL,
-								 ack));
+  mds->mdlog->submit_entry(le, new C_Locker_FileUpdate_finish(this, in, mut, false, false,
+							      client, ack));
 }
 
 void Locker::_update_cap_fields(CInode *in, int dirty, MClientCaps *m, inode_t *pi)
@@ -3346,9 +3344,8 @@ bool Locker::_do_cap_update(CInode *in, Capability *cap,
 				  ack->get_oldest_flush_tid());
 
   mds->mdlog->submit_entry(le, new C_Locker_FileUpdate_finish(this, in, mut,
-								 change_max,
-								 client, cap,
-								 ack));
+							      change_max, !!cap,
+							      client, ack));
   if (need_flush && !*need_flush &&
       ((change_max && new_max) || // max INCREASE
        _need_flush_mdlog(in, dirty)))
diff --git a/src/mds/Locker.h b/src/mds/Locker.h
index c34c7f82709..eb3d45ce581 100644
--- a/src/mds/Locker.h
+++ b/src/mds/Locker.h
@@ -248,8 +248,8 @@ public:
 protected:
   void handle_inode_file_caps(class MInodeFileCaps *m);
 
-  void file_update_finish(CInode *in, MutationRef& mut, bool share, client_t client, Capability *cap,
-			  MClientCaps *ack);
+  void file_update_finish(CInode *in, MutationRef& mut, bool share_max, bool issue_client_cap,
+			  client_t client, MClientCaps *ack);
 public:
   void calc_new_client_ranges(CInode *in, uint64_t size,
 			      map<client_t, client_writeable_range_t>* new_ranges,
diff --git a/src/mds/MDBalancer.cc b/src/mds/MDBalancer.cc
index 0e9f470b599..192dc9d02ec 100644
--- a/src/mds/MDBalancer.cc
+++ b/src/mds/MDBalancer.cc
@@ -28,7 +28,6 @@
 #include "include/Context.h"
 #include "msg/Messenger.h"
 #include "messages/MHeartbeat.h"
-#include "messages/MMDSLoadTargets.h"
 
 #include <fstream>
 #include <iostream>
@@ -42,10 +41,19 @@ using std::vector;
 
 #define dout_context g_ceph_context
 #define dout_subsys ceph_subsys_mds
-#undef DOUT_COND
-#define DOUT_COND(cct, l) l<=cct->_conf->debug_mds || l <= cct->_conf->debug_mds_balancer
 #undef dout_prefix
 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".bal "
+#undef dout
+#define dout(lvl) \
+  do {\
+    auto subsys = ceph_subsys_mds;\
+    if ((dout_context)->_conf->subsys.should_gather(ceph_subsys_mds_balancer, lvl)) {\
+      subsys = ceph_subsys_mds_balancer;\
+    }\
+    dout_impl(dout_context, subsys, lvl) dout_prefix
+#undef dendl
+#define dendl dendl_impl; } while (0)
+
 
 #define MIN_LOAD    50   //  ??
 #define MIN_REEXPORT 5  // will automatically reexport
@@ -62,15 +70,58 @@ int MDBalancer::proc_message(Message *m)
     break;
 
   default:
-    derr << " balancer unknown message " << m->get_type() << dendl;
+    dout(0) << " balancer unknown message " << m->get_type() << dendl;
     assert(0 == "balancer unknown message");
   }
 
   return 0;
 }
 
+void MDBalancer::handle_export_pins(void)
+{
+  auto &q = mds->mdcache->export_pin_queue;
+  auto it = q.begin();
+  dout(20) << "export_pin_queue size=" << q.size() << dendl;
+  while (it != q.end()) {
+    auto current = it++;
+    CInode *in = *current;
+    assert(in->is_dir());
+    mds_rank_t export_pin = in->get_export_pin();
+    if (!in->is_exportable(export_pin)) {
+      dout(10) << "can no longer export " << *in << " because export pins have since changed" << dendl;
+      q.erase(current);
+      continue;
+    }
+    dout(10) << "exporting dirfrags of " << *in << " to " << export_pin << dendl;
+    bool has_auth = false;
+    list<frag_t> ls;
+    in->dirfragtree.get_leaves(ls);
+    for (const auto &fg : ls) {
+      CDir *cd = in->get_dirfrag(fg);
+      if (cd && cd->is_auth()) {
+        /* N.B. when we are no longer auth after exporting, this function will remove the inode from the queue */
+        mds->mdcache->migrator->export_dir(cd, export_pin);
+        has_auth = true;
+      }
+    }
+    if (!has_auth) {
+      dout(10) << "can no longer export " << *in << " because I am not auth for any dirfrags" << dendl;
+      q.erase(current);
+      continue;
+    }
+  }
 
-
+  set<CDir *> authsubs;
+  mds->mdcache->get_auth_subtrees(authsubs);
+  for (auto &cd : authsubs) {
+    mds_rank_t export_pin = cd->inode->get_export_pin();
+    dout(10) << "auth tree " << *cd << " export_pin=" << export_pin << dendl;
+    if (export_pin >= 0 && export_pin != mds->get_nodeid()) {
+      dout(10) << "exporting auth subtree " << *cd->inode << " to " << export_pin << dendl;
+      mds->mdcache->migrator->export_dir(cd, export_pin);
+    }
+  }
+}
 
 void MDBalancer::tick()
 {
@@ -80,6 +131,10 @@ void MDBalancer::tick()
   utime_t elapsed = now;
   elapsed -= first;
 
+  if (g_conf->mds_bal_export_pin) {
+    handle_export_pins();
+  }
+
   // sample?
   if ((double)now - (double)last_sample > g_conf->mds_bal_sample_interval) {
     dout(15) << "tick last_sample now " << now << dendl;
@@ -159,7 +214,7 @@ mds_load_t MDBalancer::get_load(utime_t now)
   if (cpu.is_open())
     cpu >> load.cpu_load_avg;
   else
-    derr << "input file " PROCPREFIX "'/proc/loadavg' not found" << dendl;
+    dout(0) << "input file " PROCPREFIX "'/proc/loadavg' not found" << dendl;
   
   dout(15) << "get_load " << load << dendl;
   return load;
@@ -338,14 +393,10 @@ void MDBalancer::export_empties()
 {
   dout(5) << "export_empties checking for empty imports" << dendl;
 
-  for (map<CDir*,set<CDir*> >::iterator it = mds->mdcache->subtrees.begin();
-       it != mds->mdcache->subtrees.end();
-       ++it) {
-    CDir *dir = it->first;
-    if (!dir->is_auth() ||
-	dir->is_ambiguous_auth() ||
-	dir->is_freezing() ||
-	dir->is_frozen())
+  std::set<CDir *> subtrees;
+  mds->mdcache->get_fullauth_subtrees(subtrees);
+  for (auto &dir : subtrees) {
+    if (dir->is_freezing() || dir->is_frozen())
       continue;
 
     if (!dir->inode->is_base() &&
@@ -357,7 +408,7 @@ void MDBalancer::export_empties()
 
 
 
-double MDBalancer::try_match(mds_rank_t ex, double& maxex,
+double MDBalancer::try_match(balance_state_t& state, mds_rank_t ex, double& maxex,
                              mds_rank_t im, double& maxim)
 {
   if (maxex <= 0 || maxim <= 0) return 0.0;
@@ -368,10 +419,10 @@ double MDBalancer::try_match(mds_rank_t ex, double& maxex,
   dout(5) << "   - mds." << ex << " exports " << howmuch << " to mds." << im << dendl;
 
   if (ex == mds->get_nodeid())
-    my_targets[im] += howmuch;
+    state.targets[im] += howmuch;
 
-  exported[ex] += howmuch;
-  imported[im] += howmuch;
+  state.exported[ex] += howmuch;
+  state.imported[im] += howmuch;
 
   maxex -= howmuch;
   maxim -= howmuch;
@@ -500,25 +551,20 @@ void MDBalancer::queue_merge(CDir *dir)
 
 void MDBalancer::prep_rebalance(int beat)
 {
+  balance_state_t state;
+
   if (g_conf->mds_thrash_exports) {
     //we're going to randomly export to all the mds in the cluster
-    my_targets.clear();
     set<mds_rank_t> up_mds;
     mds->get_mds_map()->get_up_mds_set(up_mds);
-    for (set<mds_rank_t>::iterator i = up_mds.begin();
-	 i != up_mds.end();
-	 ++i)
-      my_targets[*i] = 0.0;
+    for (const auto &rank : up_mds) {
+      state.targets[rank] = 0.0;
+    }
   } else {
     int cluster_size = mds->get_mds_map()->get_num_in_mds();
     mds_rank_t whoami = mds->get_nodeid();
     rebalance_time = ceph_clock_now();
 
-    // reset
-    my_targets.clear();
-    imported.clear();
-    exported.clear();
-
     dout(5) << " prep_rebalance: cluster loads are" << dendl;
 
     mds->mdcache->migrator->clear_export_queue();
@@ -616,17 +662,16 @@ void MDBalancer::prep_rebalance(int beat)
       for (multimap<double,mds_rank_t>::reverse_iterator ex = exporters.rbegin();
 	   ex != exporters.rend();
 	   ++ex) {
-	double maxex = get_maxex(ex->second);
+	double maxex = get_maxex(state, ex->second);
 	if (maxex <= .001) continue;
 
 	// check importers. for now, just in arbitrary order (no intelligent matching).
 	for (map<mds_rank_t, float>::iterator im = mds_import_map[ex->second].begin();
 	     im != mds_import_map[ex->second].end();
 	     ++im) {
-	  double maxim = get_maxim(im->first);
+	  double maxim = get_maxim(state, im->first);
 	  if (maxim <= .001) continue;
-	  try_match(ex->second, maxex,
-		    im->first, maxim);
+	  try_match(state, ex->second, maxex, im->first, maxim);
 	  if (maxex <= .001) break;
 	}
       }
@@ -640,11 +685,10 @@ void MDBalancer::prep_rebalance(int beat)
       multimap<double,mds_rank_t>::iterator im = importers.begin();
       while (ex != exporters.rend() &&
 	     im != importers.end()) {
-        double maxex = get_maxex(ex->second);
-	double maxim = get_maxim(im->second);
+        double maxex = get_maxex(state, ex->second);
+	double maxim = get_maxim(state, im->second);
 	if (maxex < .001 || maxim < .001) break;
-	try_match(ex->second, maxex,
-		  im->second, maxim);
+	try_match(state, ex->second, maxex, im->second, maxim);
 	if (maxex <= .001) ++ex;
 	if (maxim <= .001) ++im;
       }
@@ -655,23 +699,31 @@ void MDBalancer::prep_rebalance(int beat)
       multimap<double,mds_rank_t>::iterator im = importers.begin();
       while (ex != exporters.end() &&
 	     im != importers.end()) {
-        double maxex = get_maxex(ex->second);
-	double maxim = get_maxim(im->second);
+        double maxex = get_maxex(state, ex->second);
+	double maxim = get_maxim(state, im->second);
 	if (maxex < .001 || maxim < .001) break;
-	try_match(ex->second, maxex,
-		  im->second, maxim);
+	try_match(state, ex->second, maxex, im->second, maxim);
 	if (maxex <= .001) ++ex;
 	if (maxim <= .001) ++im;
       }
     }
   }
-  try_rebalance();
+  try_rebalance(state);
 }
 
-
+void MDBalancer::hit_targets(const balance_state_t& state)
+{
+  utime_t now = ceph_clock_now();
+  for (auto &it : state.targets) {
+    mds_rank_t target = it.first;
+    mds->hit_export_target(now, target, g_conf->mds_bal_target_decay);
+  }
+}
 
 int MDBalancer::mantle_prep_rebalance()
 {
+  balance_state_t state;
+
   /* refresh balancer if it has changed */
   if (bal_version != mds->mdsmap->get_balancer()) {
     bal_version.assign("");
@@ -686,9 +738,6 @@ int MDBalancer::mantle_prep_rebalance()
   /* prepare for balancing */
   int cluster_size = mds->get_mds_map()->get_num_in_mds();
   rebalance_time = ceph_clock_now();
-  my_targets.clear();
-  imported.clear();
-  exported.clear();
   mds->mdcache->migrator->clear_export_queue();
 
   /* fill in the metrics for each mds by grabbing load struct */
@@ -709,24 +758,24 @@ int MDBalancer::mantle_prep_rebalance()
 
   /* execute the balancer */
   Mantle mantle;
-  int ret = mantle.balance(bal_code, mds->get_nodeid(), metrics, my_targets);
-  dout(2) << " mantle decided that new targets=" << my_targets << dendl;
+  int ret = mantle.balance(bal_code, mds->get_nodeid(), metrics, state.targets);
+  dout(2) << " mantle decided that new targets=" << state.targets << dendl;
 
   /* mantle doesn't know about cluster size, so check target len here */
-  if ((int) my_targets.size() != cluster_size)
+  if ((int) state.targets.size() != cluster_size)
     return -EINVAL;
   else if (ret)
     return ret;
 
-  try_rebalance();
+  try_rebalance(state);
   return 0;
 }
 
 
 
-void MDBalancer::try_rebalance()
+void MDBalancer::try_rebalance(balance_state_t& state)
 {
-  if (!check_targets())
+  if (!check_targets(state))
     return;
 
   if (g_conf->mds_thrash_exports) {
@@ -770,11 +819,9 @@ void MDBalancer::try_rebalance()
   // do my exports!
   set<CDir*> already_exporting;
 
-  for (map<mds_rank_t,double>::iterator it = my_targets.begin();
-       it != my_targets.end();
-       ++it) {
-    mds_rank_t target = (*it).first;
-    double amount = (*it).second;
+  for (auto &it : state.targets) {
+    mds_rank_t target = it.first;
+    double amount = it.second;
 
     if (amount < MIN_OFFLOAD) continue;
     if (amount / target_load < .2) continue;
@@ -882,62 +929,15 @@ void MDBalancer::try_rebalance()
 }
 
 
-/* returns true if all my_target MDS are in the MDSMap.
- */
-bool MDBalancer::check_targets()
+/* Check that all targets are in the MDSMap export_targets for my rank. */
+bool MDBalancer::check_targets(const balance_state_t& state)
 {
-  // get MonMap's idea of my_targets
-  const set<mds_rank_t>& map_targets = mds->mdsmap->get_mds_info(mds->get_nodeid()).export_targets;
-
-  bool send = false;
-  bool ok = true;
-
-  // make sure map targets are in the old_prev_targets map
-  for (set<mds_rank_t>::iterator p = map_targets.begin(); p != map_targets.end(); ++p) {
-    if (old_prev_targets.count(*p) == 0)
-      old_prev_targets[*p] = 0;
-    if (my_targets.count(*p) == 0)
-      old_prev_targets[*p]++;
-  }
-
-  // check if the current MonMap has all our targets
-  set<mds_rank_t> need_targets;
-  for (map<mds_rank_t,double>::iterator i = my_targets.begin();
-       i != my_targets.end();
-       ++i) {
-    need_targets.insert(i->first);
-    old_prev_targets[i->first] = 0;
-
-    if (!map_targets.count(i->first)) {
-      dout(20) << " target mds." << i->first << " not in map's export_targets" << dendl;
-      send = true;
-      ok = false;
+  for (const auto &it : state.targets) {
+    if (!mds->is_export_target(it.first)) {
+      return false;
     }
   }
-
-  set<mds_rank_t> want_targets = need_targets;
-  map<mds_rank_t, int>::iterator p = old_prev_targets.begin();
-  while (p != old_prev_targets.end()) {
-    if (map_targets.count(p->first) == 0 &&
-	need_targets.count(p->first) == 0) {
-      old_prev_targets.erase(p++);
-      continue;
-    }
-    dout(20) << " target mds." << p->first << " has been non-target for " << p->second << dendl;
-    if (p->second < g_conf->mds_bal_target_removal_min)
-      want_targets.insert(p->first);
-    if (p->second >= g_conf->mds_bal_target_removal_max)
-      send = true;
-    ++p;
-  }
-
-  dout(10) << "check_targets have " << map_targets << " need " << need_targets << " want " << want_targets << dendl;
-
-  if (send) {
-    MMDSLoadTargets* m = new MMDSLoadTargets(mds_gid_t(mon_client->get_global_id()), want_targets);
-    mon_client->send_mon_message(m);
-  }
-  return ok;
+  return true;
 }
 
 void MDBalancer::find_exports(CDir *dir,
diff --git a/src/mds/MDBalancer.h b/src/mds/MDBalancer.h
index 438ffdeb837..cbfc1c479ff 100644
--- a/src/mds/MDBalancer.h
+++ b/src/mds/MDBalancer.h
@@ -25,8 +25,6 @@ using std::map;
 #include "include/types.h"
 #include "common/Clock.h"
 #include "common/Cond.h"
-#include "CInode.h"
-
 
 class MDSRank;
 class Message;
@@ -37,90 +35,32 @@ class Messenger;
 class MonClient;
 
 class MDBalancer {
- protected:
-  MDSRank *mds;
-  Messenger *messenger;
-  MonClient *mon_client;
-  int beat_epoch;
-
-  int last_epoch_under;  
-  int last_epoch_over; 
-  string bal_code;
-  string bal_version;
-
-  utime_t last_heartbeat;
-  utime_t last_sample;    
-  utime_t rebalance_time; //ensure a consistent view of load for rebalance
-
-  // Dirfrags which are marked to be passed on to MDCache::[split|merge]_dir
-  // just as soon as a delayed context comes back and triggers it.
-  // These sets just prevent us from spawning extra timer contexts for
-  // dirfrags that already have one in flight.
-  set<dirfrag_t>   split_pending, merge_pending;
-
-  // per-epoch scatter/gathered info
-  map<mds_rank_t, mds_load_t>  mds_load;
-  map<mds_rank_t, double>       mds_meta_load;
-  map<mds_rank_t, map<mds_rank_t, float> > mds_import_map;
-
-  // per-epoch state
-  double          my_load, target_load;
-  map<mds_rank_t,double> my_targets;
-  map<mds_rank_t,double> imported;
-  map<mds_rank_t,double> exported;
-
-  map<mds_rank_t, int> old_prev_targets;  // # iterations they _haven't_ been targets
-  bool check_targets();
-
-  double try_match(mds_rank_t ex, double& maxex,
-                   mds_rank_t im, double& maxim);
-  double get_maxim(mds_rank_t im) {
-    return target_load - mds_meta_load[im] - imported[im];
-  }
-  double get_maxex(mds_rank_t ex) {
-    return mds_meta_load[ex] - target_load - exported[ex];    
-  }
-
+  friend class C_Bal_SendHeartbeat;
 public:
   MDBalancer(MDSRank *m, Messenger *msgr, MonClient *monc) : 
     mds(m),
     messenger(msgr),
     mon_client(monc),
     beat_epoch(0),
-    last_epoch_under(0), last_epoch_over(0), my_load(0.0), target_load(0.0) { }
-  
+    last_epoch_under(0), last_epoch_over(0), my_load(0.0), target_load(0.0)
+    { }
+
   mds_load_t get_load(utime_t);
 
   int proc_message(Message *m);
-  
-  int localize_balancer();
-  void send_heartbeat();
-  void handle_heartbeat(MHeartbeat *m);
 
+  /**
+   * Regularly called upkeep function.
+   *
+   * Sends MHeartbeat messages to the mons.
+   */
   void tick();
 
-  void export_empties();
-  //set up the rebalancing targets for export and do one if the
-  //MDSMap is up to date
-  void prep_rebalance(int beat);
-  int mantle_prep_rebalance();
-  /*check if the monitor has recorded the current export targets;
-    if it has then do the actual export. Otherwise send off our
-    export targets message again*/
-  void try_rebalance();
-  void find_exports(CDir *dir, 
-                    double amount, 
-                    list<CDir*>& exports, 
-                    double& have,
-                    set<CDir*>& already_exporting);
-
-
-  void subtract_export(class CDir *ex, utime_t now);
-  void add_import(class CDir *im, utime_t now);
+  void subtract_export(CDir *ex, utime_t now);
+  void add_import(CDir *im, utime_t now);
 
-  void hit_inode(utime_t now, class CInode *in, int type, int who=-1);
-  void hit_dir(utime_t now, class CDir *dir, int type, int who=-1, double amount=1.0);
-  void hit_recursive(utime_t now, class CDir *dir, int type, double amount, double rd_adj);
+  void hit_inode(utime_t now, CInode *in, int type, int who=-1);
+  void hit_dir(utime_t now, CDir *dir, int type, int who=-1, double amount=1.0);
 
   void queue_split(const CDir *dir, bool fast);
   void queue_merge(CDir *dir);
@@ -132,8 +72,80 @@ public:
    * \param hot whether the directory's temperature is enough to split it
    */
   void maybe_fragment(CDir *dir, bool hot);
-};
 
+private:
+  typedef struct {
+    std::map<mds_rank_t, double> targets;
+    std::map<mds_rank_t, double> imported;
+    std::map<mds_rank_t, double> exported;
+  } balance_state_t;
 
+  //set up the rebalancing targets for export and do one if the
+  //MDSMap is up to date
+  void prep_rebalance(int beat);
+  int mantle_prep_rebalance();
+
+  void handle_export_pins(void);
+
+  void export_empties();
+  int localize_balancer();
+  bool check_targets(const balance_state_t& state);
+  void hit_targets(const balance_state_t& state);
+  void send_heartbeat();
+  void handle_heartbeat(MHeartbeat *m);
+  void find_exports(CDir *dir,
+                    double amount,
+                    list<CDir*>& exports,
+                    double& have,
+                    set<CDir*>& already_exporting);
+
+  double try_match(balance_state_t &state,
+                   mds_rank_t ex, double& maxex,
+                   mds_rank_t im, double& maxim);
+
+  double get_maxim(balance_state_t &state, mds_rank_t im) {
+    return target_load - mds_meta_load[im] - state.imported[im];
+  }
+  double get_maxex(balance_state_t &state, mds_rank_t ex) {
+    return mds_meta_load[ex] - target_load - state.exported[ex];
+  }
+
+  /**
+   * Try to rebalance.
+   *
+   * Check if the monitor has recorded the current export targets;
+   * if it has then do the actual export. Otherwise send off our
+   * export targets message again.
+   */
+  void try_rebalance(balance_state_t& state);
+
+  MDSRank *mds;
+  Messenger *messenger;
+  MonClient *mon_client;
+  int beat_epoch;
+
+  int last_epoch_under;
+  int last_epoch_over;
+  string bal_code;
+  string bal_version;
+
+  utime_t last_heartbeat;
+  utime_t last_sample;
+  utime_t rebalance_time; //ensure a consistent view of load for rebalance
+
+  // Dirfrags which are marked to be passed on to MDCache::[split|merge]_dir
+  // just as soon as a delayed context comes back and triggers it.
+  // These sets just prevent us from spawning extra timer contexts for
+  // dirfrags that already have one in flight.
+  set<dirfrag_t>   split_pending, merge_pending;
+
+  // per-epoch scatter/gathered info
+  map<mds_rank_t, mds_load_t>  mds_load;
+  map<mds_rank_t, double>       mds_meta_load;
+  map<mds_rank_t, map<mds_rank_t, float> > mds_import_map;
+
+  // per-epoch state
+  double          my_load, target_load;
+};
 
 #endif
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index cce552b1818..8f0ca76accb 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -295,6 +295,8 @@ void MDCache::remove_inode(CInode *o)
 
   o->item_open_file.remove_myself();
 
+  export_pin_queue.erase(o);
+
   // remove from inode map
   inode_map.erase(o->vino());    
 
@@ -353,6 +355,7 @@ void MDCache::create_unlinked_system_inode(CInode *in, inodeno_t ino,
   in->inode.nlink = 1;
   in->inode.truncate_size = -1ull;
   in->inode.change_attr = 0;
+  in->inode.export_pin = MDS_RANK_NONE;
 
   memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout));
   if (in->inode.is_dir()) {
@@ -909,7 +912,8 @@ void MDCache::try_subtree_merge_at(CDir *dir, bool do_eval)
   if (parent != dir &&                              // we have a parent,
       parent->dir_auth == dir->dir_auth &&          // auth matches,
       dir->dir_auth.second == CDIR_AUTH_UNKNOWN &&  // auth is unambiguous,
-      !dir->state_test(CDir::STATE_EXPORTBOUND)) {  // not an exportbound,
+      !dir->state_test(CDir::STATE_EXPORTBOUND) && // not an exportbound,
+      !dir->state_test(CDir::STATE_AUXSUBTREE)) {  // not aux subtree
     // merge with parent.
     dout(10) << "  subtree merge at " << *dir << dendl;
     dir->set_dir_auth(CDIR_AUTH_DEFAULT);
@@ -1260,19 +1264,15 @@ void MDCache::verify_subtree_bounds(CDir *dir, const set<CDir*>& bounds)
   if (bounds != subtrees[dir]) {
     dout(0) << "verify_subtree_bounds failed" << dendl;
     set<CDir*> b = bounds;
-    for (set<CDir*>::iterator p = subtrees[dir].begin();
-	 p != subtrees[dir].end();
-	 ++p) {
-      if (bounds.count(*p)) {
-	b.erase(*p);
+    for (auto &cd : subtrees[dir]) {
+      if (bounds.count(cd)) {
+	b.erase(cd);
 	continue;
       }
-      dout(0) << "  missing bound " << **p << dendl;
+      dout(0) << "  missing bound " << *cd << dendl;
     }
-    for (set<CDir*>::iterator p = b.begin();
-	 p != b.end();
-	 ++p) 
-      dout(0) << "    extra bound " << **p << dendl;
+    for (const auto &cd : b)
+      dout(0) << "    extra bound " << *cd << dendl;
   }
   assert(bounds == subtrees[dir]);
 }
@@ -1284,10 +1284,8 @@ void MDCache::verify_subtree_bounds(CDir *dir, const list<dirfrag_t>& bounds)
 
   // make sure that any bounds i do have are properly noted as such.
   int failed = 0;
-  for (list<dirfrag_t>::const_iterator p = bounds.begin();
-       p != bounds.end();
-       ++p) {
-    CDir *bd = get_dirfrag(*p);
+  for (const auto &fg : bounds) {
+    CDir *bd = get_dirfrag(fg);
     if (!bd) continue;
     if (subtrees[dir].count(bd) == 0) {
       dout(0) << "verify_subtree_bounds failed: extra bound " << *bd << dendl;
@@ -2780,6 +2778,9 @@ void MDCache::send_subtree_resolves()
       resolves[*p] = new MMDSResolve;
   }
 
+  map<dirfrag_t, vector<dirfrag_t> > my_subtrees;
+  map<dirfrag_t, vector<dirfrag_t> > my_ambig_imports;
+
   // known
   for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
        p != subtrees.end();
@@ -2800,10 +2801,8 @@ void MDCache::send_subtree_resolves()
       vector<dirfrag_t> dfls;
       for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q)
 	dfls.push_back((*q)->dirfrag());
-      for (map<mds_rank_t, MMDSResolve*>::iterator q = resolves.begin();
-	   q != resolves.end();
-	   ++q)
-	resolves[q->first]->add_ambiguous_import(dir->dirfrag(), dfls);
+
+      my_ambig_imports[dir->dirfrag()] = dfls;
       dout(10) << " ambig " << dir->dirfrag() << " " << dfls << dendl;
     } else {
       // not ambiguous.
@@ -2818,11 +2817,9 @@ void MDCache::send_subtree_resolves()
 	   ++q) {
 	CDir *bound = *q;
 	dfls.push_back(bound->dirfrag());
-	for (map<mds_rank_t, MMDSResolve*>::iterator r = resolves.begin();
-	     r != resolves.end();
-	     ++r)
-	  resolves[r->first]->add_subtree_bound(dir->dirfrag(), bound->dirfrag());
       }
+
+      my_subtrees[dir->dirfrag()] = dfls;
       dout(10) << " claim " << dir->dirfrag() << " " << dfls << dendl;
     }
   }
@@ -2831,19 +2828,37 @@ void MDCache::send_subtree_resolves()
   for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
        p != my_ambiguous_imports.end();
        ++p) {
-    for (map<mds_rank_t, MMDSResolve*>::iterator q = resolves.begin();
-	 q != resolves.end();
-	 ++q)
-      resolves[q->first]->add_ambiguous_import(p->first, p->second);
+    my_ambig_imports[p->first] = p->second;
     dout(10) << " ambig " << p->first << " " << p->second << dendl;
   }
 
+  // simplify the claimed subtree.
+  for (auto p = my_subtrees.begin(); p != my_subtrees.end(); ++p) {
+    unsigned i = 0;
+    while (i < p->second.size()) {
+      dirfrag_t b = p->second[i];
+      if (my_subtrees.count(b)) {
+	vector<dirfrag_t>& bb = my_subtrees[b];
+	dout(10) << " simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl;
+	for (vector<dirfrag_t>::iterator r = bb.begin(); r != bb.end(); ++r)
+	  p->second.push_back(*r);
+	my_subtrees.erase(b);
+	p->second.erase(p->second.begin() + i);
+      } else {
+	++i;
+      }
+    }
+  }
+
   // send
   for (map<mds_rank_t, MMDSResolve*>::iterator p = resolves.begin();
        p != resolves.end();
        ++p) {
+    MMDSResolve* m = p->second;
+    m->subtrees = my_subtrees;
+    m->ambiguous_imports = my_ambig_imports;
     dout(10) << "sending subtee resolve to mds." << p->first << dendl;
-    mds->send_message_mds(p->second, p->first);
+    mds->send_message_mds(m, p->first);
   }
   resolves_pending = false;
 }
@@ -9015,7 +9030,7 @@ void MDCache::dispatch_request(MDRequestRef& mdr)
       dispatch_fragment_dir(mdr);
       break;
     case CEPH_MDS_OP_EXPORTDIR:
-      migrator->dispatch_export_dir(mdr);
+      migrator->dispatch_export_dir(mdr, 0);
       break;
     case CEPH_MDS_OP_ENQUEUE_SCRUB:
       enqueue_scrub_work(mdr);
@@ -10626,8 +10641,10 @@ void MDCache::adjust_dir_fragments(CInode *diri,
 	subtrees[parent_subtree].erase(dir);
 	for (list<CDir*>::iterator p = resultfrags.begin();
 	     p != resultfrags.end();
-	     ++p)
+	     ++p) {
+	  assert((*p)->is_subtree_root());
 	  subtrees[parent_subtree].insert(*p);
+	}
       }
       
       // adjust my bounds.
@@ -10681,6 +10698,7 @@ void MDCache::adjust_dir_fragments(CInode *diri,
     diri->add_dirfrag(f);
 
     if (was_subtree) {
+      assert(f->is_subtree_root());
       subtrees[f].swap(new_bounds);
       if (parent_subtree)
 	subtrees[parent_subtree].insert(f);
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
index cd0afbe9d09..28ebc48ec8a 100644
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -250,7 +250,8 @@ public:
 
   // -- subtrees --
 protected:
-  map<CDir*,set<CDir*> > subtrees;   // nested bounds on subtrees.
+  /* subtree keys and each tree's non-recursive nested subtrees (the "bounds") */
+  map<CDir*,set<CDir*> > subtrees;
   map<CInode*,list<pair<CDir*,CDir*> > > projected_subtree_renames;  // renamed ino -> target dir
   
   // adjust subtree auth specification
@@ -1170,6 +1171,10 @@ public:
 		     Formatter *f, Context *fin);
   void repair_inode_stats(CInode *diri);
   void repair_dirfrag_stats(CDir *dir);
+
+public:
+  /* Because exports may fail, this set lets us keep track of inodes that need exporting. */
+  std::set<CInode *> export_pin_queue;
 };
 
 class C_MDS_RetryRequest : public MDSInternalContext {
diff --git a/src/mds/MDLog.cc b/src/mds/MDLog.cc
index 7b2826e9700..f02489fd176 100644
--- a/src/mds/MDLog.cc
+++ b/src/mds/MDLog.cc
@@ -33,8 +33,6 @@
 
 #define dout_context g_ceph_context
 #define dout_subsys ceph_subsys_mds
-#undef DOUT_COND
-#define DOUT_COND(cct, l) l<=cct->_conf->debug_mds || l <= cct->_conf->debug_mds_log
 #undef dout_prefix
 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".log "
 
diff --git a/src/mds/MDSDaemon.h b/src/mds/MDSDaemon.h
index 835fce67d01..2abcd87fe06 100644
--- a/src/mds/MDSDaemon.h
+++ b/src/mds/MDSDaemon.h
@@ -40,7 +40,7 @@
 #include "Beacon.h"
 
 
-#define CEPH_MDS_PROTOCOL    28 /* cluster internal */
+#define CEPH_MDS_PROTOCOL    29 /* cluster internal */
 
 class MonClient;
 
diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc
index 10fc2baa617..7aa53dcc046 100644
--- a/src/mds/MDSRank.cc
+++ b/src/mds/MDSRank.cc
@@ -16,7 +16,9 @@
 #include "common/errno.h"
 
 #include "messages/MClientRequestForward.h"
+#include "messages/MMDSLoadTargets.h"
 #include "messages/MMDSMap.h"
+#include "messages/MMDSTableRequest.h"
 #include "messages/MCommand.h"
 #include "messages/MCommandReply.h"
 
@@ -25,7 +27,6 @@
 #include "SnapClient.h"
 #include "SnapServer.h"
 #include "MDBalancer.h"
-#include "messages/MMDSTableRequest.h"
 #include "Locker.h"
 #include "Server.h"
 #include "InoTable.h"
@@ -180,6 +181,62 @@ void MDSRankDispatcher::init()
   finisher->start();
 }
 
+void MDSRank::update_targets(utime_t now)
+{
+  // get MonMap's idea of my export_targets
+  const set<mds_rank_t>& map_targets = mdsmap->get_mds_info(get_nodeid()).export_targets;
+
+  dout(20) << "updating export targets, currently " << map_targets.size() << " ranks are targets" << dendl;
+
+  bool send = false;
+  set<mds_rank_t> new_map_targets;
+
+  auto it = export_targets.begin();
+  while (it != export_targets.end()) {
+    mds_rank_t rank = it->first;
+    double val = it->second.get(now);
+    dout(20) << "export target mds." << rank << " value is " << val << " @ " << now << dendl;
+
+    if (val <= 0.01) {
+      dout(15) << "export target mds." << rank << " is no longer an export target" << dendl;
+      export_targets.erase(it++);
+      send = true;
+      continue;
+    }
+    if (!map_targets.count(rank)) {
+      dout(15) << "export target mds." << rank << " not in map's export_targets" << dendl;
+      send = true;
+    }
+    new_map_targets.insert(rank);
+    it++;
+  }
+  if (new_map_targets.size() < map_targets.size()) {
+    dout(15) << "export target map holds stale targets, sending update" << dendl;
+    send = true;
+  }
+
+  if (send) {
+    dout(15) << "updating export_targets, now " << new_map_targets.size() << " ranks are targets" << dendl;
+    MMDSLoadTargets* m = new MMDSLoadTargets(mds_gid_t(monc->get_global_id()), new_map_targets);
+    monc->send_mon_message(m);
+  }
+}
+
+void MDSRank::hit_export_target(utime_t now, mds_rank_t rank, double amount)
+{
+  double rate = g_conf->mds_bal_target_decay;
+  if (amount < 0.0) {
+    amount = 100.0/g_conf->mds_bal_target_decay; /* a good default for "i am trying to keep this export_target active" */
+  }
+  auto em = export_targets.emplace(std::piecewise_construct, std::forward_as_tuple(rank), std::forward_as_tuple(now, DecayRate(rate)));
+  if (em.second) {
+    dout(15) << "hit export target (new) " << amount << " @ " << now << dendl;
+  } else {
+    dout(15) << "hit export target " << amount << " @ " << now << dendl;
+  }
+  em.first->second.hit(now, amount);
+}
+
 void MDSRankDispatcher::tick()
 {
   heartbeat_reset();
@@ -206,8 +263,7 @@ void MDSRankDispatcher::tick()
   }
 
   // log
-  utime_t now = ceph_clock_now();
-  mds_load_t load = balancer->get_load(now);
+  mds_load_t load = balancer->get_load(ceph_clock_now());
 
   if (logger) {
     logger->set(l_mds_load_cent, 100 * load.mds_load());
@@ -234,6 +290,10 @@ void MDSRankDispatcher::tick()
       snapserver->check_osd_map(false);
   }
 
+  if (is_active() || is_stopping()) {
+    update_targets(ceph_clock_now());
+  }
+
   // shut down?
   if (is_stopping()) {
     mdlog->trim();
@@ -1671,9 +1731,6 @@ void MDSRankDispatcher::handle_mds_map(
 	mdcache->migrator->handle_mds_failure_or_stop(*p);
   }
 
-  if (!is_any_replay())
-    balancer->try_rebalance();
-
   {
     map<epoch_t,list<MDSInternalContextBase*> >::iterator p = waiting_for_mdsmap.begin();
     while (p != waiting_for_mdsmap.end() && p->first <= mdsmap->get_epoch()) {
@@ -2429,43 +2486,60 @@ void MDSRank::create_logger()
   {
     PerfCountersBuilder mds_plb(g_ceph_context, "mds", l_mds_first, l_mds_last);
 
-    mds_plb.add_u64_counter(l_mds_request, "request", "Requests");
+    mds_plb.add_u64_counter(
+      l_mds_request, "request", "Requests", "req",
+      PerfCountersBuilder::PRIO_CRITICAL);
     mds_plb.add_u64_counter(l_mds_reply, "reply", "Replies");
-    mds_plb.add_time_avg(l_mds_reply_latency, "reply_latency",
-        "Reply latency", "rlat");
-    mds_plb.add_u64_counter(l_mds_forward, "forward", "Forwarding request");
-
+    mds_plb.add_time_avg(
+      l_mds_reply_latency, "reply_latency", "Reply latency", "rlat",
+      PerfCountersBuilder::PRIO_CRITICAL);
+    mds_plb.add_u64_counter(
+      l_mds_forward, "forward", "Forwarding request", "fwd",
+      PerfCountersBuilder::PRIO_INTERESTING);
     mds_plb.add_u64_counter(l_mds_dir_fetch, "dir_fetch", "Directory fetch");
     mds_plb.add_u64_counter(l_mds_dir_commit, "dir_commit", "Directory commit");
     mds_plb.add_u64_counter(l_mds_dir_split, "dir_split", "Directory split");
     mds_plb.add_u64_counter(l_mds_dir_merge, "dir_merge", "Directory merge");
 
     mds_plb.add_u64(l_mds_inode_max, "inode_max", "Max inodes, cache size");
-    mds_plb.add_u64(l_mds_inodes, "inodes", "Inodes", "inos");
+    mds_plb.add_u64(l_mds_inodes, "inodes", "Inodes", "inos",
+		    PerfCountersBuilder::PRIO_CRITICAL);
     mds_plb.add_u64(l_mds_inodes_top, "inodes_top", "Inodes on top");
     mds_plb.add_u64(l_mds_inodes_bottom, "inodes_bottom", "Inodes on bottom");
-    mds_plb.add_u64(l_mds_inodes_pin_tail, "inodes_pin_tail", "Inodes on pin tail");
+    mds_plb.add_u64(
+      l_mds_inodes_pin_tail, "inodes_pin_tail", "Inodes on pin tail");
     mds_plb.add_u64(l_mds_inodes_pinned, "inodes_pinned", "Inodes pinned");
     mds_plb.add_u64(l_mds_inodes_expired, "inodes_expired", "Inodes expired");
-    mds_plb.add_u64(l_mds_inodes_with_caps, "inodes_with_caps", "Inodes with capabilities");
-    mds_plb.add_u64(l_mds_caps, "caps", "Capabilities", "caps");
+    mds_plb.add_u64(
+      l_mds_inodes_with_caps, "inodes_with_caps", "Inodes with capabilities");
+    mds_plb.add_u64(l_mds_caps, "caps", "Capabilities", "caps",
+		    PerfCountersBuilder::PRIO_INTERESTING);
     mds_plb.add_u64(l_mds_subtrees, "subtrees", "Subtrees");
 
     mds_plb.add_u64_counter(l_mds_traverse, "traverse", "Traverses");
     mds_plb.add_u64_counter(l_mds_traverse_hit, "traverse_hit", "Traverse hits");
-    mds_plb.add_u64_counter(l_mds_traverse_forward, "traverse_forward", "Traverse forwards");
-    mds_plb.add_u64_counter(l_mds_traverse_discover, "traverse_discover", "Traverse directory discovers");
-    mds_plb.add_u64_counter(l_mds_traverse_dir_fetch, "traverse_dir_fetch", "Traverse incomplete directory content fetchings");
-    mds_plb.add_u64_counter(l_mds_traverse_remote_ino, "traverse_remote_ino", "Traverse remote dentries");
-    mds_plb.add_u64_counter(l_mds_traverse_lock, "traverse_lock", "Traverse locks");
+    mds_plb.add_u64_counter(l_mds_traverse_forward, "traverse_forward",
+			    "Traverse forwards");
+    mds_plb.add_u64_counter(l_mds_traverse_discover, "traverse_discover",
+			    "Traverse directory discovers");
+    mds_plb.add_u64_counter(l_mds_traverse_dir_fetch, "traverse_dir_fetch",
+			    "Traverse incomplete directory content fetchings");
+    mds_plb.add_u64_counter(l_mds_traverse_remote_ino, "traverse_remote_ino",
+			    "Traverse remote dentries");
+    mds_plb.add_u64_counter(l_mds_traverse_lock, "traverse_lock",
+			    "Traverse locks");
 
     mds_plb.add_u64(l_mds_load_cent, "load_cent", "Load per cent");
     mds_plb.add_u64(l_mds_dispatch_queue_len, "q", "Dispatch queue length");
 
     mds_plb.add_u64_counter(l_mds_exported, "exported", "Exports");
-    mds_plb.add_u64_counter(l_mds_exported_inodes, "exported_inodes", "Exported inodes");
+    mds_plb.add_u64_counter(
+      l_mds_exported_inodes, "exported_inodes", "Exported inodes", "exi",
+      PerfCountersBuilder::PRIO_INTERESTING);
     mds_plb.add_u64_counter(l_mds_imported, "imported", "Imports");
-    mds_plb.add_u64_counter(l_mds_imported_inodes, "imported_inodes", "Imported inodes");
+    mds_plb.add_u64_counter(
+      l_mds_imported_inodes, "imported_inodes", "Imported inodes", "imi",
+      PerfCountersBuilder::PRIO_INTERESTING);
     logger = mds_plb.create_perf_counters();
     g_ceph_context->get_perfcounters_collection()->add(logger);
   }
diff --git a/src/mds/MDSRank.h b/src/mds/MDSRank.h
index 5fc6376b6b8..af46f4c2f3e 100644
--- a/src/mds/MDSRank.h
+++ b/src/mds/MDSRank.h
@@ -15,9 +15,10 @@
 #ifndef MDS_RANK_H_
 #define MDS_RANK_H_
 
-#include "common/TrackedOp.h"
+#include "common/DecayCounter.h"
 #include "common/LogClient.h"
 #include "common/Timer.h"
+#include "common/TrackedOp.h"
 
 #include "messages/MCommand.h"
 
@@ -271,6 +272,8 @@ class MDSRank {
     void bcast_mds_map();  // to mounted clients
     epoch_t      last_client_mdsmap_bcast;
 
+    map<mds_rank_t,DecayCounter> export_targets; /* targets this MDS is exporting to or wants/tries to */
+
     void create_logger();
   public:
 
@@ -397,6 +400,12 @@ class MDSRank {
 
     void dump_status(Formatter *f) const;
 
+    void hit_export_target(utime_t now, mds_rank_t rank, double amount=-1.0);
+    bool is_export_target(mds_rank_t rank) {
+      const set<mds_rank_t>& map_targets = mdsmap->get_mds_info(get_nodeid()).export_targets;
+      return map_targets.count(rank);
+    }
+
   protected:
     void dump_clientreplay_status(Formatter *f) const;
     void command_scrub_path(Formatter *f, const string& path, vector<string>& scrubop_vec);
@@ -488,6 +497,9 @@ class MDSRank {
     void handle_mds_recovery(mds_rank_t who);
     void handle_mds_failure(mds_rank_t who);
     // <<<
+
+    /* Update MDSMap export_targets for this rank. Called on ::tick(). */
+    void update_targets(utime_t now);
 };
 
 /* This expects to be given a reference which it is responsible for.
diff --git a/src/mds/Mantle.cc b/src/mds/Mantle.cc
index df1adab7734..18d1bc697e1 100644
--- a/src/mds/Mantle.cc
+++ b/src/mds/Mantle.cc
@@ -16,144 +16,115 @@
 #include "MDSRank.h"
 #include "Mantle.h"
 #include "msg/Messenger.h"
+#include "common/Clock.h"
+#include "CInode.h"
 
 #include <fstream>
 
 #define dout_context g_ceph_context
 #define dout_subsys ceph_subsys_mds_balancer
-#undef DOUT_COND
-#define DOUT_COND(cct, l) l<=cct->_conf->debug_mds || l <= cct->_conf->debug_mds_balancer
 #undef dout_prefix
 #define dout_prefix *_dout << "mds.mantle "
+#define mantle_dout(lvl) \
+  do {\
+    auto subsys = ceph_subsys_mds;\
+    if ((dout_context)->_conf->subsys.should_gather(ceph_subsys_mds_balancer, lvl)) {\
+      subsys = ceph_subsys_mds_balancer;\
+    }\
+    dout_impl(dout_context, subsys, lvl) dout_prefix
 
-int dout_wrapper(lua_State *L)
-{
-  #undef dout_prefix
-  #define dout_prefix *_dout << "lua.balancer "
-
-  /* Lua indexes the stack from the bottom up */
-  int bottom = -1 * lua_gettop(L);
-  if (!lua_isinteger(L, bottom) || bottom == 0) {
-    dout(0) << "WARNING: BAL_LOG has no message" << dendl;
-    return -EINVAL;
-  }
-
-  /* bottom of the stack is the log level */
-  int level = lua_tointeger(L, bottom);
-
-  /* rest of the stack is the message */
-  string s = "";
-  for (int i = bottom + 1; i < 0; i++)
-    lua_isstring(L, i) ? s.append(lua_tostring(L, i)) : s.append("<empty>");
+#define mantle_dendl dendl; } while (0)
 
-  dout(level) << s << dendl;
-  return 0;
-}
 
-int Mantle::start()
+static int dout_wrapper(lua_State *L)
 {
-  /* build lua vm state */
-  L = luaL_newstate();
-  if (!L) {
-    dout(0) << "WARNING: mantle could not load Lua state" << dendl;
-    return -ENOEXEC;
-  }
-
-  /* balancer policies can use basic Lua functions */
-  luaopen_base(L);
-
-  /* setup debugging */
-  lua_register(L, "BAL_LOG", dout_wrapper);
-
+  int level = luaL_checkinteger(L, 1);
+  lua_concat(L, lua_gettop(L)-1);
+  mantle_dout(level) << lua_tostring(L, 2) << mantle_dendl;
   return 0;
 }
 
-int Mantle::execute(const string &script)
+int Mantle::balance(const std::string &script,
+                    mds_rank_t whoami,
+                    const std::vector<std::map<std::string, double>> &metrics,
+                    std::map<mds_rank_t, double> &my_targets)
 {
-  if (L == NULL) {
-    dout(0) << "ERROR: mantle was not started" << dendl;
-    return -ENOENT;
-  }
+  lua_settop(L, 0); /* clear the stack */
 
   /* load the balancer */
   if (luaL_loadstring(L, script.c_str())) {
-    dout(0) << "WARNING: mantle could not load balancer: "
-            << lua_tostring(L, -1) << dendl;
+    mantle_dout(0) << "WARNING: mantle could not load balancer: "
+            << lua_tostring(L, -1) << mantle_dendl;
     return -EINVAL;
   }
 
-  /* compile/execute balancer */
-  int ret = lua_pcall(L, 0, LUA_MULTRET, 0);
-
-  if (ret) {
-    dout(0) << "WARNING: mantle could not execute script: "
-            << lua_tostring(L, -1) << dendl;
-    return -EINVAL;
-  }
-
-  return 0;
-}
-
-int Mantle::balance(const string &script,
-                    mds_rank_t whoami,
-                    const vector < map<string, double> > &metrics,
-                    map<mds_rank_t,double> &my_targets)
-{
-  if (start() != 0)
-    return -ENOEXEC;
-
   /* tell the balancer which mds is making the decision */
-  lua_pushinteger(L, int(whoami));
-  lua_setfield(L, -2, "whoami");
+  lua_pushinteger(L, (lua_Integer)whoami);
+  lua_setglobal(L, "whoami");
 
   /* global mds metrics to hold all dictionaries */
   lua_newtable(L);
 
   /* push name of mds (i) and its metrics onto Lua stack */
-  for (unsigned i=0; i < metrics.size(); i++) {
-    lua_pushinteger(L, i);
+  for (size_t i=0; i < metrics.size(); i++) {
     lua_newtable(L);
 
     /* push values into this mds's table; setfield assigns key/pops val */
-    for (map<string, double>::const_iterator it = metrics[i].begin();
-         it != metrics[i].end();
-         ++it) {
-      lua_pushnumber(L, it->second);
-      lua_setfield(L, -2, it->first.c_str());
+    for (const auto &it : metrics[i]) {
+      lua_pushnumber(L, it.second);
+      lua_setfield(L, -2, it.first.c_str());
     }
 
     /* in global mds table at stack[-3], set k=stack[-1] to v=stack[-2] */
-    lua_rawset(L, -3);
+    lua_seti(L, -2, i);
   }
 
   /* set the name of the global mds table */
   lua_setglobal(L, "mds");
 
-  int ret = execute(script);
-  if (ret != 0) {
-    lua_close(L);
-    return ret;
+  assert(lua_gettop(L) == 1);
+  if (lua_pcall(L, 0, 1, 0) != LUA_OK) {
+    mantle_dout(0) << "WARNING: mantle could not execute script: "
+            << lua_tostring(L, -1) << mantle_dendl;
+    return -EINVAL;
   }
 
   /* parse response by iterating over Lua stack */
   if (lua_istable(L, -1) == 0) {
-    dout(0) << "WARNING: mantle script returned a malformed response" << dendl;
-    lua_close(L);
+    mantle_dout(0) << "WARNING: mantle script returned a malformed response" << mantle_dendl;
     return -EINVAL;
   }
 
   /* fill in return value */
-  mds_rank_t it = mds_rank_t(0);
   for (lua_pushnil(L); lua_next(L, -2); lua_pop(L, 1)) {
-    if (!lua_isnumber(L, -1)) {
-      dout(0) << "WARNING: mantle script returned a malformed response" << dendl;
-      lua_close(L);
+    if (!lua_isinteger(L, -2) || !lua_isnumber(L, -1)) {
+      mantle_dout(0) << "WARNING: mantle script returned a malformed response" << mantle_dendl;
       return -EINVAL;
     }
-    my_targets[it] = (lua_tonumber(L, -1));
-    it++;
+    mds_rank_t rank(lua_tointeger(L, -2));
+    my_targets[rank] = lua_tonumber(L, -1);
   }
 
-  lua_close(L);
   return 0;
 }
+
+Mantle::Mantle (void)
+{
+  /* build lua vm state */
+  L = luaL_newstate();
+  if (!L) {
+    mantle_dout(0) << "WARNING: mantle could not load Lua state" << mantle_dendl;
+    throw std::bad_alloc();
+  }
+
+  /* balancer policies can use basic Lua functions */
+  luaopen_base(L);
+  luaopen_coroutine(L);
+  luaopen_string(L);
+  luaopen_math(L);
+  luaopen_table(L);
+  luaopen_utf8(L);
+
+  /* setup debugging */
+  lua_register(L, "BAL_LOG", dout_wrapper);
+}
diff --git a/src/mds/Mantle.h b/src/mds/Mantle.h
index 5f693fc6d1f..7970562846f 100644
--- a/src/mds/Mantle.h
+++ b/src/mds/Mantle.h
@@ -23,18 +23,16 @@
 #include "mdstypes.h"
 
 class Mantle {
-  protected:
-    lua_State *L;
-    map<mds_rank_t, mds_load_t>  mds_load;
-
   public:
-    Mantle() : L(NULL) {};
-    int start();
-    int execute(const string &script);
-    int balance(const string &script,
+    Mantle();
+    ~Mantle() { if (L) lua_close(L); }
+    int balance(const std::string &script,
                 mds_rank_t whoami,
-                const vector < map<string, double> > &metrics,
-                map<mds_rank_t,double> &my_targets);
+                const std::vector <std::map<std::string, double>> &metrics,
+                std::map<mds_rank_t,double> &my_targets);
+
+  protected:
+    lua_State *L;
 };
 
 #endif
diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc
index e55a3b266db..67796cf5a21 100644
--- a/src/mds/Migrator.cc
+++ b/src/mds/Migrator.cc
@@ -78,8 +78,6 @@
 
 #define dout_context g_ceph_context
 #define dout_subsys ceph_subsys_mds
-#undef DOUT_COND
-#define DOUT_COND(cct, l) (l <= cct->_conf->debug_mds || l <= cct->_conf->debug_mds_migrator)
 #undef dout_prefix
 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".migrator "
 
@@ -757,6 +755,18 @@ void Migrator::get_export_lock_set(CDir *dir, set<SimpleLock*>& locks)
 }
 
 
+class C_M_ExportTargetWait : public MigratorContext {
+  MDRequestRef mdr;
+  int count;
+public:
+  C_M_ExportTargetWait(Migrator *m, MDRequestRef mdr, int count)
+   : MigratorContext(m), mdr(mdr), count(count) {}
+  void finish(int r) override {
+    mig->dispatch_export_dir(mdr, count);
+  }
+};
+
+
 /** export_dir(dir, dest)
  * public method to initiate an export.
  * will fail if the directory is freezing, frozen, unpinnable, or root. 
@@ -781,8 +791,8 @@ void Migrator::export_dir(CDir *dir, mds_rank_t dest)
     return;
   }
 
-  if (!dir->inode->is_base() && dir->get_parent_dir()->get_inode()->is_stray() &&
-      dir->get_parent_dir()->get_parent_dir()->ino() != MDS_INO_MDSDIR(dest)) {
+  if (!dir->inode->is_base() && dir->inode->get_projected_parent_dir()->inode->is_stray() &&
+      dir->inode->get_projected_parent_dir()->get_parent_dir()->ino() != MDS_INO_MDSDIR(dest)) {
     dout(7) << "i won't export anything in stray" << dendl;
     return;
   }
@@ -797,6 +807,44 @@ void Migrator::export_dir(CDir *dir, mds_rank_t dest)
     return;
   }
 
+  if (!mds->is_stopping() && !dir->inode->is_exportable(dest)) {
+    dout(7) << "dir is export pinned" << dendl;
+    return;
+  }
+
+  if (dest == mds->get_nodeid() || !mds->mdsmap->is_up(dest)) {
+    dout(7) << "cannot export: dest " << dest << " is me or is not active" << dendl;
+    return;
+  }
+
+  if (g_conf->mds_thrash_exports) {
+    // create random subtree bound (which will not be exported)
+    list<CDir*> ls;
+    for (auto p = dir->begin(); p != dir->end(); ++p) {
+      auto dn = p->second;
+      CDentry::linkage_t *dnl= dn->get_linkage();
+      if (dnl->is_primary()) {
+	CInode *in = dnl->get_inode();
+	if (in->is_dir())
+	  in->get_nested_dirfrags(ls);
+      }
+    }
+    if (ls.size() > 0) {
+      int n = rand() % ls.size();
+      auto p = ls.begin();
+      while (n--) ++p;
+      CDir *bd = *p;
+      if (!(bd->is_frozen() || bd->is_freezing())) {
+	assert(bd->is_auth());
+	dir->state_set(CDir::STATE_AUXSUBTREE);
+	mds->mdcache->adjust_subtree_auth(dir, mds->get_nodeid());
+	dout(0) << "export_dir: create aux subtree " << *bd << " under " << *dir << dendl;
+      }
+    }
+  }
+
+  mds->hit_export_target(ceph_clock_now(), dest, -1);
+
   dir->auth_pin(this);
   dir->state_set(CDir::STATE_EXPORTING);
 
@@ -810,14 +858,14 @@ void Migrator::export_dir(CDir *dir, mds_rank_t dest)
   stat.tid = mdr->reqid.tid;
   stat.mut = mdr;
 
-  dispatch_export_dir(mdr);
+  return mds->mdcache->dispatch_request(mdr);
 }
 
-void Migrator::dispatch_export_dir(MDRequestRef& mdr)
+void Migrator::dispatch_export_dir(MDRequestRef& mdr, int count)
 {
   dout(7) << "dispatch_export_dir " << *mdr << dendl;
-  CDir *dir = mdr->more()->export_dir;
 
+  CDir *dir = mdr->more()->export_dir;
   map<CDir*,export_state_t>::iterator it = export_state.find(dir);
   if (it == export_state.end() || it->second.tid != mdr->reqid.tid) {
     // export must have aborted.
@@ -827,6 +875,25 @@ void Migrator::dispatch_export_dir(MDRequestRef& mdr)
   }
   assert(it->second.state == EXPORT_LOCKING);
 
+  mds_rank_t dest = it->second.peer;
+
+  if (!mds->is_export_target(dest)) {
+    dout(7) << "dest is not yet an export target" << dendl;
+    if (count > 3) {
+      dout(5) << "dest has not been added as export target after three MDSMap epochs, canceling export" << dendl;
+      export_try_cancel(dir);
+      return;
+    }
+    mds->wait_for_mdsmap(mds->mdsmap->get_epoch(), new C_M_ExportTargetWait(this, mdr, count+1));
+    return;
+  }
+
+  if (!dir->inode->get_parent_dn()) {
+    dout(7) << "waiting for dir to become stable before export: " << *dir << dendl;
+    dir->add_waiter(CDir::WAIT_CREATED, new C_M_ExportTargetWait(this, mdr, 1));
+    return;
+  }
+
   if (mdr->aborted || dir->is_frozen() || dir->is_freezing()) {
     dout(7) << "wouldblock|freezing|frozen, canceling export" << dendl;
     export_try_cancel(dir);
@@ -867,7 +934,7 @@ void Migrator::dispatch_export_dir(MDRequestRef& mdr)
   MExportDirDiscover *discover = new MExportDirDiscover(dir->dirfrag(), path,
 							mds->get_nodeid(),
 							it->second.tid);
-  mds->send_message_mds(discover, it->second.peer);
+  mds->send_message_mds(discover, dest);
   assert(g_conf->mds_kill_export_at != 2);
 
   it->second.last_cum_auth_pins_change = ceph_clock_now();
@@ -887,15 +954,19 @@ void Migrator::dispatch_export_dir(MDRequestRef& mdr)
 void Migrator::handle_export_discover_ack(MExportDirDiscoverAck *m)
 {
   CDir *dir = cache->get_dirfrag(m->get_dirfrag());
+  mds_rank_t dest(m->get_source().num());
+  utime_t now = ceph_clock_now();
   assert(dir);
   
   dout(7) << "export_discover_ack from " << m->get_source()
 	  << " on " << *dir << dendl;
 
+  mds->hit_export_target(now, dest, -1);
+
   map<CDir*,export_state_t>::iterator it = export_state.find(dir);
   if (it == export_state.end() ||
       it->second.tid != m->get_tid() ||
-      it->second.peer != mds_rank_t(m->get_source().num())) {
+      it->second.peer != dest) {
     dout(7) << "must have aborted" << dendl;
   } else {
     assert(it->second.state == EXPORT_DISCOVERING);
@@ -1150,10 +1221,14 @@ void Migrator::get_export_client_set(CInode *in, set<client_t>& client_set)
 void Migrator::handle_export_prep_ack(MExportDirPrepAck *m)
 {
   CDir *dir = cache->get_dirfrag(m->get_dirfrag());
+  mds_rank_t dest(m->get_source().num());
+  utime_t now = ceph_clock_now();
   assert(dir);
 
   dout(7) << "export_prep_ack " << *dir << dendl;
 
+  mds->hit_export_target(now, dest, -1);
+
   map<CDir*,export_state_t>::iterator it = export_state.find(dir);
   if (it == export_state.end() ||
       it->second.tid != m->get_tid() ||
@@ -1238,7 +1313,6 @@ void Migrator::export_go(CDir *dir)
 
 void Migrator::export_go_synced(CDir *dir, uint64_t tid)
 {
-
   map<CDir*,export_state_t>::iterator it = export_state.find(dir);
   if (it == export_state.end() ||
       it->second.state == EXPORT_CANCELLING ||
@@ -1248,8 +1322,9 @@ void Migrator::export_go_synced(CDir *dir, uint64_t tid)
     return;
   }
   assert(it->second.state == EXPORT_WARNING);
+  mds_rank_t dest = it->second.peer;
 
-  dout(7) << "export_go_synced " << *dir << " to " << it->second.peer << dendl;
+  dout(7) << "export_go_synced " << *dir << " to " << dest << dendl;
 
   cache->show_subtrees();
   
@@ -1260,7 +1335,7 @@ void Migrator::export_go_synced(CDir *dir, uint64_t tid)
   assert(dir->get_cum_auth_pins() == 0);
 
   // set ambiguous auth
-  cache->adjust_subtree_auth(dir, mds->get_nodeid(), it->second.peer);
+  cache->adjust_subtree_auth(dir, mds->get_nodeid(), dest);
 
   // take away the popularity we're sending.
   utime_t now = ceph_clock_now();
@@ -1269,7 +1344,7 @@ void Migrator::export_go_synced(CDir *dir, uint64_t tid)
   // fill export message with cache data
   MExportDir *req = new MExportDir(dir->dirfrag(), it->second.tid);
   map<client_t,entity_inst_t> exported_client_map;
-  int num_exported_inodes = encode_export_dir(req->export_data,
+  uint64_t num_exported_inodes = encode_export_dir(req->export_data,
 					      dir,   // recur start point
 					      exported_client_map,
 					      now);
@@ -1285,9 +1360,11 @@ void Migrator::export_go_synced(CDir *dir, uint64_t tid)
     req->add_export((*p)->dirfrag());
 
   // send
-  mds->send_message_mds(req, it->second.peer);
+  mds->send_message_mds(req, dest);
   assert(g_conf->mds_kill_export_at != 8);
 
+  mds->hit_export_target(now, dest, num_exported_inodes+1);
+
   // stats
   if (mds->logger) mds->logger->inc(l_mds_exported);
   if (mds->logger) mds->logger->inc(l_mds_exported_inodes, num_exported_inodes);
@@ -1430,12 +1507,12 @@ void Migrator::finish_export_inode(CInode *in, utime_t now, mds_rank_t peer,
 
 }
 
-int Migrator::encode_export_dir(bufferlist& exportbl,
+uint64_t Migrator::encode_export_dir(bufferlist& exportbl,
 				CDir *dir,
 				map<client_t,entity_inst_t>& exported_client_map,
 				utime_t now)
 {
-  int num_exported = 0;
+  uint64_t num_exported = 0;
 
   dout(7) << "encode_export_dir " << *dir << " " << dir->get_num_head_items() << " head items" << dendl;
   
@@ -1494,7 +1571,7 @@ int Migrator::encode_export_dir(bufferlist& exportbl,
       ::encode(d_type, exportbl);
       continue;
     }
-    
+
     // primary link
     // -- inode
     exportbl.append("I", 1);    // inode dentry
@@ -1590,12 +1667,16 @@ public:
 void Migrator::handle_export_ack(MExportDirAck *m)
 {
   CDir *dir = cache->get_dirfrag(m->get_dirfrag());
+  mds_rank_t dest(m->get_source().num());
+  utime_t now = ceph_clock_now();
   assert(dir);
   assert(dir->is_frozen_tree_root());  // i'm exporting!
 
   // yay!
   dout(7) << "handle_export_ack " << *dir << dendl;
 
+  mds->hit_export_target(now, dest, -1);
+
   map<CDir*,export_state_t>::iterator it = export_state.find(dir);
   assert(it != export_state.end());
   assert(it->second.state == EXPORT_EXPORTING);
@@ -1700,10 +1781,7 @@ void Migrator::export_reverse(CDir *dir)
   }
   
   // unpin bounds
-  for (set<CDir*>::iterator p = bounds.begin();
-       p != bounds.end();
-       ++p) {
-    CDir *bd = *p;
+  for (const auto &bd : bounds) {
     bd->put(CDir::PIN_EXPORTBOUND);
     bd->state_clear(CDir::STATE_EXPORTBOUND);
   }
@@ -1800,9 +1878,13 @@ void Migrator::export_logged_finish(CDir *dir)
 void Migrator::handle_export_notify_ack(MExportDirNotifyAck *m)
 {
   CDir *dir = cache->get_dirfrag(m->get_dirfrag());
+  mds_rank_t dest(m->get_source().num());
+  utime_t now = ceph_clock_now();
   assert(dir);
   mds_rank_t from = mds_rank_t(m->get_source().num());
 
+  mds->hit_export_target(now, dest, -1);
+
   auto export_state_entry = export_state.find(dir);
   if (export_state_entry != export_state.end()) {
     export_state_t& stat = export_state_entry->second;
@@ -1887,6 +1969,9 @@ void Migrator::export_finish(CDir *dir)
     bd->state_clear(CDir::STATE_EXPORTBOUND);
   }
 
+  if (dir->state_test(CDir::STATE_AUXSUBTREE))
+    dir->state_clear(CDir::STATE_AUXSUBTREE);
+
   // adjust auth, with possible subtree merge.
   //  (we do this _after_ removing EXPORTBOUND pins, to allow merges)
   cache->adjust_subtree_auth(dir, it->second.peer);
@@ -2862,7 +2947,6 @@ void Migrator::decode_import_inode(CDentry *dn, bufferlist::iterator& blp,
   in->add_replica(oldauth, CInode::EXPORT_NONCE);
   if (in->is_replica(mds->get_nodeid()))
     in->remove_replica(mds->get_nodeid());
-  
 }
 
 void Migrator::decode_import_inode_caps(CInode *in, bool auth_cap,
@@ -3032,6 +3116,8 @@ int Migrator::decode_import_dir(bufferlist::iterator& blp,
     dir->verify_fragstat();
 #endif
 
+  dir->inode->maybe_export_pin();
+
   dout(7) << "decode_import_dir done " << *dir << dendl;
   return num_imported;
 }
diff --git a/src/mds/Migrator.h b/src/mds/Migrator.h
index 85ae9cf4410..d14561920a4 100644
--- a/src/mds/Migrator.h
+++ b/src/mds/Migrator.h
@@ -50,11 +50,6 @@ class MGatherCaps;
 class EImportStart;
 
 class Migrator {
-private:
-  MDSRank *mds;
-  MDCache *cache;
-
-  // -- exports --
 public:
   // export stages.  used to clean up intelligently if there's a failure.
   const static int EXPORT_CANCELLED	= 0;  // cancelled
@@ -82,31 +77,7 @@ public:
     }
   }
 
-protected:
-  // export fun
-  struct export_state_t {
-    int state;
-    mds_rank_t peer;
-    uint64_t tid;
-    set<mds_rank_t> warning_ack_waiting;
-    set<mds_rank_t> notify_ack_waiting;
-    map<inodeno_t,map<client_t,Capability::Import> > peer_imported;
-    MutationRef mut;
-    // for freeze tree deadlock detection
-    utime_t last_cum_auth_pins_change;
-    int last_cum_auth_pins;
-    int num_remote_waiters; // number of remote authpin waiters
-    export_state_t() : state(0), peer(0), tid(0), mut(), 
-		       last_cum_auth_pins(0), num_remote_waiters(0) {}
-  };
-
-  map<CDir*, export_state_t>  export_state;
-  
-  list<pair<dirfrag_t,mds_rank_t> >  export_queue;
-
-
   // -- imports --
-public:
   const static int IMPORT_DISCOVERING   = 1; // waiting for prep
   const static int IMPORT_DISCOVERED    = 2; // waiting for prep
   const static int IMPORT_PREPPING      = 3; // opening dirs on bounds
@@ -115,7 +86,6 @@ public:
   const static int IMPORT_ACKING        = 6; // logged EImportStart, sent ack, waiting for finish
   const static int IMPORT_FINISHING     = 7; // sent cap imports, waiting for finish
   const static int IMPORT_ABORTING      = 8; // notifying bystanders of an abort before unfreezing
-
   static const char *get_import_statename(int s) {
     switch (s) {
     case IMPORT_DISCOVERING: return "discovering";
@@ -130,7 +100,34 @@ public:
     }
   }
 
+  // -- cons --
+  Migrator(MDSRank *m, MDCache *c) : mds(m), cache(c) {}
+
+
+
 protected:
+  // export fun
+  struct export_state_t {
+    int state;
+    mds_rank_t peer;
+    uint64_t tid;
+    set<mds_rank_t> warning_ack_waiting;
+    set<mds_rank_t> notify_ack_waiting;
+    map<inodeno_t,map<client_t,Capability::Import> > peer_imported;
+    MutationRef mut;
+    // for freeze tree deadlock detection
+    utime_t last_cum_auth_pins_change;
+    int last_cum_auth_pins;
+    int num_remote_waiters; // number of remote authpin waiters
+    export_state_t() : state(0), peer(0), tid(0), mut(),
+		       last_cum_auth_pins(0), num_remote_waiters(0) {}
+  };
+
+  map<CDir*, export_state_t>  export_state;
+
+  list<pair<dirfrag_t,mds_rank_t> >  export_queue;
+
+  // import fun
   struct import_state_t {
     int state;
     mds_rank_t peer;
@@ -146,9 +143,66 @@ protected:
 
   map<dirfrag_t, import_state_t>  import_state;
 
+  void handle_export_discover_ack(MExportDirDiscoverAck *m);
+  void export_frozen(CDir *dir, uint64_t tid);
+  void handle_export_prep_ack(MExportDirPrepAck *m);
+  void export_sessions_flushed(CDir *dir, uint64_t tid);
+  void export_go(CDir *dir);
+  void export_go_synced(CDir *dir, uint64_t tid);
+  void export_try_cancel(CDir *dir, bool notify_peer=true);
+  void export_cancel_finish(CDir *dir);
+  void export_reverse(CDir *dir);
+  void export_notify_abort(CDir *dir, set<CDir*>& bounds);
+  void handle_export_ack(MExportDirAck *m);
+  void export_logged_finish(CDir *dir);
+  void handle_export_notify_ack(MExportDirNotifyAck *m);
+  void export_finish(CDir *dir);
+
+  void handle_gather_caps(MGatherCaps *m);
+
+  friend class C_MDC_ExportFreeze;
+  friend class C_MDS_ExportFinishLogged;
+  friend class C_M_ExportGo;
+  friend class C_M_ExportSessionsFlushed;
+  friend class MigratorContext;
+  friend class MigratorLogContext;
+
+  // importer
+  void handle_export_discover(MExportDirDiscover *m);
+  void handle_export_cancel(MExportDirCancel *m);
+  void handle_export_prep(MExportDirPrep *m);
+  void handle_export_dir(MExportDir *m);
+
+  void import_reverse_discovering(dirfrag_t df);
+  void import_reverse_discovered(dirfrag_t df, CInode *diri);
+  void import_reverse_prepping(CDir *dir);
+  void import_remove_pins(CDir *dir, set<CDir*>& bounds);
+  void import_reverse_unfreeze(CDir *dir);
+  void import_reverse_final(CDir *dir);
+  void import_notify_abort(CDir *dir, set<CDir*>& bounds);
+  void import_notify_finish(CDir *dir, set<CDir*>& bounds);
+  void import_logged_start(dirfrag_t df, CDir *dir, mds_rank_t from,
+			   map<client_t,entity_inst_t> &imported_client_map,
+			   map<client_t,uint64_t>& sseqmap);
+  void handle_export_finish(MExportDirFinish *m);
+
+  void handle_export_caps(MExportCaps *m);
+  void logged_import_caps(CInode *in,
+			  mds_rank_t from,
+			  map<CInode*, map<client_t,Capability::Export> >& cap_imports,
+			  map<client_t,entity_inst_t>& client_map,
+			  map<client_t,uint64_t>& sseqmap);
+
+
+  friend class C_MDS_ImportDirLoggedStart;
+  friend class C_MDS_ImportDirLoggedFinish;
+  friend class C_M_LoggedImportCaps;
+
+  // bystander
+  void handle_export_notify(MExportDirNotify *m);
+
+
 public:
-  // -- cons --
-  Migrator(MDSRank *m, MDCache *c) : mds(m), cache(c) {}
 
   void dispatch(Message*);
 
@@ -230,8 +284,7 @@ public:
 
   // -- import/export --
   // exporter
- public:
-  void dispatch_export_dir(MDRequestRef& mdr);
+  void dispatch_export_dir(MDRequestRef& mdr, int count);
   void export_dir(CDir *dir, mds_rank_t dest);
   void export_empty_import(CDir *dir);
 
@@ -256,7 +309,7 @@ public:
 			        map<client_t,Capability::Import>& peer_imported);
 
 
-  int encode_export_dir(bufferlist& exportbl,
+  uint64_t encode_export_dir(bufferlist& exportbl,
 			CDir *dir,
 			map<client_t,entity_inst_t>& exported_client_map,
 			utime_t now);
@@ -268,38 +321,6 @@ public:
 
   void export_caps(CInode *in);
 
- protected:
-  void handle_export_discover_ack(MExportDirDiscoverAck *m);
-  void export_frozen(CDir *dir, uint64_t tid);
-  void handle_export_prep_ack(MExportDirPrepAck *m);
-  void export_sessions_flushed(CDir *dir, uint64_t tid);
-  void export_go(CDir *dir);
-  void export_go_synced(CDir *dir, uint64_t tid);
-  void export_try_cancel(CDir *dir, bool notify_peer=true);
-  void export_cancel_finish(CDir *dir);
-  void export_reverse(CDir *dir);
-  void export_notify_abort(CDir *dir, set<CDir*>& bounds);
-  void handle_export_ack(MExportDirAck *m);
-  void export_logged_finish(CDir *dir);
-  void handle_export_notify_ack(MExportDirNotifyAck *m);
-  void export_finish(CDir *dir);
-
-  void handle_gather_caps(MGatherCaps *m);
-
-  friend class C_MDC_ExportFreeze;
-  friend class C_MDS_ExportFinishLogged;
-  friend class C_M_ExportGo;
-  friend class C_M_ExportSessionsFlushed;
-  friend class MigratorContext;
-  friend class MigratorLogContext;
-
-  // importer
-  void handle_export_discover(MExportDirDiscover *m);
-  void handle_export_cancel(MExportDirCancel *m);
-  void handle_export_prep(MExportDirPrep *m);
-  void handle_export_dir(MExportDir *m);
-
-public:
   void decode_import_inode(CDentry *dn, bufferlist::iterator& blp,
 			   mds_rank_t oldauth, LogSegment *ls,
 			   map<CInode*, map<client_t,Capability::Export> >& cap_imports,
@@ -317,41 +338,13 @@ public:
 			map<CInode*, map<client_t,Capability::Export> >& cap_imports,
 			list<ScatterLock*>& updated_scatterlocks, utime_t now);
 
-public:
   void import_reverse(CDir *dir);
-protected:
-  void import_reverse_discovering(dirfrag_t df);
-  void import_reverse_discovered(dirfrag_t df, CInode *diri);
-  void import_reverse_prepping(CDir *dir);
-  void import_remove_pins(CDir *dir, set<CDir*>& bounds);
-  void import_reverse_unfreeze(CDir *dir);
-  void import_reverse_final(CDir *dir);
-  void import_notify_abort(CDir *dir, set<CDir*>& bounds);
-  void import_notify_finish(CDir *dir, set<CDir*>& bounds);
-  void import_logged_start(dirfrag_t df, CDir *dir, mds_rank_t from,
-			   map<client_t,entity_inst_t> &imported_client_map,
-			   map<client_t,uint64_t>& sseqmap);
-  void handle_export_finish(MExportDirFinish *m);
-public:
-  void import_finish(CDir *dir, bool notify, bool last=true);
-protected:
-
-  void handle_export_caps(MExportCaps *m);
-  void logged_import_caps(CInode *in, 
-			  mds_rank_t from,
-			  map<CInode*, map<client_t,Capability::Export> >& cap_imports,
-			  map<client_t,entity_inst_t>& client_map,
-			  map<client_t,uint64_t>& sseqmap);
-
 
-  friend class C_MDS_ImportDirLoggedStart;
-  friend class C_MDS_ImportDirLoggedFinish;
-  friend class C_M_LoggedImportCaps;
-
-  // bystander
-  void handle_export_notify(MExportDirNotify *m);
+  void import_finish(CDir *dir, bool notify, bool last=true);
 
+private:
+  MDSRank *mds;
+  MDCache *cache;
 };
 
-
 #endif
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index cd92b4edd60..500fd3ec629 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -58,7 +58,6 @@
 #include "osd/OSDMap.h"
 
 #include <errno.h>
-#include <fcntl.h>
 
 #include <list>
 #include <iostream>
@@ -119,6 +118,62 @@ void Server::create_logger()
       "Client session messages", "hcs");
   plb.add_u64_counter(l_mdss_dispatch_client_request, "dispatch_client_request", "Client requests dispatched");
   plb.add_u64_counter(l_mdss_dispatch_slave_request, "dispatch_server_request", "Server requests dispatched");
+  plb.add_u64_counter(l_mdss_req_lookuphash, "req_lookuphash",
+      "Request type lookup hash of inode");
+  plb.add_u64_counter(l_mdss_req_lookupino, "req_lookupino",
+      "Request type lookup inode");
+  plb.add_u64_counter(l_mdss_req_lookupparent, "req_lookupparent",
+      "Request type lookup parent");
+  plb.add_u64_counter(l_mdss_req_lookupname, "req_lookupname",
+      "Request type lookup name");
+  plb.add_u64_counter(l_mdss_req_lookup, "req_lookup",
+      "Request type lookup");
+  plb.add_u64_counter(l_mdss_req_lookupsnap, "req_lookupsnap",
+      "Request type lookup snapshot");
+  plb.add_u64_counter(l_mdss_req_getattr, "req_getattr",
+      "Request type get attribute");
+  plb.add_u64_counter(l_mdss_req_setattr, "req_setattr",
+      "Request type set attribute");
+  plb.add_u64_counter(l_mdss_req_setlayout, "req_setlayout",
+      "Request type set file layout");
+  plb.add_u64_counter(l_mdss_req_setdirlayout, "req_setdirlayout",
+      "Request type set directory layout");
+  plb.add_u64_counter(l_mdss_req_setxattr, "req_setxattr",
+      "Request type set extended attribute");
+  plb.add_u64_counter(l_mdss_req_rmxattr, "req_rmxattr",
+      "Request type remove extended attribute");
+  plb.add_u64_counter(l_mdss_req_readdir, "req_readdir",
+      "Request type read directory");
+  plb.add_u64_counter(l_mdss_req_setfilelock, "req_setfilelock",
+      "Request type set file lock");
+  plb.add_u64_counter(l_mdss_req_getfilelock, "req_getfilelock",
+      "Request type get file lock");
+  plb.add_u64_counter(l_mdss_req_create, "req_create",
+      "Request type create");
+  plb.add_u64_counter(l_mdss_req_open, "req_open",
+      "Request type open");
+  plb.add_u64_counter(l_mdss_req_mknod, "req_mknod",
+      "Request type make node");
+  plb.add_u64_counter(l_mdss_req_link, "req_link",
+      "Request type link");
+  plb.add_u64_counter(l_mdss_req_unlink, "req_unlink",
+      "Request type unlink");
+  plb.add_u64_counter(l_mdss_req_rmdir, "req_rmdir",
+      "Request type remove directory");
+  plb.add_u64_counter(l_mdss_req_rename, "req_rename",
+      "Request type rename");
+  plb.add_u64_counter(l_mdss_req_mkdir, "req_mkdir",
+      "Request type make directory");
+  plb.add_u64_counter(l_mdss_req_symlink, "req_symlink",
+      "Request type symbolic link");
+  plb.add_u64_counter(l_mdss_req_lssnap, "req_lssnap",
+      "Request type list snapshot");
+  plb.add_u64_counter(l_mdss_req_mksnap, "req_mksnap",
+      "Request type make snapshot");
+  plb.add_u64_counter(l_mdss_req_rmsnap, "req_rmsnap",
+      "Request type remove snapshot");
+  plb.add_u64_counter(l_mdss_req_renamesnap, "req_renamesnap",
+      "Request type rename snapshot");
   logger = plb.create_perf_counters();
   g_ceph_context->get_perfcounters_collection()->add(logger);
 }
@@ -1039,6 +1094,93 @@ void Server::respond_to_request(MDRequestRef& mdr, int r)
 {
   if (mdr->client_request) {
     reply_client_request(mdr, new MClientReply(mdr->client_request, r));
+
+    // add here to avoid counting ops multiple times (e.g., locks, loading)
+    switch(mdr->client_request->get_op()) {
+    case CEPH_MDS_OP_LOOKUPHASH:
+      logger->inc(l_mdss_req_lookuphash);
+      break;
+    case CEPH_MDS_OP_LOOKUPINO:
+      logger->inc(l_mdss_req_lookupino);
+      break;
+    case CEPH_MDS_OP_LOOKUPPARENT:
+      logger->inc(l_mdss_req_lookupparent);
+      break;
+    case CEPH_MDS_OP_LOOKUPNAME:
+      logger->inc(l_mdss_req_lookupname);
+      break;
+    case CEPH_MDS_OP_LOOKUP:
+      logger->inc(l_mdss_req_lookup);
+      break;
+    case CEPH_MDS_OP_LOOKUPSNAP:
+      logger->inc(l_mdss_req_lookupsnap);
+      break;
+    case CEPH_MDS_OP_GETATTR:
+      logger->inc(l_mdss_req_getattr);
+      break;
+    case CEPH_MDS_OP_SETATTR:
+      logger->inc(l_mdss_req_setattr);
+      break;
+    case CEPH_MDS_OP_SETLAYOUT:
+      logger->inc(l_mdss_req_setlayout);
+      break;
+    case CEPH_MDS_OP_SETDIRLAYOUT:
+      logger->inc(l_mdss_req_setdirlayout);
+      break;
+    case CEPH_MDS_OP_SETXATTR:
+      logger->inc(l_mdss_req_setxattr);
+      break;
+    case CEPH_MDS_OP_RMXATTR:
+      logger->inc(l_mdss_req_rmxattr);
+      break;
+    case CEPH_MDS_OP_READDIR:
+      logger->inc(l_mdss_req_readdir);
+      break;
+    case CEPH_MDS_OP_SETFILELOCK:
+      logger->inc(l_mdss_req_setfilelock);
+      break;
+    case CEPH_MDS_OP_GETFILELOCK:
+      logger->inc(l_mdss_req_getfilelock);
+      break;
+    case CEPH_MDS_OP_CREATE:
+      logger->inc(l_mdss_req_create);
+    case CEPH_MDS_OP_OPEN:
+      logger->inc(l_mdss_req_open);
+      break;
+    case CEPH_MDS_OP_MKNOD:
+      logger->inc(l_mdss_req_mknod);
+      break;
+    case CEPH_MDS_OP_LINK:
+      logger->inc(l_mdss_req_link);
+      break;
+    case CEPH_MDS_OP_UNLINK:
+      logger->inc(l_mdss_req_unlink);
+      break;
+    case CEPH_MDS_OP_RMDIR:
+      logger->inc(l_mdss_req_rmdir);
+      break;
+    case CEPH_MDS_OP_RENAME:
+      logger->inc(l_mdss_req_rename);
+      break;
+    case CEPH_MDS_OP_MKDIR:
+      logger->inc(l_mdss_req_mkdir);
+      break;
+    case CEPH_MDS_OP_SYMLINK:
+      logger->inc(l_mdss_req_symlink);
+      break;
+    case CEPH_MDS_OP_LSSNAP:
+      logger->inc(l_mdss_req_lssnap);
+      break;
+    case CEPH_MDS_OP_MKSNAP:
+      logger->inc(l_mdss_req_mksnap);
+      break;
+    case CEPH_MDS_OP_RMSNAP:
+      logger->inc(l_mdss_req_rmsnap);
+      break;
+    case CEPH_MDS_OP_RENAMESNAP:
+      logger->inc(l_mdss_req_renamesnap);
+      break;
+    }
   } else if (mdr->internal_op > -1) {
     dout(10) << "respond_to_request on internal request " << mdr << dendl;
     if (!mdr->internal_op_finish)
@@ -2952,7 +3094,7 @@ void Server::handle_client_open(MDRequestRef& mdr)
     return;
   }
   
-  bool need_auth = !file_mode_is_readonly(cmode) || (flags & O_TRUNC);
+  bool need_auth = !file_mode_is_readonly(cmode) || (flags & CEPH_O_TRUNC);
 
   if ((cmode & CEPH_FILE_MODE_WR) && mdcache->is_readonly()) {
     dout(7) << "read-only FS" << dendl;
@@ -2977,8 +3119,8 @@ void Server::handle_client_open(MDRequestRef& mdr)
     // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
     cmode = CEPH_FILE_MODE_PIN;
     // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
-    if (cur->inode.is_symlink() && !(flags & O_NOFOLLOW))
-      flags &= ~O_TRUNC;
+    if (cur->inode.is_symlink() && !(flags & CEPH_O_NOFOLLOW))
+      flags &= ~CEPH_O_TRUNC;
   }
 
   dout(10) << "open flags = " << flags
@@ -2992,13 +3134,13 @@ void Server::handle_client_open(MDRequestRef& mdr)
     respond_to_request(mdr, -ENXIO);                 // FIXME what error do we want?
     return;
     }*/
-  if ((flags & O_DIRECTORY) && !cur->inode.is_dir() && !cur->inode.is_symlink()) {
+  if ((flags & CEPH_O_DIRECTORY) && !cur->inode.is_dir() && !cur->inode.is_symlink()) {
     dout(7) << "specified O_DIRECTORY on non-directory " << *cur << dendl;
     respond_to_request(mdr, -EINVAL);
     return;
   }
 
-  if ((flags & O_TRUNC) && !cur->inode.is_file()) {
+  if ((flags & CEPH_O_TRUNC) && !cur->inode.is_file()) {
     dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur << dendl;
     // we should return -EISDIR for directory, return -EINVAL for other non-regular
     respond_to_request(mdr, cur->inode.is_dir() ? -EISDIR : -EINVAL);
@@ -3036,7 +3178,7 @@ void Server::handle_client_open(MDRequestRef& mdr)
   }
 
   // O_TRUNC
-  if ((flags & O_TRUNC) && !mdr->has_completed) {
+  if ((flags & CEPH_O_TRUNC) && !mdr->has_completed) {
     assert(cur->is_auth());
 
     xlocks.insert(&cur->filelock);
@@ -3174,7 +3316,7 @@ void Server::handle_client_openc(MDRequestRef& mdr)
     return;
   }
 
-  if (!(req->head.args.open.flags & O_EXCL)) {
+  if (!(req->head.args.open.flags & CEPH_O_EXCL)) {
     int r = mdcache->path_traverse(mdr, NULL, NULL, req->get_filepath(),
 				   &mdr->dn[0], NULL, MDS_TRAVERSE_FORWARD);
     if (r > 0) return;
@@ -3197,7 +3339,7 @@ void Server::handle_client_openc(MDRequestRef& mdr)
     // r == -ENOENT
   }
 
-  bool excl = (req->head.args.open.flags & O_EXCL);
+  bool excl = (req->head.args.open.flags & CEPH_O_EXCL);
   set<SimpleLock*> rdlocks, wrlocks, xlocks;
   file_layout_t *dir_layout = NULL;
   CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks,
@@ -3271,7 +3413,7 @@ void Server::handle_client_openc(MDRequestRef& mdr)
 
   if (!dnl->is_null()) {
     // it existed.  
-    assert(req->head.args.open.flags & O_EXCL);
+    assert(req->head.args.open.flags & CEPH_O_EXCL);
     dout(10) << "O_EXCL, target exists, failing with -EEXIST" << dendl;
     mdr->tracei = dnl->get_inode();
     mdr->tracedn = dn;
@@ -4417,6 +4559,28 @@ void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur,
 
     pi = cur->project_inode();
     pi->quota = quota;
+  } else if (name.find("ceph.dir.pin") == 0) {
+    if (!cur->is_dir() || cur->is_root()) {
+      respond_to_request(mdr, -EINVAL);
+      return;
+    }
+
+    mds_rank_t rank;
+    try {
+      rank = boost::lexical_cast<mds_rank_t>(value);
+      if (rank < 0) rank = MDS_RANK_NONE;
+    } catch (boost::bad_lexical_cast const&) {
+      dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
+      respond_to_request(mdr, -EINVAL);
+      return;
+    }
+
+    xlocks.insert(&cur->policylock);
+    if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
+      return;
+
+    pi = cur->project_inode();
+    cur->set_export_pin(rank);
   } else {
     dout(10) << " unknown vxattr " << name << dendl;
     respond_to_request(mdr, -EINVAL);
@@ -4791,9 +4955,6 @@ void Server::handle_client_mknod(MDRequestRef& mdr)
       newi->filelock.set_state(LOCK_EXCL);
       newi->authlock.set_state(LOCK_EXCL);
       newi->xattrlock.set_state(LOCK_EXCL);
-      cap->issue_norevoke(CEPH_CAP_AUTH_EXCL|CEPH_CAP_AUTH_SHARED|
-			  CEPH_CAP_XATTR_EXCL|CEPH_CAP_XATTR_SHARED|
-			  CEPH_CAP_ANY_FILE_WR);
     }
   }
 
@@ -4890,8 +5051,6 @@ void Server::handle_client_mkdir(MDRequestRef& mdr)
     newi->filelock.set_state(LOCK_EXCL);
     newi->authlock.set_state(LOCK_EXCL);
     newi->xattrlock.set_state(LOCK_EXCL);
-    cap->issue_norevoke(CEPH_CAP_AUTH_EXCL|CEPH_CAP_AUTH_SHARED|
-			CEPH_CAP_XATTR_EXCL|CEPH_CAP_XATTR_SHARED);
   }
 
   // make sure this inode gets into the journal
diff --git a/src/mds/Server.h b/src/mds/Server.h
index bf6e05bae00..7d1aaeab745 100644
--- a/src/mds/Server.h
+++ b/src/mds/Server.h
@@ -35,6 +35,34 @@ enum {
   l_mdss_handle_slave_request,
   l_mdss_handle_client_session,
   l_mdss_dispatch_client_request,
+  l_mdss_req_lookuphash,
+  l_mdss_req_lookupino,
+  l_mdss_req_lookupparent,
+  l_mdss_req_lookupname,
+  l_mdss_req_lookup,
+  l_mdss_req_lookupsnap,
+  l_mdss_req_getattr,
+  l_mdss_req_setattr,
+  l_mdss_req_setlayout,
+  l_mdss_req_setdirlayout,
+  l_mdss_req_setxattr,
+  l_mdss_req_rmxattr,
+  l_mdss_req_readdir,
+  l_mdss_req_setfilelock,
+  l_mdss_req_getfilelock,
+  l_mdss_req_create,
+  l_mdss_req_open,
+  l_mdss_req_mknod,
+  l_mdss_req_link,
+  l_mdss_req_unlink,
+  l_mdss_req_rmdir,
+  l_mdss_req_rename,
+  l_mdss_req_mkdir,
+  l_mdss_req_symlink,
+  l_mdss_req_lssnap,
+  l_mdss_req_mksnap,
+  l_mdss_req_rmsnap,
+  l_mdss_req_renamesnap,
   l_mdss_dispatch_slave_request,
   l_mdss_last,
 };
diff --git a/src/mds/balancers/greedyspill.lua b/src/mds/balancers/greedyspill.lua
index c3e38fa4a14..20576cdb803 100644
--- a/src/mds/balancers/greedyspill.lua
+++ b/src/mds/balancers/greedyspill.lua
@@ -1,48 +1,49 @@
-metrics = {"auth.meta_load", "all.meta_load", "req_rate", "queue_len", "cpu_load_avg"}
+local metrics = {"auth.meta_load", "all.meta_load", "req_rate", "queue_len", "cpu_load_avg"}
 
 -- Metric for balancing is the workload; also dumps metrics
-function mds_load()
-  for i=0, #mds do
-    s = "MDS"..i..": < "
-    for j=1, #metrics do
-      s = s..metrics[j].."="..mds[i][metrics[j]].." "
+local function mds_load()
+  for rank, mds in pairs(mds) do
+    local s = "MDS"..rank..": < "
+    for _, metric in ipairs(metrics) do
+      s = s..metric.."="..mds[metric].." "
     end
-    mds[i]["load"] = mds[i]["all.meta_load"]
-    BAL_LOG(0, s.."> load="..mds[i]["load"])
+    mds.load = mds["all.meta_load"]
+    BAL_LOG(5, s.."> load="..mds.load)
   end
 end
 
 -- Shed load when you have load and your neighbor doesn't
-function when()
+local function when()
+  if not mds[whoami+1] then
+    -- i'm the last rank
+    BAL_LOG(5, "when: not migrating! I am the last rank, nothing to spill to.");
+    return false
+  end
   my_load = mds[whoami]["load"]
   his_load = mds[whoami+1]["load"]
   if my_load > 0.01 and his_load < 0.01 then
-    BAL_LOG(2, "when: migrating! my_load="..my_load.." hisload="..his_load)
+    BAL_LOG(5, "when: migrating! my_load="..my_load.." hisload="..his_load)
     return true
   end
-  BAL_LOG(2, "when: not migrating! my_load="..my_load.." hisload="..his_load)
+  BAL_LOG(5, "when: not migrating! my_load="..my_load.." hisload="..his_load)
   return false
 end
 
 -- Shed half your load to your neighbor
 -- neighbor=whoami+2 because Lua tables are indexed starting at 1
-function where()
-  targets = {}
-  for i=1, #mds+1 do
-    targets[i] = 0
-  end
-
-  targets[whoami+2] = mds[whoami]["load"]/2
+local function where(targets)
+  targets[whoami+1] = mds[whoami]["load"]/2
   return targets
 end
 
+local targets = {}
+for rank in pairs(mds) do
+  targets[rank] = 0
+end
+
 mds_load()
 if when() then
-  return where()
+  where(targets)
 end
 
-targets = {}
-for i=1, #mds+1 do
-  targets[i] = 0
-end
 return targets
diff --git a/src/mds/journal.cc b/src/mds/journal.cc
index db8efeba0c6..a2849154721 100644
--- a/src/mds/journal.cc
+++ b/src/mds/journal.cc
@@ -54,9 +54,6 @@
 
 #define dout_context g_ceph_context
 #define dout_subsys ceph_subsys_mds
-#undef DOUT_COND
-#define DOUT_COND(cct, l) (l<=cct->_conf->debug_mds || l <= cct->_conf->debug_mds_log \
-			      || l <= cct->_conf->debug_mds_log_expire)
 #undef dout_prefix
 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".journal "
 
@@ -298,9 +295,6 @@ void LogSegment::try_to_expire(MDSRank *mds, MDSGatherBuilder &gather_bld, int o
   }
 }
 
-#undef DOUT_COND
-#define DOUT_COND(cct, l) (l<=cct->_conf->debug_mds || l <= cct->_conf->debug_mds_log)
-
 
 // -----------------------
 // EMetaBlob
@@ -335,13 +329,20 @@ void EMetaBlob::add_dir_context(CDir *dir, int mode)
 
     if (mode == TO_AUTH_SUBTREE_ROOT) {
       // subtree root?
-      if (dir->is_subtree_root() && !dir->state_test(CDir::STATE_EXPORTBOUND)) {
-	if (dir->is_auth() && !dir->is_ambiguous_auth()) {
-	  // it's an auth subtree, we don't need maybe (if any), and we're done.
-	  dout(20) << "EMetaBlob::add_dir_context(" << dir << ") reached unambig auth subtree, don't need " << maybe
-		   << " at " << *dir << dendl;
-	  maybe.clear();
-	  break;
+      if (dir->is_subtree_root() &&
+	  !dir->state_test(CDir::STATE_EXPORTBOUND)) {
+	if (dir->is_auth() && !dir->is_ambiguous_auth() ) {
+	  if (dir->state_test(CDir::STATE_AUXSUBTREE) &&
+	      dir->get_dir_auth().first == diri->authority().first) {
+	    // auxiliary subtree. treat it as normal dirfrag
+	    dout(20) << "EMetaBlob::add_dir_context(" << dir << ") auxiliary subtree " << dendl;
+	  } else {
+	    // it's an auth subtree, we don't need maybe (if any), and we're done.
+	    dout(20) << "EMetaBlob::add_dir_context(" << dir << ") reached unambig auth subtree, don't need " << maybe
+		     << " at " << *dir << dendl;
+	    maybe.clear();
+	    break;
+	  }
 	} else {
 	  dout(20) << "EMetaBlob::add_dir_context(" << dir << ") reached ambig or !auth subtree, need " << maybe
 		   << " at " << *dir << dendl;
diff --git a/src/mds/mdstypes.cc b/src/mds/mdstypes.cc
index 5a300d2fc03..71c5f11d3c2 100644
--- a/src/mds/mdstypes.cc
+++ b/src/mds/mdstypes.cc
@@ -244,7 +244,7 @@ void inline_data_t::decode(bufferlist::iterator &p)
  */
 void inode_t::encode(bufferlist &bl, uint64_t features) const
 {
-  ENCODE_START(14, 6, bl);
+  ENCODE_START(15, 6, bl);
 
   ::encode(ino, bl);
   ::encode(rdev, bl);
@@ -294,12 +294,14 @@ void inode_t::encode(bufferlist &bl, uint64_t features) const
   ::encode(btime, bl);
   ::encode(change_attr, bl);
 
+  ::encode(export_pin, bl);
+
   ENCODE_FINISH(bl);
 }
 
 void inode_t::decode(bufferlist::iterator &p)
 {
-  DECODE_START_LEGACY_COMPAT_LEN(14, 6, 6, p);
+  DECODE_START_LEGACY_COMPAT_LEN(15, 6, 6, p);
 
   ::decode(ino, p);
   ::decode(rdev, p);
@@ -380,6 +382,12 @@ void inode_t::decode(bufferlist::iterator &p)
     change_attr = 0;
   }
 
+  if (struct_v >= 15) {
+    ::decode(export_pin, p);
+  } else {
+    export_pin = MDS_RANK_NONE;
+  }
+
   DECODE_FINISH(p);
 }
 
@@ -416,6 +424,7 @@ void inode_t::dump(Formatter *f) const
   f->dump_stream("atime") << atime;
   f->dump_unsigned("time_warp_seq", time_warp_seq);
   f->dump_unsigned("change_attr", change_attr);
+  f->dump_int("export_pin", export_pin);
 
   f->open_array_section("client_ranges");
   for (map<client_t,client_writeable_range_t>::const_iterator p = client_ranges.begin(); p != client_ranges.end(); ++p) {
diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h
index ed7067fc25b..59f53520e0a 100644
--- a/src/mds/mdstypes.h
+++ b/src/mds/mdstypes.h
@@ -74,8 +74,6 @@
 typedef int32_t mds_rank_t;
 typedef int32_t fs_cluster_id_t;
 
-
-
 BOOST_STRONG_TYPEDEF(uint64_t, mds_gid_t)
 extern const mds_gid_t MDS_GID_NONE;
 constexpr fs_cluster_id_t FS_CLUSTER_ID_NONE = {-1};
@@ -508,6 +506,8 @@ struct inode_t {
   nest_info_t accounted_rstat; // protected by parent's nestlock
 
   quota_info_t quota;
+
+  mds_rank_t export_pin;
  
   // special stuff
   version_t version;           // auth only
@@ -529,6 +529,7 @@ struct inode_t {
 	      truncate_seq(0), truncate_size(0), truncate_from(0),
 	      truncate_pending(0),
 	      time_warp_seq(0), change_attr(0),
+              export_pin(MDS_RANK_NONE),
 	      version(0), file_data_version(0), xattr_version(0),
 	      last_scrub_version(0), backtrace_version(0) {
     clear_layout();
diff --git a/src/messages/MBackfillReserve.h b/src/messages/MBackfillReserve.h
index 41f33557a18..f70c2946b30 100644
--- a/src/messages/MBackfillReserve.h
+++ b/src/messages/MBackfillReserve.h
@@ -19,7 +19,7 @@
 
 class MBackfillReserve : public Message {
   static const int HEAD_VERSION = 3;
-  static const int COMPAT_VERSION = 1;
+  static const int COMPAT_VERSION = 3;
 public:
   spg_t pgid;
   epoch_t query_epoch;
@@ -68,15 +68,8 @@ public:
     ::decode(pgid.pgid, p);
     ::decode(query_epoch, p);
     ::decode(type, p);
-    if (header.version > 1)
-      ::decode(priority, p);
-    else
-      priority = 0;
-    if (header.version >= 3)
-      ::decode(pgid.shard, p);
-    else
-      pgid.shard = shard_id_t::NO_SHARD;
-
+    ::decode(priority, p);
+    ::decode(pgid.shard, p);
   }
 
   void encode_payload(uint64_t features) override {
diff --git a/src/messages/MCommand.h b/src/messages/MCommand.h
index f1077644f3f..d6f59fe4c08 100644
--- a/src/messages/MCommand.h
+++ b/src/messages/MCommand.h
@@ -25,7 +25,7 @@ class MCommand : public Message {
   std::vector<string> cmd;
 
   MCommand()
-    : Message(MSG_MON_COMMAND) {}
+    : Message(MSG_COMMAND) {}
   MCommand(const uuid_d &f)
     : Message(MSG_COMMAND),
       fsid(f) { }
diff --git a/src/messages/MForward.h b/src/messages/MForward.h
index 5f464a1bb69..333475e4662 100644
--- a/src/messages/MForward.h
+++ b/src/messages/MForward.h
@@ -35,7 +35,7 @@ struct MForward : public Message {
   string msg_desc;  // for operator<< only
   
   static const int HEAD_VERSION = 3;
-  static const int COMPAT_VERSION = 1;
+  static const int COMPAT_VERSION = 3;
 
   MForward() : Message(MSG_FORWARD, HEAD_VERSION, COMPAT_VERSION),
                tid(0), con_features(0), msg(NULL) {}
@@ -91,20 +91,8 @@ public:
     ::decode(client, p);
     ::decode(client_caps, p);
     msg = (PaxosServiceMessage *)decode_message(NULL, 0, p);
-    if (header.version >= 2) {
-      ::decode(con_features, p);
-    } else {
-      con_features = 0;
-    }
-    if (header.version >= 3) {
-      ::decode(entity_name, p);
-    } else {
-      // we are able to know the entity type, obtaining it from the
-      // entity_name_t on 'client', but we have no idea about the
-      // entity name, so we'll just use a friendly '?' instead.
-      entity_name.set(client.name.type(), "?");
-    }
-
+    ::decode(con_features, p);
+    ::decode(entity_name, p);
   }
 
   PaxosServiceMessage *claim_message() {
diff --git a/src/messages/MMDSBeacon.h b/src/messages/MMDSBeacon.h
index 0f31dfec8af..31febe50a35 100644
--- a/src/messages/MMDSBeacon.h
+++ b/src/messages/MMDSBeacon.h
@@ -125,7 +125,7 @@ WRITE_CLASS_ENCODER(MDSHealth)
 class MMDSBeacon : public PaxosServiceMessage {
 
   static const int HEAD_VERSION = 7;
-  static const int COMPAT_VERSION = 2;
+  static const int COMPAT_VERSION = 6;
 
   uuid_d fsid;
   mds_gid_t global_id;
@@ -225,21 +225,13 @@ public:
     ::decode(name, p);
     ::decode(standby_for_rank, p);
     ::decode(standby_for_name, p);
-    if (header.version >= 2)
-      ::decode(compat, p);
-    if (header.version >= 3) {
-      ::decode(health, p);
-    }
-    if (state == MDSMap::STATE_BOOT &&
-	header.version >= 4) {
+    ::decode(compat, p);
+    ::decode(health, p);
+    if (state == MDSMap::STATE_BOOT) {
       ::decode(sys_info, p);
     }
-    if (header.version >= 5) {
-      ::decode(mds_features, p);
-    }
-    if (header.version >= 6) {
-      ::decode(standby_for_fscid, p);
-    }
+    ::decode(mds_features, p);
+    ::decode(standby_for_fscid, p);
     if (header.version >= 7) {
       ::decode(standby_replay, p);
     }
diff --git a/src/messages/MMDSLoadTargets.h b/src/messages/MMDSLoadTargets.h
index 9fc93525945..128b1d1d034 100644
--- a/src/messages/MMDSLoadTargets.h
+++ b/src/messages/MMDSLoadTargets.h
@@ -16,6 +16,7 @@
 #define CEPH_MMDSLoadTargets_H
 
 #include "msg/Message.h"
+#include "mds/mdstypes.h"
 #include "messages/PaxosServiceMessage.h"
 #include "include/types.h"
 
diff --git a/src/messages/MMonElection.h b/src/messages/MMonElection.h
index 89f10a81a2b..79503875e26 100644
--- a/src/messages/MMonElection.h
+++ b/src/messages/MMonElection.h
@@ -23,7 +23,7 @@
 class MMonElection : public Message {
 
   static const int HEAD_VERSION = 6;
-  static const int COMPAT_VERSION = 2;
+  static const int COMPAT_VERSION = 5;
 
 public:
   static const int OP_PROPOSE = 1;
@@ -103,24 +103,15 @@ public:
   }
   void decode_payload() override {
     bufferlist::iterator p = payload.begin();
-    if (header.version >= 2)
-      ::decode(fsid, p);
-    else
-      memset(&fsid, 0, sizeof(fsid));
+    ::decode(fsid, p);
     ::decode(op, p);
     ::decode(epoch, p);
     ::decode(monmap_bl, p);
     ::decode(quorum, p);
-    if (header.version >= 3)
-      ::decode(quorum_features, p);
-    else
-      quorum_features = 0;
-    if (header.version >= 4) {
-      ::decode(defunct_one, p);
-      ::decode(defunct_two, p);
-    }
-    if (header.version >= 5)
-      ::decode(sharing_bl, p);
+    ::decode(quorum_features, p);
+    ::decode(defunct_one, p);
+    ::decode(defunct_two, p);
+    ::decode(sharing_bl, p);
     if (header.version >= 6)
       ::decode(mon_features, p);
   }
diff --git a/src/messages/MMonPaxos.h b/src/messages/MMonPaxos.h
index 328fd6e3c3f..8c709ddb8e6 100644
--- a/src/messages/MMonPaxos.h
+++ b/src/messages/MMonPaxos.h
@@ -114,8 +114,7 @@ public:
     ::decode(pn, p);   
     ::decode(uncommitted_pn, p);
     ::decode(lease_timestamp, p);
-    if (header.version >= 1)
-      ::decode(sent_timestamp, p);
+    ::decode(sent_timestamp, p);
     ::decode(latest_version, p);
     ::decode(latest_value, p);
     ::decode(values, p);
diff --git a/src/messages/MMonScrub.h b/src/messages/MMonScrub.h
index 2b9d8c9c172..b66a34d7e1d 100644
--- a/src/messages/MMonScrub.h
+++ b/src/messages/MMonScrub.h
@@ -19,7 +19,7 @@
 class MMonScrub : public Message
 {
   static const int HEAD_VERSION = 2;
-  static const int COMPAT_VERSION = 1;
+  static const int COMPAT_VERSION = 2;
 
 public:
   typedef enum {
@@ -79,10 +79,8 @@ public:
     op = (op_type_t)o;
     ::decode(version, p);
     ::decode(result, p);
-    if (header.version >= 2) {
-      ::decode(num_keys, p);
-      ::decode(key, p);
-    }
+    ::decode(num_keys, p);
+    ::decode(key, p);
   }
 };
 
diff --git a/src/messages/MOSDBoot.h b/src/messages/MOSDBoot.h
index 72f126ec0aa..66f5bf079b6 100644
--- a/src/messages/MOSDBoot.h
+++ b/src/messages/MOSDBoot.h
@@ -23,7 +23,7 @@
 class MOSDBoot : public PaxosServiceMessage {
 
   static const int HEAD_VERSION = 6;
-  static const int COMPAT_VERSION = 2;
+  static const int COMPAT_VERSION = 6;
 
  public:
   OSDSuperblock sb;
@@ -77,18 +77,11 @@ public:
     paxos_decode(p);
     ::decode(sb, p);
     ::decode(hb_back_addr, p);
-    if (header.version >= 2)
-      ::decode(cluster_addr, p);
-    if (header.version >= 3)
-      ::decode(boot_epoch, p);
-    if (header.version >= 4)
-      ::decode(hb_front_addr, p);
-    if (header.version >= 5)
-      ::decode(metadata, p);
-    if (header.version >= 6)
-      ::decode(osd_features, p);
-    else
-      osd_features = 0;
+    ::decode(cluster_addr, p);
+    ::decode(boot_epoch, p);
+    ::decode(hb_front_addr, p);
+    ::decode(metadata, p);
+    ::decode(osd_features, p);
   }
 };
 
diff --git a/src/messages/MOSDECSubOpRead.h b/src/messages/MOSDECSubOpRead.h
index 689685b74e6..de9317e45a6 100644
--- a/src/messages/MOSDECSubOpRead.h
+++ b/src/messages/MOSDECSubOpRead.h
@@ -19,12 +19,12 @@
 #include "osd/ECMsgTypes.h"
 
 class MOSDECSubOpRead : public MOSDFastDispatchOp {
-  static const int HEAD_VERSION = 2;
+  static const int HEAD_VERSION = 3;
   static const int COMPAT_VERSION = 1;
 
 public:
   spg_t pgid;
-  epoch_t map_epoch;
+  epoch_t map_epoch, min_epoch;
   ECSubRead op;
 
   int get_cost() const override {
@@ -33,6 +33,9 @@ public:
   epoch_t get_map_epoch() const override {
     return map_epoch;
   }
+  epoch_t get_min_epoch() const override {
+    return min_epoch;
+  }
   spg_t get_spg() const override {
     return pgid;
   }
@@ -46,19 +49,27 @@ public:
     ::decode(pgid, p);
     ::decode(map_epoch, p);
     ::decode(op, p);
+    if (header.version >= 3) {
+      ::decode(min_epoch, p);
+      decode_trace(p);
+    } else {
+      min_epoch = map_epoch;
+    }
   }
 
   void encode_payload(uint64_t features) override {
     ::encode(pgid, payload);
     ::encode(map_epoch, payload);
     ::encode(op, payload, features);
+    ::encode(min_epoch, payload);
+    encode_trace(payload, features);
   }
 
   const char *get_type_name() const override { return "MOSDECSubOpRead"; }
 
   void print(ostream& out) const override {
     out << "MOSDECSubOpRead(" << pgid
-	<< " " << map_epoch
+	<< " " << map_epoch << "/" << min_epoch
 	<< " " << op;
     out << ")";
   }
diff --git a/src/messages/MOSDECSubOpReadReply.h b/src/messages/MOSDECSubOpReadReply.h
index 3903db68628..7ad931445c9 100644
--- a/src/messages/MOSDECSubOpReadReply.h
+++ b/src/messages/MOSDECSubOpReadReply.h
@@ -19,12 +19,12 @@
 #include "osd/ECMsgTypes.h"
 
 class MOSDECSubOpReadReply : public MOSDFastDispatchOp {
-  static const int HEAD_VERSION = 1;
+  static const int HEAD_VERSION = 2;
   static const int COMPAT_VERSION = 1;
 
 public:
   spg_t pgid;
-  epoch_t map_epoch;
+  epoch_t map_epoch, min_epoch;
   ECSubReadReply op;
 
   int get_cost() const override {
@@ -33,6 +33,9 @@ public:
   epoch_t get_map_epoch() const override {
     return map_epoch;
   }
+  epoch_t get_min_epoch() const override {
+    return min_epoch;
+  }
   spg_t get_spg() const override {
     return pgid;
   }
@@ -46,19 +49,27 @@ public:
     ::decode(pgid, p);
     ::decode(map_epoch, p);
     ::decode(op, p);
+    if (header.version >= 2) {
+      ::decode(min_epoch, p);
+      decode_trace(p);
+    } else {
+      min_epoch = map_epoch;
+    }
   }
 
   void encode_payload(uint64_t features) override {
     ::encode(pgid, payload);
     ::encode(map_epoch, payload);
     ::encode(op, payload);
+    ::encode(min_epoch, payload);
+    encode_trace(payload, features);
   }
 
   const char *get_type_name() const override { return "MOSDECSubOpReadReply"; }
 
   void print(ostream& out) const override {
     out << "MOSDECSubOpReadReply(" << pgid
-	<< " " << map_epoch
+	<< " " << map_epoch << "/" << min_epoch
 	<< " " << op;
     out << ")";
   }
diff --git a/src/messages/MOSDECSubOpWrite.h b/src/messages/MOSDECSubOpWrite.h
index 679b7445843..72a272c6ab0 100644
--- a/src/messages/MOSDECSubOpWrite.h
+++ b/src/messages/MOSDECSubOpWrite.h
@@ -19,12 +19,12 @@
 #include "osd/ECMsgTypes.h"
 
 class MOSDECSubOpWrite : public MOSDFastDispatchOp {
-  static const int HEAD_VERSION = 1;
+  static const int HEAD_VERSION = 2;
   static const int COMPAT_VERSION = 1;
 
 public:
   spg_t pgid;
-  epoch_t map_epoch;
+  epoch_t map_epoch, min_epoch;
   ECSubWrite op;
 
   int get_cost() const override {
@@ -33,6 +33,9 @@ public:
   epoch_t get_map_epoch() const override {
     return map_epoch;
   }
+  epoch_t get_min_epoch() const override {
+    return min_epoch;
+  }
   spg_t get_spg() const override {
     return pgid;
   }
@@ -50,19 +53,27 @@ public:
     ::decode(pgid, p);
     ::decode(map_epoch, p);
     ::decode(op, p);
+    if (header.version >= 2) {
+      ::decode(min_epoch, p);
+      decode_trace(p);
+    } else {
+      min_epoch = map_epoch;
+    }
   }
 
   void encode_payload(uint64_t features) override {
     ::encode(pgid, payload);
     ::encode(map_epoch, payload);
     ::encode(op, payload);
+    ::encode(min_epoch, payload);
+    encode_trace(payload, features);
   }
 
   const char *get_type_name() const override { return "MOSDECSubOpWrite"; }
 
   void print(ostream& out) const override {
     out << "MOSDECSubOpWrite(" << pgid
-	<< " " << map_epoch
+	<< " " << map_epoch << "/" << min_epoch
 	<< " " << op;
     out << ")";
   }
diff --git a/src/messages/MOSDECSubOpWriteReply.h b/src/messages/MOSDECSubOpWriteReply.h
index dd653983ebf..778a13d7984 100644
--- a/src/messages/MOSDECSubOpWriteReply.h
+++ b/src/messages/MOSDECSubOpWriteReply.h
@@ -19,12 +19,12 @@
 #include "osd/ECMsgTypes.h"
 
 class MOSDECSubOpWriteReply : public MOSDFastDispatchOp {
-  static const int HEAD_VERSION = 1;
+  static const int HEAD_VERSION = 2;
   static const int COMPAT_VERSION = 1;
 
 public:
   spg_t pgid;
-  epoch_t map_epoch;
+  epoch_t map_epoch, min_epoch;
   ECSubWriteReply op;
 
   int get_cost() const override {
@@ -33,6 +33,9 @@ public:
   epoch_t get_map_epoch() const override {
     return map_epoch;
   }
+  epoch_t get_min_epoch() const override {
+    return min_epoch;
+  }
   spg_t get_spg() const override {
     return pgid;
   }
@@ -46,19 +49,27 @@ public:
     ::decode(pgid, p);
     ::decode(map_epoch, p);
     ::decode(op, p);
+    if (header.version >= 2) {
+      ::decode(min_epoch, p);
+      decode_trace(p);
+    } else {
+      min_epoch = map_epoch;
+    }
   }
 
   void encode_payload(uint64_t features) override {
     ::encode(pgid, payload);
     ::encode(map_epoch, payload);
     ::encode(op, payload);
+    ::encode(min_epoch, payload);
+    encode_trace(payload, features);
   }
 
   const char *get_type_name() const override { return "MOSDECSubOpWriteReply"; }
 
   void print(ostream& out) const override {
     out << "MOSDECSubOpWriteReply(" << pgid
-	<< " " << map_epoch
+	<< " " << map_epoch << "/" << min_epoch
 	<< " " << op;
     out << ")";
   }
diff --git a/src/messages/MOSDFailure.h b/src/messages/MOSDFailure.h
index 9766522ef3e..81a9230c530 100644
--- a/src/messages/MOSDFailure.h
+++ b/src/messages/MOSDFailure.h
@@ -22,6 +22,7 @@
 class MOSDFailure : public PaxosServiceMessage {
 
   static const int HEAD_VERSION = 3;
+  static const int COMPAT_VERSION = 3;
 
  public:
   enum {
@@ -38,13 +39,13 @@ class MOSDFailure : public PaxosServiceMessage {
 
   MOSDFailure() : PaxosServiceMessage(MSG_OSD_FAILURE, 0, HEAD_VERSION) { }
   MOSDFailure(const uuid_d &fs, const entity_inst_t& f, int duration, epoch_t e)
-    : PaxosServiceMessage(MSG_OSD_FAILURE, e, HEAD_VERSION),
+    : PaxosServiceMessage(MSG_OSD_FAILURE, e, HEAD_VERSION, COMPAT_VERSION),
       fsid(fs), target_osd(f),
       flags(FLAG_FAILED),
       epoch(e), failed_for(duration) { }
   MOSDFailure(const uuid_d &fs, const entity_inst_t& f, int duration, 
               epoch_t e, __u8 extra_flags)
-    : PaxosServiceMessage(MSG_OSD_FAILURE, e, HEAD_VERSION),
+    : PaxosServiceMessage(MSG_OSD_FAILURE, e, HEAD_VERSION, COMPAT_VERSION),
       fsid(fs), target_osd(f),
       flags(extra_flags),
       epoch(e), failed_for(duration) { }
@@ -67,14 +68,8 @@ public:
     ::decode(fsid, p);
     ::decode(target_osd, p);
     ::decode(epoch, p);
-    if (header.version >= 2)
-      ::decode(flags, p);
-    else
-      flags = FLAG_FAILED;
-    if (header.version >= 3)
-      ::decode(failed_for, p);
-    else
-      failed_for = 0;
+    ::decode(flags, p);
+    ::decode(failed_for, p);
   }
 
   void encode_payload(uint64_t features) override {
diff --git a/src/messages/MOSDFastDispatchOp.h b/src/messages/MOSDFastDispatchOp.h
index 6babd16f796..1eaa2c37f9a 100644
--- a/src/messages/MOSDFastDispatchOp.h
+++ b/src/messages/MOSDFastDispatchOp.h
@@ -10,6 +10,9 @@
 class MOSDFastDispatchOp : public Message {
 public:
   virtual epoch_t get_map_epoch() const = 0;
+  virtual epoch_t get_min_epoch() const {
+    return get_map_epoch();
+  }
   virtual spg_t get_spg() const = 0;
 
   MOSDFastDispatchOp(int t, int version, int compat_version)
diff --git a/src/messages/MOSDMarkMeDown.h b/src/messages/MOSDMarkMeDown.h
index d48f2367b86..e2c948c3ed8 100644
--- a/src/messages/MOSDMarkMeDown.h
+++ b/src/messages/MOSDMarkMeDown.h
@@ -19,8 +19,8 @@
 
 class MOSDMarkMeDown : public PaxosServiceMessage {
 
-  static const int COMPAT_VERSION = 1;
   static const int HEAD_VERSION = 2;
+  static const int COMPAT_VERSION = 2;
 
  public:
   uuid_d fsid;
@@ -50,8 +50,6 @@ public:
     ::decode(target_osd, p);
     ::decode(epoch, p);
     ::decode(request_ack, p);
-    if (header.version < 2)
-      request_ack = true;    // assume true for older clients
   }
 
   void encode_payload(uint64_t features) override {
diff --git a/src/messages/MOSDOp.h b/src/messages/MOSDOp.h
index 4ca66396961..b6b87f131de 100755
--- a/src/messages/MOSDOp.h
+++ b/src/messages/MOSDOp.h
@@ -361,6 +361,10 @@ struct ceph_osd_request_head {
       ::encode(osdmap_epoch, payload);
       ::encode(flags, payload);
       ::encode(reqid, payload);
+      encode_trace(payload, features);
+
+      // -- above decoded up front; below decoded post-dispatch thread --
+
       ::encode(client_inc, payload);
       ::encode(mtime, payload);
       ::encode(get_object_locator(), payload);
@@ -393,6 +397,7 @@ struct ceph_osd_request_head {
       ::decode(osdmap_epoch, p);
       ::decode(flags, p);
       ::decode(reqid, p);
+      decode_trace(p);
     } else if (header.version == 7) {
       ::decode(pgid.pgid, p);      // raw pgid
       hobj.set_hash(pgid.pgid.ps());
diff --git a/src/messages/MOSDOpReply.h b/src/messages/MOSDOpReply.h
index 115d83b9a29..ef701fda5bf 100644
--- a/src/messages/MOSDOpReply.h
+++ b/src/messages/MOSDOpReply.h
@@ -32,7 +32,7 @@
 
 class MOSDOpReply : public Message {
 
-  static const int HEAD_VERSION = 7;
+  static const int HEAD_VERSION = 8;
   static const int COMPAT_VERSION = 2;
 
   object_t oid;
@@ -204,6 +204,7 @@ public:
           ::encode(redirect, payload);
         }
       }
+      encode_trace(payload, features);
     }
   }
   void decode_payload() override {
@@ -294,6 +295,9 @@ public:
 	  ::decode(redirect, p);
         }
       }
+      if (header.version >= 8) {
+        decode_trace(p);
+      }
     }
   }
 
diff --git a/src/messages/MOSDPGBackfill.h b/src/messages/MOSDPGBackfill.h
index bc001118db3..7f637b9ef83 100644
--- a/src/messages/MOSDPGBackfill.h
+++ b/src/messages/MOSDPGBackfill.h
@@ -19,7 +19,7 @@
 
 class MOSDPGBackfill : public MOSDFastDispatchOp {
   static const int HEAD_VERSION = 3;
-  static const int COMPAT_VERSION = 1;
+  static const int COMPAT_VERSION = 3;
 public:
   enum {
     OP_BACKFILL_PROGRESS = 2,
@@ -39,12 +39,14 @@ public:
   epoch_t map_epoch, query_epoch;
   spg_t pgid;
   hobject_t last_backfill;
-  bool compat_stat_sum;
   pg_stat_t stats;
 
   epoch_t get_map_epoch() const override {
     return map_epoch;
   }
+  epoch_t get_min_epoch() const override {
+    return query_epoch;
+  }
   spg_t get_spg() const override {
     return pgid;
   }
@@ -60,20 +62,13 @@ public:
     // For compatibility with version 1
     ::decode(stats.stats, p);
 
-    if (header.version >= 2) {
-      ::decode(stats, p);
-    } else {
-      compat_stat_sum = true;
-    }
+    ::decode(stats, p);
 
     // Handle hobject_t format change
     if (!last_backfill.is_max() &&
 	last_backfill.pool == -1)
       last_backfill.pool = pgid.pool();
-    if (header.version >= 3)
-      ::decode(pgid.shard, p);
-    else
-      pgid.shard = shard_id_t::NO_SHARD;
+    ::decode(pgid.shard, p);
   }
 
   void encode_payload(uint64_t features) override {
@@ -92,14 +87,12 @@ public:
   }
 
   MOSDPGBackfill()
-    : MOSDFastDispatchOp(MSG_OSD_PG_BACKFILL, HEAD_VERSION, COMPAT_VERSION),
-      compat_stat_sum(false) {}
+    : MOSDFastDispatchOp(MSG_OSD_PG_BACKFILL, HEAD_VERSION, COMPAT_VERSION) {}
   MOSDPGBackfill(__u32 o, epoch_t e, epoch_t qe, spg_t p)
     : MOSDFastDispatchOp(MSG_OSD_PG_BACKFILL, HEAD_VERSION, COMPAT_VERSION),
       op(o),
       map_epoch(e), query_epoch(e),
-      pgid(p),
-      compat_stat_sum(false) {}
+      pgid(p) {}
 private:
   ~MOSDPGBackfill() override {}
 
diff --git a/src/messages/MOSDPGCreate.h b/src/messages/MOSDPGCreate.h
index e6f7aba8950..01072c267fd 100644
--- a/src/messages/MOSDPGCreate.h
+++ b/src/messages/MOSDPGCreate.h
@@ -26,8 +26,7 @@
 struct MOSDPGCreate : public Message {
 
   const static int HEAD_VERSION = 3;
-  // At head_version 2 the unspecified compat_version was set to 2
-  const static int COMPAT_VERSION = 2;
+  const static int COMPAT_VERSION = 3;
 
   version_t          epoch;
   map<pg_t,pg_create_t> mkpg;
@@ -52,32 +51,8 @@ public:
   void decode_payload() override {
     bufferlist::iterator p = payload.begin();
     ::decode(epoch, p);
-    if (header.version >= 2) {
-      ::decode(mkpg, p);
-    } else {
-      __u32 n;
-      ::decode(n, p);
-      while (n--) {
-	pg_t pgid;
-	epoch_t created;   // epoch pg created
-	pg_t parent;       // split from parent (if != pg_t())
-	__s32 split_bits;
-	::decode(pgid, p);
-	::decode(created, p);
-	::decode(parent, p);
-	::decode(split_bits, p);
-	mkpg[pgid] = pg_create_t(created, parent, split_bits);
-      }
-    }
-    if (header.version >= 3) {
-      ::decode(ctimes, p);
-    } else {
-      // To make other code simpler create map with time of 0,0 for each pg
-      for (map<pg_t,pg_create_t>::const_iterator i = mkpg.begin();
-	   i != mkpg.end(); ++i) {
-	ctimes[i->first] = utime_t();
-      }
-    }
+    ::decode(mkpg, p);
+    ::decode(ctimes, p);
   }
 
   void print(ostream& out) const override {
diff --git a/src/messages/MOSDPGInfo.h b/src/messages/MOSDPGInfo.h
index b78dd57a46b..810de9cea29 100644
--- a/src/messages/MOSDPGInfo.h
+++ b/src/messages/MOSDPGInfo.h
@@ -20,13 +20,13 @@
 #include "osd/osd_types.h"
 
 class MOSDPGInfo : public Message {
-  static const int HEAD_VERSION = 4;
+  static const int HEAD_VERSION = 5;
   static const int COMPAT_VERSION = 1;
 
   epoch_t epoch;
 
 public:
-  vector<pair<pg_notify_t,pg_interval_map_t> > pg_list;
+  vector<pair<pg_notify_t,PastIntervals> > pg_list;
 
   epoch_t get_epoch() const { return epoch; }
 
@@ -45,96 +45,111 @@ private:
 public:
   const char *get_type_name() const override { return "pg_info"; }
   void print(ostream& out) const override {
-    out << "pg_info(" << pg_list.size() << " pgs e" << epoch << ":";
-
-    for (vector<pair<pg_notify_t,pg_interval_map_t> >::const_iterator i = pg_list.begin();
+    out << "pg_info(";
+    for (auto i = pg_list.begin();
          i != pg_list.end();
          ++i) {
       if (i != pg_list.begin())
-	out << ",";
-      out << i->first.info.pgid;
-      if (i->second.size())
-	out << "(" << i->second.size() << ")";
+	out << " ";
+      out << i->first << "=" << i->second;
     }
-
-    out << ")";
+    out << " epoch " << epoch
+	<< ")";
   }
 
   void encode_payload(uint64_t features) override {
-    ::encode(epoch, payload);
+    if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
+      header.version = 4;
+
+      // for kraken+jewel only
+      ::encode(epoch, payload);
+
+      // v1 was vector<pg_info_t>
+      __u32 n = pg_list.size();
+      ::encode(n, payload);
+      for (auto p = pg_list.begin();
+	   p != pg_list.end();
+	   p++)
+	::encode(p->first.info, payload);
+
+      // v2 needs the PastIntervals for each record
+      for (auto p = pg_list.begin();
+	   p != pg_list.end();
+	   p++) {
+	p->second.encode_classic(payload);
+      }
 
-    // v1 was vector<pg_info_t>
-    __u32 n = pg_list.size();
-    ::encode(n, payload);
-    for (vector<pair<pg_notify_t,pg_interval_map_t> >::iterator p = pg_list.begin();
-	 p != pg_list.end();
-	 p++)
-      ::encode(p->first.info, payload);
-
-    // v2 needs the pg_interval_map_t for each record
-    for (vector<pair<pg_notify_t,pg_interval_map_t> >::iterator p = pg_list.begin();
-	 p != pg_list.end();
-	 p++)
-      ::encode(p->second, payload);
-
-    // v3 needs epoch_sent, query_epoch
-    for (vector<pair<pg_notify_t,pg_interval_map_t> >::iterator p = pg_list.begin();
-	 p != pg_list.end();
-	 p++)
-      ::encode(pair<epoch_t, epoch_t>(
-		 p->first.epoch_sent, p->first.query_epoch), payload);
-
-    // v4 needs from, to
-    for (vector<pair<pg_notify_t, pg_interval_map_t> >::iterator p = pg_list.begin();
-	 p != pg_list.end();
-	 ++p) {
-      ::encode(p->first.from, payload);
-      ::encode(p->first.to, payload);
+      // v3 needs epoch_sent, query_epoch
+      for (auto p = pg_list.begin();
+	   p != pg_list.end();
+	   p++)
+	::encode(pair<epoch_t, epoch_t>(
+		   p->first.epoch_sent, p->first.query_epoch), payload);
+
+      // v4 needs from, to
+      for (auto p = pg_list.begin();
+	   p != pg_list.end();
+	   ++p) {
+	::encode(p->first.from, payload);
+	::encode(p->first.to, payload);
+      }
+      return;
     }
+    ::encode(epoch, payload);
+    ::encode(pg_list, payload);
   }
   void decode_payload() override {
     bufferlist::iterator p = payload.begin();
-    ::decode(epoch, p);
-
-    // decode pg_info_t portion of the vector
-    __u32 n;
-    ::decode(n, p);
-    pg_list.resize(n);
-    for (unsigned i=0; i<n; i++) {
-      ::decode(pg_list[i].first.info, p);
-    }
+    if (header.version < 5) {
+      ::decode(epoch, p);
 
-    if (header.version >= 2) {
-      // get the pg_interval_map_t portion
+      // decode pg_info_t portion of the vector
+      __u32 n;
+      ::decode(n, p);
+      pg_list.resize(n);
       for (unsigned i=0; i<n; i++) {
-	::decode(pg_list[i].second, p);
+	::decode(pg_list[i].first.info, p);
       }
-    }
 
-    // v3 needs epoch_sent, query_epoch
-    for (vector<pair<pg_notify_t,pg_interval_map_t> >::iterator i = pg_list.begin();
-	 i != pg_list.end();
-	 i++) {
-      if (header.version >= 3) {
-	pair<epoch_t, epoch_t> dec;
-	::decode(dec, p);
-	i->first.epoch_sent = dec.first;
-	i->first.query_epoch = dec.second;
-      } else {
-	i->first.epoch_sent = epoch;
-	i->first.query_epoch = epoch;
+      if (header.version >= 2) {
+	// get the PastIntervals portion
+	for (unsigned i=0; i<n; i++) {
+	  if (header.version >= 5) {
+	    ::decode(pg_list[i].second, p);
+	  } else {
+	    pg_list[i].second.decode_classic(p);
+	  }
+	}
       }
-    }
 
-    // v4 needs from and to
-    if (header.version >= 4) {
-      for (vector<pair<pg_notify_t, pg_interval_map_t> >::iterator i = pg_list.begin();
+      // v3 needs epoch_sent, query_epoch
+      for (auto i = pg_list.begin();
 	   i != pg_list.end();
-	   i++) {
-	::decode(i->first.from, p);
-	::decode(i->first.to, p);
+	 i++) {
+	if (header.version >= 3) {
+	  pair<epoch_t, epoch_t> dec;
+	  ::decode(dec, p);
+	  i->first.epoch_sent = dec.first;
+	  i->first.query_epoch = dec.second;
+	} else {
+	  i->first.epoch_sent = epoch;
+	  i->first.query_epoch = epoch;
+	}
+      }
+
+      // v4 needs from and to
+      if (header.version >= 4) {
+	for (auto i = pg_list.begin();
+	     i != pg_list.end();
+	     i++) {
+	  ::decode(i->first.from, p);
+	  ::decode(i->first.to, p);
+	}
       }
+      return;
     }
+    ::decode(epoch, p);
+    ::decode(pg_list, p);
   }
 };
 
diff --git a/src/messages/MOSDPGLog.h b/src/messages/MOSDPGLog.h
index 62d4b204213..beeda6b7a34 100644
--- a/src/messages/MOSDPGLog.h
+++ b/src/messages/MOSDPGLog.h
@@ -20,7 +20,7 @@
 
 class MOSDPGLog : public Message {
 
-  static const int HEAD_VERSION = 4;
+  static const int HEAD_VERSION = 5;
   static const int COMPAT_VERSION = 2;
 
   epoch_t epoch;
@@ -36,7 +36,7 @@ public:
   pg_info_t info;
   pg_log_t log;
   pg_missing_t missing;
-  pg_interval_map_t past_intervals;
+  PastIntervals past_intervals;
 
   epoch_t get_epoch() const { return epoch; }
   spg_t get_pgid() const { return spg_t(info.pgid.pgid, to); }
@@ -71,6 +71,7 @@ public:
     // swapped out by OSD code.
     out << "pg_log(" << info.pgid << " epoch " << epoch
 	<< " log " << log
+	<< " pi " << past_intervals
 	<< " query_epoch " << query_epoch << ")";
   }
 
@@ -80,7 +81,12 @@ public:
     ::encode(log, payload);
     ::encode(missing, payload);
     ::encode(query_epoch, payload);
-    ::encode(past_intervals, payload);
+    if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
+      ::encode(past_intervals, payload);
+    } else {
+      header.version = 4;
+      past_intervals.encode_classic(payload);
+    }
     ::encode(to, payload);
     ::encode(from, payload);
   }
@@ -94,7 +100,11 @@ public:
       ::decode(query_epoch, p);
     }
     if (header.version >= 3) {
-      ::decode(past_intervals, p);
+      if (header.version >= 5) {
+	::decode(past_intervals, p);
+      } else {
+	past_intervals.decode_classic(p);
+      }
     }
     if (header.version >= 4) {
       ::decode(to, p);
diff --git a/src/messages/MOSDPGNotify.h b/src/messages/MOSDPGNotify.h
index ae725f2b542..a0b118aceb8 100644
--- a/src/messages/MOSDPGNotify.h
+++ b/src/messages/MOSDPGNotify.h
@@ -25,7 +25,7 @@
 
 class MOSDPGNotify : public Message {
 
-  static const int HEAD_VERSION = 5;
+  static const int HEAD_VERSION = 6;
   static const int COMPAT_VERSION = 2;
 
   epoch_t epoch;
@@ -33,11 +33,11 @@ class MOSDPGNotify : public Message {
   /// the current epoch if this is not being sent in response to a
   /// query. This allows the recipient to disregard responses to old
   /// queries.
-  vector<pair<pg_notify_t,pg_interval_map_t> > pg_list;   // pgid -> version
+  vector<pair<pg_notify_t,PastIntervals> > pg_list;   // pgid -> version
 
  public:
   version_t get_epoch() const { return epoch; }
-  const vector<pair<pg_notify_t,pg_interval_map_t> >& get_pg_list() const {
+  const vector<pair<pg_notify_t,PastIntervals> >& get_pg_list() const {
     return pg_list;
   }
 
@@ -45,7 +45,7 @@ class MOSDPGNotify : public Message {
     : Message(MSG_OSD_PG_NOTIFY, HEAD_VERSION, COMPAT_VERSION) { 
     set_priority(CEPH_MSG_PRIO_HIGH);
   }
-  MOSDPGNotify(epoch_t e, vector<pair<pg_notify_t,pg_interval_map_t> >& l)
+  MOSDPGNotify(epoch_t e, vector<pair<pg_notify_t,PastIntervals> >& l)
     : Message(MSG_OSD_PG_NOTIFY, HEAD_VERSION, COMPAT_VERSION),
       epoch(e) {
     pg_list.swap(l);
@@ -58,102 +58,118 @@ public:
   const char *get_type_name() const override { return "PGnot"; }
 
   void encode_payload(uint64_t features) override {
-    // Use query_epoch for first entry for backwards compatibility
-    epoch_t query_epoch = epoch;
-    if (pg_list.size())
-      query_epoch = pg_list.begin()->first.query_epoch;
+    if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
+      // for jewel+kraken compat only
+      header.version = 5;
+
+      // Use query_epoch for first entry for backwards compatibility
+      epoch_t query_epoch = epoch;
+      if (pg_list.size())
+	query_epoch = pg_list.begin()->first.query_epoch;
     
-    ::encode(epoch, payload);
+      ::encode(epoch, payload);
+
+      // v2 was vector<pg_info_t>
+      __u32 n = pg_list.size();
+      ::encode(n, payload);
+      for (auto p = pg_list.begin();
+	   p != pg_list.end();
+	   p++)
+	::encode(p->first.info, payload);
+
+      ::encode(query_epoch, payload);
+
+      // v3 needs the PastIntervals for each record
+      for (auto p = pg_list.begin();
+	   p != pg_list.end();
+	   p++) {
+	p->second.encode_classic(payload);
+      }
 
-    // v2 was vector<pg_info_t>
-    __u32 n = pg_list.size();
-    ::encode(n, payload);
-    for (vector<pair<pg_notify_t,pg_interval_map_t> >::iterator p = pg_list.begin();
-	 p != pg_list.end();
-	 p++)
-      ::encode(p->first.info, payload);
-
-    ::encode(query_epoch, payload);
-
-    // v3 needs the pg_interval_map_t for each record
-    for (vector<pair<pg_notify_t,pg_interval_map_t> >::iterator p = pg_list.begin();
-	 p != pg_list.end();
-	 p++)
-      ::encode(p->second, payload);
-
-    // v4 needs epoch_sent, query_epoch
-    for (vector<pair<pg_notify_t,pg_interval_map_t> >::iterator p = pg_list.begin();
-	 p != pg_list.end();
-	 p++)
-      ::encode(pair<epoch_t, epoch_t>(
-	  p->first.epoch_sent, p->first.query_epoch),
-	payload);
-
-    // v5 needs from, to
-    for (vector<pair<pg_notify_t, pg_interval_map_t> >::iterator p = pg_list.begin();
-	 p != pg_list.end();
-	 ++p) {
-      ::encode(p->first.from, payload);
-      ::encode(p->first.to, payload);
+      // v4 needs epoch_sent, query_epoch
+      for (vector<pair<pg_notify_t,PastIntervals> >::iterator p = pg_list.begin();
+	   p != pg_list.end();
+	   p++)
+	::encode(pair<epoch_t, epoch_t>(
+		   p->first.epoch_sent, p->first.query_epoch),
+		 payload);
+
+      // v5 needs from, to
+      for (vector<pair<pg_notify_t, PastIntervals> >::iterator p = pg_list.begin();
+	   p != pg_list.end();
+	   ++p) {
+	::encode(p->first.from, payload);
+	::encode(p->first.to, payload);
+      }
+      return;
     }
+
+    ::encode(epoch, payload);
+    ::encode(pg_list, payload);
   }
+
   void decode_payload() override {
-    epoch_t query_epoch;
     bufferlist::iterator p = payload.begin();
-    ::decode(epoch, p);
-
-    // decode pg_info_t portion of the vector
-    __u32 n;
-    ::decode(n, p);
-    pg_list.resize(n);
-    for (unsigned i=0; i<n; i++) {
-      ::decode(pg_list[i].first.info, p);
-    }
-
-    ::decode(query_epoch, p);
-
-    if (header.version >= 3) {
-      // get the pg_interval_map_t portion
+    if (header.version < 6) {
+      // for kraken+jewel compat only
+      epoch_t query_epoch;
+      ::decode(epoch, p);
+
+      // decode pg_info_t portion of the vector
+      __u32 n;
+      ::decode(n, p);
+      pg_list.resize(n);
       for (unsigned i=0; i<n; i++) {
-	::decode(pg_list[i].second, p);
+	::decode(pg_list[i].first.info, p);
       }
-    }
 
-    // v3 needs epoch_sent, query_epoch
-    for (vector<pair<pg_notify_t,pg_interval_map_t> >::iterator i = pg_list.begin();
-	 i != pg_list.end();
-	 i++) {
-      if (header.version >= 4) {
-	pair<epoch_t, epoch_t> dec;
-	::decode(dec, p);
-	i->first.epoch_sent = dec.first;
-	i->first.query_epoch = dec.second;
-      } else {
-	i->first.epoch_sent = epoch;
-	i->first.query_epoch = query_epoch;
+      ::decode(query_epoch, p);
+
+      if (header.version >= 3) {
+	// get the PastIntervals portion
+	for (unsigned i=0; i<n; i++) {
+	  pg_list[i].second.decode_classic(p);
+	}
       }
-    }
 
-    // v5 needs from and to
-    if (header.version >= 5) {
-      for (vector<pair<pg_notify_t, pg_interval_map_t> >::iterator i = pg_list.begin();
+      // v3 needs epoch_sent, query_epoch
+      for (auto i = pg_list.begin();
 	   i != pg_list.end();
 	   i++) {
-	::decode(i->first.from, p);
-	::decode(i->first.to, p);
+	if (header.version >= 4) {
+	  pair<epoch_t, epoch_t> dec;
+	  ::decode(dec, p);
+	  i->first.epoch_sent = dec.first;
+	  i->first.query_epoch = dec.second;
+	} else {
+	  i->first.epoch_sent = epoch;
+	  i->first.query_epoch = query_epoch;
+	}
       }
+
+      // v5 needs from and to
+      if (header.version >= 5) {
+	for (auto i = pg_list.begin();
+	     i != pg_list.end();
+	     i++) {
+	  ::decode(i->first.from, p);
+	  ::decode(i->first.to, p);
+	}
+      }
+      return;
     }
+
+    ::decode(epoch, p);
+    ::decode(pg_list, p);
   }
   void print(ostream& out) const override {
     out << "pg_notify(";
-    for (vector<pair<pg_notify_t,pg_interval_map_t> >::const_iterator i = pg_list.begin();
+    for (auto i = pg_list.begin();
          i != pg_list.end();
          ++i) {
       if (i != pg_list.begin())
-	out << ",";
-      out << i->first.info.pgid;
-      if (i->second.size())
-	out << "(" << i->second.size() << ")";
+	out << " ";
+      out << i->first << "=" << i->second;
     }
     out << " epoch " << epoch
 	<< ")";
diff --git a/src/messages/MOSDPGPull.h b/src/messages/MOSDPGPull.h
index df9c110f32f..281b26f20db 100644
--- a/src/messages/MOSDPGPull.h
+++ b/src/messages/MOSDPGPull.h
@@ -18,20 +18,23 @@
 #include "MOSDFastDispatchOp.h"
 
 class MOSDPGPull : public MOSDFastDispatchOp {
-  static const int HEAD_VERSION = 2;
-  static const int COMPAT_VERSION = 1;
+  static const int HEAD_VERSION = 3;
+  static const int COMPAT_VERSION = 2;
 
   vector<PullOp> pulls;
 
 public:
   pg_shard_t from;
   spg_t pgid;
-  epoch_t map_epoch;
+  epoch_t map_epoch, min_epoch;
   uint64_t cost;
 
   epoch_t get_map_epoch() const override {
     return map_epoch;
   }
+  epoch_t get_min_epoch() const override {
+    return min_epoch;
+  }
   spg_t get_spg() const override {
     return pgid;
   }
@@ -67,12 +70,12 @@ public:
     ::decode(map_epoch, p);
     ::decode(pulls, p);
     ::decode(cost, p);
-    if (header.version >= 2) {
-      ::decode(pgid.shard, p);
-      ::decode(from, p);
+    ::decode(pgid.shard, p);
+    ::decode(from, p);
+    if (header.version >= 3) {
+      ::decode(min_epoch, p);
     } else {
-      pgid.shard = shard_id_t::NO_SHARD;
-      from = pg_shard_t(get_source().num(), shard_id_t::NO_SHARD);
+      min_epoch = map_epoch;
     }
   }
 
@@ -83,13 +86,14 @@ public:
     ::encode(cost, payload);
     ::encode(pgid.shard, payload);
     ::encode(from, payload);
+    ::encode(min_epoch, payload);
   }
 
   const char *get_type_name() const override { return "MOSDPGPull"; }
 
   void print(ostream& out) const override {
     out << "MOSDPGPull(" << pgid
-	<< " e" << map_epoch
+	<< " e" << map_epoch << "/" << min_epoch
 	<< " cost " << cost
 	<< ")";
   }
diff --git a/src/messages/MOSDPGPush.h b/src/messages/MOSDPGPush.h
index 9842ef06945..4f45048412c 100644
--- a/src/messages/MOSDPGPush.h
+++ b/src/messages/MOSDPGPush.h
@@ -18,14 +18,13 @@
 #include "MOSDFastDispatchOp.h"
 
 class MOSDPGPush : public MOSDFastDispatchOp {
-  static const int HEAD_VERSION = 2;
-  static const int COMPAT_VERSION = 1;
-
+  static const int HEAD_VERSION = 3;
+  static const int COMPAT_VERSION = 2;
 
 public:
   pg_shard_t from;
   spg_t pgid;
-  epoch_t map_epoch;
+  epoch_t map_epoch, min_epoch;
   vector<PushOp> pushes;
 
 private:
@@ -48,6 +47,9 @@ public:
   epoch_t get_map_epoch() const override {
     return map_epoch;
   }
+  epoch_t get_min_epoch() const override {
+    return min_epoch;
+  }
   spg_t get_spg() const override {
     return pgid;
   }
@@ -67,12 +69,12 @@ public:
     ::decode(map_epoch, p);
     ::decode(pushes, p);
     ::decode(cost, p);
-    if (header.version >= 2) {
-      ::decode(pgid.shard, p);
-      ::decode(from, p);
+    ::decode(pgid.shard, p);
+    ::decode(from, p);
+    if (header.version >= 3) {
+      ::decode(min_epoch, p);
     } else {
-      pgid.shard = shard_id_t::NO_SHARD;
-      from = pg_shard_t(get_source().num(), shard_id_t::NO_SHARD);
+      min_epoch = map_epoch;
     }
   }
 
@@ -83,13 +85,14 @@ public:
     ::encode(cost, payload);
     ::encode(pgid.shard, payload);
     ::encode(from, payload);
+    ::encode(min_epoch, payload);
   }
 
   const char *get_type_name() const override { return "MOSDPGPush"; }
 
   void print(ostream& out) const override {
     out << "MOSDPGPush(" << pgid
-	<< " " << map_epoch
+	<< " " << map_epoch << "/" << min_epoch
 	<< " " << pushes;
     out << ")";
   }
diff --git a/src/messages/MOSDPGPushReply.h b/src/messages/MOSDPGPushReply.h
index 180a201eb9e..c60007c8187 100644
--- a/src/messages/MOSDPGPushReply.h
+++ b/src/messages/MOSDPGPushReply.h
@@ -18,19 +18,22 @@
 #include "MOSDFastDispatchOp.h"
 
 class MOSDPGPushReply : public MOSDFastDispatchOp {
-  static const int HEAD_VERSION = 2;
-  static const int COMPAT_VERSION = 1;
+  static const int HEAD_VERSION = 3;
+  static const int COMPAT_VERSION = 2;
 
 public:
   pg_shard_t from;
   spg_t pgid;
-  epoch_t map_epoch;
+  epoch_t map_epoch, min_epoch;
   vector<PushReplyOp> replies;
   uint64_t cost;
 
   epoch_t get_map_epoch() const override {
     return map_epoch;
   }
+  epoch_t get_min_epoch() const override {
+    return min_epoch;
+  }
   spg_t get_spg() const override {
     return pgid;
   }
@@ -59,13 +62,12 @@ public:
     ::decode(map_epoch, p);
     ::decode(replies, p);
     ::decode(cost, p);
-
-    if (header.version >= 2) {
-      ::decode(pgid.shard, p);
-      ::decode(from, p);
+    ::decode(pgid.shard, p);
+    ::decode(from, p);
+    if (header.version >= 3) {
+      ::decode(min_epoch, p);
     } else {
-      pgid.shard = shard_id_t::NO_SHARD;
-      from = pg_shard_t(get_source().num(), shard_id_t::NO_SHARD);
+      min_epoch = map_epoch;
     }
   }
 
@@ -76,11 +78,12 @@ public:
     ::encode(cost, payload);
     ::encode(pgid.shard, payload);
     ::encode(from, payload);
+    ::encode(min_epoch, payload);
   }
 
   void print(ostream& out) const override {
     out << "MOSDPGPushReply(" << pgid
-	<< " " << map_epoch
+	<< " " << map_epoch << "/" << min_epoch
 	<< " " << replies;
     out << ")";
   }
diff --git a/src/messages/MOSDPGQuery.h b/src/messages/MOSDPGQuery.h
index d6029d0d9f7..8b58bc9a9f7 100644
--- a/src/messages/MOSDPGQuery.h
+++ b/src/messages/MOSDPGQuery.h
@@ -24,9 +24,10 @@
  */
 
 class MOSDPGQuery : public Message {
-  static const int HEAD_VERSION = 3;
-  static const int COMPAT_VERSION = 1;
-  version_t       epoch;
+  static const int HEAD_VERSION = 4;
+  static const int COMPAT_VERSION = 3;
+
+  version_t epoch;
 
  public:
   version_t get_epoch() const { return epoch; }
@@ -62,44 +63,47 @@ public:
   }
 
   void encode_payload(uint64_t features) override {
-    ::encode(epoch, payload);
-    vector<pair<pg_t, pg_query_t> > _pg_list;
-    _pg_list.reserve(pg_list.size());
-    vector<shard_id_t> _shard_list;
-    _shard_list.reserve(pg_list.size());
-    for (map<spg_t, pg_query_t>::iterator i = pg_list.begin();
-	 i != pg_list.end();
-	 ++i) {
-      _pg_list.push_back(make_pair(i->first.pgid, i->second));
-      _shard_list.push_back(i->first.shard);
+    if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
+      // for kraken/jewel only
+      header.version = 3;
+      ::encode(epoch, payload);
+      vector<pair<pg_t, pg_query_t> > _pg_list;
+      _pg_list.reserve(pg_list.size());
+      vector<shard_id_t> _shard_list;
+      _shard_list.reserve(pg_list.size());
+      for (map<spg_t, pg_query_t>::iterator i = pg_list.begin();
+	   i != pg_list.end();
+	   ++i) {
+	_pg_list.push_back(make_pair(i->first.pgid, i->second));
+	_shard_list.push_back(i->first.shard);
+      }
+      ::encode(_pg_list, payload, features);
+      ::encode(_shard_list, payload);
+      return;
     }
-    ::encode(_pg_list, payload, features);
-    ::encode(_shard_list, payload);
+    ::encode(epoch, payload);
+    ::encode(pg_list, payload, features);
   }
   void decode_payload() override {
     bufferlist::iterator p = payload.begin();
-    ::decode(epoch, p);
-    vector<pair<pg_t, pg_query_t> > _pg_list;
-    ::decode(_pg_list, p);
-    vector<shard_id_t> _shard_list(_pg_list.size(), shard_id_t::NO_SHARD);
-    if (header.version >= 3) {
+    if (header.version < 4) {
+      // for kraken/jewel only
+      ::decode(epoch, p);
+      vector<pair<pg_t, pg_query_t> > _pg_list;
+      ::decode(_pg_list, p);
+      vector<shard_id_t> _shard_list(_pg_list.size(), shard_id_t::NO_SHARD);
       _shard_list.clear();
       ::decode(_shard_list, p);
-    }
-    assert(_pg_list.size() == _shard_list.size());
-    for (unsigned i = 0; i < _pg_list.size(); ++i) {
-      pg_list.insert(
-	make_pair(
-	  spg_t(_pg_list[i].first, _shard_list[i]), _pg_list[i].second));
-    }
-
-    if (header.version < 2) {
-      for (map<spg_t, pg_query_t>::iterator i = pg_list.begin();
-	   i != pg_list.end();
-	   ++i) {
-	i->second.epoch_sent = epoch;
+      assert(_pg_list.size() == _shard_list.size());
+      for (unsigned i = 0; i < _pg_list.size(); ++i) {
+	pg_list.insert(
+	  make_pair(
+	    spg_t(_pg_list[i].first, _shard_list[i]), _pg_list[i].second));
       }
+      return;
     }
+    ::decode(epoch, p);
+    ::decode(pg_list, p);
   }
 };
 
diff --git a/src/messages/MOSDPGRemove.h b/src/messages/MOSDPGRemove.h
index dcb44fffb3f..5a716b4700d 100644
--- a/src/messages/MOSDPGRemove.h
+++ b/src/messages/MOSDPGRemove.h
@@ -22,8 +22,8 @@
 
 class MOSDPGRemove : public Message {
 
-  static const int HEAD_VERSION = 2;
-  static const int COMPAT_VERSION = 1;
+  static const int HEAD_VERSION = 3;
+  static const int COMPAT_VERSION = 2;
 
   epoch_t epoch;
 
@@ -46,35 +46,46 @@ public:
   const char *get_type_name() const override { return "PGrm"; }
 
   void encode_payload(uint64_t features) override {
-    ::encode(epoch, payload);
+    if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
+      // for jewel+kraken
+      header.version = 2;
+      ::encode(epoch, payload);
 
-    vector<pg_t> _pg_list;
-    _pg_list.reserve(pg_list.size());
-    vector<shard_id_t> _shard_list;
-    _shard_list.reserve(pg_list.size());
-    for (vector<spg_t>::iterator i = pg_list.begin(); i != pg_list.end(); ++i) {
-      _pg_list.push_back(i->pgid);
-      _shard_list.push_back(i->shard);
+      vector<pg_t> _pg_list;
+      _pg_list.reserve(pg_list.size());
+      vector<shard_id_t> _shard_list;
+      _shard_list.reserve(pg_list.size());
+      for (auto i = pg_list.begin(); i != pg_list.end(); ++i) {
+	_pg_list.push_back(i->pgid);
+	_shard_list.push_back(i->shard);
+      }
+      ::encode(_pg_list, payload);
+      ::encode(_shard_list, payload);
+      return;
     }
-    ::encode(_pg_list, payload);
-    ::encode(_shard_list, payload);
+    ::encode(epoch, payload);
+    ::encode(pg_list, payload);
   }
   void decode_payload() override {
     bufferlist::iterator p = payload.begin();
-    ::decode(epoch, p);
-    vector<pg_t> _pg_list;
-    ::decode(_pg_list, p);
+    if (header.version == 2) {
+      // jewel/kraken
+      ::decode(epoch, p);
+      vector<pg_t> _pg_list;
+      ::decode(_pg_list, p);
 
-    vector<shard_id_t> _shard_list(_pg_list.size(), shard_id_t::NO_SHARD);
-    if (header.version >= 2) {
+      vector<shard_id_t> _shard_list(_pg_list.size(), shard_id_t::NO_SHARD);
       _shard_list.clear();
       ::decode(_shard_list, p);
+      assert(_shard_list.size() == _pg_list.size());
+      pg_list.reserve(_shard_list.size());
+      for (unsigned i = 0; i < _shard_list.size(); ++i) {
+	pg_list.push_back(spg_t(_pg_list[i], _shard_list[i]));
+      }
+      return;
     }
-    assert(_shard_list.size() == _pg_list.size());
-    pg_list.reserve(_shard_list.size());
-    for (unsigned i = 0; i < _shard_list.size(); ++i) {
-      pg_list.push_back(spg_t(_pg_list[i], _shard_list[i]));
-    }
+    ::decode(epoch, p);
+    ::decode(pg_list, p);
   }
   void print(ostream& out) const override {
     out << "osd pg remove(" << "epoch " << epoch << "; ";
diff --git a/src/messages/MOSDPGScan.h b/src/messages/MOSDPGScan.h
index 49f85ed2349..3c01b406fd8 100644
--- a/src/messages/MOSDPGScan.h
+++ b/src/messages/MOSDPGScan.h
@@ -20,7 +20,7 @@
 class MOSDPGScan : public MOSDFastDispatchOp {
 
   static const int HEAD_VERSION = 2;
-  static const int COMPAT_VERSION = 1;
+  static const int COMPAT_VERSION = 2;
 
 public:
   enum {
@@ -44,6 +44,9 @@ public:
   epoch_t get_map_epoch() const override {
     return map_epoch;
   }
+  epoch_t get_min_epoch() const override {
+    return query_epoch;
+  }
   spg_t get_spg() const override {
     return pgid;
   }
@@ -63,15 +66,8 @@ public:
     if (!end.is_max() && end.pool == -1)
       end.pool = pgid.pool();
 
-    if (header.version >= 2) {
-      ::decode(from, p);
-      ::decode(pgid.shard, p);
-    } else {
-      from = pg_shard_t(
-	get_source().num(),
-	shard_id_t::NO_SHARD);
-      pgid.shard = shard_id_t::NO_SHARD;
-    }
+    ::decode(from, p);
+    ::decode(pgid.shard, p);
   }
 
   void encode_payload(uint64_t features) override {
diff --git a/src/messages/MOSDPGTrim.h b/src/messages/MOSDPGTrim.h
index 175579e1e3e..030da7c2c8d 100644
--- a/src/messages/MOSDPGTrim.h
+++ b/src/messages/MOSDPGTrim.h
@@ -20,7 +20,7 @@
 class MOSDPGTrim : public Message {
 
   static const int HEAD_VERSION = 2;
-  static const int COMPAT_VERSION = 1;
+  static const int COMPAT_VERSION = 2;
 
 public:
   epoch_t epoch;
@@ -53,10 +53,7 @@ public:
     ::decode(epoch, p);
     ::decode(pgid.pgid, p);
     ::decode(trim_to, p);
-    if (header.version >= 2)
-      ::decode(pgid.shard, p);
-    else
-      pgid.shard = shard_id_t::NO_SHARD;
+    ::decode(pgid.shard, p);
   }
 };
 
diff --git a/src/messages/MOSDPGUpdateLogMissing.h b/src/messages/MOSDPGUpdateLogMissing.h
index bdb486c0818..facb65bc854 100644
--- a/src/messages/MOSDPGUpdateLogMissing.h
+++ b/src/messages/MOSDPGUpdateLogMissing.h
@@ -20,12 +20,12 @@
 
 class MOSDPGUpdateLogMissing : public MOSDFastDispatchOp {
 
-  static const int HEAD_VERSION = 1;
+  static const int HEAD_VERSION = 2;
   static const int COMPAT_VERSION = 1;
 
 
 public:
-  epoch_t map_epoch;
+  epoch_t map_epoch, min_epoch;
   spg_t pgid;
   shard_id_t from;
   ceph_tid_t rep_tid;
@@ -35,9 +35,13 @@ public:
   spg_t get_pgid() const { return pgid; }
   epoch_t get_query_epoch() const { return map_epoch; }
   ceph_tid_t get_tid() const { return rep_tid; }
+
   epoch_t get_map_epoch() const override {
     return map_epoch;
   }
+  epoch_t get_min_epoch() const override {
+    return min_epoch;
+  }
   spg_t get_spg() const override {
     return pgid;
   }
@@ -50,10 +54,12 @@ public:
     spg_t pgid,
     shard_id_t from,
     epoch_t epoch,
+    epoch_t min_epoch,
     ceph_tid_t rep_tid)
     : MOSDFastDispatchOp(MSG_OSD_PG_UPDATE_LOG_MISSING, HEAD_VERSION,
 			 COMPAT_VERSION),
       map_epoch(epoch),
+      min_epoch(min_epoch),
       pgid(pgid),
       from(from),
       rep_tid(rep_tid),
@@ -66,6 +72,7 @@ public:
   const char *get_type_name() const override { return "PGUpdateLogMissing"; }
   void print(ostream& out) const override {
     out << "pg_update_log_missing(" << pgid << " epoch " << map_epoch
+	<< "/" << min_epoch
 	<< " rep_tid " << rep_tid
 	<< " entries " << entries << ")";
   }
@@ -76,6 +83,7 @@ public:
     ::encode(from, payload);
     ::encode(rep_tid, payload);
     ::encode(entries, payload);
+    ::encode(min_epoch, payload);
   }
   void decode_payload() override {
     bufferlist::iterator p = payload.begin();
@@ -84,6 +92,11 @@ public:
     ::decode(from, p);
     ::decode(rep_tid, p);
     ::decode(entries, p);
+    if (header.version >= 2) {
+      ::decode(min_epoch, p);
+    } else {
+      min_epoch = map_epoch;
+    }
   }
 };
 
diff --git a/src/messages/MOSDPGUpdateLogMissingReply.h b/src/messages/MOSDPGUpdateLogMissingReply.h
index 58d7de0b40f..8df6d9be5c8 100644
--- a/src/messages/MOSDPGUpdateLogMissingReply.h
+++ b/src/messages/MOSDPGUpdateLogMissingReply.h
@@ -20,12 +20,12 @@
 
 class MOSDPGUpdateLogMissingReply : public MOSDFastDispatchOp {
 
-  static const int HEAD_VERSION = 1;
+  static const int HEAD_VERSION = 2;
   static const int COMPAT_VERSION = 1;
 
 
 public:
-  epoch_t map_epoch;
+  epoch_t map_epoch, min_epoch;
   spg_t pgid;
   shard_id_t from;
   ceph_tid_t rep_tid;
@@ -40,6 +40,9 @@ public:
   epoch_t get_map_epoch() const override {
     return map_epoch;
   }
+  epoch_t get_min_epoch() const override {
+    return min_epoch;
+  }
   spg_t get_spg() const override {
     return pgid;
   }
@@ -54,12 +57,14 @@ public:
     spg_t pgid,
     shard_id_t from,
     epoch_t epoch,
+    epoch_t min_epoch,
     ceph_tid_t rep_tid)
     : MOSDFastDispatchOp(
         MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY,
         HEAD_VERSION,
         COMPAT_VERSION),
       map_epoch(epoch),
+      min_epoch(min_epoch),
       pgid(pgid),
       from(from),
       rep_tid(rep_tid)
@@ -72,6 +77,7 @@ public:
   const char *get_type_name() const override { return "PGUpdateLogMissingReply"; }
   void print(ostream& out) const override {
     out << "pg_update_log_missing_reply(" << pgid << " epoch " << map_epoch
+	<< "/" << min_epoch
 	<< " rep_tid " << rep_tid << ")";
   }
 
@@ -80,6 +86,7 @@ public:
     ::encode(pgid, payload);
     ::encode(from, payload);
     ::encode(rep_tid, payload);
+    ::encode(min_epoch, payload);
   }
   void decode_payload() override {
     bufferlist::iterator p = payload.begin();
@@ -87,6 +94,11 @@ public:
     ::decode(pgid, p);
     ::decode(from, p);
     ::decode(rep_tid, p);
+    if (header.version >= 2) {
+      ::decode(min_epoch, p);
+    } else {
+      min_epoch = map_epoch;
+    }
   }
 };
 
diff --git a/src/messages/MOSDPing.h b/src/messages/MOSDPing.h
index ce75ad84857..3bb54c5ae0f 100644
--- a/src/messages/MOSDPing.h
+++ b/src/messages/MOSDPing.h
@@ -24,7 +24,7 @@
 class MOSDPing : public Message {
 
   static const int HEAD_VERSION = 2;
-  static const int COMPAT_VERSION = 1;
+  static const int COMPAT_VERSION = 2;
 
  public:
   enum {
@@ -71,8 +71,7 @@ public:
     ::decode(peer_as_of_epoch, p);
     ::decode(op, p);
     ::decode(peer_stat, p);
-    if (header.version >= 2)
-      ::decode(stamp, p);
+    ::decode(stamp, p);
   }
   void encode_payload(uint64_t features) override {
     ::encode(fsid, payload);
diff --git a/src/messages/MOSDRepOp.h b/src/messages/MOSDRepOp.h
index e4a4ef8d009..8b01131fec4 100644
--- a/src/messages/MOSDRepOp.h
+++ b/src/messages/MOSDRepOp.h
@@ -24,11 +24,11 @@
 
 class MOSDRepOp : public MOSDFastDispatchOp {
 
-  static const int HEAD_VERSION = 1;
+  static const int HEAD_VERSION = 2;
   static const int COMPAT_VERSION = 1;
 
 public:
-  epoch_t map_epoch;
+  epoch_t map_epoch, min_epoch;
 
   // metadata from original request
   osd_reqid_t reqid;
@@ -66,6 +66,9 @@ public:
   epoch_t get_map_epoch() const override {
     return map_epoch;
   }
+  epoch_t get_min_epoch() const override {
+    return min_epoch;
+  }
   spg_t get_spg() const override {
     return pgid;
   }
@@ -78,6 +81,12 @@ public:
     p = payload.begin();
     // splitted to partial and final
     ::decode(map_epoch, p);
+    if (header.version >= 2) {
+      ::decode(min_epoch, p);
+      decode_trace(p);
+    } else {
+      min_epoch = map_epoch;
+    }
     ::decode(reqid, p);
     ::decode(pgid, p);
   }
@@ -105,6 +114,12 @@ public:
 
   void encode_payload(uint64_t features) override {
     ::encode(map_epoch, payload);
+    if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
+      ::encode(min_epoch, payload);
+      encode_trace(payload, features);
+    } else {
+      header.version = 1;
+    }
     ::encode(reqid, payload);
     ::encode(pgid, payload);
     ::encode(poid, payload);
@@ -127,9 +142,10 @@ public:
       final_decode_needed(true), acks_wanted (0) {}
   MOSDRepOp(osd_reqid_t r, pg_shard_t from,
 	    spg_t p, const hobject_t& po, int aw,
-	    epoch_t mape, ceph_tid_t rtid, eversion_t v)
+	    epoch_t mape, epoch_t min_epoch, ceph_tid_t rtid, eversion_t v)
     : MOSDFastDispatchOp(MSG_OSD_REPOP, HEAD_VERSION, COMPAT_VERSION),
       map_epoch(mape),
+      min_epoch(min_epoch),
       reqid(r),
       pgid(p),
       final_decode_needed(false),
@@ -146,9 +162,9 @@ public:
   const char *get_type_name() const override { return "osd_repop"; }
   void print(ostream& out) const override {
     out << "osd_repop(" << reqid
-          << " " << pgid;
+	<< " " << pgid << " e" << map_epoch << "/" << min_epoch;
     if (!final_decode_needed) {
-        out << " " << poid << " v " << version;
+      out << " " << poid << " v " << version;
       if (updated_hit_set_history)
         out << ", has_updated_hit_set_history";
     }
diff --git a/src/messages/MOSDRepOpReply.h b/src/messages/MOSDRepOpReply.h
index 04a5707dfbe..c3e5e010042 100644
--- a/src/messages/MOSDRepOpReply.h
+++ b/src/messages/MOSDRepOpReply.h
@@ -29,10 +29,10 @@
  */
 
 class MOSDRepOpReply : public MOSDFastDispatchOp {
-  static const int HEAD_VERSION = 1;
+  static const int HEAD_VERSION = 2;
   static const int COMPAT_VERSION = 1;
 public:
-  epoch_t map_epoch;
+  epoch_t map_epoch, min_epoch;
 
   // subop metadata
   osd_reqid_t reqid;
@@ -53,6 +53,9 @@ public:
   epoch_t get_map_epoch() const override {
     return map_epoch;
   }
+  epoch_t get_min_epoch() const override {
+    return min_epoch;
+  }
   spg_t get_spg() const override {
     return pgid;
   }
@@ -60,6 +63,12 @@ public:
   void decode_payload() override {
     p = payload.begin();
     ::decode(map_epoch, p);
+    if (header.version >= 2) {
+      ::decode(min_epoch, p);
+      decode_trace(p);
+    } else {
+      min_epoch = map_epoch;
+    }
     ::decode(reqid, p);
     ::decode(pgid, p);
   }
@@ -76,6 +85,12 @@ public:
   }
   void encode_payload(uint64_t features) override {
     ::encode(map_epoch, payload);
+    if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
+      ::encode(min_epoch, payload);
+      encode_trace(payload, features);
+    } else {
+      header.version = 1;
+    }
     ::encode(reqid, payload);
     ::encode(pgid, payload);
     ::encode(ack_type, payload);
@@ -84,8 +99,6 @@ public:
     ::encode(from, payload);
   }
 
-  epoch_t get_map_epoch() { return map_epoch; }
-
   spg_t get_pg() { return pgid; }
 
   int get_ack_type() { return ack_type; }
@@ -99,9 +112,11 @@ public:
 
 public:
   MOSDRepOpReply(
-    const MOSDRepOp *req, pg_shard_t from, int result_, epoch_t e, int at) :
+    const MOSDRepOp *req, pg_shard_t from, int result_, epoch_t e, epoch_t mine,
+    int at) :
     MOSDFastDispatchOp(MSG_OSD_REPOPREPLY, HEAD_VERSION, COMPAT_VERSION),
     map_epoch(e),
+    min_epoch(mine),
     reqid(req->reqid),
     from(from),
     pgid(req->pgid.pgid, req->from.shard),
@@ -113,6 +128,7 @@ public:
   MOSDRepOpReply() 
     : MOSDFastDispatchOp(MSG_OSD_REPOPREPLY, HEAD_VERSION, COMPAT_VERSION),
       map_epoch(0),
+      min_epoch(0),
       ack_type(0), result(0),
       final_decode_needed(true) {}
 private:
@@ -123,7 +139,7 @@ public:
 
   void print(ostream& out) const override {
     out << "osd_repop_reply(" << reqid
-        << " " << pgid;
+        << " " << pgid << " e" << map_epoch << "/" << min_epoch;
     if (!final_decode_needed) {
       if (ack_type & CEPH_OSD_FLAG_ONDISK)
         out << " ondisk";
diff --git a/src/messages/MOSDRepScrub.h b/src/messages/MOSDRepScrub.h
index a1134bac39d..8b230427854 100644
--- a/src/messages/MOSDRepScrub.h
+++ b/src/messages/MOSDRepScrub.h
@@ -24,13 +24,13 @@
 
 struct MOSDRepScrub : public MOSDFastDispatchOp {
 
-  static const int HEAD_VERSION = 6;
-  static const int COMPAT_VERSION = 2;
+  static const int HEAD_VERSION = 7;
+  static const int COMPAT_VERSION = 6;
 
   spg_t pgid;             // PG to scrub
   eversion_t scrub_from; // only scrub log entries after scrub_from
   eversion_t scrub_to;   // last_update_applied when message sent
-  epoch_t map_epoch;
+  epoch_t map_epoch, min_epoch;
   bool chunky;           // true for chunky scrubs
   hobject_t start;       // lower bound of scrub, inclusive
   hobject_t end;         // upper bound of scrub, exclusive
@@ -40,6 +40,9 @@ struct MOSDRepScrub : public MOSDFastDispatchOp {
   epoch_t get_map_epoch() const override {
     return map_epoch;
   }
+  epoch_t get_min_epoch() const override {
+    return min_epoch;
+  }
   spg_t get_spg() const override {
     return pgid;
   }
@@ -50,12 +53,13 @@ struct MOSDRepScrub : public MOSDFastDispatchOp {
       deep(false),
       seed(0) { }
 
-  MOSDRepScrub(spg_t pgid, eversion_t scrub_to, epoch_t map_epoch,
+  MOSDRepScrub(spg_t pgid, eversion_t scrub_to, epoch_t map_epoch, epoch_t min_epoch,
                hobject_t start, hobject_t end, bool deep, uint32_t seed)
     : MOSDFastDispatchOp(MSG_OSD_REP_SCRUB, HEAD_VERSION, COMPAT_VERSION),
       pgid(pgid),
       scrub_to(scrub_to),
       map_epoch(map_epoch),
+      min_epoch(min_epoch),
       chunky(true),
       start(start),
       end(end),
@@ -71,7 +75,8 @@ public:
   void print(ostream& out) const override {
     out << "replica scrub(pg: ";
     out << pgid << ",from:" << scrub_from << ",to:" << scrub_to
-        << ",epoch:" << map_epoch << ",start:" << start << ",end:" << end
+        << ",epoch:" << map_epoch << "/" << min_epoch
+	<< ",start:" << start << ",end:" << end
         << ",chunky:" << chunky
         << ",deep:" << deep
 	<< ",seed:" << seed
@@ -90,6 +95,7 @@ public:
     ::encode(deep, payload);
     ::encode(pgid.shard, payload);
     ::encode(seed, payload);
+    ::encode(min_epoch, payload);
   }
   void decode_payload() override {
     bufferlist::iterator p = payload.begin();
@@ -97,30 +103,16 @@ public:
     ::decode(scrub_from, p);
     ::decode(scrub_to, p);
     ::decode(map_epoch, p);
-
-    if (header.version >= 3) {
-      ::decode(chunky, p);
-      ::decode(start, p);
-      ::decode(end, p);
-      if (header.version >= 4) {
-        ::decode(deep, p);
-      } else {
-        deep = false;
-      }
-    } else { // v2 scrub: non-chunky
-      chunky = false;
-      deep = false;
-    }
-
-    if (header.version >= 5) {
-      ::decode(pgid.shard, p);
-    } else {
-      pgid.shard = shard_id_t::NO_SHARD;
-    }
-    if (header.version >= 6) {
-      ::decode(seed, p);
+    ::decode(chunky, p);
+    ::decode(start, p);
+    ::decode(end, p);
+    ::decode(deep, p);
+    ::decode(pgid.shard, p);
+    ::decode(seed, p);
+    if (header.version >= 7) {
+      ::decode(min_epoch, p);
     } else {
-      seed = 0;
+      min_epoch = map_epoch;
     }
   }
 };
diff --git a/src/messages/MOSDScrub.h b/src/messages/MOSDScrub.h
index 8c52b043fa6..af8281e7c46 100644
--- a/src/messages/MOSDScrub.h
+++ b/src/messages/MOSDScrub.h
@@ -25,7 +25,7 @@
 struct MOSDScrub : public Message {
 
   static const int HEAD_VERSION = 2;
-  static const int COMPAT_VERSION = 1;
+  static const int COMPAT_VERSION = 2;
 
   uuid_d fsid;
   vector<pg_t> scrub_pgs;
@@ -68,11 +68,7 @@ public:
     ::decode(fsid, p);
     ::decode(scrub_pgs, p);
     ::decode(repair, p);
-    if (header.version >= 2) {
-      ::decode(deep, p);
-    } else {
-      deep = false;
-    }
+    ::decode(deep, p);
   }
 };
 
diff --git a/src/messages/MRecoveryReserve.h b/src/messages/MRecoveryReserve.h
index c0e975004d8..82bfe868940 100644
--- a/src/messages/MRecoveryReserve.h
+++ b/src/messages/MRecoveryReserve.h
@@ -19,7 +19,7 @@
 
 class MRecoveryReserve : public Message {
   static const int HEAD_VERSION = 2;
-  static const int COMPAT_VERSION = 1;
+  static const int COMPAT_VERSION = 2;
 public:
   spg_t pgid;
   epoch_t query_epoch;
@@ -66,10 +66,7 @@ public:
     ::decode(pgid.pgid, p);
     ::decode(query_epoch, p);
     ::decode(type, p);
-    if (header.version >= 2)
-      ::decode(pgid.shard, p);
-    else
-      pgid.shard = shard_id_t::NO_SHARD;
+    ::decode(pgid.shard, p);
   }
 
   void encode_payload(uint64_t features) override {
diff --git a/src/messages/MRoute.h b/src/messages/MRoute.h
index 053d2768f26..179cf598597 100644
--- a/src/messages/MRoute.h
+++ b/src/messages/MRoute.h
@@ -23,7 +23,7 @@
 struct MRoute : public Message {
 
   static const int HEAD_VERSION = 3;
-  static const int COMPAT_VERSION = 2;
+  static const int COMPAT_VERSION = 3;
 
   uint64_t session_mon_tid;
   Message *msg;
@@ -58,17 +58,11 @@ public:
     bufferlist::iterator p = payload.begin();
     ::decode(session_mon_tid, p);
     ::decode(dest, p);
-    if (header.version >= 2) {
-      bool m;
-      ::decode(m, p);
-      if (m)
-	msg = decode_message(NULL, 0, p);
-    } else {
+    bool m;
+    ::decode(m, p);
+    if (m)
       msg = decode_message(NULL, 0, p);
-    }
-    if (header.version >= 3) {
-      ::decode(send_osdmap_first, p);
-    }
+    ::decode(send_osdmap_first, p);
   }
   void encode_payload(uint64_t features) override {
     ::encode(session_mon_tid, payload);
diff --git a/src/mgr/DaemonServer.cc b/src/mgr/DaemonServer.cc
index f54d903dbd2..bb1927e25d6 100644
--- a/src/mgr/DaemonServer.cc
+++ b/src/mgr/DaemonServer.cc
@@ -30,6 +30,7 @@
 #define dout_prefix *_dout << "mgr.server " << __func__ << " "
 
 DaemonServer::DaemonServer(MonClient *monc_,
+                           Finisher &finisher_,
 			   DaemonStateIndex &daemon_state_,
 			   ClusterState &cluster_state_,
 			   PyModules &py_modules_,
@@ -54,6 +55,7 @@ DaemonServer::DaemonServer(MonClient *monc_,
 				     g_conf->mgr_mon_messages)),
       msgr(nullptr),
       monc(monc_),
+      finisher(finisher_),
       daemon_state(daemon_state_),
       cluster_state(cluster_state_),
       py_modules(py_modules_),
@@ -371,40 +373,82 @@ bool DaemonServer::_allowed_command(
   return capable;
 }
 
-class ReplyOnFinish : public Context {
-  DaemonServer* mgr;
-  MCommand *m;
-  bufferlist odata;
-
-public:
-  bufferlist from_mon;
-  string outs;
-
-  ReplyOnFinish(DaemonServer* mgr, MCommand *m, bufferlist&& odata)
-    : mgr(mgr), m(m), odata(std::move(odata))
-  {}
-  void finish(int r) override {
-    odata.claim_append(from_mon);
-    mgr->_reply(m, r, outs, odata);
-  }
-};
-
 bool DaemonServer::handle_command(MCommand *m)
 {
   int r = 0;
   std::stringstream ss;
-  bufferlist odata;
   std::string prefix;
 
   assert(lock.is_locked_by_me());
 
-  cmdmap_t cmdmap;
+  /**
+   * The working data for processing an MCommand.  This lives in
+   * a class to enable passing it into other threads for processing
+   * outside of the thread/locks that called handle_command.
+   */
+  class CommandContext
+  {
+    public:
+    MCommand *m;
+    bufferlist odata;
+    cmdmap_t cmdmap;
+
+    CommandContext(MCommand *m_)
+      : m(m_)
+    {
+    }
+
+    ~CommandContext()
+    {
+      m->put();
+    }
+
+    void reply(int r, const std::stringstream &ss)
+    {
+      reply(r, ss.str());
+    }
+
+    void reply(int r, const std::string &rs)
+    {
+      // Let the connection drop as soon as we've sent our response
+      ConnectionRef con = m->get_connection();
+      if (con) {
+        con->mark_disposable();
+      }
+
+      dout(1) << "do_command r=" << r << " " << rs << dendl;
+      if (con) {
+        MCommandReply *reply = new MCommandReply(r, rs);
+        reply->set_tid(m->get_tid());
+        reply->set_data(odata);
+        con->send_message(reply);
+      }
+    }
+  };
+
+  /**
+   * A context for receiving a bufferlist/error string from a background
+   * function and then calling back to a CommandContext when it's done
+   */
+  class ReplyOnFinish : public Context {
+    std::shared_ptr<CommandContext> cmdctx;
+
+  public:
+    bufferlist from_mon;
+    string outs;
+
+    ReplyOnFinish(std::shared_ptr<CommandContext> cmdctx_)
+      : cmdctx(cmdctx_)
+    {}
+    void finish(int r) override {
+      cmdctx->odata.claim_append(from_mon);
+      cmdctx->reply(r, outs);
+    }
+  };
 
-  // TODO background the call into python land so that we don't
-  // block a messenger thread on python code.
+  std::shared_ptr<CommandContext> cmdctx = std::make_shared<CommandContext>(m);
 
-  ConnectionRef con = m->get_connection();
-  MgrSessionRef session(static_cast<MgrSession*>(con->get_priv()));
+  MgrSessionRef session(static_cast<MgrSession*>(m->get_connection()->get_priv()));
   if (!session) {
     return true;
   }
@@ -412,23 +456,23 @@ bool DaemonServer::handle_command(MCommand *m)
   if (session->inst.name == entity_name_t())
     session->inst.name = m->get_source();
 
-  string format;
+  std::string format;
   boost::scoped_ptr<Formatter> f;
-  const MgrCommand *mgr_cmd;
   map<string,string> param_str_map;
 
-  if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
-    return _reply(m, -EINVAL, ss.str(), odata);
+  if (!cmdmap_from_json(m->cmd, &(cmdctx->cmdmap), ss)) {
+    cmdctx->reply(-EINVAL, ss);
+    return true;
   }
 
   {
-    cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
+    cmd_getval(g_ceph_context, cmdctx->cmdmap, "format", format, string("plain"));
     f.reset(Formatter::create(format));
   }
 
-  dout(4) << "decoded " << cmdmap.size() << dendl;
-  cmd_getval(cct, cmdmap, "prefix", prefix);
+  cmd_getval(cct, cmdctx->cmdmap, "prefix", prefix);
 
+  dout(4) << "decoded " << cmdctx->cmdmap.size() << dendl;
   dout(4) << "prefix=" << prefix << dendl;
 
   if (prefix == "get_command_descriptions") {
@@ -460,26 +504,36 @@ bool DaemonServer::handle_command(MCommand *m)
     }
 #endif
     f.close_section();	// command_descriptions
-    f.flush(odata);
-    return _reply(m, r, ss.str(), odata);
+    f.flush(cmdctx->odata);
+    cmdctx->reply(0, ss);
+    return true;
   }
 
   // lookup command
-  mgr_cmd = _get_mgrcommand(prefix, mgr_commands,
+  const MgrCommand *mgr_cmd = _get_mgrcommand(prefix, mgr_commands,
                                               ARRAY_SIZE(mgr_commands));
-  _generate_command_map(cmdmap, param_str_map);
+  _generate_command_map(cmdctx->cmdmap, param_str_map);
   if (!mgr_cmd) {
-    return _reply(m, -EINVAL, "command not supported", odata);
-  }
-
-  // validate user's permissions for requested command
-  if (!_allowed_command(session.get(), mgr_cmd->module, prefix, cmdmap,
-                        param_str_map, mgr_cmd)) {
-    dout(1) << __func__ << " access denied" << dendl;
-    audit_clog->info() << "from='" << session->inst << "' "
-		       << "entity='" << session->entity_name << "' "
-		       << "cmd=" << m->cmd << ":  access denied";
-    return _reply(m, -EACCES, "access denied", odata);
+    MgrCommand py_command = {"", "", "py", "rw", "cli"};
+    if (!_allowed_command(session.get(), py_command.module, prefix, cmdctx->cmdmap,
+                          param_str_map, &py_command)) {
+      dout(1) << " access denied" << dendl;
+      ss << "access denied";
+      cmdctx->reply(-EACCES, ss);
+      return true;
+    }
+  } else {
+    // validate user's permissions for requested command
+    if (!_allowed_command(session.get(), mgr_cmd->module, prefix, cmdctx->cmdmap,
+                          param_str_map, mgr_cmd)) {
+      dout(1) << " access denied" << dendl;
+      audit_clog->info() << "from='" << session->inst << "' "
+                         << "entity='" << session->entity_name << "' "
+                         << "cmd=" << m->cmd << ":  access denied";
+      ss << "access denied";
+      cmdctx->reply(-EACCES, ss);
+      return true;
+    }
   }
 
   audit_clog->debug()
@@ -496,10 +550,11 @@ bool DaemonServer::handle_command(MCommand *m)
     string scrubop = prefix.substr(3, string::npos);
     pg_t pgid;
     string pgidstr;
-    cmd_getval(g_ceph_context, cmdmap, "pgid", pgidstr);
+    cmd_getval(g_ceph_context, cmdctx->cmdmap, "pgid", pgidstr);
     if (!pgid.parse(pgidstr.c_str())) {
       ss << "invalid pgid '" << pgidstr << "'";
-      return _reply(m, -EINVAL, ss.str(), odata);
+      cmdctx->reply(-EINVAL, ss);
+      return true;
     }
     bool pg_exists = false;
     cluster_state.with_osdmap([&](const OSDMap& osdmap) {
@@ -507,7 +562,8 @@ bool DaemonServer::handle_command(MCommand *m)
       });
     if (!pg_exists) {
       ss << "pg " << pgid << " dne";
-      return _reply(m, -ENOENT, ss.str(), odata);
+      cmdctx->reply(-ENOENT, ss);
+      return true;
     }
     int acting_primary = -1;
     entity_inst_t inst;
@@ -519,7 +575,8 @@ bool DaemonServer::handle_command(MCommand *m)
       });
     if (acting_primary == -1) {
       ss << "pg " << pgid << " has no primary osd";
-      return _reply(m, -EAGAIN, ss.str(), odata);
+      cmdctx->reply(-EAGAIN, ss);
+      return true;
     }
     vector<pg_t> pgs = { pgid };
     msgr->send_message(new MOSDScrub(monc->get_fsid(),
@@ -529,7 +586,8 @@ bool DaemonServer::handle_command(MCommand *m)
 		       inst);
     ss << "instructing pg " << pgid << " on osd." << acting_primary
        << " (" << inst << ") to " << scrubop;
-    return _reply(m, 0, ss.str(), odata);
+    cmdctx->reply(0, ss);
+    return true;
   } else if (prefix == "osd reweight-by-pg" ||
 	     prefix == "osd reweight-by-utilization" ||
 	     prefix == "osd test-reweight-by-pg" ||
@@ -540,10 +598,10 @@ bool DaemonServer::handle_command(MCommand *m)
       prefix == "osd test-reweight-by-pg" ||
       prefix == "osd test-reweight-by-utilization";
     int64_t oload;
-    cmd_getval(g_ceph_context, cmdmap, "oload", oload, int64_t(120));
+    cmd_getval(g_ceph_context, cmdctx->cmdmap, "oload", oload, int64_t(120));
     set<int64_t> pools;
     vector<string> poolnames;
-    cmd_getval(g_ceph_context, cmdmap, "pools", poolnames);
+    cmd_getval(g_ceph_context, cmdctx->cmdmap, "pools", poolnames);
     cluster_state.with_osdmap([&](const OSDMap& osdmap) {
 	for (const auto& poolname : poolnames) {
 	  int64_t pool = osdmap.lookup_pg_pool_name(poolname);
@@ -555,24 +613,27 @@ bool DaemonServer::handle_command(MCommand *m)
 	}
       });
     if (r) {
-      return _reply(m, r, ss.str(), odata);
+      cmdctx->reply(r, ss);
+      return true;
     }
     double max_change = g_conf->mon_reweight_max_change;
-    cmd_getval(g_ceph_context, cmdmap, "max_change", max_change);
+    cmd_getval(g_ceph_context, cmdctx->cmdmap, "max_change", max_change);
     if (max_change <= 0.0) {
       ss << "max_change " << max_change << " must be positive";
-      return _reply(m, -EINVAL, ss.str(), odata);
+      cmdctx->reply(-EINVAL, ss);
+      return true;
     }
     int64_t max_osds = g_conf->mon_reweight_max_osds;
-    cmd_getval(g_ceph_context, cmdmap, "max_osds", max_osds);
+    cmd_getval(g_ceph_context, cmdctx->cmdmap, "max_osds", max_osds);
     if (max_osds <= 0) {
       ss << "max_osds " << max_osds << " must be positive";
-      return _reply(m, -EINVAL, ss.str(), odata);
+      cmdctx->reply(-EINVAL, ss);
+      return true;
     }
     string no_increasing;
-    cmd_getval(g_ceph_context, cmdmap, "no_increasing", no_increasing);
+    cmd_getval(g_ceph_context, cmdctx->cmdmap, "no_increasing", no_increasing);
     string out_str;
-    map<int32_t, uint32_t> new_weights;
+    mempool::osdmap::map<int32_t, uint32_t> new_weights;
     r = cluster_state.with_pgmap([&](const PGMap& pgmap) {
 	return cluster_state.with_osdmap([&](const OSDMap& osdmap) {
 	    return reweight::by_utilization(osdmap, pgmap,
@@ -589,16 +650,19 @@ bool DaemonServer::handle_command(MCommand *m)
     if (r >= 0) {
       dout(10) << "reweight::by_utilization: finished with " << out_str << dendl;
     }
-    if (f)
-      f->flush(odata);
-    else
-      odata.append(out_str);
+    if (f) {
+      f->flush(cmdctx->odata);
+    } else {
+      cmdctx->odata.append(out_str);
+    }
     if (r < 0) {
       ss << "FAILED reweight-by-pg";
-      return _reply(m, r, ss.str(), odata);
+      cmdctx->reply(r, ss);
+      return true;
     } else if (r == 0 || dry_run) {
       ss << "no change";
-      return _reply(m, r, ss.str(), odata);
+      cmdctx->reply(r, ss);
+      return true;
     } else {
       json_spirit::Object json_object;
       for (const auto& osd_weight : new_weights) {
@@ -613,7 +677,7 @@ bool DaemonServer::handle_command(MCommand *m)
 	"\"prefix\": \"osd reweightn\", "
 	"\"weights\": \"" + s + "\""
 	"}";
-      auto on_finish = new ReplyOnFinish(this, m, std::move(odata));
+      auto on_finish = new ReplyOnFinish(cmdctx);
       monc->start_mon_command({cmd}, {},
 			      &on_finish->from_mon, &on_finish->outs, on_finish);
       return true;
@@ -621,63 +685,46 @@ bool DaemonServer::handle_command(MCommand *m)
   } else {
     r = cluster_state.with_pgmap([&](const PGMap& pg_map) {
 	return cluster_state.with_osdmap([&](const OSDMap& osdmap) {
-	    return process_pg_map_command(prefix, cmdmap, pg_map, osdmap,
-					  f.get(), &ss, &odata);
+	    return process_pg_map_command(prefix, cmdctx->cmdmap, pg_map, osdmap,
+					  f.get(), &ss, &cmdctx->odata);
 	  });
       });
-  }
-  if (r != -EOPNOTSUPP)
-      return _reply(m, r, ss.str(), odata);
-  // fall back to registered python handlers
-  else {
-    // Let's find you a handler!
-    MgrPyModule *handler = nullptr;
-    auto py_commands = py_modules.get_commands();
-    for (const auto &pyc : py_commands) {
-      auto pyc_prefix = cmddesc_get_prefix(pyc.cmdstring);
-      dout(1) << "pyc_prefix: '" << pyc_prefix << "'" << dendl;
-      if (pyc_prefix == prefix) {
-        handler = pyc.handler;
-        break;
-      }
-    }
 
-    if (handler == nullptr) {
-      ss << "No handler found for '" << prefix << "'";
-      dout(4) << "No handler found for '" << prefix << "'" << dendl;
-      return _reply(m, -EINVAL, ss.str(), odata);
+    if (r != -EOPNOTSUPP) {
+      cmdctx->reply(r, ss);
+      return true;
     }
+  }
 
-    // FIXME: go run this python part in another thread, not inline
-    // with a ms_dispatch, so that the python part can block if it
-    // wants to.
-    dout(4) << "passing through " << cmdmap.size() << dendl;
-    stringstream ds;
-    r = handler->handle_command(cmdmap, &ds, &ss);
-    odata.append(ds);
-    return _reply(m, 0, ss.str(), odata);
+  // None of the special native commands, 
+  MgrPyModule *handler = nullptr;
+  auto py_commands = py_modules.get_commands();
+  for (const auto &pyc : py_commands) {
+    auto pyc_prefix = cmddesc_get_prefix(pyc.cmdstring);
+    dout(1) << "pyc_prefix: '" << pyc_prefix << "'" << dendl;
+    if (pyc_prefix == prefix) {
+      handler = pyc.handler;
+      break;
+    }
   }
-}
 
-bool DaemonServer::_reply(MCommand* m,
-			  int ret,
-			  const std::string& s,
-			  const bufferlist& payload)
-{
-  dout(1) << __func__ << " r=" << ret << " " << s << dendl;
-  auto con = m->get_connection();
-  if (!con) {
-    dout(10) << __func__ << " connection dropped for command" << dendl;
-    m->put();
+  if (handler == nullptr) {
+    ss << "No handler found for '" << prefix << "'";
+    dout(4) << "No handler found for '" << prefix << "'" << dendl;
+    cmdctx->reply(-EINVAL, ss);
+    return true;
+  } else {
+    // Okay, now we have a handler to call, but we must not call it
+    // in this thread, because the python handlers can do anything,
+    // including blocking, and including calling back into mgr.
+    dout(4) << "passing through " << cmdctx->cmdmap.size() << dendl;
+    finisher.queue(new FunctionContext([cmdctx, handler](int r_) {
+      std::stringstream ds;
+      std::stringstream ss;
+      int r = handler->handle_command(cmdctx->cmdmap, &ds, &ss);
+      cmdctx->odata.append(ds);
+      cmdctx->reply(r, ss);
+    }));
     return true;
   }
-  // Let the connection drop as soon as we've sent our response
-  con->mark_disposable();
-
-  auto response = new MCommandReply(ret, s);
-  response->set_tid(m->get_tid());
-  response->set_data(payload);
-  con->send_message(response);
-  m->put();
-  return true;
 }
diff --git a/src/mgr/DaemonServer.h b/src/mgr/DaemonServer.h
index af6d383ba50..0ef4a5655ee 100644
--- a/src/mgr/DaemonServer.h
+++ b/src/mgr/DaemonServer.h
@@ -54,6 +54,7 @@ protected:
 
   Messenger *msgr;
   MonClient *monc;
+  Finisher  &finisher;
   DaemonStateIndex &daemon_state;
   ClusterState &cluster_state;
   PyModules &py_modules;
@@ -85,6 +86,7 @@ public:
   entity_addr_t get_myaddr() const;
 
   DaemonServer(MonClient *monc_,
+               Finisher &finisher_,
 	       DaemonStateIndex &daemon_state_,
 	       ClusterState &cluster_state_,
 	       PyModules &py_modules_,
diff --git a/src/mgr/DaemonState.cc b/src/mgr/DaemonState.cc
index f83f9749de6..290fde65134 100644
--- a/src/mgr/DaemonState.cc
+++ b/src/mgr/DaemonState.cc
@@ -30,18 +30,20 @@ void DaemonStateIndex::insert(DaemonStatePtr dm)
   all[dm->key] = dm;
 }
 
-void DaemonStateIndex::_erase(DaemonKey dmk)
+void DaemonStateIndex::_erase(const DaemonKey& dmk)
 {
   assert(lock.is_locked_by_me());
 
-  const auto dm = all.at(dmk);
+  const auto to_erase = all.find(dmk);
+  assert(to_erase != all.end());
+  const auto dm = to_erase->second;
   auto &server_collection = by_server[dm->hostname];
   server_collection.erase(dm->key);
   if (server_collection.empty()) {
     by_server.erase(dm->hostname);
   }
 
-  all.erase(dmk);
+  all.erase(to_erase);
 }
 
 DaemonStateCollection DaemonStateIndex::get_by_type(uint8_t type) const
@@ -85,25 +87,25 @@ DaemonStatePtr DaemonStateIndex::get(const DaemonKey &key)
 }
 
 void DaemonStateIndex::cull(entity_type_t daemon_type,
-                               std::set<std::string> names_exist)
+			    const std::set<std::string>& names_exist)
 {
-  Mutex::Locker l(lock);
-
-  std::set<DaemonKey> victims;
-
-  for (const auto &i : all) {
-    if (i.first.first != daemon_type) {
-      continue;
-    }
+  std::vector<string> victims;
 
-    if (names_exist.count(i.first.second) == 0) {
-      victims.insert(i.first);
+  Mutex::Locker l(lock);
+  auto begin = all.lower_bound({daemon_type, ""});
+  auto end = all.end();
+  for (auto &i = begin; i != end; ++i) {
+    const auto& daemon_key = i->first;
+    if (daemon_key.first != daemon_type)
+      break;
+    if (names_exist.count(daemon_key.second) == 0) {
+      victims.push_back(daemon_key.second);
     }
   }
 
-  for (const auto &i : victims) {
+  for (auto &i : victims) {
     dout(4) << "Removing data for " << i << dendl;
-    _erase(i);
+    _erase({daemon_type, i});
   }
 }
 
diff --git a/src/mgr/DaemonState.h b/src/mgr/DaemonState.h
index 78fd036525f..91160d7f082 100644
--- a/src/mgr/DaemonState.h
+++ b/src/mgr/DaemonState.h
@@ -141,7 +141,7 @@ class DaemonStateIndex
   PerfCounterTypes types;
 
   void insert(DaemonStatePtr dm);
-  void _erase(DaemonKey dmk);
+  void _erase(const DaemonKey& dmk);
 
   bool exists(const DaemonKey &key) const;
   DaemonStatePtr get(const DaemonKey &key);
@@ -164,7 +164,7 @@ class DaemonStateIndex
    * a cluster map and want to ensure that anything absent in the map
    * is also absent in this class.
    */
-  void cull(entity_type_t daemon_type, std::set<std::string> names_exist);
+  void cull(entity_type_t daemon_type, const std::set<std::string>& names_exist);
 };
 
 #endif
diff --git a/src/mgr/Mgr.cc b/src/mgr/Mgr.cc
index 6be7216b2e4..9df32239bd0 100644
--- a/src/mgr/Mgr.cc
+++ b/src/mgr/Mgr.cc
@@ -14,6 +14,7 @@
 #include <Python.h>
 
 #include "osdc/Objecter.h"
+#include "client/Client.h"
 #include "common/errno.h"
 #include "mon/MonClient.h"
 #include "include/stringify.h"
@@ -39,16 +40,19 @@
 
 
 Mgr::Mgr(MonClient *monc_, Messenger *clientm_, Objecter *objecter_,
-	 LogChannelRef clog_, LogChannelRef audit_clog_) :
+	 Client* client_, LogChannelRef clog_, LogChannelRef audit_clog_) :
   monc(monc_),
   objecter(objecter_),
+  client(client_),
   client_messenger(clientm_),
   lock("Mgr::lock"),
   timer(g_ceph_context, lock),
   finisher(g_ceph_context, "Mgr", "mgr-fin"),
-  py_modules(daemon_state, cluster_state, *monc, finisher),
+  py_modules(daemon_state, cluster_state, *monc, *objecter, *client,
+             finisher),
   cluster_state(monc, nullptr),
-  server(monc, daemon_state, cluster_state, py_modules, clog_, audit_clog_),
+  server(monc, finisher, daemon_state, cluster_state, py_modules,
+         clog_, audit_clog_),
   initialized(false),
   initializing(false)
 {
@@ -70,6 +74,8 @@ class MetadataUpdate : public Context
   DaemonStateIndex &daemon_state;
   DaemonKey key;
 
+  std::map<std::string, std::string> defaults;
+
 public:
   bufferlist outbl;
   std::string outs;
@@ -77,6 +83,11 @@ public:
   MetadataUpdate(DaemonStateIndex &daemon_state_, const DaemonKey &key_)
     : daemon_state(daemon_state_), key(key_) {}
 
+  void set_default(const std::string &k, const std::string &v)
+  {
+    defaults[k] = v;
+  }
+
   void finish(int r) override
   {
     daemon_state.clear_updating(key);
@@ -94,6 +105,13 @@ public:
 
         json_spirit::mObject daemon_meta = json_result.get_obj();
 
+        // Apply any defaults
+        for (const auto &i : defaults) {
+          if (daemon_meta.find(i.first) == daemon_meta.end()) {
+            daemon_meta[i.first] = i.second;
+          }
+        }
+
         DaemonStatePtr state;
         if (daemon_state.exists(key)) {
           state = daemon_state.get(key);
@@ -339,15 +357,19 @@ void Mgr::load_config()
 
 void Mgr::shutdown()
 {
-  // FIXME: pre-empt init() if it is currently running, so that it will
-  // give up the lock for us.
-  Mutex::Locker l(lock);
-
-  // First stop the server so that we're not taking any more incoming requests
-  server.shutdown();
-
-  // after the messenger is stopped, signal modules to shutdown via finisher
-  py_modules.shutdown();
+  finisher.queue(new FunctionContext([&](int) {
+    {
+      Mutex::Locker l(lock);
+      monc->sub_unwant("log-info");
+      monc->sub_unwant("mgrdigest");
+      monc->sub_unwant("fsmap");
+      // First stop the server so that we're not taking any more incoming
+      // requests
+      server.shutdown();
+    }
+    // after the messenger is stopped, signal modules to shutdown via finisher
+    py_modules.shutdown();
+  }));
 
   // Then stop the finisher to ensure its enqueued contexts aren't going
   // to touch references to the things we're about to tear down
@@ -421,7 +443,7 @@ void Mgr::handle_osd_map()
     cluster_state.notify_osdmap(osd_map);
   });
 
-  // TODO: same culling for MonMap and FSMap
+  // TODO: same culling for MonMap
   daemon_state.cull(CEPH_ENTITY_TYPE_OSD, names_exist);
 }
 
@@ -430,6 +452,8 @@ void Mgr::handle_log(MLog *m)
   for (const auto &e : m->entries) {
     py_modules.notify_all(e);
   }
+
+  m->put();
 }
 
 bool Mgr::ms_dispatch(Message *m)
@@ -453,11 +477,12 @@ bool Mgr::ms_dispatch(Message *m)
       ceph_abort();
 
       py_modules.notify_all("mon_map", "");
+      m->put();
       break;
     case CEPH_MSG_FS_MAP:
       py_modules.notify_all("fs_map", "");
       handle_fs_map((MFSMap*)m);
-      m->put();
+      return false; // I shall let this pass through for Client
       break;
     case CEPH_MSG_OSD_MAP:
       handle_osd_map();
@@ -471,7 +496,6 @@ bool Mgr::ms_dispatch(Message *m)
       break;
     case MSG_LOG:
       handle_log(static_cast<MLog *>(m));
-      m->put();
       break;
 
     default:
@@ -485,6 +509,8 @@ void Mgr::handle_fs_map(MFSMap* m)
 {
   assert(lock.is_locked_by_me());
 
+  std::set<std::string> names_exist;
+  
   const FSMap &new_fsmap = m->get_fsmap();
 
   fs_map_cond.Signal();
@@ -500,6 +526,13 @@ void Mgr::handle_fs_map(MFSMap* m)
   for (const auto &i : mds_info) {
     const auto &info = i.second;
 
+    if (!new_fsmap.gid_exists(i.first)){
+      continue;
+    }
+
+    // Remember which MDS exists so that we can cull any that don't
+    names_exist.insert(info.name);
+
     const auto k = DaemonKey(CEPH_ENTITY_TYPE_MDS, info.name);
     if (daemon_state.is_updating(k)) {
       continue;
@@ -508,9 +541,6 @@ void Mgr::handle_fs_map(MFSMap* m)
     bool update = false;
     if (daemon_state.exists(k)) {
       auto metadata = daemon_state.get(k);
-      // FIXME: nothing stopping old daemons being here, they won't have
-      // addr: need to handle case of pre-ceph-mgr daemons that don't have
-      // the fields we expect
       if (metadata->metadata.empty() ||
 	  metadata->metadata.count("addr") == 0) {
         update = true;
@@ -530,6 +560,11 @@ void Mgr::handle_fs_map(MFSMap* m)
     if (update) {
       daemon_state.notify_updating(k);
       auto c = new MetadataUpdate(daemon_state, k);
+
+      // Older MDS daemons don't have addr in the metadata, so
+      // fake it if the returned metadata doesn't have the field.
+      c->set_default("addr", stringify(info.addr));
+
       std::ostringstream cmd;
       cmd << "{\"prefix\": \"mds metadata\", \"who\": \""
           << info.name << "\"}";
@@ -538,6 +573,7 @@ void Mgr::handle_fs_map(MFSMap* m)
           {}, &c->outbl, &c->outs, c);
     }
   }
+  daemon_state.cull(CEPH_ENTITY_TYPE_MDS, names_exist);
 }
 
 
@@ -553,6 +589,7 @@ void Mgr::handle_mgr_digest(MMgrDigest* m)
   // the pgmap might have changed since last time we were here.
   py_modules.notify_all("pg_summary", "");
   dout(10) << "done." << dendl;
+
   m->put();
 }
 
diff --git a/src/mgr/Mgr.h b/src/mgr/Mgr.h
index 652fff94f61..2efb2683f8a 100644
--- a/src/mgr/Mgr.h
+++ b/src/mgr/Mgr.h
@@ -40,6 +40,7 @@ class MCommand;
 class MMgrDigest;
 class MLog;
 class Objecter;
+class Client;
 
 
 class MgrPyModule;
@@ -48,6 +49,7 @@ class Mgr {
 protected:
   MonClient *monc;
   Objecter  *objecter;
+  Client    *client;
   Messenger *client_messenger;
 
   Mutex lock;
@@ -71,7 +73,7 @@ protected:
 
 public:
   Mgr(MonClient *monc_, Messenger *clientm_, Objecter *objecter_,
-      LogChannelRef clog_, LogChannelRef audit_clog_);
+      Client *client_, LogChannelRef clog_, LogChannelRef audit_clog_);
   ~Mgr();
 
   bool is_initialized() const {return initialized;}
diff --git a/src/mgr/MgrClient.h b/src/mgr/MgrClient.h
index adc35fe406f..f9385b8be6d 100644
--- a/src/mgr/MgrClient.h
+++ b/src/mgr/MgrClient.h
@@ -95,6 +95,7 @@ public:
 
   void set_pgstats_cb(std::function<MPGStats*()> cb_)
   {
+    Mutex::Locker l(lock);
     pgstats_cb = cb_;
   }
 
diff --git a/src/mgr/MgrPyModule.cc b/src/mgr/MgrPyModule.cc
index 2997be1b27b..078f7f50350 100644
--- a/src/mgr/MgrPyModule.cc
+++ b/src/mgr/MgrPyModule.cc
@@ -50,7 +50,7 @@ std::string handle_pyerror()
 #define dout_prefix *_dout << "mgr " << __func__ << " "
 
 MgrPyModule::MgrPyModule(const std::string &module_name_)
-  : module_name(module_name_), pModule(nullptr), pClass(nullptr),
+  : module_name(module_name_),
     pClassInstance(nullptr)
 {}
 
@@ -60,8 +60,6 @@ MgrPyModule::~MgrPyModule()
   gstate = PyGILState_Ensure();
 
   Py_XDECREF(pClassInstance);
-  Py_XDECREF(pClass);
-  Py_XDECREF(pModule);
 
   PyGILState_Release(gstate);
 }
@@ -70,7 +68,7 @@ int MgrPyModule::load()
 {
   // Load the module
   PyObject *pName = PyString_FromString(module_name.c_str());
-  pModule = PyImport_Import(pName);
+  auto pModule = PyImport_Import(pName);
   Py_DECREF(pName);
   if (pModule == nullptr) {
     derr << "Module not found: '" << module_name << "'" << dendl;
@@ -79,7 +77,8 @@ int MgrPyModule::load()
 
   // Find the class
   // TODO: let them call it what they want instead of just 'Module'
-  pClass = PyObject_GetAttrString(pModule, (const char*)"Module");
+  auto pClass = PyObject_GetAttrString(pModule, (const char*)"Module");
+  Py_DECREF(pModule);
   if (pClass == nullptr) {
     derr << "Class not found in module '" << module_name << "'" << dendl;
     return -EINVAL;
@@ -91,6 +90,7 @@ int MgrPyModule::load()
   auto pyHandle = PyString_FromString(module_name.c_str());
   auto pArgs = PyTuple_Pack(1, pyHandle);
   pClassInstance = PyObject_CallObject(pClass, pArgs);
+  Py_DECREF(pClass);
   Py_DECREF(pyHandle);
   Py_DECREF(pArgs);
   if (pClassInstance == nullptr) {
diff --git a/src/mgr/MgrPyModule.h b/src/mgr/MgrPyModule.h
index b466d1ac20f..7d91275eacb 100644
--- a/src/mgr/MgrPyModule.h
+++ b/src/mgr/MgrPyModule.h
@@ -43,8 +43,6 @@ class MgrPyModule
 {
 private:
   const std::string module_name;
-  PyObject *pModule;
-  PyObject *pClass;
   PyObject *pClassInstance;
 
   std::vector<ModuleCommand> commands;
diff --git a/src/mgr/MgrStandby.cc b/src/mgr/MgrStandby.cc
index af176fd9b24..1bfcadd71b4 100644
--- a/src/mgr/MgrStandby.cc
+++ b/src/mgr/MgrStandby.cc
@@ -14,7 +14,7 @@
 #include <Python.h>
 
 #include "common/errno.h"
-#include "mon/MonClient.h"
+
 #include "include/stringify.h"
 #include "global/global_context.h"
 #include "global/signal_handler.h"
@@ -35,10 +35,11 @@
 
 MgrStandby::MgrStandby() :
   Dispatcher(g_ceph_context),
-  monc(new MonClient(g_ceph_context)),
+  monc{g_ceph_context},
   client_messenger(Messenger::create_client_messenger(g_ceph_context, "mgr")),
-  objecter(new Objecter(g_ceph_context, client_messenger, monc, NULL, 0, 0)),
-  log_client(g_ceph_context, client_messenger, &monc->monmap, LogClient::NO_FLAGS),
+  objecter{g_ceph_context, client_messenger.get(), &monc, NULL, 0, 0},
+  client{client_messenger.get(), &monc, &objecter},
+  log_client(g_ceph_context, client_messenger.get(), &monc.monmap, LogClient::NO_FLAGS),
   clog(log_client.create_channel(CLOG_CHANNEL_CLUSTER)),
   audit_clog(log_client.create_channel(CLOG_CHANNEL_AUDIT)),
   lock("MgrStandby::lock"),
@@ -47,13 +48,7 @@ MgrStandby::MgrStandby() :
 {
 }
 
-
-MgrStandby::~MgrStandby()
-{
-  delete objecter;
-  delete monc;
-  delete client_messenger;
-}
+MgrStandby::~MgrStandby() = default;
 
 const char** MgrStandby::get_tracked_conf_keys() const
 {
@@ -97,48 +92,48 @@ int MgrStandby::init()
 
   // Initialize Messenger
   client_messenger->add_dispatcher_tail(this);
+  client_messenger->add_dispatcher_head(&objecter);
+  client_messenger->add_dispatcher_tail(&client);
   client_messenger->start();
 
   // Initialize MonClient
-  if (monc->build_initial_monmap() < 0) {
+  if (monc.build_initial_monmap() < 0) {
     client_messenger->shutdown();
     client_messenger->wait();
     return -1;
   }
 
-  monc->sub_want("mgrmap", 0, 0);
+  monc.sub_want("mgrmap", 0, 0);
 
-  monc->set_want_keys(CEPH_ENTITY_TYPE_MON|CEPH_ENTITY_TYPE_OSD
+  monc.set_want_keys(CEPH_ENTITY_TYPE_MON|CEPH_ENTITY_TYPE_OSD
       |CEPH_ENTITY_TYPE_MDS|CEPH_ENTITY_TYPE_MGR);
-  monc->set_messenger(client_messenger);
-  int r = monc->init();
+  monc.set_messenger(client_messenger.get());
+  int r = monc.init();
   if (r < 0) {
-    monc->shutdown();
+    monc.shutdown();
     client_messenger->shutdown();
     client_messenger->wait();
     return r;
   }
-  r = monc->authenticate();
+  r = monc.authenticate();
   if (r < 0) {
     derr << "Authentication failed, did you specify a mgr ID with a valid keyring?" << dendl;
-    monc->shutdown();
+    monc.shutdown();
     client_messenger->shutdown();
     client_messenger->wait();
     return r;
   }
 
-  client_t whoami = monc->get_global_id();
+  client_t whoami = monc.get_global_id();
   client_messenger->set_myname(entity_name_t::CLIENT(whoami.v));
-
-  monc->set_log_client(&log_client);
+  monc.set_log_client(&log_client);
   _update_log_config();
-
-  objecter->set_client_incarnation(0);
-  objecter->init();
-  client_messenger->add_dispatcher_head(objecter);
-  objecter->start();
-
+  objecter.set_client_incarnation(0);
+  objecter.init();
+  objecter.start();
+  client.init();
   timer.init();
+
   send_beacon();
 
   dout(4) << "Complete." << dendl;
@@ -149,17 +144,17 @@ void MgrStandby::send_beacon()
 {
   assert(lock.is_locked_by_me());
   dout(1) << state_str() << dendl;
-  dout(10) << "sending beacon as gid " << monc->get_global_id() << dendl;
+  dout(10) << "sending beacon as gid " << monc.get_global_id() << dendl;
 
   bool available = active_mgr != nullptr && active_mgr->is_initialized();
   auto addr = available ? active_mgr->get_server_addr() : entity_addr_t();
-  MMgrBeacon *m = new MMgrBeacon(monc->get_fsid(),
-				 monc->get_global_id(),
+  MMgrBeacon *m = new MMgrBeacon(monc.get_fsid(),
+				 monc.get_global_id(),
                                  g_conf->name.get_id(),
                                  addr,
                                  available);
                                  
-  monc->send_mon_message(m);
+  monc.send_mon_message(m);
   timer.add_event_after(g_conf->mgr_beacon_period, new FunctionContext(
         [this](int r){
           send_beacon();
@@ -180,15 +175,19 @@ void MgrStandby::shutdown()
   // Expect already to be locked as we're called from signal handler
   assert(lock.is_locked_by_me());
 
+  // stop sending beacon first, i use monc to talk with monitors
+  timer.shutdown();
+  // client uses monc and objecter
+  client.shutdown();
+  // stop monc, so mon won't be able to instruct me to shutdown/activate after
+  // the active_mgr is stopped
+  monc.shutdown();
   if (active_mgr) {
     active_mgr->shutdown();
   }
-
-  objecter->shutdown();
-
-  timer.shutdown();
-
-  monc->shutdown();
+  // objecter is used by monc and active_mgr
+  objecter.shutdown();
+  // client_messenger is used by all of them, so stop it in the end
   client_messenger->shutdown();
 }
 
@@ -223,13 +222,14 @@ void MgrStandby::handle_mgr_map(MMgrMap* mmap)
 {
   auto map = mmap->get_map();
   dout(4) << "received map epoch " << map.get_epoch() << dendl;
-  const bool active_in_map = map.active_gid == monc->get_global_id();
+  const bool active_in_map = map.active_gid == monc.get_global_id();
   dout(4) << "active in map: " << active_in_map
           << " active is " << map.active_gid << dendl;
   if (active_in_map) {
     if (!active_mgr) {
       dout(1) << "Activating!" << dendl;
-      active_mgr.reset(new Mgr(monc, client_messenger, objecter, clog, audit_clog));
+      active_mgr.reset(new Mgr(&monc, client_messenger.get(), &objecter,
+			       &client, clog, audit_clog));
       active_mgr->background_init();
       dout(1) << "I am now active" << dendl;
     } else {
@@ -242,6 +242,8 @@ void MgrStandby::handle_mgr_map(MMgrMap* mmap)
       active_mgr.reset();
     }
   }
+
+  mmap->put();
 }
 
 bool MgrStandby::ms_dispatch(Message *m)
@@ -256,13 +258,14 @@ bool MgrStandby::ms_dispatch(Message *m)
 
     default:
       if (active_mgr) {
-        return active_mgr->ms_dispatch(m);
+        lock.Unlock();
+        active_mgr->ms_dispatch(m);
+        lock.Lock();
       } else {
         return false;
       }
   }
 
-  m->put();
   return true;
 }
 
@@ -274,11 +277,11 @@ bool MgrStandby::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer,
     return true;
 
   if (force_new) {
-    if (monc->wait_auth_rotating(10) < 0)
+    if (monc.wait_auth_rotating(10) < 0)
       return false;
   }
 
-  *authorizer = monc->build_authorizer(dest_type);
+  *authorizer = monc.build_authorizer(dest_type);
   return *authorizer != NULL;
 }
 
@@ -289,7 +292,7 @@ bool MgrStandby::ms_handle_refused(Connection *con)
 }
 
 // A reference for use by the signal handler
-MgrStandby *signal_mgr = nullptr;
+static MgrStandby *signal_mgr = nullptr;
 
 static void handle_mgr_signal(int signum)
 {
diff --git a/src/mgr/MgrStandby.h b/src/mgr/MgrStandby.h
index 89ede8cee2c..d33c633dc42 100644
--- a/src/mgr/MgrStandby.h
+++ b/src/mgr/MgrStandby.h
@@ -20,6 +20,10 @@
 #include "common/Timer.h"
 #include "common/LogClient.h"
 
+#include "client/Client.h"
+#include "mon/MonClient.h"
+#include "osdc/Objecter.h"
+
 #include "DaemonServer.h"
 #include "PyModules.h"
 
@@ -38,9 +42,10 @@ public:
                          const std::set <std::string> &changed) override;
 
 protected:
-  MonClient *monc;
-  Messenger *client_messenger;
-  Objecter *objecter;
+  MonClient monc;
+  std::unique_ptr<Messenger> client_messenger;
+  Objecter objecter;
+  Client client;
 
   LogClient log_client;
   LogChannelRef clog, audit_clog;
diff --git a/src/mgr/PyModules.cc b/src/mgr/PyModules.cc
index 45078262b65..3e00822fc28 100644
--- a/src/mgr/PyModules.cc
+++ b/src/mgr/PyModules.cc
@@ -61,8 +61,11 @@ namespace {
 #define dout_prefix *_dout << "mgr " << __func__ << " "
 
 PyModules::PyModules(DaemonStateIndex &ds, ClusterState &cs, MonClient &mc,
+                     Objecter &objecter_, Client &client_,
 		     Finisher &f)
-  : daemon_state(ds), cluster_state(cs), monc(mc), finisher(f)
+  : daemon_state(ds), cluster_state(cs), monc(mc),
+    objecter(objecter_), client(client_),
+    finisher(f)
 {}
 
 // we can not have the default destructor in header, because ServeThread is
@@ -360,7 +363,8 @@ int PyModules::init()
   global_handle = this;
 
   // Set up global python interpreter
-  Py_Initialize();
+  Py_SetProgramName(const_cast<char*>(PYTHON_EXECUTABLE));
+  Py_InitializeEx(0);
 
   // Some python modules do not cope with an unpopulated argv, so lets
   // fake one.  This step also picks up site-packages into sys.path.
@@ -458,22 +462,15 @@ void PyModules::start()
 void PyModules::shutdown()
 {
   Mutex::Locker locker(lock);
+  assert(global_handle);
 
   // Signal modules to drop out of serve() and/or tear down resources
-  C_SaferCond shutdown_called;
-  C_GatherBuilder gather(g_ceph_context);
   for (auto &i : modules) {
     auto module = i.second.get();
-    auto shutdown_cb = gather.new_sub();
-    finisher.queue(new FunctionContext([module, shutdown_cb](int r){
-      module->shutdown();
-      shutdown_cb->complete(0);
-    }));
-  }
-
-  if (gather.has_subs()) {
-    gather.set_finisher(&shutdown_called);
-    gather.activate();
+    const auto& name = i.first;
+    dout(10) << "waiting for module " << name << " to shutdown" << dendl;
+    module->shutdown();
+    dout(10) << "module " << name << " shutdown" << dendl;
   }
 
   // For modules implementing serve(), finish the threads where we
@@ -485,17 +482,13 @@ void PyModules::shutdown()
   }
   serve_threads.clear();
 
-  // Wait for the module's shutdown() to complete before
-  // we proceed to destroy the module.
-  if (!modules.empty()) {
-    dout(4) << "waiting for module shutdown calls" << dendl;
-    shutdown_called.wait();
-  }
-
   modules.clear();
 
   PyGILState_Ensure();
   Py_Finalize();
+
+  // nobody needs me anymore.
+  global_handle = nullptr;
 }
 
 void PyModules::notify_all(const std::string &notify_type,
@@ -576,8 +569,14 @@ void PyModules::set_config(const std::string &handle,
   }
   set_cmd.wait();
 
-  // FIXME: is config-key put ever allowed to fail?
-  assert(set_cmd.r == 0);
+  if (set_cmd.r != 0) {
+    // config-key put will fail if mgr's auth key has insufficient
+    // permission to set config keys
+    // FIXME: should this somehow raise an exception back into Python land?
+    dout(0) << "`config-key put " << global_key << " " << val << "` failed: "
+      << cpp_strerror(set_cmd.r) << dendl;
+    dout(0) << "mon returned " << set_cmd.r << ": " << set_cmd.outs << dendl;
+  }
 }
 
 std::vector<ModuleCommand> PyModules::get_commands()
@@ -661,3 +660,16 @@ PyObject* PyModules::get_counter_python(
   return f.get();
 }
 
+PyObject *PyModules::get_context()
+{
+  PyThreadState *tstate = PyEval_SaveThread();
+  Mutex::Locker l(lock);
+  PyEval_RestoreThread(tstate);
+
+  // Construct a capsule containing ceph context.
+  // Not incrementing/decrementing ref count on the context because
+  // it's the global one and it has process lifetime.
+  auto capsule = PyCapsule_New(g_ceph_context, nullptr, nullptr);
+  return capsule;
+}
+
diff --git a/src/mgr/PyModules.h b/src/mgr/PyModules.h
index 6c6361a8125..8e929113053 100644
--- a/src/mgr/PyModules.h
+++ b/src/mgr/PyModules.h
@@ -20,6 +20,9 @@
 #include "common/Mutex.h"
 #include "common/Thread.h"
 
+#include "osdc/Objecter.h"
+#include "client/Client.h"
+
 #include "DaemonState.h"
 #include "ClusterState.h"
 
@@ -32,6 +35,8 @@ class PyModules
   DaemonStateIndex &daemon_state;
   ClusterState &cluster_state;
   MonClient &monc;
+  Objecter &objecter;
+  Client   &client;
   Finisher &finisher;
 
   mutable Mutex lock{"PyModules"};
@@ -42,12 +47,15 @@ public:
   static constexpr auto config_prefix = "mgr.";
 
   PyModules(DaemonStateIndex &ds, ClusterState &cs, MonClient &mc,
+            Objecter &objecter_, Client &client_,
             Finisher &f);
 
   ~PyModules();
 
   // FIXME: wrap for send_command?
   MonClient &get_monc() {return monc;}
+  Objecter  &get_objecter() {return objecter;}
+  Client    &get_client() {return client;}
 
   PyObject *get_python(const std::string &what);
   PyObject *get_server_python(const std::string &hostname);
@@ -57,6 +65,7 @@ public:
   PyObject *get_counter_python(std::string const &handle,
       entity_type_t svc_type, const std::string &svc_id,
       const std::string &path);
+  PyObject *get_context();
 
   std::map<std::string, std::string> config_cache;
 
diff --git a/src/mgr/PyState.cc b/src/mgr/PyState.cc
index 1836282f270..ca62c762b94 100644
--- a/src/mgr/PyState.cc
+++ b/src/mgr/PyState.cc
@@ -82,11 +82,18 @@ static PyObject*
 ceph_send_command(PyObject *self, PyObject *args)
 {
   char *handle = nullptr;
+
+  // Like mon, osd, mds
+  char *type = nullptr;
+
+  // Like "23" for an OSD or "myid" for an MDS
+  char *name = nullptr;
+
   char *cmd_json = nullptr;
   char *tag = nullptr;
   PyObject *completion = nullptr;
-  if (!PyArg_ParseTuple(args, "sOss:ceph_send_command",
-        &handle, &completion, &cmd_json, &tag)) {
+  if (!PyArg_ParseTuple(args, "sOssss:ceph_send_command",
+        &handle, &completion, &type, &name, &cmd_json, &tag)) {
     return nullptr;
   }
 
@@ -99,12 +106,49 @@ ceph_send_command(PyObject *self, PyObject *args)
   Py_DECREF(set_fn);
 
   auto c = new MonCommandCompletion(completion, tag);
-  global_handle->get_monc().start_mon_command(
-      {cmd_json},
-      {},
-      &c->outbl,
-      &c->outs,
-      c);
+  if (std::string(type) == "mon") {
+    global_handle->get_monc().start_mon_command(
+        {cmd_json},
+        {},
+        &c->outbl,
+        &c->outs,
+        c);
+  } else if (std::string(type) == "osd") {
+    std::string err;
+    uint64_t osd_id = strict_strtoll(name, 10, &err);
+    if (!err.empty()) {
+      // TODO: raise exception
+      return nullptr;
+    }
+
+    ceph_tid_t tid;
+    global_handle->get_objecter().osd_command(
+        osd_id,
+        {cmd_json},
+        {},
+        &tid,
+        &c->outbl,
+        &c->outs,
+        c);
+  } else if (std::string(type) == "mds") {
+    int r = global_handle->get_client().mds_command(
+        name,
+        {cmd_json},
+        {},
+        &c->outbl,
+        &c->outs,
+        c);
+    if (r != 0) {
+      // TODO: raise exception
+      return nullptr;
+    }
+  } else if (std::string(type) == "pg") {
+    // TODO: expose objecter::pg_command
+    return nullptr;
+  } else {
+    // TODO: raise exception
+    return nullptr;
+  }
 
   Py_RETURN_NONE;
 }
@@ -229,6 +273,12 @@ ceph_get_version(PyObject *self, PyObject *args)
   return PyString_FromString(pretty_version_to_str().c_str());
 }
 
+static PyObject *
+ceph_get_context(PyObject *self, PyObject *args)
+{
+  return global_handle->get_context();
+}
+
 static PyObject*
 get_counter(PyObject *self, PyObject *args)
 {
@@ -270,6 +320,8 @@ PyMethodDef CephStateMethods[] = {
      "Emit a (local) log message"},
     {"get_version", ceph_get_version, METH_VARARGS,
      "Get the ceph version of this process"},
+    {"get_context", ceph_get_context, METH_NOARGS,
+      "Get a CephContext* in a python capsule"},
     {NULL, NULL, 0, NULL}
 };
 
diff --git a/src/mon/ConfigKeyService.cc b/src/mon/ConfigKeyService.cc
index 8d083199331..f685daa05a6 100644
--- a/src/mon/ConfigKeyService.cc
+++ b/src/mon/ConfigKeyService.cc
@@ -83,6 +83,21 @@ void ConfigKeyService::store_list(stringstream &ss)
   f.flush(ss);
 }
 
+void ConfigKeyService::store_dump(stringstream &ss)
+{
+  KeyValueDB::Iterator iter =
+    mon->store->get_iterator(STORE_PREFIX);
+
+  JSONFormatter f(true);
+  f.open_object_section("config-key store");
+
+  while (iter->valid()) {
+    f.dump_string(iter->key().c_str(), iter->value().to_str());
+    iter->next();
+  }
+  f.close_section();
+  f.flush(ss);
+}
 
 bool ConfigKeyService::service_dispatch(MonOpRequestRef op)
 {
@@ -187,6 +202,12 @@ bool ConfigKeyService::service_dispatch(MonOpRequestRef op)
     store_list(tmp_ss);
     rdata.append(tmp_ss);
     ret = 0;
+
+  } else if (prefix == "config-key dump") {
+    stringstream tmp_ss;
+    store_dump(tmp_ss);
+    rdata.append(tmp_ss);
+    ret = 0;
   }
 
 out:
diff --git a/src/mon/ConfigKeyService.h b/src/mon/ConfigKeyService.h
index 523bafbdf31..34c70342a98 100644
--- a/src/mon/ConfigKeyService.h
+++ b/src/mon/ConfigKeyService.h
@@ -30,6 +30,7 @@ class ConfigKeyService : public QuorumService
   void store_put(const string &key, bufferlist &bl, Context *cb = NULL);
   void store_delete(const string &key, Context *cb = NULL);
   void store_list(stringstream &ss);
+  void store_dump(stringstream &ss);
   bool store_exists(const string &key);
 
   static const string STORE_PREFIX;
diff --git a/src/mon/Elector.cc b/src/mon/Elector.cc
index c3c8fd8845f..303510530b5 100644
--- a/src/mon/Elector.cc
+++ b/src/mon/Elector.cc
@@ -42,8 +42,7 @@ void Elector::init()
 
 void Elector::shutdown()
 {
-  if (expire_event)
-    mon->timer.cancel_event(expire_event);
+  cancel_timer();
 }
 
 void Elector::bump_epoch(epoch_t e) 
@@ -127,6 +126,8 @@ void Elector::defer(int who)
 
 void Elector::reset_timer(double plus)
 {
+  // set the timer
+  cancel_timer();
   /**
    * This class is used as the callback when the expire_event timer fires up.
    *
@@ -140,17 +141,9 @@ void Elector::reset_timer(double plus)
    * as far as we know, we may even be dead); so, just propose ourselves as the
    * Leader.
    */
-  class C_ElectionExpire : public Context {
-    Elector *elector;
-  public:
-    explicit C_ElectionExpire(Elector *e) : elector(e) { }
-    void finish(int r) override {
-      elector->expire();
-    }
-  };
-  // set the timer
-  cancel_timer();
-  expire_event = new C_ElectionExpire(this);
+  expire_event = new C_MonContext(mon, [this](int) {
+      expire();
+    });
   mon->timer.add_event_after(g_conf->mon_election_timeout + plus,
 			     expire_event);
 }
diff --git a/src/mon/Elector.h b/src/mon/Elector.h
index 02097510f81..2e407d29058 100644
--- a/src/mon/Elector.h
+++ b/src/mon/Elector.h
@@ -65,7 +65,7 @@ class Elector {
    * Event callback responsible for dealing with an expired election once a
    * timer runs out and fires up.
    */
-  Context *expire_event;
+  Context *expire_event = nullptr;
 
   /**
    * Resets the expire_event timer, by cancelling any existing one and
@@ -335,7 +335,6 @@ class Elector {
    * @param m A Monitor instance
    */
   explicit Elector(Monitor *m) : mon(m),
-			expire_event(0),
 			epoch(0),
 			participating(true),
 			electing_me(false),
diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc
index 16e45529525..92a3215e6cb 100644
--- a/src/mon/MDSMonitor.cc
+++ b/src/mon/MDSMonitor.cc
@@ -53,19 +53,19 @@ static ostream& _prefix(std::ostream *_dout, Monitor *mon, FSMap const& fsmap) {
  * out strongly-typedef'd types
  */
 template<> bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
-                std::string k, mds_gid_t &val)
+			   const std::string& k, mds_gid_t &val)
 {
   return cmd_getval(cct, cmdmap, k, (int64_t&)val);
 }
 
 template<> bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
-                std::string k, mds_rank_t &val)
+			   const std::string& k, mds_rank_t &val)
 {
   return cmd_getval(cct, cmdmap, k, (int64_t&)val);
 }
 
 template<> bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
-                std::string k, MDSMap::DaemonState &val)
+			   const std::string& k, MDSMap::DaemonState &val)
 {
   return cmd_getval(cct, cmdmap, k, (int64_t&)val);
 }
diff --git a/src/mon/MgrMap.h b/src/mon/MgrMap.h
index 1fd0500131c..e522ab4ba0f 100644
--- a/src/mon/MgrMap.h
+++ b/src/mon/MgrMap.h
@@ -126,7 +126,12 @@ public:
       dump(f);
     } else {
       if (get_active_gid() != 0) {
-	*ss << "active: " << get_active_name() << " ";
+	*ss << "active: " << get_active_name();
+        if (!available) {
+          // If the daemon hasn't gone active yet, indicate that.
+          *ss << "(starting)";
+        }
+        *ss << " ";
       } else {
 	*ss << "no daemons active ";
       }
diff --git a/src/mon/MgrMonitor.cc b/src/mon/MgrMonitor.cc
index 5d6bbb931a6..aa816d4a495 100644
--- a/src/mon/MgrMonitor.cc
+++ b/src/mon/MgrMonitor.cc
@@ -187,7 +187,7 @@ bool MgrMonitor::prepare_beacon(MonOpRequestRef op)
     }
   }
 
-  last_beacon[m->get_gid()] = ceph_clock_now();
+  last_beacon[m->get_gid()] = ceph::coarse_mono_clock::now();
 
   // Track whether we modified pending_map
   bool updated = false;
@@ -263,7 +263,9 @@ void MgrMonitor::check_sub(Subscription *sub)
     }
   } else {
     assert(sub->type == "mgrdigest");
-    send_digests();
+    if (digest_event == nullptr) {
+      send_digests();
+    }
   }
 }
 
@@ -273,7 +275,11 @@ void MgrMonitor::check_sub(Subscription *sub)
  */
 void MgrMonitor::send_digests()
 {
-  digest_callback = nullptr;
+  cancel_timer();
+
+  if (!is_active()) {
+    return;
+  }
 
   const std::string type = "mgrdigest";
   if (mon->session_map.subs.count(type) == 0)
@@ -296,10 +302,18 @@ void MgrMonitor::send_digests()
     sub->session->con->send_message(mdigest);
   }
 
-  digest_callback = new FunctionContext([this](int r){
+  digest_event = new C_MonContext(mon, [this](int){
       send_digests();
   });
-  mon->timer.add_event_after(g_conf->mon_mgr_digest_period, digest_callback);
+  mon->timer.add_event_after(g_conf->mon_mgr_digest_period, digest_event);
+}
+
+void MgrMonitor::cancel_timer()
+{
+  if (digest_event) {
+    mon->timer.cancel_event(digest_event);
+    digest_event = nullptr;
+  }
 }
 
 void MgrMonitor::on_active()
@@ -343,9 +357,8 @@ void MgrMonitor::tick()
   if (!is_active() || !mon->is_leader())
     return;
 
-  const utime_t now = ceph_clock_now();
-  utime_t cutoff = now;
-  cutoff -= g_conf->mon_mgr_beacon_grace;
+  const auto now = ceph::coarse_mono_clock::now();
+  const auto cutoff = now - std::chrono::seconds(g_conf->mon_mgr_beacon_grace);
 
   // Populate any missing beacons (i.e. no beacon since MgrMonitor
   // instantiation) with the current time, so that they will
@@ -536,13 +549,12 @@ bool MgrMonitor::prepare_command(MonOpRequestRef op)
 
 void MgrMonitor::init()
 {
-  send_digests();  // To get it to schedule its own event
+  if (digest_event == nullptr) {
+    send_digests();  // To get it to schedule its own event
+  }
 }
 
 void MgrMonitor::on_shutdown()
 {
-  if (digest_callback) {
-    mon->timer.cancel_event(digest_callback);
-  }
+  cancel_timer();
 }
-
diff --git a/src/mon/MgrMonitor.h b/src/mon/MgrMonitor.h
index a582d95b1f6..af3c1aeaffc 100644
--- a/src/mon/MgrMonitor.h
+++ b/src/mon/MgrMonitor.h
@@ -24,7 +24,7 @@ class MgrMonitor : public PaxosService
 
   utime_t first_seen_inactive;
 
-  std::map<uint64_t, utime_t> last_beacon;
+  std::map<uint64_t, ceph::coarse_mono_clock::time_point> last_beacon;
 
   /**
    * If a standby is available, make it active, given that
@@ -36,13 +36,14 @@ class MgrMonitor : public PaxosService
   void drop_active();
   void drop_standby(uint64_t gid);
 
-  Context *digest_callback;
+  Context *digest_event = nullptr;
+  void cancel_timer();
 
   bool check_caps(MonOpRequestRef op, const uuid_d& fsid);
 
 public:
   MgrMonitor(Monitor *mn, Paxos *p, const string& service_name)
-    : PaxosService(mn, p, service_name), digest_callback(nullptr)
+    : PaxosService(mn, p, service_name)
   {}
 
   void init() override;
diff --git a/src/mon/MonClient.cc b/src/mon/MonClient.cc
index 5d00487a010..da843269ac9 100644
--- a/src/mon/MonClient.cc
+++ b/src/mon/MonClient.cc
@@ -463,6 +463,9 @@ int MonClient::authenticate(double timeout)
   if (active_con) {
     ldout(cct, 5) << __func__ << " success, global_id "
 		  << active_con->get_global_id() << dendl;
+    // active_con should not have been set if there was an error
+    assert(authenticate_err == 0);
+    authenticated = true;
   }
 
   if (authenticate_err < 0 && no_keyring_disabled_cephx) {
diff --git a/src/mon/MonClient.h b/src/mon/MonClient.h
index 126032b7b2f..9656db0ee71 100644
--- a/src/mon/MonClient.h
+++ b/src/mon/MonClient.h
@@ -194,6 +194,7 @@ private:
   uint64_t global_id = 0;
   Cond auth_cond;
   int authenticate_err = 0;
+  bool authenticated = false;
 
   list<Message*> waiting_for_session;
   utime_t last_rotating_renew_sent;
@@ -219,6 +220,7 @@ public:
   int wait_auth_rotating(double timeout);
 
   int authenticate(double timeout=0.0);
+  bool is_authenticated() const {return authenticated;}
 
   /**
    * Try to flush as many log messages as we can in a single
diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h
index b64ba8d5ff1..c7b932690b9 100644
--- a/src/mon/MonCommands.h
+++ b/src/mon/MonCommands.h
@@ -512,6 +512,12 @@ COMMAND("osd crush move " \
 	"name=args,type=CephString,n=N,goodchars=[A-Za-z0-9-_.=]", \
 	"move existing entry for <name> to location <args>", \
 	"osd", "rw", "cli,rest")
+COMMAND("osd crush swap-bucket " \
+	"name=source,type=CephString,goodchars=[A-Za-z0-9-_.] " \
+	"name=dest,type=CephString,goodchars=[A-Za-z0-9-_.] " \
+	"name=force,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \
+	"swap existing bucket contents from (orphan) bucket <source> and <target>", \
+	"osd", "rw", "cli,rest")
 COMMAND("osd crush link " \
 	"name=name,type=CephString " \
 	"name=args,type=CephString,n=N,goodchars=[A-Za-z0-9-_.=]", \
@@ -603,6 +609,10 @@ COMMAND("osd set-nearfull-ratio " \
 	"name=ratio,type=CephFloat,range=0.0|1.0", \
 	"set usage ratio at which OSDs are marked near-full",
 	"osd", "rw", "cli,rest")
+COMMAND("osd set-require-min-compat-client " \
+	"name=version,type=CephString",
+	"set the minimum client version we will maintain compatibility with",
+	"osd", "rw", "cli,rest")
 COMMAND("osd pause", "pause osd", "osd", "rw", "cli,rest")
 COMMAND("osd unpause", "unpause osd", "osd", "rw", "cli,rest")
 COMMAND("osd erasure-code-profile set " \
@@ -727,7 +737,7 @@ COMMAND("osd pool delete " \
 COMMAND("osd pool rm " \
 	"name=pool,type=CephPoolname " \
 	"name=pool2,type=CephPoolname,req=false " \
-	"name=sure,type=CephChoices,strings=--yes-i-really-really-mean-it,req=false", \
+	"name=sure,type=CephString,req=false", \
 	"remove pool", \
 	"osd", "rw", "cli,rest")
 COMMAND("osd pool rename " \
@@ -824,6 +834,7 @@ COMMAND("config-key exists " \
 	"name=key,type=CephString", \
 	"check for <key>'s existence", "config-key", "r", "cli,rest")
 COMMAND("config-key list ", "list keys", "config-key", "r", "cli,rest")
+COMMAND("config-key dump", "dump keys and values", "config-key", "r", "cli,rest")
 
 
 /*
diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
index af44df8e392..2a1e6ad111e 100644
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -147,6 +147,12 @@ long parse_pos_long(const char *s, ostream *pss)
   return r;
 }
 
+void C_MonContext::finish(int r) {
+  if (mon->is_shutdown())
+    return;
+  FunctionContext::finish(r);
+}
+
 Monitor::Monitor(CephContext* cct_, string nm, MonitorDBStore *s,
 		 Messenger *m, Messenger *mgr_m, MonMap *map) :
   Dispatcher(cct_),
@@ -199,12 +205,8 @@ Monitor::Monitor(CephContext* cct_, string nm, MonitorDBStore *s,
   timecheck_rounds_since_clean(0),
   timecheck_event(NULL),
 
-  probe_timeout_event(NULL),
-
   paxos_service(PAXOS_NUM),
   admin_hook(NULL),
-  health_tick_event(NULL),
-  health_interval_event(NULL),
   routed_request_tid(0),
   op_tracker(cct, true, 1)
 {
@@ -1260,7 +1262,9 @@ void Monitor::sync_reset_timeout()
   dout(10) << __func__ << dendl;
   if (sync_timeout_event)
     timer.cancel_event(sync_timeout_event);
-  sync_timeout_event = new C_SyncTimeout(this);
+  sync_timeout_event = new C_MonContext(this, [this](int) {
+      sync_timeout();
+    });
   timer.add_event_after(g_conf->mon_sync_timeout, sync_timeout_event);
 }
 
@@ -1598,7 +1602,9 @@ void Monitor::cancel_probe_timeout()
 void Monitor::reset_probe_timeout()
 {
   cancel_probe_timeout();
-  probe_timeout_event = new C_ProbeTimeout(this);
+  probe_timeout_event = new C_MonContext(this, [this](int r) {
+      probe_timeout(r);
+    });
   double t = g_conf->mon_probe_timeout;
   timer.add_event_after(t, probe_timeout_event);
   dout(10) << "reset_probe_timeout " << probe_timeout_event << " after " << t << " seconds" << dendl;
@@ -2257,8 +2263,12 @@ void Monitor::health_tick_start()
   dout(15) << __func__ << dendl;
 
   health_tick_stop();
-  health_tick_event = new C_HealthToClogTick(this);
-
+  health_tick_event = new C_MonContext(this, [this](int r) {
+      if (r < 0)
+        return;
+      do_health_to_clog();
+      health_tick_start();
+    });
   timer.add_event_after(cct->_conf->mon_health_to_clog_tick_interval,
                         health_tick_event);
 }
@@ -2302,7 +2312,11 @@ void Monitor::health_interval_start()
 
   health_interval_stop();
   utime_t next = health_interval_calc_next_update();
-  health_interval_event = new C_HealthToClogInterval(this);
+  health_interval_event = new C_MonContext(this, [this](int r) {
+      if (r < 0)
+        return;
+      do_health_to_clog_interval();
+    });
   timer.add_event_at(next, health_interval_event);
 }
 
@@ -4112,7 +4126,9 @@ void Monitor::timecheck_reset_event()
            << " rounds_since_clean " << timecheck_rounds_since_clean
            << dendl;
 
-  timecheck_event = new C_TimeCheck(this);
+  timecheck_event = new C_MonContext(this, [this](int) {
+      timecheck_start_round();
+    });
   timer.add_event_after(delay, timecheck_event);
 }
 
@@ -4952,15 +4968,9 @@ void Monitor::scrub_event_start()
     return;
   }
 
-  struct C_Scrub : public Context {
-    Monitor *mon;
-    explicit C_Scrub(Monitor *m) : mon(m) { }
-    void finish(int r) override {
-      mon->scrub_start();
-    }
-  };
-
-  scrub_event = new C_Scrub(this);
+  scrub_event = new C_MonContext(this, [this](int) {
+      scrub_start();
+    });
   timer.add_event_after(cct->_conf->mon_scrub_interval, scrub_event);
 }
 
@@ -4986,33 +4996,18 @@ void Monitor::scrub_reset_timeout()
   dout(15) << __func__ << " reset timeout event" << dendl;
   scrub_cancel_timeout();
 
-  struct C_ScrubTimeout : public Context {
-    Monitor *mon;
-    explicit C_ScrubTimeout(Monitor *m) : mon(m) { }
-    void finish(int r) override {
-      mon->scrub_timeout();
-    }
-  };
-
-  scrub_timeout_event = new C_ScrubTimeout(this);
+  scrub_timeout_event = new C_MonContext(this, [this](int) {
+      scrub_timeout();
+    });
   timer.add_event_after(g_conf->mon_scrub_timeout, scrub_timeout_event);
 }
 
 /************ TICK ***************/
-
-class C_Mon_Tick : public Context {
-  Monitor *mon;
-public:
-  explicit C_Mon_Tick(Monitor *m) : mon(m) {}
-  void finish(int r) override {
-    mon->tick();
-  }
-};
-
 void Monitor::new_tick()
 {
-  C_Mon_Tick *ctx = new C_Mon_Tick(this);
-  timer.add_event_after(g_conf->mon_tick_interval, ctx);
+  timer.add_event_after(g_conf->mon_tick_interval, new C_MonContext(this, [this](int) {
+	tick();
+      }));
 }
 
 void Monitor::tick()
diff --git a/src/mon/Monitor.h b/src/mon/Monitor.h
index 285997e18aa..3f8b5950888 100644
--- a/src/mon/Monitor.h
+++ b/src/mon/Monitor.h
@@ -114,6 +114,14 @@ struct MonCommand;
 
 #define COMPAT_SET_LOC "feature_set"
 
+class C_MonContext final : public FunctionContext {
+  const Monitor *mon;
+public:
+  explicit C_MonContext(Monitor *m, boost::function<void(int)>&& callback)
+    : FunctionContext(std::move(callback)), mon(m) {}
+  void finish(int r) override;
+};
+
 class Monitor : public Dispatcher,
                 public md_config_obs_t {
 public:
@@ -163,7 +171,6 @@ public:
 
 private:
   void new_tick();
-  friend class C_Mon_Tick;
 
   // -- local storage --
 public:
@@ -336,14 +343,6 @@ private:
    */
   version_t sync_last_committed_floor;
 
-  struct C_SyncTimeout : public Context {
-    Monitor *mon;
-    explicit C_SyncTimeout(Monitor *m) : mon(m) {}
-    void finish(int r) override {
-      mon->sync_timeout();
-    }
-  };
-
   /**
    * Obtain the synchronization target prefixes in set form.
    *
@@ -503,14 +502,6 @@ private:
    */
   Context *timecheck_event;
 
-  struct C_TimeCheck : public Context {
-    Monitor *mon;
-    explicit C_TimeCheck(Monitor *m) : mon(m) { }
-    void finish(int r) override {
-      mon->timecheck_start_round();
-    }
-  };
-
   void timecheck_start();
   void timecheck_finish();
   void timecheck_start_round();
@@ -546,15 +537,7 @@ private:
    */
   void handle_ping(MonOpRequestRef op);
 
-  Context *probe_timeout_event;  // for probing
-
-  struct C_ProbeTimeout : public Context {
-    Monitor *mon;
-    explicit C_ProbeTimeout(Monitor *m) : mon(m) {}
-    void finish(int r) override {
-      mon->probe_timeout(r);
-    }
-  };
+  Context *probe_timeout_event = nullptr;  // for probing
 
   void reset_probe_timeout();
   void cancel_probe_timeout();
@@ -713,29 +696,8 @@ public:
     }
   } health_status_cache;
 
-  struct C_HealthToClogTick : public Context {
-    Monitor *mon;
-    explicit C_HealthToClogTick(Monitor *m) : mon(m) { }
-    void finish(int r) override {
-      if (r < 0)
-        return;
-      mon->do_health_to_clog();
-      mon->health_tick_start();
-    }
-  };
-
-  struct C_HealthToClogInterval : public Context {
-    Monitor *mon;
-    explicit C_HealthToClogInterval(Monitor *m) : mon(m) { }
-    void finish(int r) override {
-      if (r < 0)
-        return;
-      mon->do_health_to_clog_interval();
-    }
-  };
-
-  Context *health_tick_event;
-  Context *health_interval_event;
+  Context *health_tick_event = nullptr;
+  Context *health_interval_event = nullptr;
 
   void health_tick_start();
   void health_tick_stop();
diff --git a/src/mon/MonitorDBStore.h b/src/mon/MonitorDBStore.h
index e6732384c20..707d635af55 100644
--- a/src/mon/MonitorDBStore.h
+++ b/src/mon/MonitorDBStore.h
@@ -629,9 +629,13 @@ class MonitorDBStore
   int open(ostream &out) {
     string kv_type;
     int r = read_meta("kv_backend", &kv_type);
-    if (r < 0 || kv_type.length() == 0)
+    if (r < 0 || kv_type.empty()) {
+      // assume old monitors that did not mark the type were leveldb.
       kv_type = "leveldb";
-
+      r = write_meta("kv_backend", kv_type);
+      if (r < 0)
+	return r;
+    }
     _open(kv_type);
     r = db->open(out);
     if (r < 0)
@@ -646,8 +650,7 @@ class MonitorDBStore
     string kv_type;
     int r = read_meta("kv_backend", &kv_type);
     if (r < 0) {
-      // assume old monitors that did not mark the type were leveldb.
-      kv_type = "leveldb";
+      kv_type = g_conf->mon_keyvaluedb;
       r = write_meta("kv_backend", kv_type);
       if (r < 0)
 	return r;
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index e740677f4ad..33ed0e40988 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -80,13 +80,21 @@ void LastEpochClean::Lec::report(ps_t ps, epoch_t last_epoch_clean)
   if (epoch_by_pg.size() <= ps) {
     epoch_by_pg.resize(ps + 1, 0);
   }
-  if (epoch_by_pg[ps] >= last_epoch_clean) {
+  const auto old_lec = epoch_by_pg[ps];
+  if (old_lec >= last_epoch_clean) {
     // stale lec
     return;
   }
   epoch_by_pg[ps] = last_epoch_clean;
   if (last_epoch_clean < floor) {
     floor = last_epoch_clean;
+  } else if (last_epoch_clean > floor) {
+    if (old_lec == floor) {
+      // probably should increase floor?
+      auto new_floor = std::min_element(std::begin(epoch_by_pg),
+					std::end(epoch_by_pg));
+      floor = *new_floor;
+    }
   }
   if (ps != next_missing) {
     return;
@@ -222,6 +230,7 @@ void OSDMonitor::create_initial()
     if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100;
     newmap.nearfull_ratio = g_conf->mon_osd_nearfull_ratio;
     if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100;
+    newmap.require_min_compat_client = g_conf->mon_osd_initial_require_min_compat_client;
   }
 
   // encode into pending incremental
@@ -303,8 +312,7 @@ void OSDMonitor::update_from_paxos(bool *need_bootstrap)
   if (mon->monmap->get_required_features().contains_all(
 	ceph::features::mon::FEATURE_LUMINOUS)) {
     bufferlist bl;
-    mon->store->get(OSD_PG_CREATING_PREFIX, "creating", bl);
-    if (bl.length()) {
+    if (!mon->store->get(OSD_PG_CREATING_PREFIX, "creating", bl)) {
       auto p = bl.begin();
       std::lock_guard<std::mutex> l(creating_pgs_lock);
       creating_pgs.decode(p);
@@ -422,9 +430,6 @@ void OSDMonitor::update_from_paxos(bool *need_bootstrap)
   }
   // XXX: need to trim MonSession connected with a osd whose id > max_osd?
 
-  /** we don't have any of the feature bit infrastructure in place for
-   * supporting primary_temp mappings without breaking old clients/OSDs.*/
-  assert(g_conf->mon_osd_allow_primary_temp || osdmap.primary_temp->empty());
   if (mon->is_leader()) {
     // kick pgmon, make sure it's seen the latest map
     mon->pgmon()->check_osd_map(osdmap.epoch);
@@ -918,6 +923,7 @@ OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc)
     pending_creatings.created_pools.insert(pg.pool());
     pending_creatings.pgs.erase(pg);
   }
+  pending_created_pgs.clear();
   // PAXOS_PGMAP is less than PAXOS_OSDMAP, so PGMonitor::update_from_paxos()
   // should have prepared the latest pgmap if any
   const auto& pgm = mon->pgmon()->pg_map;
@@ -928,7 +934,7 @@ OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc)
       assert(st != pgm.pg_stat.end());
       auto created = make_pair(st->second.created, st->second.last_scrub_stamp);
       // no need to add the pg, if it already exists in creating_pgs
-      creating_pgs.pgs.emplace(pgid, created);
+      pending_creatings.pgs.emplace(pgid, created);
     }
   }
   for (auto old_pool : inc.old_pools) {
@@ -1060,15 +1066,18 @@ void OSDMonitor::prime_pg_temp(
 {
   if (mon->monmap->get_required_features().contains_all(
         ceph::features::mon::FEATURE_LUMINOUS)) {
-    if (!creating_pgs.pgs.count(pgid)) {
+    if (creating_pgs.pgs.count(pgid)) {
       return;
     }
   } else {
     const auto& pg_map = mon->pgmon()->pg_map;
-    if (!pg_map.creating_pgs.count(pgid)) {
+    if (pg_map.creating_pgs.count(pgid)) {
       return;
     }
   }
+  if (!osdmap.pg_exists(pgid)) {
+    return;
+  }
 
   vector<int> up, acting;
   mapping.get(pgid, &up, nullptr, &acting, nullptr);
@@ -1093,7 +1102,9 @@ void OSDMonitor::prime_pg_temp(
   {
     Mutex::Locker l(prime_pg_temp_lock);
     // do not touch a mapping if a change is pending
-    pending_inc.new_pg_temp.emplace(pgid, acting);
+    pending_inc.new_pg_temp.emplace(
+      pgid,
+      mempool::osdmap::vector<int>(acting.begin(), acting.end()));
   }
 }
 
@@ -1118,7 +1129,7 @@ void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
 	      << mapping_job.get() << " did not complete, "
 	      << mapping_job->shards << " left" << dendl;
       mapping_job->abort();
-    } else if (mapping.get_epoch() == osdmap.get_epoch()) {
+    } else if (mapping.get_epoch() < osdmap.get_epoch()) {
       dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
 	      << mapping_job.get() << " is prior epoch "
 	      << mapping.get_epoch() << dendl;
@@ -1135,13 +1146,13 @@ void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
 
   bufferlist bl;
 
-  // set or clear full/nearfull?
   {
     OSDMap tmp;
     tmp.deepish_copy_from(osdmap);
     tmp.apply_incremental(pending_inc);
 
     if (tmp.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
+      // set or clear full/nearfull?
       int full, backfill, nearfull;
       tmp.count_full_nearfull_osds(&full, &backfill, &nearfull);
       if (full > 0) {
@@ -1167,6 +1178,16 @@ void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
 	  }
 	}
       }
+
+      // min_compat_client?
+      if (tmp.require_min_compat_client.empty()) {
+	auto mv = tmp.get_min_compat_client();
+	dout(1) << __func__ << " setting require_min_compat_client to current " << mv
+		<< dendl;
+	mon->clog->info() << "setting require_min_compat_client to currently required "
+			  << mv;
+	pending_inc.new_require_min_compat_client = mv.first;
+      }
     }
   }
 
@@ -1821,23 +1842,23 @@ bool OSDMonitor::can_mark_up(int i)
 bool OSDMonitor::can_mark_out(int i)
 {
   if (osdmap.test_flag(CEPH_OSDMAP_NOOUT)) {
-    dout(5) << "can_mark_out NOOUT flag set, will not mark osds out" << dendl;
+    dout(5) << __func__ << " NOOUT flag set, will not mark osds out" << dendl;
     return false;
   }
   int num_osds = osdmap.get_num_osds();
   if (num_osds == 0) {
-    dout(5) << "can_mark_out no osds" << dendl;
+    dout(5) << __func__ << " no osds" << dendl;
     return false;
   }
   int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap);
   float in_ratio = (float)in / (float)num_osds;
   if (in_ratio < g_conf->mon_osd_min_in_ratio) {
     if (i >= 0)
-      dout(5) << "can_mark_down current in_ratio " << in_ratio << " < min "
+      dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
 	      << g_conf->mon_osd_min_in_ratio
 	      << ", will not mark osd." << i << " out" << dendl;
     else
-      dout(5) << "can_mark_down current in_ratio " << in_ratio << " < min "
+      dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
 	      << g_conf->mon_osd_min_in_ratio
 	      << ", will not mark osds out" << dendl;
     return false;
@@ -2631,7 +2652,7 @@ bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
 {
   MOSDPGTemp *m = static_cast<MOSDPGTemp*>(op->get_req());
   dout(10) << "preprocess_pgtemp " << *m << dendl;
-  vector<int> empty;
+  mempool::osdmap::vector<int> empty;
   int from = m->get_orig_source().num();
   size_t ignore_cnt = 0;
 
@@ -2651,7 +2672,7 @@ bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
     goto ignore;
   }
 
-  for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
+  for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
     dout(20) << " " << p->first
 	     << (osdmap.pg_temp->count(p->first) ? (*osdmap.pg_temp)[p->first] : empty)
              << " -> " << p->second << dendl;
@@ -2695,9 +2716,10 @@ bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
     // change?
     //  NOTE: we assume that this will clear pg_primary, so consider
     //        an existing pg_primary field to imply a change
-    if (p->second.size() && (osdmap.pg_temp->count(p->first) == 0 ||
-			     (*osdmap.pg_temp)[p->first] != p->second ||
-			     osdmap.primary_temp->count(p->first)))
+    if (p->second.size() &&
+	(osdmap.pg_temp->count(p->first) == 0 ||
+	 !vectors_equal((*osdmap.pg_temp)[p->first], p->second) ||
+	 osdmap.primary_temp->count(p->first)))
       return false;
   }
 
@@ -2744,7 +2766,8 @@ bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
                << ": pool has been removed" << dendl;
       continue;
     }
-    pending_inc.new_pg_temp[p->first] = p->second;
+    pending_inc.new_pg_temp[p->first] =
+      mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
 
     // unconditionally clear pg_primary (until this message can encode
     // a change for that, too.. at which point we need to also fix
@@ -3136,10 +3159,11 @@ void OSDMonitor::check_pg_creates_sub(Subscription *sub)
   }
 }
 
-void OSDMonitor::scan_for_creating_pgs(const map<int64_t,pg_pool_t>& pools,
-				       const set<int64_t>& removed_pools,
-				       utime_t modified,
-				       creating_pgs_t* creating_pgs) const
+void OSDMonitor::scan_for_creating_pgs(
+  const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
+  const mempool::osdmap::set<int64_t>& removed_pools,
+  utime_t modified,
+  creating_pgs_t* creating_pgs) const
 {
   for (auto& p : pools) {
     int64_t poolid = p.first;
@@ -3188,7 +3212,7 @@ void OSDMonitor::update_creating_pgs()
   for (auto& pg : creating_pgs.pgs) {
     int acting_primary = -1;
     auto pgid = pg.first;
-    auto& created = pg.second.first;
+    auto mapped = pg.second.first;
     mapping.get(pgid, nullptr, nullptr, nullptr, &acting_primary);
     // check the previous creating_pgs, look for the target to whom the pg was
     // previously mapped
@@ -3196,15 +3220,14 @@ void OSDMonitor::update_creating_pgs()
       const auto last_acting_primary = pgs_by_epoch.first;
       for (auto& pgs: pgs_by_epoch.second) {
 	if (pgs.second.count(pgid)) {
-	  if (last_acting_primary != acting_primary) {
+	  if (last_acting_primary == acting_primary) {
+	    mapped = pgs.first;
+	  } else {
 	    dout(20) << __func__ << " " << pgid << " "
 		     << " acting_primary:" << last_acting_primary
 		     << " -> " << acting_primary << dendl;
 	    // note epoch if the target of the create message changed.
-	    // creating_pgs is updated here instead of in
-	    // scan_for_creating_pgs() because we don't have the updated pg
-	    // mapping by then.
-	    created = mapping.get_epoch();
+	    mapped = mapping.get_epoch();
           }
           break;
         }
@@ -3212,7 +3235,7 @@ void OSDMonitor::update_creating_pgs()
     }
     dout(10) << __func__ << " will instruct osd." << acting_primary
 	     << " to create " << pgid << dendl;
-    new_pgs_by_osd_epoch[acting_primary][created].insert(pgid);
+    new_pgs_by_osd_epoch[acting_primary][mapped].insert(pgid);
   }
   creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch);
   creating_pgs_epoch = mapping.get_epoch();
@@ -3222,6 +3245,7 @@ epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next)
 {
   dout(30) << __func__ << " osd." << osd << " next=" << next
 	   << " " << creating_pgs_by_osd_epoch << dendl;
+  std::lock_guard<std::mutex> l(creating_pgs_lock);
   auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd);
   if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end())
     return next;
@@ -3242,8 +3266,8 @@ epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next)
       // Need the create time from the monitor using its clock to set
       // last_scrub_stamp upon pg creation.
       const auto& creation = creating_pgs.pgs[pg];
-      m->mkpg[pg] = pg_create_t{creation.first, pg, 0};
-      m->ctimes[pg] = creation.second;
+      m->mkpg.emplace(pg, pg_create_t{creation.first, pg, 0});
+      m->ctimes.emplace(pg, creation.second);
       dout(20) << __func__ << " will create " << pg
 	       << " at " << creation.first << dendl;
     }
@@ -3457,6 +3481,13 @@ void OSDMonitor::get_health(list<pair<health_status_t,string> >& summary,
     int num_in_osds = 0;
     int num_down_in_osds = 0;
     set<int> osds;
+    set<int> down_in_osds;
+    set<int> up_in_osds;
+    set<int> subtree_up;
+    unordered_map<int, set<int> > subtree_type_down;
+    unordered_map<int, int> num_osds_subtree;
+    int max_type = osdmap.crush->get_num_type_names() - 1;
+
     for (int i = 0; i < osdmap.get_max_osd(); i++) {
       if (!osdmap.exists(i)) {
         if (osdmap.crush->item_exists(i)) {
@@ -3467,22 +3498,98 @@ void OSDMonitor::get_health(list<pair<health_status_t,string> >& summary,
       if (osdmap.is_out(i))
         continue;
       ++num_in_osds;
+      if (down_in_osds.count(i) || up_in_osds.count(i))
+	continue;
       if (!osdmap.is_up(i)) {
-	++num_down_in_osds;
-	if (detail) {
-	  const osd_info_t& info = osdmap.get_info(i);
-	  ostringstream ss;
-	  ss << "osd." << i << " is down since epoch " << info.down_at
-	     << ", last address " << osdmap.get_addr(i);
-	  detail->push_back(make_pair(HEALTH_WARN, ss.str()));
+	down_in_osds.insert(i);
+	int parent_id = 0;
+	int current = i;
+	for (int type = 0; type <= max_type; type++) {
+	  int r = osdmap.crush->get_immediate_parent_id(current, &parent_id);
+	  if (r == -ENOENT)
+	    break;
+	  // break early if this parent is already marked as up
+	  if (subtree_up.count(parent_id))
+	    break;
+	  type = osdmap.crush->get_bucket_type(parent_id);
+	  if (!osdmap.subtree_type_is_down(g_ceph_context, parent_id, type, &down_in_osds, &up_in_osds, &subtree_up, &subtree_type_down))
+	    break;
+	  current = parent_id;
+	}
+      }
+    }
+
+    // calculate the number of down osds in each down subtree and store it in num_osds_subtree
+    for (int type = 1; type <= max_type; type++) {
+      for (auto j = subtree_type_down[type].begin(); j != subtree_type_down[type].end(); ++j) {
+	if (type == 1) {
+          list<int> children;
+          int num = osdmap.crush->get_children(*j, &children);
+          num_osds_subtree[*j] = num;
+        } else {
+          list<int> children;
+          int num = 0;
+          int num_children = osdmap.crush->get_children(*j, &children);
+          if (num_children == 0)
+	    continue;
+          for (auto l = children.begin(); l != children.end(); ++l) {
+            if (num_osds_subtree[*l] > 0) {
+              num = num + num_osds_subtree[*l];
+            }
+          }
+          num_osds_subtree[*j] = num;
 	}
       }
     }
+    num_down_in_osds = down_in_osds.size();
     assert(num_down_in_osds <= num_in_osds);
     if (num_down_in_osds > 0) {
       ostringstream ss;
-      ss << num_down_in_osds << "/" << num_in_osds << " in osds are down";
+      ss << "\n";
+      // summary of down subtree types and osds
+      for (int type = max_type; type > 0; type--) {
+	if (subtree_type_down[type].size() > 0) {
+	  ss << subtree_type_down[type].size() << " " << osdmap.crush->get_type_name(type);
+	  if (subtree_type_down[type].size() > 1) {
+	    ss << "s";
+	  }
+	  int sum_down_osds = 0;
+	  for (auto j = subtree_type_down[type].begin(); j != subtree_type_down[type].end(); ++j) {
+	    sum_down_osds = sum_down_osds + num_osds_subtree[*j];
+	  }
+          ss << " (" << sum_down_osds << " osds) down\n";
+	}
+      }
+      ss << down_in_osds.size() << " osds down\n";
       summary.push_back(make_pair(HEALTH_WARN, ss.str()));
+
+      if (detail) {
+	ostringstream ss;
+	// details of down subtree types
+	for (int type = max_type; type > 0; type--) {
+	  for (auto j = subtree_type_down[type].rbegin(); j != subtree_type_down[type].rend(); ++j) {
+	    ss << osdmap.crush->get_type_name(type);
+	    ss << " ";
+	    ss << osdmap.crush->get_item_name(*j);
+	    // at the top level, do not print location
+	    if (type != max_type) {
+              ss << " (";
+              ss << osdmap.crush->get_full_location_ordered_string(*j);
+              ss << ")";
+	    }
+	    int num = num_osds_subtree[*j];
+	    ss << " (" << num << " osds)";
+	    ss << " is down\n";
+	  }
+        }
+	// details of down osds
+	for (auto it = down_in_osds.begin(); it != down_in_osds.end(); ++it) {
+	  ss << "osd." << *it << " (";
+	  ss << osdmap.crush->get_full_location_ordered_string(*it);
+          ss << ") is down\n";
+	}
+        detail->push_back(make_pair(HEALTH_WARN, ss.str()));
+      }
     }
 
     if (!osds.empty()) {
@@ -4793,13 +4900,10 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
     f->close_section();
     f->flush(rdata);
   } else if (prefix == "osd erasure-code-profile ls") {
-    const map<string,map<string,string> > &profiles =
-      osdmap.get_erasure_code_profiles();
+    const auto &profiles = osdmap.get_erasure_code_profiles();
     if (f)
       f->open_array_section("erasure-code-profiles");
-    for(map<string,map<string,string> >::const_iterator i = profiles.begin();
-	i != profiles.end();
-	++i) {
+    for (auto i = profiles.begin(); i != profiles.end(); ++i) {
       if (f)
         f->dump_string("profile", i->first.c_str());
       else
@@ -4863,10 +4967,8 @@ bool OSDMonitor::update_pools_status()
 
   bool ret = false;
 
-  const map<int64_t,pg_pool_t>& pools = osdmap.get_pools();
-  for (map<int64_t,pg_pool_t>::const_iterator it = pools.begin();
-       it != pools.end();
-       ++it) {
+  auto& pools = osdmap.get_pools();
+  for (auto it = pools.begin(); it != pools.end(); ++it) {
     if (!mon->pgmon()->pg_map.pg_pool_sum.count(it->first))
       continue;
     pool_stat_t& stats = mon->pgmon()->pg_map.pg_pool_sum[it->first];
@@ -4914,9 +5016,8 @@ void OSDMonitor::get_pools_health(
     list<pair<health_status_t,string> >& summary,
     list<pair<health_status_t,string> > *detail) const
 {
-  const map<int64_t,pg_pool_t>& pools = osdmap.get_pools();
-  for (map<int64_t,pg_pool_t>::const_iterator it = pools.begin();
-       it != pools.end(); ++it) {
+  auto& pools = osdmap.get_pools();
+  for (auto it = pools.begin(); it != pools.end(); ++it) {
     if (!mon->pgmon()->pg_map.pg_pool_sum.count(it->first))
       continue;
     pool_stat_t& stats = mon->pgmon()->pg_map.pg_pool_sum[it->first];
@@ -5233,22 +5334,35 @@ bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
   newmap.deepish_copy_from(osdmap);
   newmap.apply_incremental(new_pending);
 
+  // client compat
+  if (newmap.require_min_compat_client.length()) {
+    auto mv = newmap.get_min_compat_client();
+    if (mv.first > newmap.require_min_compat_client) {
+      ss << "new crush map requires client version " << mv
+	 << " but require_min_compat_client is "
+	 << newmap.require_min_compat_client;
+      return false;
+    }
+  }
+
+  // osd compat
   uint64_t features =
     newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) |
     newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
-
   stringstream features_ss;
   int r = check_cluster_features(features, features_ss);
-  if (!r)
-    return true;
+  if (r) {
+    ss << "Could not change CRUSH: " << features_ss.str();
+    return false;
+  }
 
-  ss << "Could not change CRUSH: " << features_ss.str();
-  return false;
+  return true;
 }
 
-bool OSDMonitor::erasure_code_profile_in_use(const map<int64_t, pg_pool_t> &pools,
-					     const string &profile,
-					     ostream *ss)
+bool OSDMonitor::erasure_code_profile_in_use(
+  const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
+  const string &profile,
+  ostream *ss)
 {
   bool found = false;
   for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
@@ -5963,7 +6077,8 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
       return -EINVAL;
     }
     stringstream err;
-    if (!is_pool_currently_all_bluestore(pool, p, &err)) {
+    if (!g_conf->mon_debug_no_require_bluestore_for_ec_overwrites &&
+	!is_pool_currently_all_bluestore(pool, p, &err)) {
       ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str();
       return -EINVAL;
     }
@@ -6285,9 +6400,8 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
       goto reply;
     }
     
-    const map<int64_t,pg_pool_t> &osdmap_pools = osdmap.get_pools();
-    map<int64_t,pg_pool_t>::const_iterator pit;
-    for (pit = osdmap_pools.begin(); pit != osdmap_pools.end(); ++pit) {
+    const auto& osdmap_pools = osdmap.get_pools();
+    for (auto pit = osdmap_pools.begin(); pit != osdmap_pools.end(); ++pit) {
       const int64_t pool_id = pit->first;
       const pg_pool_t &pool = pit->second;
       int ruleno = pool.get_crush_ruleset();
@@ -6655,7 +6769,52 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
 	err = 0;
       }
     } while (false);
-
+  } else if (prefix == "osd crush swap-bucket") {
+    string source, dest, force;
+    cmd_getval(g_ceph_context, cmdmap, "source", source);
+    cmd_getval(g_ceph_context, cmdmap, "dest", dest);
+    cmd_getval(g_ceph_context, cmdmap, "force", force);
+    CrushWrapper newcrush;
+    _get_pending_crush(newcrush);
+    if (!newcrush.name_exists(source)) {
+      ss << "source item " << source << " does not exist";
+      err = -ENOENT;
+      goto reply;
+    }
+    if (!newcrush.name_exists(dest)) {
+      ss << "dest item " << dest << " does not exist";
+      err = -ENOENT;
+      goto reply;
+    }
+    int sid = newcrush.get_item_id(source);
+    int did = newcrush.get_item_id(dest);
+    int sparent;
+    if (newcrush.get_immediate_parent_id(sid, &sparent) == 0 &&
+	force != "--yes-i-really-mean-it") {
+      ss << "source item " << source << " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
+      err = -EPERM;
+      goto reply;
+    }
+    if (newcrush.get_bucket_alg(sid) != newcrush.get_bucket_alg(did) &&
+	force != "--yes-i-really-mean-it") {
+      ss << "source bucket alg " << crush_alg_name(newcrush.get_bucket_alg(sid)) << " != "
+	 << "dest bucket alg " << crush_alg_name(newcrush.get_bucket_alg(did))
+	 << "; pass --yes-i-really-mean-it to proceed anyway";
+      err = -EPERM;
+      goto reply;
+    }
+    int r = newcrush.swap_bucket(g_ceph_context, sid, did);
+    if (r < 0) {
+      ss << "failed to swap bucket contents: " << cpp_strerror(r);
+      goto reply;
+    }
+    ss << "swapped bucket of " << source << " to " << dest;
+    pending_inc.crush.clear();
+    newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
+    wait_for_finished_proposal(op,
+			       new Monitor::C_Command(mon, op, err, ss.str(),
+						      get_last_committed() + 1));
+    return true;
   } else if (prefix == "osd crush link") {
     // osd crush link <name> <loc1> [<loc2> ...]
     string name;
@@ -7258,6 +7417,39 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
 					      get_last_committed() + 1));
     return true;
+  } else if (prefix == "osd set-require-min-compat-client") {
+    if (!osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
+      ss << "you must complete the upgrade and set require_luminous_osds before"
+	 << " using the new interface";
+      err = -EPERM;
+      goto reply;
+    }
+    string v;
+    cmd_getval(g_ceph_context, cmdmap, "version", v);
+    if (v != "luminous" && v != "kraken" && v != "jewel" && v != "infernalis" &&
+	v != "hammer" && v != "giant" && v != "firefly" && v != "emperor" &&
+	v != "dumpling" && v != "cuttlefish" && v != "bobtail" && v != "argonaut") {
+      ss << "version " << v << " is not recognized";
+      err = -EINVAL;
+      goto reply;
+    }
+    OSDMap newmap;
+    newmap.deepish_copy_from(osdmap);
+    newmap.apply_incremental(pending_inc);
+    newmap.require_min_compat_client = v;
+    auto mv = newmap.get_min_compat_client();
+    if (v < mv.first) {
+      ss << "osdmap current utilizes features that require " << mv
+	 << "; cannot set require_min_compat_client below that to " << v;
+      err = -EPERM;
+      goto reply;
+    }
+    ss << "set require_min_compat_client to " << v;
+    pending_inc.new_require_min_compat_client = v;
+    getline(ss, rs);
+    wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
+							  get_last_committed() + 1));
+    return true;
   } else if (prefix == "osd pause") {
     return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
 
@@ -7504,7 +7696,8 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
       new_pg_temp.push_back(osd);
     }
 
-    pending_inc.new_pg_temp[pgid] = new_pg_temp;
+    pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
+      new_pg_temp.begin(), new_pg_temp.end());
     ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp;
     goto update;
   } else if (prefix == "osd primary-temp") {
@@ -7540,7 +7733,13 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
       goto reply;
     }
 
-    if (!g_conf->mon_osd_allow_primary_temp) {
+    if (osdmap.require_min_compat_client.length() &&
+	osdmap.require_min_compat_client < "firefly") {
+      ss << "require_min_compat_client " << osdmap.require_min_compat_client
+	 << " < firefly, which is required for primary-temp";
+      err = -EPERM;
+      goto reply;
+    } else if (!g_conf->mon_osd_allow_primary_temp) {
       ss << "you must enable 'mon osd allow primary temp = true' on the mons before you can set primary_temp mappings.  note that this is for developers only: older clients/OSDs will break and there is no feature bit infrastructure in place.";
       err = -EPERM;
       goto reply;
@@ -7550,17 +7749,18 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     ss << "set " << pgid << " primary_temp mapping to " << osd;
     goto update;
   } else if (prefix == "osd pg-upmap") {
-    if (!g_conf->mon_osd_allow_pg_upmap) {
-      ss << "you must enable 'mon osd allow pg upmap = true' on the mons before you can adjust pg_upmap.  note that pre-luminous clients will no longer be able to communicate with the cluster.";
+    if (!osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
+      ss << "you must set the require_luminous_osds flag to use this feature";
       err = -EPERM;
       goto reply;
     }
-    if (!osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
-      ss << "you must set the require_luminous_osds flag to use this feature";
+    if (osdmap.require_min_compat_client < "luminous") {
+      ss << "min_compat_client " << osdmap.require_min_compat_client
+	 << " < luminous, which is required for pg-upmap";
       err = -EPERM;
       goto reply;
     }
-    err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_REMAP, ss);
+    err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
     if (err == -EAGAIN)
       goto wait;
     if (err < 0)
@@ -7606,21 +7806,23 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
       new_pg_upmap.push_back(osd);
     }
 
-    pending_inc.new_pg_upmap[pgid] = new_pg_upmap;
+    pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
+      new_pg_upmap.begin(), new_pg_upmap.end());
     ss << "set " << pgid << " pg_upmap mapping to " << new_pg_upmap;
     goto update;
   } else if (prefix == "osd rm-pg-upmap") {
-    if (!g_conf->mon_osd_allow_pg_upmap) {
-      ss << "you must enable 'mon osd allow pg upmap = true' on the mons before you can adjust pg_upmap.  note that pre-luminous clients will no longer be able to communicate with the cluster.";
+    if (!osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
+      ss << "you must set the require_luminous_osds flag to use this feature";
       err = -EPERM;
       goto reply;
     }
-    if (!osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
-      ss << "you must set the require_luminous_osds flag to use this feature";
+    if (osdmap.require_min_compat_client < "luminous") {
+      ss << "require_min_compat_client " << osdmap.require_min_compat_client
+	 << " < luminous, which is required for pg-upmap";
       err = -EPERM;
       goto reply;
     }
-    err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_REMAP, ss);
+    err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
     if (err == -EAGAIN)
       goto wait;
     if (err < 0)
@@ -7654,17 +7856,18 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     ss << "clear " << pgid << " pg_upmap mapping";
     goto update;
   } else if (prefix == "osd pg-upmap-items") {
-    if (!g_conf->mon_osd_allow_pg_upmap) {
-      ss << "you must enable 'mon osd allow pg upmap = true' on the mons before you can adjust pg_upmap.  note that pre-luminous clients will no longer be able to communicate with the cluster.";
+    if (!osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
+      ss << "you must set the require_luminous_osds flag to use this feature";
       err = -EPERM;
       goto reply;
     }
-    if (!osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
-      ss << "you must set the require_luminous_osds flag to use this feature";
+    if (osdmap.require_min_compat_client < "luminous") {
+      ss << "require_min_compat_client " << osdmap.require_min_compat_client
+	 << " < luminous, which is required for pg-upmap";
       err = -EPERM;
       goto reply;
     }
-    err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_REMAP, ss);
+    err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
     if (err == -EAGAIN)
       goto wait;
     if (err < 0)
@@ -7722,21 +7925,24 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
       new_pg_upmap_items.push_back(make_pair(from, to));
     }
 
-    pending_inc.new_pg_upmap_items[pgid] = new_pg_upmap_items;
+    pending_inc.new_pg_upmap_items[pgid] =
+      mempool::osdmap::vector<pair<int32_t,int32_t>>(
+      new_pg_upmap_items.begin(), new_pg_upmap_items.end());
     ss << "set " << pgid << " pg_upmap_items mapping to " << new_pg_upmap_items;
     goto update;
   } else if (prefix == "osd rm-pg-upmap-items") {
-    if (!g_conf->mon_osd_allow_pg_upmap) {
-      ss << "you must enable 'mon osd allow pg upmap = true' on the mons before you can adjust pg_upmap.  note that pre-luminous clients will no longer be able to communicate with the cluster.";
+    if (!osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
+      ss << "you must set the require_luminous_osds flag to use this feature";
       err = -EPERM;
       goto reply;
     }
-    if (!osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
-      ss << "you must set the require_luminous_osds flag to use this feature";
+    if (osdmap.require_min_compat_client < "luminous") {
+      ss << "require_min_compat_client " << osdmap.require_min_compat_client
+	 << " < luminous, which is required for pg-upmap";
       err = -EPERM;
       goto reply;
     }
-    err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_REMAP, ss);
+    err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
     if (err == -EAGAIN)
       goto wait;
     if (err < 0)
@@ -7790,7 +7996,13 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
       err = -EINVAL;
       goto reply;
     }
-    if (!g_conf->mon_osd_allow_primary_affinity) {
+    if (osdmap.require_min_compat_client.length() &&
+	osdmap.require_min_compat_client < "firefly") {
+      ss << "require_min_compat_client " << osdmap.require_min_compat_client
+	 << " < firefly, which is required for primary-affinity";
+      err = -EPERM;
+      goto reply;
+    } else if (!g_conf->mon_osd_allow_primary_affinity) {
       ss << "you must enable 'mon osd allow primary affinity = true' on the mons before you can adjust primary-affinity.  note that older clients will no longer be able to communicate with the cluster.";
       err = -EPERM;
       goto reply;
@@ -7853,7 +8065,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
          << cmd_vartype_stringify(cmdmap["weights"]) << "'";
       goto reply;
     }
-    pending_inc.new_weight = std::move(weights);
+    pending_inc.new_weight.insert(weights.begin(), weights.end());
     wait_for_finished_proposal(
 	op,
 	new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
@@ -8294,14 +8506,16 @@ done:
       goto reply;
     }
 
-    if (poolstr2 != poolstr || sure != "--yes-i-really-really-mean-it") {
+    bool force_no_fake = sure == "--yes-i-really-really-mean-it-not-faking";
+    if (poolstr2 != poolstr ||
+	(sure != "--yes-i-really-really-mean-it" && !force_no_fake)) {
       ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
 	 << ".  If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
 	 << "followed by --yes-i-really-really-mean-it.";
       err = -EPERM;
       goto reply;
     }
-    err = _prepare_remove_pool(pool, &ss);
+    err = _prepare_remove_pool(pool, &ss, force_no_fake);
     if (err == -EAGAIN) {
       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
       return true;
@@ -8917,7 +9131,7 @@ done:
     string no_increasing;
     cmd_getval(g_ceph_context, cmdmap, "no_increasing", no_increasing);
     string out_str;
-    map<int32_t, uint32_t> new_weights;
+    mempool::osdmap::map<int32_t, uint32_t> new_weights;
     err = reweight::by_utilization(osdmap,
 				   mon->pgmon()->pg_map,
 				   oload,
@@ -9365,7 +9579,8 @@ bool OSDMonitor::_check_remove_tier(
   return true;
 }
 
-int OSDMonitor::_prepare_remove_pool(int64_t pool, ostream *ss)
+int OSDMonitor::_prepare_remove_pool(
+  int64_t pool, ostream *ss, bool no_fake)
 {
   dout(10) << "_prepare_remove_pool " << pool << dendl;
   const pg_pool_t *p = osdmap.get_pg_pool(pool);
@@ -9389,11 +9604,20 @@ int OSDMonitor::_prepare_remove_pool(int64_t pool, ostream *ss)
     return 0;
   }
 
+  if (g_conf->mon_fake_pool_delete && !no_fake) {
+    string old_name = osdmap.get_pool_name(pool);
+    string new_name = old_name + "." + stringify(pool) + ".DELETED";
+    dout(1) << __func__ << " faking pool deletion: renaming " << pool << " "
+	    << old_name << " -> " << new_name << dendl;
+    pending_inc.new_pool_names[pool] = new_name;
+    return 0;
+  }
+
   // remove
   pending_inc.old_pools.insert(pool);
 
   // remove any pg_temp mappings for this pool too
-  for (map<pg_t,vector<int32_t> >::iterator p = osdmap.pg_temp->begin();
+  for (auto p = osdmap.pg_temp->begin();
        p != osdmap.pg_temp->end();
        ++p) {
     if (p->first.pool() == (uint64_t)pool) {
@@ -9402,7 +9626,7 @@ int OSDMonitor::_prepare_remove_pool(int64_t pool, ostream *ss)
       pending_inc.new_pg_temp[p->first].clear();
     }
   }
-  for (map<pg_t,int32_t>::iterator p = osdmap.primary_temp->begin();
+  for (auto p = osdmap.primary_temp->begin();
       p != osdmap.primary_temp->end();
       ++p) {
     if (p->first.pool() == (uint64_t)pool) {
@@ -9438,7 +9662,7 @@ bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op)
   op->mark_osdmon_event(__func__);
   MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
   ostringstream ss;
-  int ret = _prepare_remove_pool(m->pool, &ss);
+  int ret = _prepare_remove_pool(m->pool, &ss, false);
   if (ret == -EAGAIN) {
     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
     return true;
diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h
index 4674241592f..44f013f27f7 100644
--- a/src/mon/OSDMonitor.h
+++ b/src/mon/OSDMonitor.h
@@ -299,7 +299,7 @@ private:
       int64_t base_pool_id, const pg_pool_t *base_pool, const pg_pool_t *tier_pool,
       int *err, ostream *ss) const;
 
-  int _prepare_remove_pool(int64_t pool, ostream *ss);
+  int _prepare_remove_pool(int64_t pool, ostream *ss, bool no_fake);
   int _prepare_rename_pool(int64_t pool, string newname);
 
   bool preprocess_pool_op (MonOpRequestRef op);
@@ -331,9 +331,10 @@ private:
 				 const string &ruleset_name,
 				 int *crush_ruleset,
 				 ostream *ss);
-  bool erasure_code_profile_in_use(const map<int64_t, pg_pool_t> &pools,
-				   const string &profile,
-				   ostream *ss);
+  bool erasure_code_profile_in_use(
+    const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
+    const string &profile,
+    ostream *ss);
   int parse_erasure_code_profile(const vector<string> &erasure_code_profile,
 				 map<string,string> *erasure_code_profile_map,
 				 ostream *ss);
@@ -452,10 +453,11 @@ private:
 
   creating_pgs_t update_pending_pgs(const OSDMap::Incremental& inc);
   void trim_creating_pgs(creating_pgs_t *creating_pgs, const PGMap& pgm);
-  void scan_for_creating_pgs(const std::map<int64_t,pg_pool_t>& pools,
-			     const std::set<int64_t>& removed_pools,
-			     utime_t modified,
-			     creating_pgs_t* creating_pgs) const;
+  void scan_for_creating_pgs(
+    const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
+    const mempool::osdmap::set<int64_t>& removed_pools,
+    utime_t modified,
+    creating_pgs_t* creating_pgs) const;
   pair<int32_t, pg_t> get_parent_pg(pg_t pgid) const;
   void update_creating_pgs();
   void check_pg_creates_subs();
diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc
index 8b4a1010ab2..ac7ba802bdd 100644
--- a/src/mon/PGMap.cc
+++ b/src/mon/PGMap.cc
@@ -337,11 +337,13 @@ void PGMap::calc_stats()
 {
   num_pg_by_state.clear();
   num_pg = 0;
+  num_pg_active = 0;
   num_osd = 0;
   pg_pool_sum.clear();
   pg_sum = pool_stat_t();
   osd_sum = osd_stat_t();
   pg_by_osd.clear();
+  num_primary_pg_by_osd.clear();
 
   for (ceph::unordered_map<pg_t,pg_stat_t>::iterator p = pg_stat.begin();
        p != pg_stat.end();
@@ -457,6 +459,10 @@ void PGMap::stat_pg_add(const pg_t &pgid, const pg_stat_t &s,
     }
   }
 
+  if (s.state & PG_STATE_ACTIVE) {
+    ++num_pg_active;
+  }
+
   if (sameosds)
     return;
 
@@ -470,6 +476,9 @@ void PGMap::stat_pg_add(const pg_t &pgid, const pg_stat_t &s,
     pg_by_osd[*p].insert(pgid);
   for (vector<int>::const_iterator p = s.up.begin(); p != s.up.end(); ++p)
     pg_by_osd[*p].insert(pgid);
+
+  if (s.up_primary >= 0)
+    num_primary_pg_by_osd[s.up_primary]++;
 }
 
 void PGMap::stat_pg_sub(const pg_t &pgid, const pg_stat_t &s,
@@ -500,6 +509,10 @@ void PGMap::stat_pg_sub(const pg_t &pgid, const pg_stat_t &s,
     }
   }
 
+  if (s.state & PG_STATE_ACTIVE) {
+    --num_pg_active;
+  }
+
   if (sameosds)
     return;
 
@@ -525,6 +538,12 @@ void PGMap::stat_pg_sub(const pg_t &pgid, const pg_stat_t &s,
     if (oset.empty())
       pg_by_osd.erase(*p);
   }
+
+  if (s.up_primary >= 0) {
+    auto it = num_primary_pg_by_osd.find(s.up_primary);
+    if (it != num_primary_pg_by_osd.end() && it->second > 0)
+      it->second--;
+  }
 }
 
 void PGMap::stat_pg_update(const pg_t pgid, pg_stat_t& s,
@@ -539,6 +558,13 @@ void PGMap::stat_pg_update(const pg_t pgid, pg_stat_t& s,
     s.blocked_by == n.blocked_by;
 
   stat_pg_sub(pgid, s, sameosds);
+
+  // if acting_primary has shift to an just restored osd, and pg yet to finish
+  // peering, many attributes in current stats remain stale. others seem don't
+  // mater much while faulty last_active will make "pg stuck in" check unhappy.
+  if (!(n.state & (PG_STATE_ACTIVE | PG_STATE_PEERED)) &&
+      n.last_active < s.last_active)
+    n.last_active = s.last_active;
   s = n;
   stat_pg_add(pgid, n, sameosds);
 }
@@ -692,7 +718,7 @@ void PGMap::dump_basic(Formatter *f) const
   osd_sum.dump(f);
   f->close_section();
 
-  f->open_object_section("osd_epochs");
+  f->open_array_section("osd_epochs");
   for (ceph::unordered_map<int32_t,epoch_t>::const_iterator p =
          osd_epochs.begin(); p != osd_epochs.end(); ++p) {
     f->open_object_section("osd");
@@ -956,6 +982,7 @@ void PGMap::dump_osd_stats(ostream& ss) const
   tab.define_column("TOTAL", TextTable::LEFT, TextTable::RIGHT);
   tab.define_column("HB_PEERS", TextTable::LEFT, TextTable::RIGHT);
   tab.define_column("PG_SUM", TextTable::LEFT, TextTable::RIGHT);
+  tab.define_column("PRIMARY_PG_SUM", TextTable::LEFT, TextTable::RIGHT);
 
   for (ceph::unordered_map<int32_t,osd_stat_t>::const_iterator p = osd_stat.begin();
        p != osd_stat.end();
@@ -966,6 +993,7 @@ void PGMap::dump_osd_stats(ostream& ss) const
         << si_t(p->second.kb << 10)
         << p->second.hb_peers
         << get_num_pg_by_osd(p->first)
+        << get_num_primary_pg_by_osd(p->first)
         << TextTable::endrow;
   }
 
@@ -1650,6 +1678,18 @@ void PGMap::print_summary(Formatter *f, ostream *out) const
          << kb_t(osd_sum.kb) << " avail\n";
   }
 
+
+  if (num_pg_active < num_pg) {
+    float p = (float)num_pg_active / (float)num_pg;
+    if (f) {
+      f->dump_float("active_pgs_ratio", p);
+    } else {
+      char b[20];
+      snprintf(b, sizeof(b), "%.3lf", (1.0 - p) * 100.0);
+      *out << "            " << b << "% pgs inactive\n";
+    }
+  }
+
   list<string> sl;
   overall_recovery_summary(f, &sl);
   if (!f && !sl.empty()) {
@@ -1983,10 +2023,10 @@ void PGMap::dump_pool_stats(const OSDMap &osd_map, stringstream *ss,
       break;
     case pg_pool_t::TYPE_ERASURE:
     {
-      const map<string,string>& ecp =
+      auto& ecp =
         osd_map.get_erasure_code_profile(pool->erasure_code_profile);
-      map<string,string>::const_iterator pm = ecp.find("m");
-      map<string,string>::const_iterator pk = ecp.find("k");
+      auto pm = ecp.find("m");
+      auto pk = ecp.find("k");
       if (pm != ecp.end() && pk != ecp.end()) {
 	int k = atoi(pk->second.c_str());
 	int m = atoi(pm->second.c_str());
@@ -2860,7 +2900,7 @@ int reweight::by_utilization(
     int max_osds,
     bool by_pg, const set<int64_t> *pools,
     bool no_increasing,
-    map<int32_t, uint32_t>* new_weights,
+    mempool::osdmap::map<int32_t, uint32_t>* new_weights,
     std::stringstream *ss,
     std::string *out_str,
     Formatter *f)
diff --git a/src/mon/PGMap.h b/src/mon/PGMap.h
index 05ccea3f03f..6231407b08e 100644
--- a/src/mon/PGMap.h
+++ b/src/mon/PGMap.h
@@ -117,13 +117,15 @@ public:
 
   // aggregate stats (soft state), generated by calc_stats()
   ceph::unordered_map<int,int> num_pg_by_state;
-  int64_t num_pg, num_osd;
+  int64_t num_pg = 0, num_osd = 0;
+  int64_t num_pg_active = 0;
   ceph::unordered_map<int,pool_stat_t> pg_pool_sum;
   pool_stat_t pg_sum;
   osd_stat_t osd_sum;
-  mutable epoch_t min_last_epoch_clean;
+  mutable epoch_t min_last_epoch_clean = 0;
   ceph::unordered_map<int,int> blocked_by_sum;
   ceph::unordered_map<int,set<pg_t> > pg_by_osd;
+  ceph::unordered_map<int,int> num_primary_pg_by_osd;
 
   utime_t stamp;
 
@@ -198,10 +200,7 @@ public:
   PGMap()
     : version(0),
       last_osdmap_epoch(0), last_pg_scan(0),
-      full_ratio(0), nearfull_ratio(0),
-      num_pg(0),
-      num_osd(0),
-      min_last_epoch_clean(0)
+      full_ratio(0), nearfull_ratio(0)
   {}
 
   void set_full_ratios(float full, float nearfull) {
@@ -253,6 +252,15 @@ public:
     return pool_stat_t();
   }
 
+  int get_num_primary_pg_by_osd(int osd) const {
+    assert(osd >= 0);
+    int num = 0;
+    auto it = num_primary_pg_by_osd.find(osd);
+    if (it != num_primary_pg_by_osd.end())
+      num = it->second;
+    return num;
+  }
+
   void update_pg(pg_t pgid, bufferlist& bl);
   void remove_pg(pg_t pgid);
   void update_osd(int osd, bufferlist& bl);
@@ -453,7 +461,7 @@ namespace reweight {
 		     int max_osds,
 		     bool by_pg, const set<int64_t> *pools,
 		     bool no_increasing,
-		     map<int32_t, uint32_t>* new_weights,
+		     mempool::osdmap::map<int32_t, uint32_t>* new_weights,
 		     std::stringstream *ss,
 		     std::string *out_str,
 		     Formatter *f);
diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc
index f7464c3458e..151468a5897 100644
--- a/src/mon/PGMonitor.cc
+++ b/src/mon/PGMonitor.cc
@@ -1504,8 +1504,8 @@ void PGMonitor::get_health(list<pair<health_status_t,string> >& summary,
   }
 
   // near-target max pools
-  const map<int64_t,pg_pool_t>& pools = mon->osdmon()->osdmap.get_pools();
-  for (map<int64_t,pg_pool_t>::const_iterator p = pools.begin();
+  auto& pools = mon->osdmon()->osdmap.get_pools();
+  for (auto p = pools.begin();
        p != pools.end(); ++p) {
     if ((!p->second.target_max_objects && !p->second.target_max_bytes) ||
         !pg_map.pg_pool_sum.count(p->first))
@@ -1588,7 +1588,9 @@ void PGMonitor::get_health(list<pair<health_status_t,string> >& summary,
       if (!pi)
 	continue;   // in case osdmap changes haven't propagated to PGMap yet
       const string& name = mon->osdmon()->osdmap.get_pool_name(p->first);
-      if (pi->get_pg_num() > pi->get_pgp_num()) {
+      if (pi->get_pg_num() > pi->get_pgp_num() &&
+	  !(name.find(".DELETED") != string::npos &&
+	    g_conf->mon_fake_pool_delete)) {
 	ostringstream ss;
 	ss << "pool " << name << " pg_num "
 	   << pi->get_pg_num() << " > pgp_num " << pi->get_pgp_num();
diff --git a/src/mon/Paxos.cc b/src/mon/Paxos.cc
index 2e905ac8fb7..8ba73118e0f 100644
--- a/src/mon/Paxos.cc
+++ b/src/mon/Paxos.cc
@@ -37,61 +37,6 @@ static ostream& _prefix(std::ostream *_dout, Monitor *mon, const string& name,
 		<< ") ";
 }
 
-class Paxos::C_CollectTimeout : public Context {
-  Paxos *paxos;
-public:
-  explicit C_CollectTimeout(Paxos *p) : paxos(p) {}
-  void finish(int r) override {
-    if (r == -ECANCELED)
-      return;
-    paxos->collect_timeout();
-  }
-};
-
-class Paxos::C_AcceptTimeout : public Context {
-  Paxos *paxos;
-public:
-  explicit C_AcceptTimeout(Paxos *p) : paxos(p) {}
-  void finish(int r) override {
-    if (r == -ECANCELED)
-      return;
-    paxos->accept_timeout();
-  }
-};
-
-class Paxos::C_LeaseAckTimeout : public Context {
-  Paxos *paxos;
-public:
-  explicit C_LeaseAckTimeout(Paxos *p) : paxos(p) {}
-  void finish(int r) override {
-    if (r == -ECANCELED)
-      return;
-    paxos->lease_ack_timeout();
-  }
-};
-
-class Paxos::C_LeaseTimeout : public Context {
-  Paxos *paxos;
-public:
-  explicit C_LeaseTimeout(Paxos *p) : paxos(p) {}
-  void finish(int r) override {
-    if (r == -ECANCELED)
-      return;
-    paxos->lease_timeout();
-  }
-};
-
-class Paxos::C_LeaseRenew : public Context {
-  Paxos *paxos;
-public:
-  explicit C_LeaseRenew(Paxos *p) : paxos(p) {}
-  void finish(int r) override {
-    if (r == -ECANCELED)
-      return;
-    paxos->lease_renew_timeout();
-  }
-};
-
 class Paxos::C_Trimmed : public Context {
   Paxos *paxos;
 public:
@@ -249,7 +194,11 @@ void Paxos::collect(version_t oldpn)
   }
 
   // set timeout event
-  collect_timeout_event = new C_CollectTimeout(this);
+  collect_timeout_event = new C_MonContext(mon, [this](int r) {
+	if (r == -ECANCELED)
+	  return;
+	collect_timeout();
+    });
   mon->timer.add_event_after(g_conf->mon_accept_timeout_factor *
 			     g_conf->mon_lease,
 			     collect_timeout_event);
@@ -737,7 +686,11 @@ void Paxos::begin(bufferlist& v)
   }
 
   // set timeout event
-  accept_timeout_event = new C_AcceptTimeout(this);
+  accept_timeout_event = new C_MonContext(mon, [this](int r) {
+      if (r == -ECANCELED)
+	return;
+      accept_timeout();
+    });
   mon->timer.add_event_after(g_conf->mon_accept_timeout_factor *
 			     g_conf->mon_lease,
 			     accept_timeout_event);
@@ -1023,14 +976,22 @@ void Paxos::extend_lease()
   // set timeout event.
   //  if old timeout is still in place, leave it.
   if (!lease_ack_timeout_event) {
-    lease_ack_timeout_event = new C_LeaseAckTimeout(this);
+    lease_ack_timeout_event = new C_MonContext(mon, [this](int r) {
+	if (r == -ECANCELED)
+	  return;
+	lease_ack_timeout();
+      });
     mon->timer.add_event_after(g_conf->mon_lease_ack_timeout_factor *
 			       g_conf->mon_lease,
 			       lease_ack_timeout_event);
   }
 
   // set renew event
-  lease_renew_event = new C_LeaseRenew(this);
+  lease_renew_event = new C_MonContext(mon, [this](int r) {
+      if (r == -ECANCELED)
+	return;
+      lease_renew_timeout();
+    });
   utime_t at = lease_expire;
   at -= g_conf->mon_lease;
   at += g_conf->mon_lease_renew_interval_factor * g_conf->mon_lease;
@@ -1213,7 +1174,11 @@ void Paxos::reset_lease_timeout()
   dout(20) << "reset_lease_timeout - setting timeout event" << dendl;
   if (lease_timeout_event)
     mon->timer.cancel_event(lease_timeout_event);
-  lease_timeout_event = new C_LeaseTimeout(this);
+  lease_timeout_event = new C_MonContext(mon, [this](int r) {
+      if (r == -ECANCELED)
+	return;
+      lease_timeout();
+    });
   mon->timer.add_event_after(g_conf->mon_lease_ack_timeout_factor *
 			     g_conf->mon_lease,
 			     lease_timeout_event);
diff --git a/src/mon/PaxosService.cc b/src/mon/PaxosService.cc
index 203be1c817c..a1c9df895a8 100644
--- a/src/mon/PaxosService.cc
+++ b/src/mon/PaxosService.cc
@@ -100,22 +100,15 @@ bool PaxosService::dispatch(MonOpRequestRef op)
 	   * Callback class used to propose the pending value once the proposal_timer
 	   * fires up.
 	   */
-	  class C_Propose : public Context {
-	    PaxosService *ps;
-	  public:
-	    explicit C_Propose(PaxosService *p) : ps(p) { }
-	    void finish(int r) override {
-	      ps->proposal_timer = 0;
+	  proposal_timer = new C_MonContext(mon, [this](int r) {
+	      proposal_timer = 0;
 	      if (r >= 0)
-		ps->propose_pending();
+		propose_pending();
 	      else if (r == -ECANCELED || r == -EAGAIN)
 		return;
 	      else
-		assert(0 == "bad return value for C_Propose");
-	    }
-	  };
-
-	  proposal_timer = new C_Propose(this);
+		assert(0 == "bad return value for proposal_timer");
+	    });
 	  dout(10) << " setting proposal_timer " << proposal_timer << " with delay of " << delay << dendl;
 	  mon->timer.add_event_after(delay, proposal_timer);
 	} else { 
diff --git a/src/mon/PaxosService.h b/src/mon/PaxosService.h
index ae2a3892e9c..5c6b2872b09 100644
--- a/src/mon/PaxosService.h
+++ b/src/mon/PaxosService.h
@@ -117,8 +117,6 @@ protected:
   /**
    * @}
    */
-  friend class C_Propose;
-  
 
 public:
   /**
diff --git a/src/mon/QuorumService.h b/src/mon/QuorumService.h
index f1100520a0d..b354c40a77f 100644
--- a/src/mon/QuorumService.h
+++ b/src/mon/QuorumService.h
@@ -25,19 +25,9 @@
 
 class QuorumService
 {
-  Context *tick_event;
+  Context *tick_event = nullptr;
   double tick_period;
 
-  struct C_Tick : public Context {
-    QuorumService *s;
-    C_Tick(QuorumService *qs) : s(qs) { }
-    void finish(int r) override {
-      if (r < 0)
-        return;
-      s->tick();
-    }
-  };
-
 public:
   enum {
     SERVICE_HEALTH                   = 0x01,
@@ -50,7 +40,6 @@ protected:
   epoch_t epoch;
 
   QuorumService(Monitor *m) :
-    tick_event(NULL),
     tick_period(g_conf->mon_tick_interval),
     mon(m),
     epoch(0)
@@ -70,7 +59,11 @@ protected:
     if (tick_period <= 0)
       return;
 
-    tick_event = new C_Tick(this);
+    tick_event = new C_MonContext(mon, [this](int r) {
+	if (r < 0)
+	  return;
+	tick();
+      });
     mon->timer.add_event_after(tick_period, tick_event);
   }
 
diff --git a/src/mount/mount.ceph.c b/src/mount/mount.ceph.c
index 5ead9313b7d..e8fdfcd5496 100644
--- a/src/mount/mount.ceph.c
+++ b/src/mount/mount.ceph.c
@@ -3,8 +3,6 @@
 #include <stdlib.h>
 #include <errno.h>
 #include <sys/mount.h>
-#include <sys/types.h>
-#include <sys/wait.h>
 
 #include "common/module.h"
 #include "common/secret.h"
diff --git a/src/msg/Message.cc b/src/msg/Message.cc
index e19fd017b1c..5579e34825a 100644
--- a/src/msg/Message.cc
+++ b/src/msg/Message.cc
@@ -276,7 +276,7 @@ Message *decode_message(CephContext *cct, int crcflags,
 			ceph_msg_header& header,
 			ceph_msg_footer& footer,
 			bufferlist& front, bufferlist& middle,
-			bufferlist& data)
+			bufferlist& data, Connection* conn)
 {
   // verify crc
   if (crcflags & MSG_CRC_HEADER) {
@@ -812,6 +812,7 @@ Message *decode_message(CephContext *cct, int crcflags,
     return 0;
   }
 
+  m->set_connection(conn);
   m->set_header(header);
   m->set_footer(footer);
   m->set_payload(front);
@@ -840,6 +841,42 @@ Message *decode_message(CephContext *cct, int crcflags,
   return m;
 }
 
+void Message::encode_trace(bufferlist &bl, uint64_t features) const
+{
+  auto p = trace.get_info();
+  static const blkin_trace_info empty = { 0, 0, 0 };
+  if (!p) {
+    p = &empty;
+  }
+  ::encode(*p, bl);
+}
+
+void Message::decode_trace(bufferlist::iterator &p, bool create)
+{
+  blkin_trace_info info = {};
+  ::decode(info, p);
+
+#ifdef WITH_BLKIN
+  if (!connection)
+    return;
+
+  const auto msgr = connection->get_messenger();
+  const auto endpoint = msgr->get_trace_endpoint();
+  if (info.trace_id) {
+    trace.init(get_type_name(), endpoint, &info, true);
+    trace.event("decoded trace");
+  } else if (create || (msgr->get_myname().is_osd() &&
+                        msgr->cct->_conf->osd_blkin_trace_all)) {
+    // create a trace even if we didn't get one on the wire
+    trace.init(get_type_name(), endpoint);
+    trace.event("created trace");
+  }
+  trace.keyval("tid", get_tid());
+  trace.keyval("entity type", get_source().type_str());
+  trace.keyval("entity num", get_source().num());
+#endif
+}
+
 
 // This routine is not used for ordinary messages, but only when encapsulating a message
 // for forwarding and routing.  It's also used in a backward compatibility test, which only
@@ -890,6 +927,6 @@ Message *decode_message(CephContext *cct, int crcflags, bufferlist::iterator& p)
   ::decode(fr, p);
   ::decode(mi, p);
   ::decode(da, p);
-  return decode_message(cct, crcflags, h, f, fr, mi, da);
+  return decode_message(cct, crcflags, h, f, fr, mi, da, nullptr);
 }
 
diff --git a/src/msg/Message.h b/src/msg/Message.h
index 67c1edce82d..b3a836c12d4 100644
--- a/src/msg/Message.h
+++ b/src/msg/Message.h
@@ -26,6 +26,7 @@
 #include "include/types.h"
 #include "include/buffer.h"
 #include "common/Throttle.h"
+#include "common/zipkin_trace.h"
 #include "msg_types.h"
 
 #include "common/RefCountedObj.h"
@@ -240,6 +241,11 @@ protected:
   bi::list_member_hook<> dispatch_q;
 
 public:
+  // zipkin tracing
+  ZTracer::Trace trace;
+  void encode_trace(bufferlist &bl, uint64_t features) const;
+  void decode_trace(bufferlist::iterator &p, bool create = false);
+
   class CompletionHook : public Context {
   protected:
     Message *m;
@@ -297,6 +303,7 @@ protected:
     if (byte_throttler)
       byte_throttler->put(payload.length() + middle.length() + data.length());
     release_message_throttle();
+    trace.event("message destructed");
     /* call completion hooks (if any) */
     if (completion_hook)
       completion_hook->complete(0);
@@ -471,8 +478,9 @@ typedef boost::intrusive_ptr<Message> MessageRef;
 extern Message *decode_message(CephContext *cct, int crcflags,
 			       ceph_msg_header &header,
 			       ceph_msg_footer& footer, bufferlist& front,
-			       bufferlist& middle, bufferlist& data);
-inline ostream& operator<<(ostream &out, const Message &m) {
+			       bufferlist& middle, bufferlist& data,
+			       Connection* conn);
+inline ostream& operator<<(ostream& out, const Message& m) {
   m.print(out);
   if (m.get_header().version)
     out << " v" << m.get_header().version;
diff --git a/src/msg/Messenger.cc b/src/msg/Messenger.cc
index ca00e0bb5ca..6ab2862dc41 100644
--- a/src/msg/Messenger.cc
+++ b/src/msg/Messenger.cc
@@ -2,7 +2,9 @@
 // vim: ts=8 sw=2 smarttab
 
 #include <random>
+#include <netdb.h>
 #include "include/Spinlock.h"
+
 #include "include/types.h"
 #include "Messenger.h"
 
@@ -48,6 +50,27 @@ Messenger *Messenger::create(CephContext *cct, const string &type,
   return nullptr;
 }
 
+void Messenger::set_endpoint_addr(const entity_addr_t& a,
+                                  const entity_name_t &name)
+{
+  size_t hostlen;
+  if (a.get_family() == AF_INET)
+    hostlen = sizeof(struct sockaddr_in);
+  else if (a.get_family() == AF_INET6)
+    hostlen = sizeof(struct sockaddr_in6);
+  else
+    hostlen = 0;
+
+  if (hostlen) {
+    char buf[NI_MAXHOST] = { 0 };
+    getnameinfo(a.get_sockaddr(), hostlen, buf, sizeof(buf),
+                NULL, 0, NI_NUMERICHOST);
+
+    trace_endpoint.copy_ip(buf);
+  }
+  trace_endpoint.set_port(a.get_port());
+}
+
 /*
  * Pre-calculate desired software CRC settings.  CRC computation may
  * be disabled by default for some transports (e.g., those with strong
diff --git a/src/msg/Messenger.h b/src/msg/Messenger.h
index c4df379939a..a186ec3c875 100644
--- a/src/msg/Messenger.h
+++ b/src/msg/Messenger.h
@@ -41,6 +41,10 @@ class Messenger {
 private:
   list<Dispatcher*> dispatchers;
   list <Dispatcher*> fast_dispatchers;
+  ZTracer::Endpoint trace_endpoint;
+
+  void set_endpoint_addr(const entity_addr_t& a,
+                         const entity_name_t &name);
 
 protected:
   /// the "name" of the local daemon. eg client.99
@@ -136,7 +140,8 @@ public:
    * or use the create() function.
    */
   Messenger(CephContext *cct_, entity_name_t w)
-    : my_inst(),
+    : trace_endpoint("0.0.0.0", 0, "Messenger"),
+      my_inst(),
       default_send_priority(CEPH_MSG_PRIO_DEFAULT), started(false),
       magic(0),
       socket_priority(-1),
@@ -213,9 +218,19 @@ protected:
   /**
    * set messenger's address
    */
-  virtual void set_myaddr(const entity_addr_t& a) { my_inst.addr = a; }
+  virtual void set_myaddr(const entity_addr_t& a) {
+    my_inst.addr = a;
+    set_endpoint_addr(a, my_inst.name);
+  }
 public:
   /**
+   * @return the zipkin trace endpoint
+   */
+  const ZTracer::Endpoint* get_trace_endpoint() const {
+    return &trace_endpoint;
+  }
+
+  /**
    * Retrieve the Messenger's name.
    *
    * @return A const reference to the name this Messenger
diff --git a/src/msg/async/AsyncConnection.cc b/src/msg/async/AsyncConnection.cc
index 4741294ace9..8eec5968458 100644
--- a/src/msg/async/AsyncConnection.cc
+++ b/src/msg/async/AsyncConnection.cc
@@ -417,7 +417,7 @@ void AsyncConnection::process()
       case STATE_OPEN_TAG_ACK:
         {
           ceph_le64 *seq;
-          r = read_until(sizeof(seq), state_buffer);
+          r = read_until(sizeof(*seq), state_buffer);
           if (r < 0) {
             ldout(async_msgr->cct, 1) << __func__ << " read ack seq failed" << dendl;
             goto fail;
@@ -698,7 +698,8 @@ void AsyncConnection::process()
 
           ldout(async_msgr->cct, 20) << __func__ << " got " << front.length() << " + " << middle.length()
                               << " + " << data.length() << " byte message" << dendl;
-          Message *message = decode_message(async_msgr->cct, async_msgr->crcflags, current_header, footer, front, middle, data);
+          Message *message = decode_message(async_msgr->cct, async_msgr->crcflags, current_header, footer,
+                                            front, middle, data, this);
           if (!message) {
             ldout(async_msgr->cct, 1) << __func__ << " decode message failed " << dendl;
             goto fail;
@@ -1947,6 +1948,7 @@ int AsyncConnection::send_message(Message *m)
                                << " Drop message " << m << dendl;
     m->put();
   } else {
+    m->trace.event("async enqueueing message");
     out_q[m->get_priority()].emplace_back(std::move(bl), m);
     ldout(async_msgr->cct, 15) << __func__ << " inline write is denied, reschedule m=" << m << dendl;
     if (can_write != WriteStatus::REPLACING)
@@ -2265,6 +2267,7 @@ ssize_t AsyncConnection::write_message(Message *m, bufferlist& bl, bool more)
     outcoming_bl.append((char*)&old_footer, sizeof(old_footer));
   }
 
+  m->trace.event("async writing message");
   logger->inc(l_msgr_send_bytes, outcoming_bl.length() - original_bl_len);
   ldout(async_msgr->cct, 20) << __func__ << " sending " << m->get_seq()
                              << " " << m << dendl;
diff --git a/src/msg/async/net_handler.cc b/src/msg/async/net_handler.cc
index 059f9220948..19adb2c83f2 100644
--- a/src/msg/async/net_handler.cc
+++ b/src/msg/async/net_handler.cc
@@ -33,22 +33,26 @@ namespace ceph{
 
 int NetHandler::create_socket(int domain, bool reuse_addr)
 {
-  int s, on = 1;
+  int s;
+  int r = 0;
 
   if ((s = ::socket(domain, SOCK_STREAM, 0)) == -1) {
-    lderr(cct) << __func__ << " couldn't create socket " << cpp_strerror(errno) << dendl;
-    return -errno;
+    r = errno;
+    lderr(cct) << __func__ << " couldn't create socket " << cpp_strerror(r) << dendl;
+    return -r;
   }
 
 #if !defined(__FreeBSD__)
   /* Make sure connection-intensive things like the benchmark
    * will be able to close/open sockets a zillion of times */
   if (reuse_addr) {
+    int on = 1;
     if (::setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) {
+      r = errno;
       lderr(cct) << __func__ << " setsockopt SO_REUSEADDR failed: "
-                 << strerror(errno) << dendl;
+                 << strerror(r) << dendl;
       close(s);
-      return -errno;
+      return -r;
     }
   }
 #endif
@@ -59,17 +63,20 @@ int NetHandler::create_socket(int domain, bool reuse_addr)
 int NetHandler::set_nonblock(int sd)
 {
   int flags;
+  int r = 0;
 
   /* Set the socket nonblocking.
    * Note that fcntl(2) for F_GETFL and F_SETFL can't be
    * interrupted by a signal. */
   if ((flags = fcntl(sd, F_GETFL)) < 0 ) {
-    lderr(cct) << __func__ << " fcntl(F_GETFL) failed: " << strerror(errno) << dendl;
-    return -errno;
+    r = errno;
+    lderr(cct) << __func__ << " fcntl(F_GETFL) failed: " << cpp_strerror(r) << dendl;
+    return -r;
   }
   if (fcntl(sd, F_SETFL, flags | O_NONBLOCK) < 0) {
-    lderr(cct) << __func__ << " fcntl(F_SETFL,O_NONBLOCK): " << strerror(errno) << dendl;
-    return -errno;
+    r = errno;
+    lderr(cct) << __func__ << " fcntl(F_SETFL,O_NONBLOCK): " << cpp_strerror(r) << dendl;
+    return -r;
   }
 
   return 0;
@@ -99,14 +106,14 @@ int NetHandler::set_socket_options(int sd, bool nodelay, int size)
     int flag = 1;
     r = ::setsockopt(sd, IPPROTO_TCP, TCP_NODELAY, (char*)&flag, sizeof(flag));
     if (r < 0) {
-      r = -errno;
+      r = errno;
       ldout(cct, 0) << "couldn't set TCP_NODELAY: " << cpp_strerror(r) << dendl;
     }
   }
   if (size) {
     r = ::setsockopt(sd, SOL_SOCKET, SO_RCVBUF, (void*)&size, sizeof(size));
     if (r < 0)  {
-      r = -errno;
+      r = errno;
       ldout(cct, 0) << "couldn't set SO_RCVBUF to " << size << ": " << cpp_strerror(r) << dendl;
     }
   }
@@ -116,53 +123,53 @@ int NetHandler::set_socket_options(int sd, bool nodelay, int size)
   int val = 1;
   r = ::setsockopt(sd, SOL_SOCKET, SO_NOSIGPIPE, (void*)&val, sizeof(val));
   if (r) {
-    r = -errno;
+    r = errno;
     ldout(cct,0) << "couldn't set SO_NOSIGPIPE: " << cpp_strerror(r) << dendl;
   }
 #endif
-  return r;
+  return -r;
 }
 
 void NetHandler::set_priority(int sd, int prio, int domain)
 {
-  if (prio >= 0) {
-    int r = -1;
+#ifdef SO_PRIORITY
+  if (prio < 0) {
+    return;
+  }
 #ifdef IPTOS_CLASS_CS6
-    int iptos = IPTOS_CLASS_CS6;
+  int iptos = IPTOS_CLASS_CS6;
+  int r = -1;
+  switch (domain) {
+  case AF_INET:
     r = ::setsockopt(sd, IPPROTO_IP, IP_TOS, &iptos, sizeof(iptos));
-    if (domain == AF_INET) {
-      r = ::setsockopt(sd, IPPROTO_IP, IP_TOS, &iptos, sizeof(iptos));
-      r = -errno;
-      if (r < 0) {
-        ldout(cct,0) << "couldn't set IP_TOS to " << iptos
-                           << ": " << cpp_strerror(r) << dendl;
-      }
-    } else if (domain == AF_INET6) {
-      r = ::setsockopt(sd, IPPROTO_IPV6, IPV6_TCLASS, &iptos, sizeof(iptos));
-      if (r)
-	r = -errno;
-      if (r < 0) {
-        ldout(cct,0) << "couldn't set IPV6_TCLASS to " << iptos
-                           << ": " << cpp_strerror(r) << dendl;
-      }
-    } else {
-      ldout(cct,0) << "couldn't set ToS of unknown family to " << iptos
-                         << dendl;
-    }
-#endif
-#if defined(SO_PRIORITY) 
-    // setsockopt(IPTOS_CLASS_CS6) sets the priority of the socket as 0.
-    // See http://goo.gl/QWhvsD and http://goo.gl/laTbjT
-    // We need to call setsockopt(SO_PRIORITY) after it.
-#if defined(__linux__)
-    r = ::setsockopt(sd, SOL_SOCKET, SO_PRIORITY, &prio, sizeof(prio));
-#endif
-    if (r < 0) {
-      ldout(cct, 0) << __func__ << " couldn't set SO_PRIORITY to " << prio
-                    << ": " << cpp_strerror(errno) << dendl;
-    }
-#endif
+    break;
+  case AF_INET6:
+    r = ::setsockopt(sd, IPPROTO_IPV6, IPV6_TCLASS, &iptos, sizeof(iptos));
+    break;
+  default:
+    lderr(cct) << "couldn't set ToS of unknown family (" << domain << ")"
+	       << " to " << iptos << dendl;
+    return;
   }
+  if (r < 0) {
+    r = errno;
+    ldout(cct,0) << "couldn't set TOS to " << iptos
+		 << ": " << cpp_strerror(r) << dendl;
+  }
+
+#endif	// IPTOS_CLASS_CS6
+  // setsockopt(IPTOS_CLASS_CS6) sets the priority of the socket as 0.
+  // See http://goo.gl/QWhvsD and http://goo.gl/laTbjT
+  // We need to call setsockopt(SO_PRIORITY) after it.
+  r = ::setsockopt(sd, SOL_SOCKET, SO_PRIORITY, &prio, sizeof(prio));
+  if (r < 0) {
+    r = errno;
+    ldout(cct, 0) << __func__ << " couldn't set SO_PRIORITY to " << prio
+		  << ": " << cpp_strerror(r) << dendl;
+  }
+#else
+  return;
+#endif	// SO_PRIORITY
 }
 
 int NetHandler::generic_connect(const entity_addr_t& addr, const entity_addr_t &bind_addr, bool nonblock)
@@ -188,22 +195,23 @@ int NetHandler::generic_connect(const entity_addr_t& addr, const entity_addr_t &
       addr.set_port(0);
       ret = ::bind(s, addr.get_sockaddr(), addr.get_sockaddr_len());
       if (ret < 0) {
-        ret = -errno;
+        ret = errno;
         ldout(cct, 2) << __func__ << " client bind error " << ", " << cpp_strerror(ret) << dendl;
         close(s);
-        return ret;
+        return -ret;
       }
     }
   }
 
   ret = ::connect(s, addr.get_sockaddr(), addr.get_sockaddr_len());
   if (ret < 0) {
+    ret = errno;
     if (errno == EINPROGRESS && nonblock)
       return s;
 
-    ldout(cct, 10) << __func__ << " connect: " << strerror(errno) << dendl;
+    ldout(cct, 10) << __func__ << " connect: " << cpp_strerror(ret) << dendl;
     close(s);
-    return -errno;
+    return -ret;
   }
 
   return s;
@@ -211,13 +219,15 @@ int NetHandler::generic_connect(const entity_addr_t& addr, const entity_addr_t &
 
 int NetHandler::reconnect(const entity_addr_t &addr, int sd)
 {
+  int r = 0;
   int ret = ::connect(sd, addr.get_sockaddr(), addr.get_sockaddr_len());
 
   if (ret < 0 && errno != EISCONN) {
-    ldout(cct, 10) << __func__ << " reconnect: " << strerror(errno) << dendl;
-    if (errno == EINPROGRESS || errno == EALREADY)
+    r = errno;
+    ldout(cct, 10) << __func__ << " reconnect: " << strerror(r) << dendl;
+    if (r == EINPROGRESS || r == EALREADY)
       return 1;
-    return -errno;
+    return -r;
   }
 
   return 0;
diff --git a/src/msg/async/rdma/RDMAConnectedSocketImpl.cc b/src/msg/async/rdma/RDMAConnectedSocketImpl.cc
index 34ea63279ee..3014bbbd4be 100644
--- a/src/msg/async/rdma/RDMAConnectedSocketImpl.cc
+++ b/src/msg/async/rdma/RDMAConnectedSocketImpl.cc
@@ -111,10 +111,12 @@ RDMAConnectedSocketImpl::~RDMAConnectedSocketImpl()
   for (unsigned i=0; i < wc.size(); ++i) {
     ret = ibdev->post_chunk(reinterpret_cast<Chunk*>(wc[i].wr_id));
     assert(ret == 0);
+    dispatcher->perf_logger->dec(l_msgr_rdma_inqueue_rx_chunks);
   }
   for (unsigned i=0; i < buffers.size(); ++i) {
     ret = ibdev->post_chunk(buffers[i]);
     assert(ret == 0);
+    dispatcher->perf_logger->dec(l_msgr_rdma_inqueue_rx_chunks);
   }
 
   delete cmgr;
@@ -355,6 +357,7 @@ ssize_t RDMAConnectedSocketImpl::read(char* buf, size_t len)
         ldout(cct, 20) << __func__ << " got remote close msg..." << dendl;
       }
       assert(ibdev->post_chunk(chunk) == 0);
+      dispatcher->perf_logger->dec(l_msgr_rdma_inqueue_rx_chunks);
     } else {
       if (read == (ssize_t)len) {
         buffers.push_back(chunk);
@@ -366,6 +369,7 @@ ssize_t RDMAConnectedSocketImpl::read(char* buf, size_t len)
       } else {
         read += chunk->read(buf+read, response->byte_len);
         assert(ibdev->post_chunk(chunk) == 0);
+        dispatcher->perf_logger->dec(l_msgr_rdma_inqueue_rx_chunks);
       }
     }
   }
@@ -388,6 +392,7 @@ ssize_t RDMAConnectedSocketImpl::read_buffers(char* buf, size_t len)
     ldout(cct, 25) << __func__ << " this iter read: " << tmp << " bytes." << " offset: " << (*c)->get_offset() << " ,bound: " << (*c)->get_bound()  << ". Chunk:" << *c  << dendl;
     if ((*c)->over()) {
       assert(ibdev->post_chunk(*c) == 0);
+      dispatcher->perf_logger->dec(l_msgr_rdma_inqueue_rx_chunks);
       ldout(cct, 25) << __func__ << " one chunk over." << dendl;
     }
     if (read == len) {
diff --git a/src/msg/async/rdma/RDMAStack.cc b/src/msg/async/rdma/RDMAStack.cc
index 88ce6bf168e..b1dae3bcf5c 100644
--- a/src/msg/async/rdma/RDMAStack.cc
+++ b/src/msg/async/rdma/RDMAStack.cc
@@ -52,6 +52,7 @@ RDMADispatcher::RDMADispatcher(CephContext* c, RDMAStack* s)
 
   plb.add_u64_counter(l_msgr_rdma_polling, "polling", "Whether dispatcher thread is polling");
   plb.add_u64_counter(l_msgr_rdma_inflight_tx_chunks, "inflight_tx_chunks", "The number of inflight tx chunks");
+  plb.add_u64_counter(l_msgr_rdma_inqueue_rx_chunks, "inqueue_rx_chunks", "The number of inqueue rx chunks");
 
   plb.add_u64_counter(l_msgr_rdma_tx_total_wc, "tx_total_wc", "The number of tx work comletions");
   plb.add_u64_counter(l_msgr_rdma_tx_total_wc_errors, "tx_total_wc_errors", "The number of tx errors");
@@ -179,8 +180,10 @@ void RDMADispatcher::polling()
         }
       }
 
-      for (auto &&i : polled)
+      for (auto &&i : polled) {
+        perf_logger->inc(l_msgr_rdma_inqueue_rx_chunks, i.second.size());
         i.first->pass_wc(std::move(i.second));
+      }
       polled.clear();
     }
 
diff --git a/src/msg/async/rdma/RDMAStack.h b/src/msg/async/rdma/RDMAStack.h
index 8e91d42f533..ef1188b8023 100644
--- a/src/msg/async/rdma/RDMAStack.h
+++ b/src/msg/async/rdma/RDMAStack.h
@@ -40,6 +40,7 @@ enum {
 
   l_msgr_rdma_polling,
   l_msgr_rdma_inflight_tx_chunks,
+  l_msgr_rdma_inqueue_rx_chunks,
 
   l_msgr_rdma_tx_total_wc,
   l_msgr_rdma_tx_total_wc_errors,
diff --git a/src/msg/simple/Pipe.cc b/src/msg/simple/Pipe.cc
index fa5df0a184f..cece32f0baa 100644
--- a/src/msg/simple/Pipe.cc
+++ b/src/msg/simple/Pipe.cc
@@ -917,45 +917,48 @@ void Pipe::set_socket_options()
   }
 #endif
 
+#ifdef SO_PRIORITY
   int prio = msgr->get_socket_priority();
   if (prio >= 0) {
     int r = -1;
 #ifdef IPTOS_CLASS_CS6
     int iptos = IPTOS_CLASS_CS6;
-
-    if (peer_addr.get_family() == AF_INET) {
+    int addr_family = 0;
+    if (!peer_addr.is_blank_ip()) {
+      addr_family = peer_addr.get_family();
+    } else {
+      addr_family = msgr->get_myaddr().get_family();
+    }
+    switch (addr_family) {
+    case AF_INET:
       r = ::setsockopt(sd, IPPROTO_IP, IP_TOS, &iptos, sizeof(iptos));
-      if (r < 0) {
-        r = -errno;
-        ldout(msgr->cct,0) << "couldn't set IP_TOS to " << iptos
-                           << ": " << cpp_strerror(r) << dendl;
-      }
-    } else if (peer_addr.get_family() == AF_INET6) {
+      break;
+    case AF_INET6:
       r = ::setsockopt(sd, IPPROTO_IPV6, IPV6_TCLASS, &iptos, sizeof(iptos));
-      if (r < 0) {
-        r = -errno;
-        ldout(msgr->cct,0) << "couldn't set IPV6_TCLASS to " << iptos
-                           << ": " << cpp_strerror(r) << dendl;
-      }
-    } else {
-      ldout(msgr->cct,0) << "couldn't set ToS of unknown family to " << iptos
-                         << dendl;
+      break;
+    default:
+      lderr(msgr->cct) << "couldn't set ToS of unknown family ("
+		       << addr_family << ")"
+		       << " to " << iptos << dendl;
+      return;
+    }
+    if (r < 0) {
+      r = -errno;
+      ldout(msgr->cct,0) << "couldn't set TOS to " << iptos
+			 << ": " << cpp_strerror(r) << dendl;
     }
 #endif
-#if defined(SO_PRIORITY) 
     // setsockopt(IPTOS_CLASS_CS6) sets the priority of the socket as 0.
     // See http://goo.gl/QWhvsD and http://goo.gl/laTbjT
     // We need to call setsockopt(SO_PRIORITY) after it.
-#if defined(__linux__)
     r = ::setsockopt(sd, SOL_SOCKET, SO_PRIORITY, &prio, sizeof(prio));
-#endif
     if (r < 0) {
       r = -errno;
       ldout(msgr->cct,0) << "couldn't set SO_PRIORITY to " << prio
                          << ": " << cpp_strerror(r) << dendl;
     }
-#endif
   }
+#endif
 }
 
 int Pipe::connect()
@@ -1715,6 +1718,8 @@ void Pipe::reader()
 	continue;
       }
 
+      m->trace.event("pipe read message");
+
       if (state == STATE_CLOSED ||
 	  state == STATE_CONNECTING) {
 	in_q->dispatch_throttle_release(m->get_dispatch_throttle_size());
@@ -1949,6 +1954,8 @@ void Pipe::writer()
 
         pipe_lock.Unlock();
 
+        m->trace.event("pipe writing message");
+
         ldout(msgr->cct,20) << "writer sending " << m->get_seq() << " " << m << dendl;
 	int rc = write_message(header, footer, blist);
 
@@ -2193,7 +2200,8 @@ int Pipe::read_message(Message **pm, AuthSessionHandler* auth_handler)
 
   ldout(msgr->cct,20) << "reader got " << front.length() << " + " << middle.length() << " + " << data.length()
 	   << " byte message" << dendl;
-  message = decode_message(msgr->cct, msgr->crcflags, header, footer, front, middle, data);
+  message = decode_message(msgr->cct, msgr->crcflags, header, footer,
+                           front, middle, data, connection_state.get());
   if (!message) {
     ret = -EINVAL;
     goto out_dethrottle;
diff --git a/src/msg/simple/SimpleMessenger.cc b/src/msg/simple/SimpleMessenger.cc
index 600f26c3f1c..efd8ede137d 100644
--- a/src/msg/simple/SimpleMessenger.cc
+++ b/src/msg/simple/SimpleMessenger.cc
@@ -443,6 +443,7 @@ void SimpleMessenger::submit_message(Message *m, PipeConnection *con,
 				     const entity_addr_t& dest_addr, int dest_type,
 				     bool already_locked)
 {
+  m->trace.event("simple submitting message");
   if (cct->_conf->ms_dump_on_send) {
     m->encode(-1, true);
     ldout(cct, 0) << "submit_message " << *m << "\n";
diff --git a/src/msg/xio/XioConnection.cc b/src/msg/xio/XioConnection.cc
index 98eebcc8224..107a489a44e 100644
--- a/src/msg/xio/XioConnection.cc
+++ b/src/msg/xio/XioConnection.cc
@@ -435,9 +435,8 @@ int XioConnection::handle_data_msg(struct xio_session *session,
   /* update connection timestamp */
   recv.set(tmsg->timestamp);
 
-  Message *m =
-    decode_message(msgr->cct, msgr->crcflags, header, footer, payload, middle,
-		   data);
+  Message *m = decode_message(msgr->cct, msgr->crcflags, header, footer,
+                              payload, middle, data, this);
 
   if (m) {
     /* completion */
diff --git a/src/msg/xio/XioMessenger.cc b/src/msg/xio/XioMessenger.cc
index 248cf937285..a63f5ffb3d6 100644
--- a/src/msg/xio/XioMessenger.cc
+++ b/src/msg/xio/XioMessenger.cc
@@ -933,7 +933,10 @@ assert(req->out.pdata_iov.nents || !nbuffers);
      }
     tail->next = NULL;
   }
-  xcon->portal->enqueue(xcon, xmsg);
+  xmsg->trace = m->trace;
+  m->trace.event("xio portal enqueue for send");
+  m->trace.keyval("xio message segments", xmsg->hdr.msg_cnt);
+  xcon->portal->enqueue_for_send(xcon, xmsg);
 
   return code;
 } /* send_message(Message *, Connection *) */
diff --git a/src/msg/xio/XioMsg.cc b/src/msg/xio/XioMsg.cc
index cf7712d3950..8c2d3d8ec06 100644
--- a/src/msg/xio/XioMsg.cc
+++ b/src/msg/xio/XioMsg.cc
@@ -27,6 +27,7 @@ int XioDispatchHook::release_msgs()
   /* queue for release */
   xcmp = static_cast<XioCompletion *>(rsp_pool.alloc(sizeof(XioCompletion)));
   new (xcmp) XioCompletion(xcon, this);
+  xcmp->trace = m->trace;
 
   /* merge with portal traffic */
   xcon->portal->enqueue(xcon, xcmp);
diff --git a/src/msg/xio/XioPortal.h b/src/msg/xio/XioPortal.h
index bbe31ff2b7b..b3f21010095 100644
--- a/src/msg/xio/XioPortal.h
+++ b/src/msg/xio/XioPortal.h
@@ -170,6 +170,7 @@ public:
 	xcmp->xcon->msg_release_fail(msg, code);
       msg = next_msg;
     }
+    xcmp->trace.event("xio_release_msg");
     xcmp->finalize(); /* unconditional finalize */
   }
 
@@ -273,6 +274,7 @@ public:
 		  goto restart;
 		}
 
+		xs->trace.event("xio_send_msg");
 		msg = xsend->get_xio_msg();
 		code = xio_send_msg(xcon->conn, msg);
 		/* header trace moved here to capture xio serial# */
diff --git a/src/msg/xio/XioSubmit.h b/src/msg/xio/XioSubmit.h
index dcb6eb14e00..9840ad4a449 100644
--- a/src/msg/xio/XioSubmit.h
+++ b/src/msg/xio/XioSubmit.h
@@ -40,6 +40,7 @@ public:
   enum submit_type type;
   bi::list_member_hook<> submit_list;
   XioConnection *xcon;
+  ZTracer::Trace trace;
 
   XioSubmit(enum submit_type _type, XioConnection *_xcon) :
     type(_type), xcon(_xcon)
diff --git a/src/objclass/objclass.h b/src/objclass/objclass.h
index a2418798f32..99be19beb34 100644
--- a/src/objclass/objclass.h
+++ b/src/objclass/objclass.h
@@ -10,6 +10,7 @@
 #include "msg/msg_types.h"
 #include "common/hobject.h"
 #include "common/ceph_time.h"
+#include "include/rados/objclass.h"
 
 struct obj_list_watch_response_t;
 
@@ -22,40 +23,9 @@ struct obj_list_watch_response_t;
 extern "C" {
 #endif
 
-#ifndef BUILDING_FOR_EMBEDDED
-#define CLS_VER(maj,min) \
-int __cls_ver__## maj ## _ ##min = 0; \
-int __cls_ver_maj = maj; \
-int __cls_ver_min = min;
-
-#define CLS_NAME(name) \
-int __cls_name__## name = 0; \
-const char *__cls_name = #name;
-#define CLS_INIT(name) \
-void CEPH_CLS_API __cls_init()
-#else
-#define CLS_VER(maj,min)
-#define CLS_NAME(name)
-#define CLS_INIT(name) \
-void CEPH_CLS_API name##_cls_init()
-#endif
-
-#define CLS_METHOD_RD       0x1 /// method executes read operations
-#define CLS_METHOD_WR       0x2 /// method executes write operations
 #define CLS_METHOD_PUBLIC   0x4 /// unused
-#define CLS_METHOD_PROMOTE  0x8 /// method cannot be proxied to base tier
-
 
-#define CLS_LOG(level, fmt, ...)					\
-  cls_log(level, "<cls> %s:%d: " fmt, __FILE__, __LINE__, ##__VA_ARGS__)
-#define CLS_ERR(fmt, ...) CLS_LOG(0, fmt, ##__VA_ARGS__)
-
-void __cls_init();
-
-typedef void *cls_handle_t;
-typedef void *cls_method_handle_t;
 typedef void *cls_filter_handle_t;
-typedef void *cls_method_context_t;
 typedef int (*cls_method_call_t)(cls_method_context_t ctx,
 				 char *indata, int datalen,
 				 char **outdata, int *outdatalen);
@@ -65,8 +35,6 @@ typedef struct {
 } cls_deps_t;
 
 /* class utils */
-extern int cls_log(int level, const char *format, ...)
-  __attribute__((__format__(printf, 2, 3)));
 extern void *cls_alloc(size_t size);
 extern void cls_free(void *p);
 
@@ -85,7 +53,6 @@ extern int cls_get_request_origin(cls_method_context_t hctx,
                                   entity_inst_t *origin);
 
 /* class registration api */
-extern int cls_register(const char *name, cls_handle_t *handle);
 extern int cls_unregister(cls_handle_t);
 
 extern int cls_register_method(cls_handle_t hclass, const char *method, int flags,
@@ -113,9 +80,6 @@ extern void class_fini(void);
 #ifdef __cplusplus
 }
 
-typedef int (*cls_method_cxx_call_t)(cls_method_context_t ctx,
-				     class buffer::list *inbl, class buffer::list *outbl);
-
 class PGLSFilter {
   CephContext* cct;
 protected:
@@ -150,30 +114,18 @@ public:
 typedef PGLSFilter* (*cls_cxx_filter_factory_t)();
 
 
-extern int cls_register_cxx_method(cls_handle_t hclass, const char *method, int flags,
-				   cls_method_cxx_call_t class_call, cls_method_handle_t *handle);
-
 extern int cls_register_cxx_filter(cls_handle_t hclass,
                                    const std::string &filter_name,
 				   cls_cxx_filter_factory_t fn,
                                    cls_filter_handle_t *handle=NULL);
 
-extern int cls_cxx_create(cls_method_context_t hctx, bool exclusive);
-extern int cls_cxx_remove(cls_method_context_t hctx);
-extern int cls_cxx_stat(cls_method_context_t hctx, uint64_t *size, time_t *mtime);
 extern int cls_cxx_stat2(cls_method_context_t hctx, uint64_t *size, ceph::real_time *mtime);
-extern int cls_cxx_read(cls_method_context_t hctx, int ofs, int len, bufferlist *bl);
 extern int cls_cxx_read2(cls_method_context_t hctx, int ofs, int len,
                          bufferlist *bl, uint32_t op_flags);
-extern int cls_cxx_write(cls_method_context_t hctx, int ofs, int len, bufferlist *bl);
 extern int cls_cxx_write2(cls_method_context_t hctx, int ofs, int len,
                           bufferlist *bl, uint32_t op_flags);
 extern int cls_cxx_write_full(cls_method_context_t hctx, bufferlist *bl);
-extern int cls_cxx_getxattr(cls_method_context_t hctx, const char *name,
-                            bufferlist *outbl);
 extern int cls_cxx_getxattrs(cls_method_context_t hctx, map<string, bufferlist> *attrset);
-extern int cls_cxx_setxattr(cls_method_context_t hctx, const char *name,
-                            bufferlist *inbl);
 extern int cls_cxx_replace(cls_method_context_t hctx, int ofs, int len, bufferlist *bl);
 extern int cls_cxx_snap_revert(cls_method_context_t hctx, snapid_t snapid);
 extern int cls_cxx_map_clear(cls_method_context_t hctx);
@@ -189,10 +141,6 @@ extern int cls_cxx_map_get_vals(cls_method_context_t hctx,
                                 uint64_t max_to_get,
                                 std::map<string, bufferlist> *vals);
 extern int cls_cxx_map_read_header(cls_method_context_t hctx, bufferlist *outbl);
-extern int cls_cxx_map_get_val(cls_method_context_t hctx,
-                               const string &key, bufferlist *outbl);
-extern int cls_cxx_map_set_val(cls_method_context_t hctx,
-                               const string &key, bufferlist *inbl);
 extern int cls_cxx_map_set_vals(cls_method_context_t hctx,
                                 const std::map<string, bufferlist> *map);
 extern int cls_cxx_map_write_header(cls_method_context_t hctx, bufferlist *inbl);
diff --git a/src/os/CMakeLists.txt b/src/os/CMakeLists.txt
index 0fe5a8048d2..b24686d8117 100644
--- a/src/os/CMakeLists.txt
+++ b/src/os/CMakeLists.txt
@@ -24,6 +24,7 @@ set(libos_srcs
   kstore/KStore.cc
   kstore/kstore_types.cc
   fs/FS.cc
+  fs/aio.cc
   ${libos_xfs_srcs})
 
 if(HAVE_LIBAIO)
@@ -82,7 +83,7 @@ endif()
 target_link_libraries(os kv)
 
 add_dependencies(os compressor_plugins)
-add_dependencies(os ceph_crypto_isal)
+add_dependencies(os crypto_plugins)
 
 
 if(HAVE_LIBAIO)
diff --git a/src/os/bluestore/BlockDevice.cc b/src/os/bluestore/BlockDevice.cc
index f0d6b4d7384..cdc13ff3637 100644
--- a/src/os/bluestore/BlockDevice.cc
+++ b/src/os/bluestore/BlockDevice.cc
@@ -34,14 +34,12 @@ void IOContext::aio_wait()
 {
   std::unique_lock<std::mutex> l(lock);
   // see _aio_thread for waker logic
-  ++num_waiting;
-  while (num_running.load() > 0 || num_reading.load() > 0) {
+  while (num_running.load() > 0) {
     dout(10) << __func__ << " " << this
-	     << " waiting for " << num_running.load() << " aios and/or "
-	     << num_reading.load() << " readers to complete" << dendl;
+	     << " waiting for " << num_running.load() << " aios to complete"
+	     << dendl;
     cond.wait(l);
   }
-  --num_waiting;
   dout(20) << __func__ << " " << this << " done" << dendl;
 }
 
diff --git a/src/os/bluestore/BlockDevice.h b/src/os/bluestore/BlockDevice.h
index 613a73afa45..4448b2ef6de 100644
--- a/src/os/bluestore/BlockDevice.h
+++ b/src/os/bluestore/BlockDevice.h
@@ -20,9 +20,10 @@
 #include <atomic>
 #include <condition_variable>
 #include <mutex>
+#include <list>
 
 #include "acconfig.h"
-#include "os/fs/FS.h"
+#include "os/fs/aio.h"
 
 #define SPDK_PREFIX "spdk:"
 
@@ -38,12 +39,10 @@ struct IOContext {
   std::mutex lock;
   std::condition_variable cond;
 
-  list<FS::aio_t> pending_aios;    ///< not yet submitted
-  list<FS::aio_t> running_aios;    ///< submitting or submitted
+  std::list<aio_t> pending_aios;    ///< not yet submitted
+  std::list<aio_t> running_aios;    ///< submitting or submitted
   std::atomic_int num_pending = {0};
   std::atomic_int num_running = {0};
-  std::atomic_int num_reading = {0};
-  std::atomic_int num_waiting = {0};
 
   explicit IOContext(CephContext* cct, void *p)
     : cct(cct), priv(p)
@@ -60,10 +59,10 @@ struct IOContext {
   void aio_wait();
 
   void aio_wake() {
-    if (num_waiting.load()) {
-      std::lock_guard<std::mutex> l(lock);
-      cond.notify_all();
-    }
+    std::lock_guard<std::mutex> l(lock);
+    cond.notify_all();
+    --num_running;
+    assert(num_running == 0);
   }
 };
 
@@ -73,7 +72,7 @@ public:
   CephContext* cct;
 private:
   std::mutex ioc_reap_lock;
-  vector<IOContext*> ioc_reap_queue;
+  std::vector<IOContext*> ioc_reap_queue;
   std::atomic_int ioc_reap_count = {0};
 
 protected:
@@ -85,7 +84,7 @@ public:
   typedef void (*aio_callback_t)(void *handle, void *aio);
 
   static BlockDevice *create(
-    CephContext* cct, const string& path, aio_callback_t cb, void *cbpriv);
+    CephContext* cct, const std::string& path, aio_callback_t cb, void *cbpriv);
   virtual bool supported_bdev_label() { return true; }
   virtual bool is_rotational() { return rotational; }
 
@@ -94,7 +93,7 @@ public:
   virtual uint64_t get_size() const = 0;
   virtual uint64_t get_block_size() const = 0;
 
-  virtual int collect_metadata(string prefix, map<string,string> *pm) const = 0;
+  virtual int collect_metadata(std::string prefix, std::map<std::string,std::string> *pm) const = 0;
 
   virtual int read(
     uint64_t off,
@@ -107,6 +106,10 @@ public:
     uint64_t len,
     char *buf,
     bool buffered) = 0;
+  virtual int write(
+    uint64_t off,
+    bufferlist& bl,
+    bool buffered) = 0;
 
   virtual int aio_read(
     uint64_t off,
@@ -125,7 +128,7 @@ public:
 
   // for managing buffered readers/writers
   virtual int invalidate_cache(uint64_t off, uint64_t len) = 0;
-  virtual int open(const string& path) = 0;
+  virtual int open(const std::string& path) = 0;
   virtual void close() = 0;
 };
 
diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc
index 4cc34f7f036..db1dad55451 100644
--- a/src/os/bluestore/BlueFS.cc
+++ b/src/os/bluestore/BlueFS.cc
@@ -60,31 +60,42 @@ void BlueFS::_init_logger()
   b.add_u64_counter(l_bluefs_reclaim_bytes, "reclaim_bytes",
 		    "Bytes reclaimed by BlueStore");
   b.add_u64(l_bluefs_db_total_bytes, "db_total_bytes",
-	    "Total bytes (main db device)");
+	    "Total bytes (main db device)",
+	    "b", PerfCountersBuilder::PRIO_USEFUL);
   b.add_u64(l_bluefs_db_used_bytes, "db_used_bytes",
-	    "Used bytes (main db device)");
+	    "Used bytes (main db device)",
+	    "u", PerfCountersBuilder::PRIO_USEFUL);
   b.add_u64(l_bluefs_wal_total_bytes, "wal_total_bytes",
-	    "Total bytes (wal device)");
+	    "Total bytes (wal device)",
+	    "walb", PerfCountersBuilder::PRIO_USEFUL);
   b.add_u64(l_bluefs_wal_used_bytes, "wal_used_bytes",
-	    "Used bytes (wal device)");
+	    "Used bytes (wal device)",
+	    "walu", PerfCountersBuilder::PRIO_USEFUL);
   b.add_u64(l_bluefs_slow_total_bytes, "slow_total_bytes",
-	    "Total bytes (slow device)");
+	    "Total bytes (slow device)",
+	    "slob", PerfCountersBuilder::PRIO_USEFUL);
   b.add_u64(l_bluefs_slow_used_bytes, "slow_used_bytes",
-	    "Used bytes (slow device)");
-  b.add_u64(l_bluefs_num_files, "num_files", "File count");
-  b.add_u64(l_bluefs_log_bytes, "log_bytes", "Size of the metadata log");
+	    "Used bytes (slow device)",
+	    "slou", PerfCountersBuilder::PRIO_USEFUL);
+  b.add_u64(l_bluefs_num_files, "num_files", "File count",
+	    "f", PerfCountersBuilder::PRIO_USEFUL);
+  b.add_u64(l_bluefs_log_bytes, "log_bytes", "Size of the metadata log",
+	    "jlen", PerfCountersBuilder::PRIO_INTERESTING);
   b.add_u64_counter(l_bluefs_log_compactions, "log_compactions",
 		    "Compactions of the metadata log");
   b.add_u64_counter(l_bluefs_logged_bytes, "logged_bytes",
-		    "Bytes written to the metadata log", "j");
+		    "Bytes written to the metadata log", "j",
+		    PerfCountersBuilder::PRIO_CRITICAL);
   b.add_u64_counter(l_bluefs_files_written_wal, "files_written_wal",
 		    "Files written to WAL");
   b.add_u64_counter(l_bluefs_files_written_sst, "files_written_sst",
 		    "Files written to SSTs");
   b.add_u64_counter(l_bluefs_bytes_written_wal, "bytes_written_wal",
-		    "Bytes written to WAL", "wal");
+		    "Bytes written to WAL", "wal",
+		    PerfCountersBuilder::PRIO_CRITICAL);
   b.add_u64_counter(l_bluefs_bytes_written_sst, "bytes_written_sst",
-		    "Bytes written to SSTs", "sst");
+		    "Bytes written to SSTs", "sst",
+		    PerfCountersBuilder::PRIO_CRITICAL);
   logger = b.create_perf_counters();
   cct->get_perfcounters_collection()->add(logger);
 }
@@ -466,9 +477,7 @@ int BlueFS::_write_super()
   assert(bl.length() <= get_super_length());
   bl.append_zero(get_super_length() - bl.length());
 
-  bdev[BDEV_DB]->aio_write(get_super_offset(), bl, ioc[BDEV_DB], false);
-  bdev[BDEV_DB]->aio_submit(ioc[BDEV_DB]);
-  ioc[BDEV_DB]->aio_wait();
+  bdev[BDEV_DB]->write(get_super_offset(), bl, false);
   dout(20) << __func__ << " v " << super.version
            << " crc 0x" << std::hex << crc
            << " offset 0x" << get_super_offset() << std::dec
@@ -1399,14 +1408,7 @@ int BlueFS::_flush_and_sync_log(std::unique_lock<std::mutex>& l,
     log_writer->file->fnode.size = jump_to;
   }
 
-  // drop lock while we wait for io
-  list<FS::aio_t> completed_ios;
-  _claim_completed_aios(log_writer, &completed_ios);
-  l.unlock();
-  wait_for_aio(log_writer);
-  completed_ios.clear();
-  flush_bdev();
-  l.lock();
+  _flush_bdev_safely(log_writer);
 
   log_flushing = false;
   log_cond.notify_all();
@@ -1622,7 +1624,11 @@ int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
 	t.append_zero(zlen);
       }
     }
-    bdev[p->bdev]->aio_write(p->offset + x_off, t, h->iocv[p->bdev], buffered);
+    if (cct->_conf->bluefs_sync_write) {
+      bdev[p->bdev]->write(p->offset + x_off, t, buffered);
+    } else {
+      bdev[p->bdev]->aio_write(p->offset + x_off, t, h->iocv[p->bdev], buffered);
+    }
     bloff += x_len;
     length -= x_len;
     ++p;
@@ -1643,7 +1649,7 @@ int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
 
 // we need to retire old completed aios so they don't stick around in
 // memory indefinitely (along with their bufferlist refs).
-void BlueFS::_claim_completed_aios(FileWriter *h, list<FS::aio_t> *ls)
+void BlueFS::_claim_completed_aios(FileWriter *h, list<aio_t> *ls)
 {
   for (auto p : h->iocv) {
     if (p) {
@@ -1742,13 +1748,7 @@ int BlueFS::_fsync(FileWriter *h, std::unique_lock<std::mutex>& l)
      return r;
   uint64_t old_dirty_seq = h->file->dirty_seq;
 
-  list<FS::aio_t> completed_ios;
-  _claim_completed_aios(h, &completed_ios);
-  lock.unlock();
-  wait_for_aio(h);
-  completed_ios.clear();
-  flush_bdev();
-  lock.lock();
+  _flush_bdev_safely(h);
 
   if (old_dirty_seq) {
     uint64_t s = log_seq;
@@ -1761,6 +1761,23 @@ int BlueFS::_fsync(FileWriter *h, std::unique_lock<std::mutex>& l)
   return 0;
 }
 
+void BlueFS::_flush_bdev_safely(FileWriter *h)
+{
+  if (!cct->_conf->bluefs_sync_write) {
+    list<aio_t> completed_ios;
+    _claim_completed_aios(h, &completed_ios);
+    lock.unlock();
+    wait_for_aio(h);
+    completed_ios.clear();
+    flush_bdev();
+    lock.lock();
+  } else {
+    lock.unlock();
+    flush_bdev();
+    lock.lock();
+  }
+}
+
 void BlueFS::flush_bdev()
 {
   // NOTE: this is safe to call without a lock.
diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h
index 030cf1709d2..7229355a10d 100644
--- a/src/os/bluestore/BlueFS.h
+++ b/src/os/bluestore/BlueFS.h
@@ -273,7 +273,7 @@ private:
   int _flush(FileWriter *h, bool force);
   int _fsync(FileWriter *h, std::unique_lock<std::mutex>& l);
 
-  void _claim_completed_aios(FileWriter *h, list<FS::aio_t> *ls);
+  void _claim_completed_aios(FileWriter *h, list<aio_t> *ls);
   void wait_for_aio(FileWriter *h);  // safe to call without a lock
 
   int _flush_and_sync_log(std::unique_lock<std::mutex>& l,
@@ -287,6 +287,7 @@ private:
 
   //void _aio_finish(void *priv);
 
+  void _flush_bdev_safely(FileWriter *h);
   void flush_bdev();  // this is safe to call without a lock
 
   int _preallocate(FileRef f, uint64_t off, uint64_t len);
diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc
index b08a7bcab8e..394c46c917f 100644
--- a/src/os/bluestore/BlueStore.cc
+++ b/src/os/bluestore/BlueStore.cc
@@ -1134,7 +1134,8 @@ void BlueStore::TwoQCache::_trim(uint64_t onode_max, uint64_t buffer_max)
     }
 
     // adjust warm out list too, if necessary
-    while (buffer_warm_out.size() > kout) {
+    int64_t num = buffer_warm_out.size() - kout;
+    while (num-- > 0) {
       Buffer *b = &*buffer_warm_out.rbegin();
       assert(b->is_empty());
       dout(20) << __func__ << " buffer_warm_out rm " << *b << dendl;
@@ -2853,13 +2854,14 @@ BlueStore::BlobRef BlueStore::ExtentMap::split_blob(
 
 void BlueStore::Onode::flush()
 {
-  if (flushing_count) {
+  if (flushing_count.load()) {
+    ldout(c->store->cct, 20) << __func__ << " cnt:" << flushing_count << dendl;
     std::unique_lock<std::mutex> l(flush_lock);
-    ldout(c->store->cct, 20) << flush_txns << dendl;
-    while (!flush_txns.empty())
+    while (flushing_count.load()) {
       flush_cond.wait(l);
+    }
   }
-  ldout(c->store->cct, 20) << "done" << dendl;
+  ldout(c->store->cct, 20) << __func__ << " done" << dendl;
 }
 
 // =======================================================
@@ -2887,8 +2889,110 @@ bool BlueStore::WriteContext::has_conflict(
   return false;
 }
  
- // =======================================================
- 
+// =======================================================
+
+// DeferredBatch
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") "
+
+void BlueStore::DeferredBatch::prepare_write(
+  CephContext *cct,
+  uint64_t seq, uint64_t offset, uint64_t length,
+  bufferlist::const_iterator& blp)
+{
+  _discard(cct, offset, length);
+  auto i = iomap.insert(make_pair(offset, deferred_io()));
+  assert(i.second);  // this should be a new insertion
+  i.first->second.seq = seq;
+  blp.copy(length, i.first->second.bl);
+  dout(20) << __func__ << " seq " << seq
+	   << " 0x" << std::hex << offset << "~" << length
+	   << " crc " << i.first->second.bl.crc32c(-1)
+	   << std::dec << dendl;
+  seq_bytes[seq] += length;
+#ifdef DEBUG_DEFERRED
+  _audit(cct);
+#endif
+}
+
+void BlueStore::DeferredBatch::_discard(
+  CephContext *cct, uint64_t offset, uint64_t length)
+{
+  generic_dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
+		   << std::dec << dendl;
+  auto p = iomap.lower_bound(offset);
+  if (p != iomap.begin()) {
+    --p;
+    auto end = p->first + p->second.bl.length();
+    if (end > offset) {
+      bufferlist head;
+      head.substr_of(p->second.bl, 0, offset - p->first);
+      dout(20) << __func__ << "  keep head " << p->second.seq
+	       << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
+	       << " -> 0x" << head.length() << std::dec << dendl;
+      auto i = seq_bytes.find(p->second.seq);
+      if (end > offset + length) {
+	bufferlist tail;
+	tail.substr_of(p->second.bl, offset + length - p->first,
+		       end - (offset + length));
+	dout(20) << __func__ << "  keep tail " << p->second.seq
+		 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
+		 << " -> 0x" << tail.length() << std::dec << dendl;
+	auto &n = iomap[offset + length];
+	n.bl.swap(tail);
+	n.seq = p->second.seq;
+	i->second -= length;
+      } else {
+	i->second -= end - offset;
+      }
+      p->second.bl.swap(head);
+    }
+    ++p;
+  }
+  while (p != iomap.end()) {
+    if (p->first >= offset + length) {
+      break;
+    }
+    auto i = seq_bytes.find(p->second.seq);
+    auto end = p->first + p->second.bl.length();
+    if (end > offset + length) {
+      unsigned drop_front = offset + length - p->first;
+      unsigned keep_tail = end - (offset + length);
+      dout(20) << __func__ << "  truncate front " << p->second.seq
+	       << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
+	       << " drop_front 0x" << drop_front << " keep_tail 0x" << keep_tail
+	       << " to 0x" << (offset + length) << "~" << keep_tail
+	       << std::dec << dendl;
+      auto &s = iomap[offset + length];
+      s.seq = p->second.seq;
+      s.bl.substr_of(p->second.bl, drop_front, keep_tail);
+      i->second -= drop_front;
+    } else {
+      dout(20) << __func__ << "  drop " << p->second.seq
+	       << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
+	       << std::dec << dendl;
+      i->second -= p->second.bl.length();
+    }
+    p = iomap.erase(p);
+  }
+}
+
+void BlueStore::DeferredBatch::_audit(CephContext *cct)
+{
+  map<uint64_t,int> sb;
+  for (auto p : seq_bytes) {
+    sb[p.first] = 0;  // make sure we have the same set of keys
+  }
+  uint64_t pos = 0;
+  for (auto& p : iomap) {
+    assert(p.first >= pos);
+    sb[p.second.seq] += p.second.bl.length();
+    pos = p.first + p.second.bl.length();
+  }
+  assert(sb == seq_bytes);
+}
+
+
 // Collection
 
 #undef dout_prefix
@@ -3165,15 +3269,17 @@ void *BlueStore::MempoolThread::entry()
 static void aio_cb(void *priv, void *priv2)
 {
   BlueStore *store = static_cast<BlueStore*>(priv);
-  store->_txc_aio_finish(priv2);
+  BlueStore::AioContext *c = static_cast<BlueStore::AioContext*>(priv2);
+  c->aio_finish(store);
 }
 
 BlueStore::BlueStore(CephContext *cct, const string& path)
   : ObjectStore(cct, path),
-    throttle_bytes(cct, "bluestore_max_bytes", cct->_conf->bluestore_max_bytes),
-    throttle_deferred_bytes(cct, "bluestore_deferred_max_bytes",
-		       cct->_conf->bluestore_max_bytes +
-		       cct->_conf->bluestore_deferred_max_bytes),
+    throttle_bytes(cct, "bluestore_throttle_bytes",
+		   cct->_conf->bluestore_throttle_bytes),
+    throttle_deferred_bytes(cct, "bluestore_throttle_deferred_bytes",
+		       cct->_conf->bluestore_throttle_bytes +
+		       cct->_conf->bluestore_throttle_deferred_bytes),
     kv_sync_thread(this),
     mempool_thread(this)
 {
@@ -3197,10 +3303,11 @@ BlueStore::BlueStore(CephContext *cct,
   const string& path,
   uint64_t _min_alloc_size)
   : ObjectStore(cct, path),
-    throttle_bytes(cct, "bluestore_max_bytes", cct->_conf->bluestore_max_bytes),
-    throttle_deferred_bytes(cct, "bluestore_deferred_max_bytes",
-		       cct->_conf->bluestore_max_bytes +
-		       cct->_conf->bluestore_deferred_max_bytes),
+    throttle_bytes(cct, "bluestore_throttle_bytes",
+		   cct->_conf->bluestore_throttle_bytes),
+    throttle_deferred_bytes(cct, "bluestore_throttle_deferred_bytes",
+		       cct->_conf->bluestore_throttle_bytes +
+		       cct->_conf->bluestore_throttle_deferred_bytes),
     kv_sync_thread(this),
     min_alloc_size(_min_alloc_size),
     min_alloc_size_order(ctz(_min_alloc_size)),
@@ -3259,10 +3366,11 @@ const char **BlueStore::get_tracked_conf_keys() const
     "bleustore_deferred_batch_ops",
     "bleustore_deferred_batch_ops_hdd",
     "bleustore_deferred_batch_ops_ssd",
-    "bluestore_max_ops",
-    "bluestore_max_bytes",
-    "bluestore_deferred_max_ops",
-    "bluestore_deferred_max_bytes",
+    "bluestore_throttle_bytes",
+    "bluestore_throttle_deferred_bytes",
+    "bluestore_throttle_cost_per_io_hdd",
+    "bluestore_throttle_cost_per_io_ssd",
+    "bluestore_throttle_cost_per_io",
     "bluestore_max_blob_size",
     "bluestore_max_blob_size_ssd",
     "bluestore_max_blob_size_hdd",
@@ -3310,14 +3418,14 @@ void BlueStore::handle_conf_change(const struct md_config_t *conf,
       _set_throttle_params();
     }
   }
-  if (changed.count("bluestore_max_bytes")) {
-    throttle_bytes.reset_max(conf->bluestore_max_bytes);
+  if (changed.count("bluestore_throttle_bytes")) {
+    throttle_bytes.reset_max(conf->bluestore_throttle_bytes);
     throttle_deferred_bytes.reset_max(
-      conf->bluestore_max_bytes + conf->bluestore_deferred_max_bytes);
+      conf->bluestore_throttle_bytes + conf->bluestore_throttle_deferred_bytes);
   }
-  if (changed.count("bluestore_deferred_max_bytes")) {
+  if (changed.count("bluestore_throttle_deferred_bytes")) {
     throttle_deferred_bytes.reset_max(
-      conf->bluestore_max_bytes + conf->bluestore_deferred_max_bytes);
+      conf->bluestore_throttle_bytes + conf->bluestore_throttle_deferred_bytes);
   }
 }
 
@@ -3421,15 +3529,18 @@ void BlueStore::_init_logger()
   PerfCountersBuilder b(cct, "bluestore",
                         l_bluestore_first, l_bluestore_last);
   b.add_time_avg(l_bluestore_kv_flush_lat, "kv_flush_lat",
-		 "Average kv_thread flush latency", "kflat");
+		 "Average kv_thread flush latency",
+		 "fl_l", PerfCountersBuilder::PRIO_INTERESTING);
   b.add_time_avg(l_bluestore_kv_commit_lat, "kv_commit_lat",
 		 "Average kv_thread commit latency");
   b.add_time_avg(l_bluestore_kv_lat, "kv_lat",
-		 "Average kv_thread sync latency", "klat");
+		 "Average kv_thread sync latency",
+		 "k_l", PerfCountersBuilder::PRIO_INTERESTING);
   b.add_time_avg(l_bluestore_state_prepare_lat, "state_prepare_lat",
     "Average prepare state latency");
   b.add_time_avg(l_bluestore_state_aio_wait_lat, "state_aio_wait_lat",
-		 "Average aio_wait state latency", "iolat");
+		 "Average aio_wait state latency",
+		 "io_l", PerfCountersBuilder::PRIO_INTERESTING);
   b.add_time_avg(l_bluestore_state_io_done_lat, "state_io_done_lat",
     "Average io_done state latency");
   b.add_time_avg(l_bluestore_state_kv_queued_lat, "state_kv_queued_lat",
@@ -3449,13 +3560,17 @@ void BlueStore::_init_logger()
   b.add_time_avg(l_bluestore_state_done_lat, "state_done_lat",
     "Average done state latency");
   b.add_time_avg(l_bluestore_throttle_lat, "throttle_lat",
-		 "Average submit throttle latency", "tlat");
+		 "Average submit throttle latency",
+		 "th_l", PerfCountersBuilder::PRIO_CRITICAL);
   b.add_time_avg(l_bluestore_submit_lat, "submit_lat",
-		 "Average submit latency", "slat");
+		 "Average submit latency",
+		 "s_l", PerfCountersBuilder::PRIO_CRITICAL);
   b.add_time_avg(l_bluestore_commit_lat, "commit_lat",
-		 "Average commit latency", "clat");
+		 "Average commit latency",
+		 "c_l", PerfCountersBuilder::PRIO_CRITICAL);
   b.add_time_avg(l_bluestore_read_lat, "read_lat",
-		 "Average read latency", "rlat");
+		 "Average read latency",
+		 "r_l", PerfCountersBuilder::PRIO_CRITICAL);
   b.add_time_avg(l_bluestore_read_onode_meta_lat, "read_onode_meta_lat",
     "Average read onode metadata latency");
   b.add_time_avg(l_bluestore_read_wait_aio_lat, "read_wait_aio_lat",
@@ -4023,7 +4138,7 @@ bool BlueStore::test_mount_in_use()
   return ret;
 }
 
-int BlueStore::_open_db(bool create, bool kv_no_open)
+int BlueStore::_open_db(bool create)
 {
   int r;
   assert(!db);
@@ -4127,9 +4242,11 @@ int BlueStore::_open_db(bool create, bool kv_no_open)
       initial = MAX(initial, cct->_conf->bluestore_bluefs_min);
       // align to bluefs's alloc_size
       initial = P2ROUNDUP(initial, cct->_conf->bluefs_alloc_size);
-      initial += cct->_conf->bluefs_alloc_size - SUPER_RESERVED;
-      bluefs->add_block_extent(bluefs_shared_bdev, SUPER_RESERVED, initial);
-      bluefs_extents.insert(SUPER_RESERVED, initial);
+      // put bluefs in the middle of the device in case it is an HDD
+      uint64_t start = P2ALIGN((bdev->get_size() - initial) / 2,
+			       cct->_conf->bluefs_alloc_size);
+      bluefs->add_block_extent(bluefs_shared_bdev, start, initial);
+      bluefs_extents.insert(start, initial);
     }
 
     bfn = path + "/block.wal";
@@ -4262,9 +4379,6 @@ int BlueStore::_open_db(bool create, bool kv_no_open)
   if (kv_backend == "rocksdb")
     options = cct->_conf->bluestore_rocksdb_options;
   db->init(options);
-  if (kv_no_open) {
-    return 0;
-  }
   if (create)
     r = db->create_and_open(err);
   else
@@ -4405,7 +4519,7 @@ int BlueStore::_balance_bluefs_freespace(PExtentVector *extents)
 
   if (gift) {
     // round up to alloc size
-    gift = P2ROUNDUP(gift, min_alloc_size);
+    gift = P2ROUNDUP(gift, cct->_conf->bluefs_alloc_size);
 
     // hard cap to fit into 32 bits
     gift = MIN(gift, 1ull<<31);
@@ -4861,7 +4975,7 @@ int BlueStore::_mount(bool kv_only)
   if (r < 0)
     goto out_fsid;
 
-  r = _open_db(false, kv_only);
+  r = _open_db(false);
   if (r < 0)
     goto out_bdev;
 
@@ -5689,9 +5803,9 @@ void BlueStore::_reap_collections()
     dout(10) << __func__ << " " << c << " " << c->cid << dendl;
     if (c->onode_map.map_any([&](OnodeRef o) {
 	  assert(!o->exists);
-	  if (!o->flush_txns.empty()) {
+	  if (o->flushing_count.load()) {
 	    dout(10) << __func__ << " " << c << " " << c->cid << " " << o->oid
-		     << " flush_txns " << o->flush_txns << dendl;
+		     << " flush_txns " << o->flushing_count << dendl;
 	    return false;
 	  }
 	  return true;
@@ -6052,6 +6166,7 @@ int BlueStore::_do_read(
             return r;
           return 0;
 	});
+        assert(r == 0);
     } else {
       // read the pieces
       for (auto& reg : p.second) {
@@ -6089,8 +6204,7 @@ int BlueStore::_do_read(
               return r;
             return 0;
 	  });
-        if (r < 0)
-          return r;
+	assert(r == 0);
 	assert(reg.bl.length() == r_len);
       }
     }
@@ -7299,7 +7413,7 @@ BlueStore::TransContext *BlueStore::_txc_create(OpSequencer *osr)
 
 void BlueStore::_txc_calc_cost(TransContext *txc)
 {
-  // this is about the simplest model for trasnaction cost you can
+  // this is about the simplest model for transaction cost you can
   // imagine.  there is some fixed overhead cost by saying there is a
   // minimum of one "io".  and then we have some cost per "io" that is
   // a configurable (with different hdd and ssd defaults), and add
@@ -7383,7 +7497,6 @@ void BlueStore::_txc_state_proc(TransContext *txc)
 	  dout(20) << __func__ << " DEBUG randomly forcing submit via kv thread"
 		   << dendl;
 	} else {
-	  _txc_finalize_kv(txc, txc->t);
 	  txc->state = TransContext::STATE_KV_SUBMITTED;
 	  int r = db->submit_transaction(txc->t);
 	  assert(r == 0);
@@ -7416,11 +7529,6 @@ void BlueStore::_txc_state_proc(TransContext *txc)
       txc->state = TransContext::STATE_FINISHING;
       break;
 
-    case TransContext::STATE_DEFERRED_AIO_WAIT:
-      txc->log_state_latency(logger, l_bluestore_state_deferred_aio_wait_lat);
-      _deferred_finish(txc);
-      return;
-
     case TransContext::STATE_DEFERRED_CLEANUP:
       txc->log_state_latency(logger, l_bluestore_state_deferred_cleanup_lat);
       txc->state = TransContext::STATE_FINISHING;
@@ -7528,9 +7636,6 @@ void BlueStore::_txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t)
 	     << extent_part << " bytes inline extents)"
 	     << dendl;
     t->set(PREFIX_OBJ, o->key.c_str(), o->key.size(), bl);
-
-    std::lock_guard<std::mutex> l(o->flush_lock);
-    o->flush_txns.insert(txc);
     o->flushing_count++;
   }
 
@@ -7538,8 +7643,6 @@ void BlueStore::_txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t)
   auto p = txc->modified_objects.begin();
   while (p != txc->modified_objects.end()) {
     if (txc->onodes.count(*p) == 0) {
-      std::lock_guard<std::mutex> l((*p)->flush_lock);
-      (*p)->flush_txns.insert(txc);
       (*p)->flushing_count++;
       ++p;
     } else {
@@ -7630,13 +7733,10 @@ void BlueStore::_txc_applied_kv(TransContext *txc)
 {
   for (auto ls : { &txc->onodes, &txc->modified_objects }) {
     for (auto& o : *ls) {
-      std::lock_guard<std::mutex> l(o->flush_lock);
-      dout(20) << __func__ << " onode " << o << " had " << o->flush_txns
+      dout(20) << __func__ << " onode " << o << " had " << o->flushing_count
 	       << dendl;
-      assert(o->flush_txns.count(txc));
-      o->flush_txns.erase(txc);
-      o->flushing_count--;
-      if (o->flush_txns.empty()) {
+      if (--o->flushing_count == 0) {
+        std::lock_guard<std::mutex> l(o->flush_lock);
 	o->flush_cond.notify_all();
       }
     }
@@ -7684,41 +7784,13 @@ void BlueStore::_txc_finish(TransContext *txc)
   }
 
   OpSequencerRef osr = txc->osr;
-  {
-    std::lock_guard<std::mutex> l(osr->qlock);
-    txc->state = TransContext::STATE_DONE;
-  }
-
-  bool empty = _osr_reap_done(osr.get());
-  if (empty && osr->zombie) {
-    dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
-    osr->_unregister();
-  }
-}
-
-void BlueStore::_txc_release_alloc(TransContext *txc)
-{
-  // update allocator with full released set
-  if (!cct->_conf->bluestore_debug_no_reuse_blocks) {
-    dout(10) << __func__ << " " << txc << " " << txc->released << dendl;
-    for (interval_set<uint64_t>::iterator p = txc->released.begin();
-	 p != txc->released.end();
-	 ++p) {
-      alloc->release(p.get_start(), p.get_len());
-    }
-  }
-
-  txc->allocated.clear();
-  txc->released.clear();
-}
-
-bool BlueStore::_osr_reap_done(OpSequencer *osr)
-{
   CollectionRef c;
   bool empty = false;
+  OpSequencer::q_list_t releasing_txc;
   {
     std::lock_guard<std::mutex> l(osr->qlock);
-    dout(20) << __func__ << " osr " << osr << dendl;
+    txc->state = TransContext::STATE_DONE;
+    bool notify = false;
     while (!osr->q.empty()) {
       TransContext *txc = &osr->q.front();
       dout(20) << __func__ << "  txc " << txc << " " << txc->get_state_name()
@@ -7727,23 +7799,19 @@ bool BlueStore::_osr_reap_done(OpSequencer *osr)
 	if (txc->state == TransContext::STATE_PREPARE &&
 	  deferred_aggressive) {
 	  // for _osr_drain_preceding()
-	  osr->qcond.notify_all();
+          notify = true;
 	}
         break;
       }
 
-      // release to allocator only after all preceding txc's have also
-      // finished any deferred writes that potentially land in these
-      // blocks
-      _txc_release_alloc(txc);
-
       if (!c && txc->first_collection) {
         c = txc->first_collection;
       }
-
       osr->q.pop_front();
-      txc->log_state_latency(logger, l_bluestore_state_done_lat);
-      delete txc;
+      releasing_txc.push_back(*txc);
+      notify = true;
+    }
+    if (notify) {
       osr->qcond.notify_all();
     }
     if (osr->q.empty()) {
@@ -7751,12 +7819,42 @@ bool BlueStore::_osr_reap_done(OpSequencer *osr)
       empty = true;
     }
   }
+  while (!releasing_txc.empty()) {
+    // release to allocator only after all preceding txc's have also
+    // finished any deferred writes that potentially land in these
+    // blocks
+    auto txc = &releasing_txc.front();
+    _txc_release_alloc(txc);
+    releasing_txc.pop_front();
+    txc->log_state_latency(logger, l_bluestore_state_done_lat);
+    delete txc;
+  }
 
   if (c) {
     c->trim_cache();
   }
 
-  return empty;
+
+  if (empty && osr->zombie) {
+    dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
+    osr->_unregister();
+  }
+}
+
+void BlueStore::_txc_release_alloc(TransContext *txc)
+{
+  // update allocator with full released set
+  if (!cct->_conf->bluestore_debug_no_reuse_blocks) {
+    dout(10) << __func__ << " " << txc << " " << txc->released << dendl;
+    for (interval_set<uint64_t>::iterator p = txc->released.begin();
+	 p != txc->released.end();
+	 ++p) {
+      alloc->release(p.get_start(), p.get_len());
+    }
+  }
+
+  txc->allocated.clear();
+  txc->released.clear();
 }
 
 void BlueStore::_osr_drain_preceding(TransContext *txc)
@@ -7767,8 +7865,8 @@ void BlueStore::_osr_drain_preceding(TransContext *txc)
   {
     // submit anything pending
     std::lock_guard<std::mutex> l(deferred_lock);
-    if (!osr->deferred_pending.empty()) {
-      _deferred_try_submit(osr);
+    if (osr->deferred_pending) {
+      _deferred_submit(osr);
     }
   }
   {
@@ -7850,12 +7948,11 @@ void BlueStore::_kv_sync_thread()
       if (kv_stop)
 	break;
       dout(20) << __func__ << " sleep" << dendl;
-      kv_sync_cond.notify_all();
       kv_cond.wait(l);
       dout(20) << __func__ << " wake" << dendl;
     } else {
       deque<TransContext*> kv_submitting;
-      deque<TransContext*> deferred_done, deferred_stable;
+      deque<DeferredBatch*> deferred_done, deferred_stable;
       dout(20) << __func__ << " committing " << kv_queue.size()
 	       << " submitting " << kv_queue_unsubmitted.size()
 	       << " deferred done " << deferred_done_queue.size()
@@ -7889,7 +7986,7 @@ void BlueStore::_kv_sync_thread()
 	if (num_aios) {
 	  force_flush = true;
 	} else if (kv_committing.empty() && kv_submitting.empty() &&
-	    deferred_stable.empty()) {
+		   deferred_stable.empty()) {
 	  force_flush = true;  // there's nothing else to commit!
 	} else if (deferred_aggressive) {
 	  force_flush = true;
@@ -7939,7 +8036,6 @@ void BlueStore::_kv_sync_thread()
       }
       for (auto txc : kv_submitting) {
 	assert(txc->state == TransContext::STATE_KV_QUEUED);
-	_txc_finalize_kv(txc, txc->t);
 	txc->log_state_latency(logger, l_bluestore_state_kv_queued_lat);
 	int r = db->submit_transaction(txc->t);
 	assert(r == 0);
@@ -7987,19 +8083,22 @@ void BlueStore::_kv_sync_thread()
       }
 
       // cleanup sync deferred keys
-      for (auto txc : deferred_stable) {
-	bluestore_deferred_transaction_t& wt = *txc->deferred_txn;
-	if (!wt.released.empty()) {
-	  // kraken replay compat only
-	  txc->released = wt.released;
-	  dout(10) << __func__ << " deferred txn has released " << txc->released
-		   << " (we just upgraded from kraken) on " << txc << dendl;
-	  _txc_finalize_kv(txc, synct);
+      for (auto b : deferred_stable) {
+	for (auto& txc : b->txcs) {
+	  bluestore_deferred_transaction_t& wt = *txc.deferred_txn;
+	  if (!wt.released.empty()) {
+	    // kraken replay compat only
+	    txc.released = wt.released;
+	    dout(10) << __func__ << " deferred txn has released "
+		     << txc.released
+		     << " (we just upgraded from kraken) on " << &txc << dendl;
+	    _txc_finalize_kv(&txc, synct);
+	  }
+	  // cleanup the deferred
+	  string key;
+	  get_deferred_key(wt.seq, &key);
+	  synct->rm_single_key(PREFIX_DEFERRED, key);
 	}
-	// cleanup the deferred
-	string key;
-	get_deferred_key(wt.seq, &key);
-	synct->rm_single_key(PREFIX_DEFERRED, key);
       }
 
       // submit synct synchronously (block and wait for it to commit)
@@ -8035,10 +8134,14 @@ void BlueStore::_kv_sync_thread()
 	_txc_state_proc(txc);
 	kv_committing.pop_front();
       }
-      while (!deferred_stable.empty()) {
-	TransContext *txc = deferred_stable.front();
-	_txc_state_proc(txc);
-	deferred_stable.pop_front();
+      for (auto b : deferred_stable) {
+	auto p = b->txcs.begin();
+	while (p != b->txcs.end()) {
+	  TransContext *txc = &*p;
+	  p = b->txcs.erase(p); // unlink here because
+	  _txc_state_proc(txc); // this may destroy txc
+	}
+	delete b;
       }
 
       if (!deferred_aggressive) {
@@ -8088,17 +8191,30 @@ bluestore_deferred_op_t *BlueStore::_get_deferred_op(
 
 void BlueStore::_deferred_queue(TransContext *txc)
 {
-  dout(20) << __func__ << " txc " << txc << " on " << txc->osr << dendl;
+  dout(20) << __func__ << " txc " << txc << " osr " << txc->osr << dendl;
   std::lock_guard<std::mutex> l(deferred_lock);
-  if (txc->osr->deferred_pending.empty() &&
-      txc->osr->deferred_running.empty()) {
+  if (!txc->osr->deferred_pending &&
+      !txc->osr->deferred_running) {
     deferred_queue.push_back(*txc->osr);
   }
-  txc->osr->deferred_pending.push_back(*txc);
+  if (!txc->osr->deferred_pending) {
+    txc->osr->deferred_pending = new DeferredBatch(cct, txc->osr.get());
+  }
   ++deferred_queue_size;
+  txc->osr->deferred_pending->txcs.push_back(*txc);
+  bluestore_deferred_transaction_t& wt = *txc->deferred_txn;
+  for (auto opi = wt.ops.begin(); opi != wt.ops.end(); ++opi) {
+    const auto& op = *opi;
+    assert(op.op == bluestore_deferred_op_t::OP_WRITE);
+    bufferlist::const_iterator p = op.data.begin();
+    for (auto e : op.extents) {
+      txc->osr->deferred_pending->prepare_write(
+	cct, wt.seq, e.offset, e.length, p);
+    }
+  }
   if (deferred_aggressive &&
-      txc->osr->deferred_running.empty()) {
-    _deferred_try_submit(txc->osr.get());
+      !txc->osr->deferred_running) {
+    _deferred_submit(txc->osr.get());
   }
 }
 
@@ -8107,106 +8223,99 @@ void BlueStore::_deferred_try_submit()
   dout(20) << __func__ << " " << deferred_queue.size() << " osrs, "
 	   << deferred_queue_size << " txcs" << dendl;
   for (auto& osr : deferred_queue) {
-    if (osr.deferred_running.empty()) {
-      _deferred_try_submit(&osr);
+    if (!osr.deferred_running) {
+      _deferred_submit(&osr);
     }
   }
 }
 
-void BlueStore::_deferred_try_submit(OpSequencer *osr)
+void BlueStore::_deferred_submit(OpSequencer *osr)
 {
-  dout(10) << __func__ << " osr " << osr << " " << osr->deferred_pending.size()
-	   << " pending " << dendl;
-  assert(!osr->deferred_pending.empty());
-  assert(osr->deferred_running.empty());
+  dout(10) << __func__ << " osr " << osr
+	   << " " << osr->deferred_pending->iomap.size() << " ios pending "
+	   << dendl;
+  assert(osr->deferred_pending);
+  assert(!osr->deferred_running);
 
-  deferred_queue_size -= osr->deferred_pending.size();
+  auto b = osr->deferred_pending;
+  deferred_queue_size -= b->seq_bytes.size();
   assert(deferred_queue_size >= 0);
-  osr->deferred_running.swap(osr->deferred_pending);
 
-  // attach all IO to the last in the batch
-  TransContext *last = &osr->deferred_running.back();
+  osr->deferred_running = osr->deferred_pending;
+  osr->deferred_pending = nullptr;
 
-  // reverse order
-  for (auto i = osr->deferred_running.rbegin();
-       i != osr->deferred_running.rend();
-       ++i) {
-    TransContext *txc = &*i;
-    bluestore_deferred_transaction_t& wt = *txc->deferred_txn;
-    dout(20) << __func__ << "  txc " << txc << " seq " << wt.seq << dendl;
-    txc->log_state_latency(logger, l_bluestore_state_deferred_queued_lat);
-    txc->state = TransContext::STATE_DEFERRED_AIO_WAIT;
-    for (auto opi = wt.ops.rbegin(); opi != wt.ops.rend(); ++opi) {
-      assert(opi->op == bluestore_deferred_op_t::OP_WRITE);
-      const auto& op = *opi;
-      uint64_t bl_offset = 0;
-      for (auto e : op.extents) {
-	interval_set<uint64_t> cur;
-	cur.insert(e.offset, e.length);
-	interval_set<uint64_t> overlap;
-	overlap.intersection_of(cur, osr->deferred_blocks);
-	cur.subtract(overlap);
-	dout(20) << __func__ << "  txc " << txc << " " << e << std::hex
-		 << " overlap 0x" << overlap << " new 0x" << cur
-		 << " from bl_offset 0x" << bl_offset << std::dec << dendl;
-	for (auto j = cur.begin(); j != cur.end(); ++j) {
-	  bufferlist bl;
-	  bl.substr_of(op.data, bl_offset + j.get_start() - e.offset,
-		       j.get_len());
-	  if (!g_conf->bluestore_debug_omit_block_device_write) {
-	    logger->inc(l_bluestore_deferred_write_ops);
-	    logger->inc(l_bluestore_deferred_write_bytes, bl.length());
-	    int r = bdev->aio_write(j.get_start(), bl, &last->ioc, false);
-	    assert(r == 0);
-	  }
-	  txc->osr->deferred_blocks.insert(j.get_start(), j.get_len());
+  uint64_t start = 0, pos = 0;
+  bufferlist bl;
+  auto i = b->iomap.begin();
+  while (true) {
+    if (i == b->iomap.end() || i->first != pos) {
+      if (bl.length()) {
+	dout(20) << __func__ << " write 0x" << std::hex
+		 << start << "~" << bl.length()
+		 << " crc " << bl.crc32c(-1) << std::dec << dendl;
+	if (!g_conf->bluestore_debug_omit_block_device_write) {
+	  logger->inc(l_bluestore_deferred_write_ops);
+	  logger->inc(l_bluestore_deferred_write_bytes, bl.length());
+	  int r = bdev->aio_write(start, bl, &b->ioc, false);
+	  assert(r == 0);
 	}
-        bl_offset += e.length;
       }
+      if (i == b->iomap.end()) {
+	break;
+      }
+      start = 0;
+      pos = i->first;
+      bl.clear();
+    }
+    dout(20) << __func__ << "   seq " << i->second.seq << " 0x"
+	     << std::hex << pos << "~" << i->second.bl.length() << std::dec
+	     << dendl;
+    if (!bl.length()) {
+      start = pos;
     }
+    pos += i->second.bl.length();
+    bl.claim_append(i->second.bl);
+    ++i;
   }
-  osr->deferred_txc = last;
-  dout(20) << __func__ << " osr " << osr << " deferred_blocks 0x" << std::hex
-	   << osr->deferred_blocks << std::dec << dendl;
-  _txc_aio_submit(last);
+  bdev->aio_submit(&b->ioc);
 }
 
-int BlueStore::_deferred_finish(TransContext *txc)
+void BlueStore::_deferred_aio_finish(OpSequencer *osr)
 {
-  bluestore_deferred_transaction_t& wt = *txc->deferred_txn;
-  dout(20) << __func__ << " txc " << txc << " seq " << wt.seq << dendl;
+  dout(10) << __func__ << " osr " << osr << dendl;
+  assert(osr->deferred_running);
+  DeferredBatch *b = osr->deferred_running;
 
-  OpSequencer::deferred_queue_t finished;
   {
     std::lock_guard<std::mutex> l(deferred_lock);
-    assert(txc->osr->deferred_txc == txc);
-    txc->osr->deferred_blocks.clear();
-    finished.swap(txc->osr->deferred_running);
-    if (txc->osr->deferred_pending.empty()) {
-      auto q = deferred_queue.iterator_to(*txc->osr);
+    assert(osr->deferred_running == b);
+    osr->deferred_running = nullptr;
+    if (!osr->deferred_pending) {
+      auto q = deferred_queue.iterator_to(*osr);
       deferred_queue.erase(q);
     } else if (deferred_aggressive) {
-      _deferred_try_submit(txc->osr.get());
+      _deferred_submit(osr);
     }
   }
 
-  std::lock_guard<std::mutex> l2(txc->osr->qlock);
-  std::lock_guard<std::mutex> l(kv_lock);
-  for (auto& i : finished) {
-    TransContext *txc = &i;
-    txc->state = TransContext::STATE_DEFERRED_CLEANUP;
-    txc->osr->qcond.notify_all();
-    throttle_deferred_bytes.put(txc->cost);
-    deferred_done_queue.push_back(txc);
+  {
+    std::lock_guard<std::mutex> l2(osr->qlock);
+    for (auto& i : b->txcs) {
+      TransContext *txc = &i;
+      txc->state = TransContext::STATE_DEFERRED_CLEANUP;
+      txc->osr->qcond.notify_all();
+      throttle_deferred_bytes.put(txc->cost);
+    }
+    std::lock_guard<std::mutex> l(kv_lock);
+    deferred_done_queue.emplace_back(b);
   }
-  finished.clear();
 
   // in the normal case, do not bother waking up the kv thread; it will
   // catch us on the next commit anyway.
   if (deferred_aggressive) {
+    std::lock_guard<std::mutex> l(kv_lock);
     kv_cond.notify_one();
   }
-  return 0;
 }
 
 int BlueStore::_deferred_replay()
@@ -8308,6 +8417,7 @@ int BlueStore::queue_transactions(
     txc->t->set(PREFIX_DEFERRED, key, bl);
   }
 
+  _txc_finalize_kv(txc, txc->t);
   if (handle)
     handle->suspend_tp_timeout();
 
@@ -9458,12 +9568,13 @@ int BlueStore::_do_alloc_write(
 		 << l->length() << std::dec << " write via deferred" << dendl;
 	bluestore_deferred_op_t *op = _get_deferred_op(txc, o);
 	op->op = bluestore_deferred_op_t::OP_WRITE;
-	b->get_blob().map(
+	int r = b->get_blob().map(
 	  b_off, l->length(),
 	  [&](uint64_t offset, uint64_t length) {
 	    op->extents.emplace_back(bluestore_pextent_t(offset, length));
 	    return 0;
 	  });
+        assert(r == 0);
 	op->data = *l;
       } else {
 	b->get_blob().map_bl(
@@ -9661,7 +9772,7 @@ int BlueStore::_do_write(
     auto order = min_alloc_size_order.load();
     if (o->onode.expected_write_size) {
       wctx.csum_order = std::max(order,
-			         (size_t)ctzl(o->onode.expected_write_size));
+			         (uint8_t)ctz(o->onode.expected_write_size));
     } else {
       wctx.csum_order = order;
     }
diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h
index f2a32b8b2df..50b3df8d70c 100644
--- a/src/os/bluestore/BlueStore.h
+++ b/src/os/bluestore/BlueStore.h
@@ -47,6 +47,7 @@ class FreelistManager;
 class BlueFS;
 
 //#define DEBUG_CACHE
+//#define DEBUG_DEFERRED
 
 enum {
   l_bluestore_first = 732430,
@@ -134,6 +135,11 @@ public:
   struct Collection;
   typedef boost::intrusive_ptr<Collection> CollectionRef;
 
+  struct AioContext {
+    virtual void aio_finish(BlueStore *store) = 0;
+    virtual ~AioContext() {}
+  };
+
   /// cached buffer
   struct Buffer {
     MEMPOOL_CLASS_HELPERS();
@@ -982,7 +988,6 @@ public:
     std::atomic<int> flushing_count = {0};
     std::mutex flush_lock;  ///< protect flush_txns
     std::condition_variable flush_cond;   ///< wait here for uncommitted txns
-    set<TransContext*> flush_txns;        ///< unapplied txns
 
     Onode(Collection *c, const ghobject_t& o,
 	  const mempool::bluestore_meta_other::string& k)
@@ -1369,7 +1374,7 @@ public:
   class OpSequencer;
   typedef boost::intrusive_ptr<OpSequencer> OpSequencerRef;
 
-  struct TransContext {
+  struct TransContext : public AioContext {
     typedef enum {
       STATE_PREPARE,
       STATE_AIO_WAIT,
@@ -1377,8 +1382,7 @@ public:
       STATE_KV_QUEUED,     // queued for kv_sync_thread submission
       STATE_KV_SUBMITTED,  // submitted to kv; not yet synced
       STATE_KV_DONE,
-      STATE_DEFERRED_QUEUED,    // in deferred_queue
-      STATE_DEFERRED_AIO_WAIT,  // aio in flight, waiting for completion
+      STATE_DEFERRED_QUEUED,    // in deferred_queue (pending or running)
       STATE_DEFERRED_CLEANUP,   // remove deferred kv record
       STATE_DEFERRED_DONE,
       STATE_FINISHING,
@@ -1396,7 +1400,6 @@ public:
       case STATE_KV_SUBMITTED: return "kv_submitted";
       case STATE_KV_DONE: return "kv_done";
       case STATE_DEFERRED_QUEUED: return "deferred_queued";
-      case STATE_DEFERRED_AIO_WAIT: return "deferred_aio_wait";
       case STATE_DEFERRED_CLEANUP: return "deferred_cleanup";
       case STATE_DEFERRED_DONE: return "deferred_done";
       case STATE_FINISHING: return "finishing";
@@ -1415,7 +1418,6 @@ public:
       case l_bluestore_state_kv_committing_lat: return "kv_committing";
       case l_bluestore_state_kv_done_lat: return "kv_done";
       case l_bluestore_state_deferred_queued_lat: return "deferred_queued";
-      case l_bluestore_state_deferred_aio_wait_lat: return "deferred_aio_wait";
       case l_bluestore_state_deferred_cleanup_lat: return "deferred_cleanup";
       case l_bluestore_state_finishing_lat: return "finishing";
       case l_bluestore_state_done_lat: return "done";
@@ -1547,6 +1549,45 @@ public:
       onodes.erase(o);
       modified_objects.erase(o);
     }
+
+    void aio_finish(BlueStore *store) override {
+      store->txc_aio_finish(this);
+    }
+  };
+
+  typedef boost::intrusive::list<
+    TransContext,
+    boost::intrusive::member_hook<
+      TransContext,
+      boost::intrusive::list_member_hook<>,
+      &TransContext::deferred_queue_item> > deferred_queue_t;
+
+  struct DeferredBatch : public AioContext {
+    OpSequencer *osr;
+    struct deferred_io {
+      bufferlist bl;    ///< data
+      uint64_t seq;     ///< deferred transaction seq
+    };
+    map<uint64_t,deferred_io> iomap; ///< map of ios in this batch
+    deferred_queue_t txcs;           ///< txcs in this batch
+    IOContext ioc;                   ///< our aios
+    /// bytes of pending io for each deferred seq (may be 0)
+    map<uint64_t,int> seq_bytes;
+
+    void _discard(CephContext *cct, uint64_t offset, uint64_t length);
+    void _audit(CephContext *cct);
+
+    DeferredBatch(CephContext *cct, OpSequencer *osr)
+      : osr(osr), ioc(cct, this) {}
+
+    /// prepare a write
+    void prepare_write(CephContext *cct,
+		       uint64_t seq, uint64_t offset, uint64_t length,
+		       bufferlist::const_iterator& p);
+
+    void aio_finish(BlueStore *store) override {
+      store->_deferred_aio_finish(osr);
+    }
   };
 
   class OpSequencer : public Sequencer_impl {
@@ -1561,19 +1602,11 @@ public:
 	&TransContext::sequencer_item> > q_list_t;
     q_list_t q;  ///< transactions
 
-    typedef boost::intrusive::list<
-      TransContext,
-      boost::intrusive::member_hook<
-	TransContext,
-	boost::intrusive::list_member_hook<>,
-	&TransContext::deferred_queue_item> > deferred_queue_t;
-    deferred_queue_t deferred_pending;      ///< waiting
-    deferred_queue_t deferred_running;      ///< in flight ios
-    interval_set<uint64_t> deferred_blocks; ///< blocks in flight
-    TransContext *deferred_txc;             ///< txc carrying this batch
-
     boost::intrusive::list_member_hook<> deferred_osr_queue_item;
 
+    DeferredBatch *deferred_running = nullptr;
+    DeferredBatch *deferred_pending = nullptr;
+
     Sequencer *parent;
     BlueStore *store;
 
@@ -1601,7 +1634,7 @@ public:
     void discard() override {
       // Note that we may have txc's in flight when the parent Sequencer
       // goes away.  Reflect this with zombie==registered==true and let
-      // _osr_reap_done or _osr_drain_all clean up later.
+      // _osr_drain_all clean up later.
       assert(!zombie);
       zombie = true;
       parent = nullptr;
@@ -1768,13 +1801,13 @@ private:
 
   KVSyncThread kv_sync_thread;
   std::mutex kv_lock;
-  std::condition_variable kv_cond, kv_sync_cond;
+  std::condition_variable kv_cond;
   bool kv_stop = false;
   deque<TransContext*> kv_queue;             ///< ready, already submitted
   deque<TransContext*> kv_queue_unsubmitted; ///< ready, need submit by kv thread
   deque<TransContext*> kv_committing;        ///< currently syncing
-  deque<TransContext*> deferred_done_queue;    ///< deferred ios done
-  deque<TransContext*> deferred_stable_queue;  ///< deferred ios done + stable
+  deque<DeferredBatch*> deferred_done_queue;   ///< deferred ios done
+  deque<DeferredBatch*> deferred_stable_queue; ///< deferred ios done + stable
 
   PerfCounters *logger = nullptr;
 
@@ -1795,8 +1828,11 @@ private:
   int deferred_batch_ops = 0; ///< deferred batch size
 
   ///< bits for min_alloc_size
-  std::atomic<size_t> min_alloc_size_order = {0};
-  
+  std::atomic<uint8_t> min_alloc_size_order = {0};
+  static_assert(std::numeric_limits<uint8_t>::max() >
+		std::numeric_limits<decltype(min_alloc_size)>::digits,
+		"not enough bits for min_alloc_size");
+
   ///< size threshold for forced deferred writes
   std::atomic<uint64_t> prefer_deferred_size = {0};
 
@@ -1870,7 +1906,7 @@ private:
 
   int _open_bdev(bool create);
   void _close_bdev();
-  int _open_db(bool create, bool kv_no_open=false);
+  int _open_db(bool create);
   void _close_db();
   int _open_fm(bool create);
   void _close_fm();
@@ -1916,7 +1952,7 @@ private:
   void _txc_state_proc(TransContext *txc);
   void _txc_aio_submit(TransContext *txc);
 public:
-  void _txc_aio_finish(void *p) {
+  void txc_aio_finish(void *p) {
     _txc_state_proc(static_cast<TransContext*>(p));
   }
 private:
@@ -1927,7 +1963,6 @@ private:
   void _txc_finish(TransContext *txc);
   void _txc_release_alloc(TransContext *txc);
 
-  bool _osr_reap_done(OpSequencer *osr);
   void _osr_drain_preceding(TransContext *txc);
   void _osr_drain_all();
   void _osr_unregister_all();
@@ -1953,8 +1988,8 @@ private:
     _deferred_try_submit();
   }
   void _deferred_try_submit();
-  void _deferred_try_submit(OpSequencer *osr);
-  int _deferred_finish(TransContext *txc);
+  void _deferred_submit(OpSequencer *osr);
+  void _deferred_aio_finish(OpSequencer *osr);
   int _deferred_replay();
 
 public:
diff --git a/src/os/bluestore/KernelDevice.cc b/src/os/bluestore/KernelDevice.cc
index d8b19a1e195..284f21c7619 100644
--- a/src/os/bluestore/KernelDevice.cc
+++ b/src/os/bluestore/KernelDevice.cc
@@ -108,6 +108,18 @@ int KernelDevice::open(const string& p)
     derr << __func__ << " fstat got " << cpp_strerror(r) << dendl;
     goto out_fail;
   }
+
+  // Operate as though the block size is 4 KB.  The backing file
+  // blksize doesn't strictly matter except that some file systems may
+  // require a read/modify/write if we write something smaller than
+  // it.
+  block_size = cct->_conf->bdev_block_size;
+  if (block_size != (unsigned)st.st_blksize) {
+    dout(1) << __func__ << " backing device/file reports st_blksize "
+	    << st.st_blksize << ", using bdev_block_size "
+	    << block_size << " anyway" << dendl;
+  }
+
   if (S_ISBLK(st.st_mode)) {
     int64_t s;
     r = get_block_device_size(fd_direct, &s);
@@ -118,6 +130,7 @@ int KernelDevice::open(const string& p)
   } else {
     size = st.st_size;
   }
+  size &= ~(block_size);
 
   {
     char partition[PATH_MAX], devname[PATH_MAX];
@@ -132,15 +145,9 @@ int KernelDevice::open(const string& p)
     }
   }
 
-  // Operate as though the block size is 4 KB.  The backing file
-  // blksize doesn't strictly matter except that some file systems may
-  // require a read/modify/write if we write something smaller than
-  // it.
-  block_size = cct->_conf->bdev_block_size;
-  if (block_size != (unsigned)st.st_blksize) {
-    dout(1) << __func__ << " backing device/file reports st_blksize "
-	    << st.st_blksize << ", using bdev_block_size "
-	    << block_size << " anyway" << dendl;
+  r = _aio_start();
+  if (r < 0) {
+    goto out_fail;
   }
 
   fs = FS::create_by_fd(fd_direct);
@@ -149,9 +156,6 @@ int KernelDevice::open(const string& p)
   // round size down to an even block
   size &= ~(block_size - 1);
 
-  r = _aio_start();
-  assert(r == 0);
-
   dout(1) << __func__
 	  << " size " << size
 	  << " (0x" << std::hex << size << std::dec << ", "
@@ -306,7 +310,12 @@ int KernelDevice::_aio_start()
     dout(10) << __func__ << dendl;
     int r = aio_queue.init();
     if (r < 0) {
-      derr << __func__ << " failed: " << cpp_strerror(r) << dendl;
+      if (r == -EAGAIN) {
+	derr << __func__ << " io_setup(2) failed with EAGAIN; "
+	     << "try increasing /proc/sys/fs/aio-max-nr" << dendl;
+      } else {
+	derr << __func__ << " io_setup(2) failed: " << cpp_strerror(r) << dendl;
+      }
       return r;
     }
     aio_thread.create("bstore_aio");
@@ -332,7 +341,7 @@ void KernelDevice::_aio_thread()
   while (!aio_stop) {
     dout(40) << __func__ << " polling" << dendl;
     int max = 16;
-    FS::aio_t *aio[max];
+    aio_t *aio[max];
     int r = aio_queue.get_next_completed(cct->_conf->bdev_aio_poll_ms,
 					 aio, max);
     if (r < 0) {
@@ -363,17 +372,18 @@ void KernelDevice::_aio_thread()
 		 << " aios left" << dendl;
 	assert(r >= 0);
 
-	int left = --ioc->num_running;
-	// NOTE: once num_running is decremented we can no longer
-	// trust aio[] values; they my be freed (e.g., by BlueFS::_fsync)
-	if (left == 0) {
-	  // check waiting count before doing callback (which may
-	  // destroy this ioc).  and avoid ref to ioc after aio_wake()
-	  // in case that triggers destruction.
-	  void *priv = ioc->priv;
-	  ioc->aio_wake();
-	  if (priv) {
-	    aio_callback(aio_callback_priv, priv);
+	// NOTE: once num_running and we either call the callback or
+	// call aio_wake we cannot touch ioc or aio[] as the caller
+	// may free it.
+	if (ioc->priv) {
+	  if (--ioc->num_running == 0) {
+	    aio_callback(aio_callback_priv, ioc->priv);
+	  }
+	} else {
+	  if (ioc->num_running == 1) {
+	    ioc->aio_wake();
+	  } else {
+	    --ioc->num_running;
 	  }
 	}
       }
@@ -433,7 +443,7 @@ void KernelDevice::_aio_log_start(
   }
 }
 
-void KernelDevice::debug_aio_link(FS::aio_t& aio)
+void KernelDevice::debug_aio_link(aio_t& aio)
 {
   if (debug_queue.empty()) {
     debug_oldest = &aio;
@@ -441,7 +451,7 @@ void KernelDevice::debug_aio_link(FS::aio_t& aio)
   debug_queue.push_back(aio);
 }
 
-void KernelDevice::debug_aio_unlink(FS::aio_t& aio)
+void KernelDevice::debug_aio_unlink(aio_t& aio)
 {
   if (aio.queue_item.is_linked()) {
     debug_queue.erase(debug_queue.iterator_to(aio));
@@ -481,9 +491,9 @@ void KernelDevice::aio_submit(IOContext *ioc)
   // move these aside, and get our end iterator position now, as the
   // aios might complete as soon as they are submitted and queue more
   // wal aio's.
-  list<FS::aio_t>::iterator e = ioc->running_aios.begin();
+  list<aio_t>::iterator e = ioc->running_aios.begin();
   ioc->running_aios.splice(e, ioc->pending_aios);
-  list<FS::aio_t>::iterator p = ioc->running_aios.begin();
+  list<aio_t>::iterator p = ioc->running_aios.begin();
 
   int pending = ioc->num_pending.load();
   ioc->num_running += pending;
@@ -492,19 +502,19 @@ void KernelDevice::aio_submit(IOContext *ioc)
 
   bool done = false;
   while (!done) {
-    FS::aio_t& aio = *p;
+    aio_t& aio = *p;
     aio.priv = static_cast<void*>(ioc);
     dout(20) << __func__ << "  aio " << &aio << " fd " << aio.fd
 	     << " 0x" << std::hex << aio.offset << "~" << aio.length
 	     << std::dec << dendl;
-    for (vector<iovec>::iterator q = aio.iov.begin(); q != aio.iov.end(); ++q)
-      dout(30) << __func__ << "   iov " << (void*)q->iov_base
-	       << " len " << q->iov_len << dendl;
+    for (auto& io : aio.iov)
+      dout(30) << __func__ << "   iov " << (void*)io.iov_base
+	       << " len " << io.iov_len << dendl;
 
     // be careful: as soon as we submit aio we race with completion.
     // since we are holding a ref take care not to dereference txc at
     // all after that point.
-    list<FS::aio_t>::iterator cur = p;
+    list<aio_t>::iterator cur = p;
     ++p;
     done = (p == e);
 
@@ -525,6 +535,69 @@ void KernelDevice::aio_submit(IOContext *ioc)
   }
 }
 
+int KernelDevice::_sync_write(uint64_t off, bufferlist &bl, bool buffered)
+{
+  uint64_t len = bl.length();
+  dout(5) << __func__ << " 0x" << std::hex << off << "~" << len
+	  << std::dec << " buffered" << dendl;
+  if (cct->_conf->bdev_inject_crash &&
+      rand() % cct->_conf->bdev_inject_crash == 0) {
+    derr << __func__ << " bdev_inject_crash: dropping io 0x" << std::hex
+	 << off << "~" << len << std::dec << dendl;
+    ++injecting_crash;
+    return 0;
+  }
+  vector<iovec> iov;
+  bl.prepare_iov(&iov);
+  int r = ::pwritev(buffered ? fd_buffered : fd_direct,
+		    &iov[0], iov.size(), off);
+
+  if (r < 0) {
+    r = -errno;
+    derr << __func__ << " pwritev error: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+  if (buffered) {
+    // initiate IO (but do not wait)
+    r = ::sync_file_range(fd_buffered, off, len, SYNC_FILE_RANGE_WRITE);
+    if (r < 0) {
+      r = -errno;
+      derr << __func__ << " sync_file_range error: " << cpp_strerror(r) << dendl;
+      return r;
+    }
+  }
+
+  io_since_flush.store(true);
+
+  return 0;
+}
+
+int KernelDevice::write(
+  uint64_t off,
+  bufferlist &bl,
+  bool buffered)
+{
+  uint64_t len = bl.length();
+  dout(20) << __func__ << " 0x" << std::hex << off << "~" << len << std::dec
+	   << (buffered ? " (buffered)" : " (direct)")
+	   << dendl;
+  assert(off % block_size == 0);
+  assert(len % block_size == 0);
+  assert(len > 0);
+  assert(off < size);
+  assert(off + len <= size);
+
+  if ((!buffered || bl.get_num_buffers() >= IOV_MAX) &&
+      bl.rebuild_aligned_size_and_memory(block_size, block_size)) {
+    dout(20) << __func__ << " rebuilding buffer to be aligned" << dendl;
+  }
+  dout(40) << "data: ";
+  bl.hexdump(*_dout);
+  *_dout << dendl;
+
+  return _sync_write(off, bl, buffered);
+}
+
 int KernelDevice::aio_write(
   uint64_t off,
   bufferlist &bl,
@@ -553,9 +626,9 @@ int KernelDevice::aio_write(
 
 #ifdef HAVE_LIBAIO
   if (aio && dio && !buffered) {
-    ioc->pending_aios.push_back(FS::aio_t(ioc, fd_direct));
+    ioc->pending_aios.push_back(aio_t(ioc, fd_direct));
     ++ioc->num_pending;
-    FS::aio_t& aio = ioc->pending_aios.back();
+    aio_t& aio = ioc->pending_aios.back();
     if (cct->_conf->bdev_inject_crash &&
 	rand() % cct->_conf->bdev_inject_crash == 0) {
       derr << __func__ << " bdev_inject_crash: dropping io 0x" << std::hex
@@ -579,35 +652,10 @@ int KernelDevice::aio_write(
   } else
 #endif
   {
-    dout(5) << __func__ << " 0x" << std::hex << off << "~" << len
-	    << std::dec << " buffered" << dendl;
-    if (cct->_conf->bdev_inject_crash &&
-	rand() % cct->_conf->bdev_inject_crash == 0) {
-      derr << __func__ << " bdev_inject_crash: dropping io 0x" << std::hex
-	   << off << "~" << len << std::dec << dendl;
-      ++injecting_crash;
-      return 0;
-    }
-    vector<iovec> iov;
-    bl.prepare_iov(&iov);
-    int r = ::pwritev(buffered ? fd_buffered : fd_direct,
-		      &iov[0], iov.size(), off);
+    int r = _sync_write(off, bl, buffered);
     _aio_log_finish(ioc, off, len);
-
-    if (r < 0) {
-      r = -errno;
-      derr << __func__ << " pwritev error: " << cpp_strerror(r) << dendl;
+    if (r < 0)
       return r;
-    }
-    if (buffered) {
-      // initiate IO (but do not wait)
-      r = ::sync_file_range(fd_buffered, off, len, SYNC_FILE_RANGE_WRITE);
-      if (r < 0) {
-        r = -errno;
-        derr << __func__ << " sync_file_range error: " << cpp_strerror(r) << dendl;
-        return r;
-      }
-    }
   }
   return 0;
 }
@@ -626,7 +674,6 @@ int KernelDevice::read(uint64_t off, uint64_t len, bufferlist *pbl,
   assert(off + len <= size);
 
   _aio_log_start(ioc, off, len);
-  ++ioc->num_reading;
 
   bufferptr p = buffer::create_page_aligned(len);
   int r = ::pread(buffered ? fd_buffered : fd_direct,
@@ -644,8 +691,6 @@ int KernelDevice::read(uint64_t off, uint64_t len, bufferlist *pbl,
 
  out:
   _aio_log_finish(ioc, off, len);
-  --ioc->num_reading;
-  ioc->aio_wake();
   return r < 0 ? r : 0;
 }
 
@@ -662,9 +707,9 @@ int KernelDevice::aio_read(
 #ifdef HAVE_LIBAIO
   if (aio && dio) {
     _aio_log_start(ioc, off, len);
-    ioc->pending_aios.push_back(FS::aio_t(ioc, fd_direct));
+    ioc->pending_aios.push_back(aio_t(ioc, fd_direct));
     ++ioc->num_pending;
-    FS::aio_t& aio = ioc->pending_aios.back();
+    aio_t& aio = ioc->pending_aios.back();
     aio.pread(off, len);
     for (unsigned i=0; i<aio.iov.size(); ++i) {
       dout(30) << "aio " << i << " " << aio.iov[i].iov_base
diff --git a/src/os/bluestore/KernelDevice.h b/src/os/bluestore/KernelDevice.h
index c89f7c7e621..faccde3d5ba 100644
--- a/src/os/bluestore/KernelDevice.h
+++ b/src/os/bluestore/KernelDevice.h
@@ -18,6 +18,7 @@
 #include <atomic>
 
 #include "os/fs/FS.h"
+#include "os/fs/aio.h"
 #include "include/interval_set.h"
 
 #include "BlockDevice.h"
@@ -26,7 +27,7 @@ class KernelDevice : public BlockDevice {
   int fd_direct, fd_buffered;
   uint64_t size;
   uint64_t block_size;
-  string path;
+  std::string path;
   FS *fs;
   bool aio, dio;
 
@@ -36,7 +37,7 @@ class KernelDevice : public BlockDevice {
   std::atomic<bool> io_since_flush = {false};
   std::mutex flush_mutex;
 
-  FS::aio_queue_t aio_queue;
+  aio_queue_t aio_queue;
   aio_callback_t aio_callback;
   void *aio_callback_priv;
   bool aio_stop;
@@ -59,17 +60,19 @@ class KernelDevice : public BlockDevice {
   void _aio_log_start(IOContext *ioc, uint64_t offset, uint64_t length);
   void _aio_log_finish(IOContext *ioc, uint64_t offset, uint64_t length);
 
+  int _sync_write(uint64_t off, bufferlist& bl, bool buffered);
+
   int _lock();
 
   int direct_read_unaligned(uint64_t off, uint64_t len, char *buf);
 
   // stalled aio debugging
-  FS::aio_list_t debug_queue;
+  aio_list_t debug_queue;
   std::mutex debug_queue_lock;
-  FS::aio_t *debug_oldest = nullptr;
+  aio_t *debug_oldest = nullptr;
   utime_t debug_stall_since;
-  void debug_aio_link(FS::aio_t& aio);
-  void debug_aio_unlink(FS::aio_t& aio);
+  void debug_aio_link(aio_t& aio);
+  void debug_aio_unlink(aio_t& aio);
 
 public:
   KernelDevice(CephContext* cct, aio_callback_t cb, void *cbpriv);
@@ -83,7 +86,7 @@ public:
     return block_size;
   }
 
-  int collect_metadata(string prefix, map<string,string> *pm) const override;
+  int collect_metadata(std::string prefix, map<std::string,std::string> *pm) const override;
 
   int read(uint64_t off, uint64_t len, bufferlist *pbl,
 	   IOContext *ioc,
@@ -92,6 +95,7 @@ public:
 	       IOContext *ioc) override;
   int read_random(uint64_t off, uint64_t len, char *buf, bool buffered) override;
 
+  int write(uint64_t off, bufferlist& bl, bool buffered) override;
   int aio_write(uint64_t off, bufferlist& bl,
 		IOContext *ioc,
 		bool buffered) override;
@@ -99,7 +103,7 @@ public:
 
   // for managing buffered readers/writers
   int invalidate_cache(uint64_t off, uint64_t len) override;
-  int open(const string& path) override;
+  int open(const std::string& path) override;
   void close() override;
 };
 
diff --git a/src/os/bluestore/NVMEDevice.cc b/src/os/bluestore/NVMEDevice.cc
index 0ffdff3dda5..89b8b5552a3 100644
--- a/src/os/bluestore/NVMEDevice.cc
+++ b/src/os/bluestore/NVMEDevice.cc
@@ -811,11 +811,16 @@ void io_complete(void *t, const struct spdk_nvme_cpl *completion)
              << queue->queue_op_seq - queue->completed_op_seq << dendl;
     // check waiting count before doing callback (which may
     // destroy this ioc).
-    if (!--ctx->num_running) {
-      ctx->aio_wake();
-      if (task->device->aio_callback && ctx->priv) {
+    if (ctx->priv) {
+      if (!--ctx->num_running) {
         task->device->aio_callback(task->device->aio_callback_priv, ctx->priv);
       }
+    } else {
+      if (ctx->num_running == 1) {
+	ctx->aio_wake();
+      } else {
+	--ctx->num_running;
+      }
     }
     task->release_segs(queue);
     delete task;
@@ -827,16 +832,21 @@ void io_complete(void *t, const struct spdk_nvme_cpl *completion)
     task->release_segs(queue);
     // read submitted by AIO
     if(!task->return_code) {
-      if (!--ctx->num_running) {
-        ctx->aio_wake();
-        if (task->device->aio_callback && ctx->priv) {
+      if (ctx->priv) {
+	if (!--ctx->num_running) {
           task->device->aio_callback(task->device->aio_callback_priv, ctx->priv);
-        }
+	}
+      } else {
+	if (ctx->num_running == 1) {
+	  ctx->aio_wake();
+	} else {
+	  --ctx->num_running;
+	}
       }
       delete task;
     } else {
       task->return_code = 0;
-      if(!--ctx->num_reading) {
+      if (!--ctx->num_running) {
         task->io_wake();
       }
     }
@@ -846,7 +856,6 @@ void io_complete(void *t, const struct spdk_nvme_cpl *completion)
     queue->logger->tinc(l_bluestore_nvmedevice_flush_lat, dur);
     dout(20) << __func__ << " flush op successfully" << dendl;
     task->return_code = 0;
-    ctx->aio_wake();
   }
 }
 
@@ -1024,6 +1033,15 @@ int NVMEDevice::aio_write(
   return 0;
 }
 
+int NVMEDevice::write(uint64_t off, bufferlist &bl, bool buffered)
+{
+  // FIXME: there is presumably a more efficient way to do this...
+  IOContext ioc(NULL);
+  aio_write(off, bl, &ioc, buffered);
+  ioc.aio_wait();
+  return 0;
+}
+
 int NVMEDevice::read(uint64_t off, uint64_t len, bufferlist *pbl,
                      IOContext *ioc,
                      bool buffered)
@@ -1043,7 +1061,7 @@ int NVMEDevice::read(uint64_t off, uint64_t len, bufferlist *pbl,
   t->fill_cb = [buf, t]() {
     t->copy_to_buf(buf, 0, t->len);
   };
-  ++ioc->num_reading;
+  ++ioc->num_running;
   if(queue_id == -1)
     queue_id = ceph_gettid();
   driver->get_queue(queue_id)->queue_task(t);
@@ -1109,7 +1127,7 @@ int NVMEDevice::read_random(uint64_t off, uint64_t len, char *buf, bool buffered
   t->fill_cb = [buf, t, off, len]() {
     t->copy_to_buf(buf, off-t->offset, len);
   };
-  ++ioc.num_reading;
+  ++ioc.num_running;
   if(queue_id == -1)
     queue_id = ceph_gettid();
   driver->get_queue(queue_id)->queue_task(t);
diff --git a/src/os/bluestore/NVMEDevice.h b/src/os/bluestore/NVMEDevice.h
index 318c3415a94..f670e308e43 100644
--- a/src/os/bluestore/NVMEDevice.h
+++ b/src/os/bluestore/NVMEDevice.h
@@ -229,6 +229,7 @@ class NVMEDevice : public BlockDevice {
   int aio_write(uint64_t off, bufferlist& bl,
                 IOContext *ioc,
                 bool buffered) override;
+  int write(uint64_t off, bufferlist& bl, bool buffered) override;
   int flush() override;
   int read_random(uint64_t off, uint64_t len, char *buf, bool buffered) override;
 
diff --git a/src/os/bluestore/bluefs_types.cc b/src/os/bluestore/bluefs_types.cc
index a340de2f121..e414c8b6d5b 100644
--- a/src/os/bluestore/bluefs_types.cc
+++ b/src/os/bluestore/bluefs_types.cc
@@ -5,28 +5,8 @@
 #include "common/Formatter.h"
 #include "include/uuid.h"
 #include "include/stringify.h"
-#include "include/small_encoding.h"
 
 // bluefs_extent_t
-
-void bluefs_extent_t::encode(bufferlist& bl) const
-{
-  ENCODE_START(1, 1, bl);
-  small_encode_lba(offset, bl);
-  small_encode_varint_lowz(length, bl);
-  ::encode(bdev, bl);
-  ENCODE_FINISH(bl);
-}
-
-void bluefs_extent_t::decode(bufferlist::iterator& p)
-{
-  DECODE_START(1, p);
-  small_decode_lba(offset, p);
-  small_decode_varint_lowz(length, p);
-  ::decode(bdev, p);
-  DECODE_FINISH(p);
-}
-
 void bluefs_extent_t::dump(Formatter *f) const
 {
   f->dump_unsigned("offset", offset);
@@ -118,29 +98,6 @@ mempool::bluefs::vector<bluefs_extent_t>::iterator bluefs_fnode_t::seek(
   return p;
 }
 
-void bluefs_fnode_t::encode(bufferlist& bl) const
-{
-  ENCODE_START(1, 1, bl);
-  small_encode_varint(ino, bl);
-  small_encode_varint(size, bl);
-  ::encode(mtime, bl);
-  ::encode(prefer_bdev, bl);
-  ::encode(extents, bl);
-  ENCODE_FINISH(bl);
-}
-
-void bluefs_fnode_t::decode(bufferlist::iterator& p)
-{
-  DECODE_START(1, p);
-  small_decode_varint(ino, p);
-  small_decode_varint(size, p);
-  ::decode(mtime, p);
-  ::decode(prefer_bdev, p);
-  ::decode(extents, p);
-  DECODE_FINISH(p);
-  recalc_allocated();
-}
-
 void bluefs_fnode_t::dump(Formatter *f) const
 {
   f->dump_unsigned("ino", ino);
diff --git a/src/os/bluestore/bluefs_types.h b/src/os/bluestore/bluefs_types.h
index 6586f32277f..6a015892d70 100644
--- a/src/os/bluestore/bluefs_types.h
+++ b/src/os/bluestore/bluefs_types.h
@@ -6,6 +6,7 @@
 #include "bluestore_types.h"
 #include "include/utime.h"
 #include "include/encoding.h"
+#include "include/denc.h"
 
 class bluefs_extent_t : public AllocExtent{
 public:
@@ -14,12 +15,18 @@ public:
   bluefs_extent_t(uint8_t b = 0, uint64_t o = 0, uint32_t l = 0)
     : AllocExtent(o, l), bdev(b) {}
 
-  void encode(bufferlist&) const;
-  void decode(bufferlist::iterator&);
+  DENC(bluefs_extent_t, v, p) {
+    DENC_START(1, 1, p);
+    denc_lba(v.offset, p);
+    denc_varint_lowz(v.length, p);
+    denc(v.bdev, p);
+    DENC_FINISH(p);
+  }
+
   void dump(Formatter *f) const;
   static void generate_test_instances(list<bluefs_extent_t*>&);
 };
-WRITE_CLASS_ENCODER(bluefs_extent_t)
+WRITE_CLASS_DENC(bluefs_extent_t)
 
 ostream& operator<<(ostream& out, bluefs_extent_t e);
 
@@ -44,15 +51,23 @@ struct bluefs_fnode_t {
       allocated += p.length;
   }
 
+  DENC(bluefs_fnode_t, v, p) {
+    DENC_START(1, 1, p);
+    denc_varint(v.ino, p);
+    denc_varint(v.size, p);
+    denc(v.mtime, p);
+    denc(v.prefer_bdev, p);
+    denc(v.extents, p);
+    DENC_FINISH(p);
+  }
+
   mempool::bluefs::vector<bluefs_extent_t>::iterator seek(
     uint64_t off, uint64_t *x_off);
 
-  void encode(bufferlist& bl) const;
-  void decode(bufferlist::iterator& p);
   void dump(Formatter *f) const;
   static void generate_test_instances(list<bluefs_fnode_t*>& ls);
 };
-WRITE_CLASS_ENCODER(bluefs_fnode_t)
+WRITE_CLASS_DENC(bluefs_fnode_t)
 
 ostream& operator<<(ostream& out, const bluefs_fnode_t& file);
 
diff --git a/src/os/bluestore/bluestore_types.h b/src/os/bluestore/bluestore_types.h
index c38d528c881..f60b53baf1d 100644
--- a/src/os/bluestore/bluestore_types.h
+++ b/src/os/bluestore/bluestore_types.h
@@ -20,7 +20,6 @@
 #include "include/types.h"
 #include "include/interval_set.h"
 #include "include/utime.h"
-#include "include/small_encoding.h"
 #include "common/hobject.h"
 #include "compressor/Compressor.h"
 #include "common/Checksummer.h"
diff --git a/src/os/filestore/FileJournal.cc b/src/os/filestore/FileJournal.cc
index 9956a1f213b..7e6a19cbf0e 100644
--- a/src/os/filestore/FileJournal.cc
+++ b/src/os/filestore/FileJournal.cc
@@ -835,6 +835,8 @@ int FileJournal::prepare_multi_write(bufferlist& bl, uint64_t& orig_ops, uint64_
 	  assert(aio_write_queue_bytes >= bytes);
 	  aio_write_queue_bytes -= bytes;
 	}
+#else
+	(void)bytes;
 #endif
       }
       if (r == -ENOSPC) {
@@ -929,8 +931,11 @@ void FileJournal::queue_completions_thru(uint64_t seq)
     }
     if (next.finish)
       finisher->queue(next.finish);
-    if (next.tracked_op)
+    if (next.tracked_op) {
       next.tracked_op->mark_event("journaled_completion_queued");
+      next.tracked_op->journal_trace.event("queued completion");
+      next.tracked_op->journal_trace.keyval("completed through", seq);
+    }
     items.erase(it++);
   }
   batch_unpop_completions(items);
@@ -975,8 +980,10 @@ int FileJournal::prepare_single_write(write_item &next_write, bufferlist& bl, of
   footerptr.copy_in(post_offset + magic2_offset, sizeof(uint64_t), (char *)&magic2);
 
   bl.claim_append(ebl);
-  if (next_write.tracked_op)
+  if (next_write.tracked_op) {
     next_write.tracked_op->mark_event("write_thread_in_journal_buffer");
+    next_write.tracked_op->journal_trace.event("prepare_single_write");
+  }
 
   journalq.push_back(pair<uint64_t,off64_t>(seq, queue_pos));
   writing_seq = seq;
@@ -1410,7 +1417,10 @@ int FileJournal::write_aio_bl(off64_t& pos, bufferlist& bl, uint64_t seq)
     aio_lock.Unlock();
 
     iocb *piocb = &aio.iocb;
-    int attempts = 10;
+    
+    // 2^16 * 125us = ~8 seconds, so max sleep is ~16 seconds
+    int attempts = 16;
+    int delay = 125;
     do {
       int r = io_submit(aio_ctx, 1, &piocb);
       dout(20) << "write_aio_bl io_submit return value: " << r << dendl;
@@ -1418,7 +1428,8 @@ int FileJournal::write_aio_bl(off64_t& pos, bufferlist& bl, uint64_t seq)
 	derr << "io_submit to " << aio.off << "~" << cur_len
 	     << " got " << cpp_strerror(r) << dendl;
 	if (r == -EAGAIN && attempts-- > 0) {
-	  usleep(500);
+	  usleep(delay);
+	  delay *= 2;
 	  continue;
 	}
 	check_align(pos, tbl);
@@ -1610,6 +1621,14 @@ void FileJournal::submit_entry(uint64_t seq, bufferlist& e, uint32_t orig_len,
     logger->inc(l_filestore_journal_bytes, e.length());
   }
 
+  if (osd_op) {
+    osd_op->mark_event("commit_queued_for_journal_write");
+    if (osd_op->store_trace) {
+      osd_op->journal_trace.init("journal", &trace_endpoint, &osd_op->store_trace);
+      osd_op->journal_trace.event("submit_entry");
+      osd_op->journal_trace.keyval("seq", seq);
+    }
+  }
   {
     Mutex::Locker l1(writeq_lock);
 #ifdef HAVE_LIBAIO
@@ -1629,6 +1648,8 @@ void FileJournal::submit_entry(uint64_t seq, bufferlist& e, uint32_t orig_len,
     if (writeq.empty())
       writeq_cond.Signal();
     writeq.push_back(write_item(seq, e, orig_len, osd_op));
+    if (osd_op)
+      osd_op->journal_trace.keyval("queue depth", writeq.size());
   }
 }
 
diff --git a/src/os/filestore/FileJournal.h b/src/os/filestore/FileJournal.h
index 1cde1876272..ff65cdb33b7 100644
--- a/src/os/filestore/FileJournal.h
+++ b/src/os/filestore/FileJournal.h
@@ -25,7 +25,7 @@ using std::deque;
 #include "common/Thread.h"
 #include "common/Throttle.h"
 #include "JournalThrottle.h"
-
+#include "common/zipkin_trace.h"
 
 #ifdef HAVE_LIBAIO
 # include <libaio.h>
@@ -46,8 +46,7 @@ public:
     Context *finish;
     utime_t start;
     TrackedOpRef tracked_op;
-    completion_item(uint64_t o, Context *c, utime_t s,
-		    TrackedOpRef opref)
+    completion_item(uint64_t o, Context *c, utime_t s, TrackedOpRef opref)
       : seq(o), finish(c), start(s), tracked_op(opref) {}
     completion_item() : seq(0), finish(0), start(0) {}
   };
@@ -56,6 +55,7 @@ public:
     bufferlist bl;
     uint32_t orig_len;
     TrackedOpRef tracked_op;
+    ZTracer::Trace trace;
     write_item(uint64_t s, bufferlist& b, int ol, TrackedOpRef opref) :
       seq(s), orig_len(ol), tracked_op(opref) {
       bl.claim(b, buffer::list::CLAIM_ALLOW_NONSHAREABLE); // potential zero-copy
@@ -386,6 +386,8 @@ private:
     return ROUND_UP_TO(sizeof(header), block_size);
   }
 
+  ZTracer::Endpoint trace_endpoint;
+
  public:
   FileJournal(CephContext* cct, uuid_d fsid, Finisher *fin, Cond *sync_cond,
 	      const char *f, bool dio=false, bool ai=true, bool faio=false) :
@@ -420,7 +422,8 @@ private:
     write_stop(true),
     aio_stop(true),
     write_thread(this),
-    write_finish_thread(this) {
+    write_finish_thread(this),
+    trace_endpoint("0.0.0.0", 0, "FileJournal") {
 
       if (aio && !directio) {
 	lderr(cct) << "FileJournal::_open_any: aio not supported without directio; disabling aio" << dendl;
diff --git a/src/os/filestore/FileStore.cc b/src/os/filestore/FileStore.cc
index 2322732ec67..d262725f65b 100644
--- a/src/os/filestore/FileStore.cc
+++ b/src/os/filestore/FileStore.cc
@@ -547,6 +547,7 @@ FileStore::FileStore(CephContext* cct, const std::string &base,
   op_wq(this, cct->_conf->filestore_op_thread_timeout,
 	cct->_conf->filestore_op_thread_suicide_timeout, &op_tp),
   logger(NULL),
+  trace_endpoint("0.0.0.0", 0, "FileStore"),
   read_error_lock("FileStore::read_error_lock"),
   m_filestore_commit_timeout(cct->_conf->filestore_commit_timeout),
   m_filestore_journal_parallel(cct->_conf->filestore_journal_parallel ),
@@ -1963,6 +1964,7 @@ void FileStore::queue_op(OpSequencer *osr, Op *o)
   // sequencer, the op order will be preserved.
 
   osr->queue(o);
+  o->trace.event("queued");
 
   logger->inc(l_filestore_ops);
   logger->inc(l_filestore_bytes, o->bytes);
@@ -2008,9 +2010,12 @@ void FileStore::_do_op(OpSequencer *osr, ThreadPool::TPHandle &handle)
 
   osr->apply_lock.Lock();
   Op *o = osr->peek_queue();
+  o->trace.event("op_apply_start");
   apply_manager.op_apply_start(o->op);
   dout(5) << "_do_op " << o << " seq " << o->op << " " << *osr << "/" << osr->parent << " start" << dendl;
+  o->trace.event("_do_transactions start");
   int r = _do_transactions(o->tls, o->op, &handle);
+  o->trace.event("op_apply_finish");
   apply_manager.op_apply_finish(o->op);
   dout(10) << "_do_op " << o << " seq " << o->op << " r = " << r
 	   << ", finisher " << o->onreadable << " " << o->onreadable_sync << dendl;
@@ -2029,6 +2034,7 @@ void FileStore::_finish_op(OpSequencer *osr)
 
   dout(10) << "_finish_op " << o << " seq " << o->op << " " << *osr << "/" << osr->parent << " lat " << lat << dendl;
   osr->apply_lock.Unlock();  // locked in _do_op
+  o->trace.event("_finish_op");
 
   // called with tp lock held
   op_queue_release_throttle(o);
@@ -2100,6 +2106,12 @@ int FileStore::queue_transactions(Sequencer *posr, vector<Transaction>& tls,
     (*i).set_osr(osr);
   }
 
+  ZTracer::Trace trace;
+  if (osd_op && osd_op->pg_trace) {
+    osd_op->store_trace.init("filestore op", &trace_endpoint, &osd_op->pg_trace);
+    trace = osd_op->store_trace;
+  }
+
   if (journal && journal->is_writeable() && !m_filestore_journal_trailing) {
     Op *o = build_op(tls, onreadable, onreadable_sync, osd_op);
 
@@ -2118,6 +2130,7 @@ int FileStore::queue_transactions(Sequencer *posr, vector<Transaction>& tls,
 
     uint64_t op_num = submit_manager.op_submit_start();
     o->op = op_num;
+    trace.keyval("opnum", op_num);
 
     if (m_filestore_do_dump)
       dump_transactions(o->tls, o->op, osr);
@@ -2125,15 +2138,20 @@ int FileStore::queue_transactions(Sequencer *posr, vector<Transaction>& tls,
     if (m_filestore_journal_parallel) {
       dout(5) << "queue_transactions (parallel) " << o->op << " " << o->tls << dendl;
 
+      trace.keyval("journal mode", "parallel");
+      trace.event("journal started");
       _op_journal_transactions(tbl, orig_len, o->op, ondisk, osd_op);
 
       // queue inside submit_manager op submission lock
       queue_op(osr, o);
+      trace.event("op queued");
     } else if (m_filestore_journal_writeahead) {
       dout(5) << "queue_transactions (writeahead) " << o->op << " " << o->tls << dendl;
 
       osr->queue_journal(o->op);
 
+      trace.keyval("journal mode", "writeahead");
+      trace.event("journal started");
       _op_journal_transactions(tbl, orig_len, o->op,
 			       new C_JournaledAhead(this, osr, o, ondisk),
 			       osd_op);
@@ -2165,6 +2183,9 @@ int FileStore::queue_transactions(Sequencer *posr, vector<Transaction>& tls,
       dump_transactions(o->tls, o->op, osr);
 
     queue_op(osr, o);
+    trace.keyval("opnum", op_num);
+    trace.keyval("journal mode", "none");
+    trace.event("op queued");
 
     if (ondisk)
       apply_manager.add_waiter(op_num, ondisk);
@@ -2187,10 +2208,15 @@ int FileStore::queue_transactions(Sequencer *posr, vector<Transaction>& tls,
   if (m_filestore_do_dump)
     dump_transactions(tls, op, osr);
 
+  trace.event("op_apply_start");
+  trace.keyval("opnum", op);
+  trace.keyval("journal mode", "trailing");
   apply_manager.op_apply_start(op);
+  trace.event("do_transactions");
   int r = do_transactions(tls, op);
 
   if (r >= 0) {
+    trace.event("journal started");
     _op_journal_transactions(tbl, orig_len, op, ondisk, osd_op);
   } else {
     delete ondisk;
@@ -2204,6 +2230,7 @@ int FileStore::queue_transactions(Sequencer *posr, vector<Transaction>& tls,
   apply_finishers[osr->id % m_apply_finisher_num]->queue(onreadable, r);
 
   submit_manager.op_submit_finish(op);
+  trace.event("op_apply_finish");
   apply_manager.op_apply_finish(op);
 
   utime_t end = ceph_clock_now();
@@ -2215,6 +2242,8 @@ void FileStore::_journaled_ahead(OpSequencer *osr, Op *o, Context *ondisk)
 {
   dout(5) << "_journaled_ahead " << o << " seq " << o->op << " " << *osr << " " << o->tls << dendl;
 
+  o->trace.event("writeahead journal finished");
+
   // this should queue in order because the journal does it's completions in order.
   queue_op(osr, o);
 
@@ -3257,17 +3286,14 @@ more:
     i++;
     last = extent++;
   }
-  const bool is_last = last->fe_flags & FIEMAP_EXTENT_LAST;
+  uint64_t xoffset = last->fe_logical + last->fe_length - offset;
+  offset = last->fe_logical + last->fe_length;
+  len -= xoffset;
+  const bool is_last = (last->fe_flags & FIEMAP_EXTENT_LAST) || (len == 0);
+  free(fiemap);
   if (!is_last) {
-    uint64_t xoffset = last->fe_logical + last->fe_length - offset;
-    offset = last->fe_logical + last->fe_length;
-    len -= xoffset;
-    free(fiemap); /* fix clang warn: use-after-free */
     goto more;
   }
-  else {
-    free(fiemap);
-  }
 
   return r;
 }
diff --git a/src/os/filestore/FileStore.h b/src/os/filestore/FileStore.h
index bbae4bc3349..f80b807bd78 100644
--- a/src/os/filestore/FileStore.h
+++ b/src/os/filestore/FileStore.h
@@ -34,6 +34,7 @@ using namespace std;
 #include "common/Timer.h"
 #include "common/WorkQueue.h"
 #include "common/perf_counters.h"
+#include "common/zipkin_trace.h"
 
 #include "common/Mutex.h"
 #include "HashIndex.h"
@@ -216,6 +217,7 @@ private:
     Context *onreadable, *onreadable_sync;
     uint64_t ops, bytes;
     TrackedOpRef osd_op;
+    ZTracer::Trace trace;
   };
   class OpSequencer : public Sequencer_impl {
     Mutex qlock; // to protect q, for benefit of flush (peek/dequeue also protected by lock)
@@ -290,6 +292,7 @@ private:
     void queue(Op *o) {
       Mutex::Locker l(qlock);
       q.push_back(o);
+      o->trace.keyval("queue depth", q.size());
     }
     Op *peek_queue() {
       Mutex::Locker l(qlock);
@@ -419,6 +422,8 @@ private:
 
   PerfCounters *logger;
 
+  ZTracer::Endpoint trace_endpoint;
+
 public:
   int lfn_find(const ghobject_t& oid, const Index& index,
                                   IndexedPath *path = NULL);
diff --git a/src/os/filestore/Journal.h b/src/os/filestore/Journal.h
index 7bc09ebfb1a..9c1206cd257 100644
--- a/src/os/filestore/Journal.h
+++ b/src/os/filestore/Journal.h
@@ -23,6 +23,7 @@
 #include "common/Finisher.h"
 #include "common/TrackedOp.h"
 #include "os/ObjectStore.h"
+#include "common/zipkin_trace.h"
 
 class PerfCounters;
 
diff --git a/src/os/filestore/chain_xattr.cc b/src/os/filestore/chain_xattr.cc
index 0461c1953db..97c547e1426 100644
--- a/src/os/filestore/chain_xattr.cc
+++ b/src/os/filestore/chain_xattr.cc
@@ -2,29 +2,17 @@
 // vim: ts=8 sw=2 smarttab
 
 #include "chain_xattr.h"
-
-#include "include/int_types.h"
-
-#include <unistd.h>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <sys/file.h>
-#include <errno.h>
-#include <dirent.h>
-#include <sys/ioctl.h>
-#include <string.h>
-#include <stdio.h>
-#include "include/assert.h"
+#include <errno.h>           // for ERANGE, ENODATA, ENOMEM
+#include <stdio.h>           // for size_t, snprintf
+#include <stdlib.h>          // for free, malloc
+#include <string.h>          // for strcpy, strlen
+#include "include/assert.h"  // for assert
+#include "include/buffer.h"
 
 #if defined(__linux__)
 #include <linux/fs.h>
 #endif
 
-#include "common/xattr.h"
-#include "include/compat.h"
-
 /*
  * chaining xattrs
  *
diff --git a/src/os/filestore/chain_xattr.h b/src/os/filestore/chain_xattr.h
index 9764c347751..8f78643f276 100644
--- a/src/os/filestore/chain_xattr.h
+++ b/src/os/filestore/chain_xattr.h
@@ -4,14 +4,11 @@
 #ifndef __CEPH_OSD_CHAIN_XATTR_H
 #define __CEPH_OSD_CHAIN_XATTR_H
 
-#include "include/compat.h"
+#include <errno.h>
+#include <stdio.h>
 #include "common/xattr.h"
 #include "include/assert.h"
-#include "include/buffer.h"
-#include <string.h>
-#include <stdio.h>
-
-#include <errno.h>
+#include "include/buffer_fwd.h"
 
 #if defined(__linux__)
 #include <linux/limits.h>
diff --git a/src/os/fs/FS.cc b/src/os/fs/FS.cc
index b9798872b1c..d15a6bf82f2 100644
--- a/src/os/fs/FS.cc
+++ b/src/os/fs/FS.cc
@@ -185,47 +185,3 @@ int FS::zero(int fd, uint64_t offset, uint64_t length)
 
 // ---------------
 
-#if defined(HAVE_LIBAIO)
-int FS::aio_queue_t::submit(aio_t &aio, int *retries)
-{
-  // 2^16 * 125us = ~8 seconds, so max sleep is ~16 seconds
-  int attempts = 16;
-  int delay = 125;
-  iocb *piocb = &aio.iocb;
-  while (true) {
-    int r = io_submit(ctx, 1, &piocb);
-    if (r < 0) {
-      if (r == -EAGAIN && attempts-- > 0) {
-	usleep(delay);
-	delay *= 2;
-	(*retries)++;
-	continue;
-      }
-      return r;
-    }
-    assert(r == 1);
-    break;
-  }
-  return 0;
-}
-
-int FS::aio_queue_t::get_next_completed(int timeout_ms, aio_t **paio, int max)
-{
-  io_event event[max];
-  struct timespec t = {
-    timeout_ms / 1000,
-    (timeout_ms % 1000) * 1000 * 1000
-  };
-
-  int r = 0;
-  do {
-    r = io_getevents(ctx, 1, max, event, &t);
-  } while (r == -EINTR);
-  
-  for (int i=0; i<r; ++i) {
-    paio[i] = (aio_t *)event[i].obj;
-    paio[i]->rval = event[i].res;
-  }
-  return r;
-}
-#endif
diff --git a/src/os/fs/FS.h b/src/os/fs/FS.h
index 0dc8dcb3764..aafa64e5350 100644
--- a/src/os/fs/FS.h
+++ b/src/os/fs/FS.h
@@ -18,17 +18,11 @@
 #include <errno.h>
 #include <time.h>
 
-#include "acconfig.h"
-#ifdef HAVE_LIBAIO
-# include <libaio.h>
-#endif
-
 #include <string>
 
 #include "include/types.h"
 #include "common/Mutex.h"
 #include "common/Cond.h"
-#include <boost/intrusive/list.hpp>
 
 class FS {
 public:
@@ -52,75 +46,6 @@ public:
   virtual int zero(int fd, uint64_t offset, uint64_t length);
 
   // -- aio --
-#if defined(HAVE_LIBAIO)
-  struct aio_t {
-    struct iocb iocb;  // must be first element; see shenanigans in aio_queue_t
-    void *priv;
-    int fd;
-    vector<iovec> iov;
-    uint64_t offset, length;
-    int rval;
-    bufferlist bl;  ///< write payload (so that it remains stable for duration)
-
-    boost::intrusive::list_member_hook<> queue_item;
-
-    aio_t(void *p, int f) : priv(p), fd(f), offset(0), length(0), rval(-1000) {
-    }
-
-    void pwritev(uint64_t _offset, uint64_t len) {
-      offset = _offset;
-      length = len;
-      io_prep_pwritev(&iocb, fd, &iov[0], iov.size(), offset);
-    }
-    void pread(uint64_t _offset, uint64_t len) {
-      offset = _offset;
-      length = len;
-      bufferptr p = buffer::create_page_aligned(length);
-      io_prep_pread(&iocb, fd, p.c_str(), length, offset);
-      bl.append(std::move(p));
-    }
-
-    int get_return_value() {
-      return rval;
-    }
-  };
-
-  typedef boost::intrusive::list<
-    aio_t,
-    boost::intrusive::member_hook<
-      aio_t,
-      boost::intrusive::list_member_hook<>,
-      &aio_t::queue_item> > aio_list_t;
-
-  struct aio_queue_t {
-    int max_iodepth;
-    io_context_t ctx;
-
-
-    explicit aio_queue_t(unsigned max_iodepth)
-      : max_iodepth(max_iodepth),
-	ctx(0) {
-    }
-    ~aio_queue_t() {
-      assert(ctx == 0);
-    }
-
-    int init() {
-      assert(ctx == 0);
-      return io_setup(max_iodepth, &ctx);
-    }
-    void shutdown() {
-      if (ctx) {
-	int r = io_destroy(ctx);
-	assert(r == 0);
-	ctx = 0;
-      }
-    }
-
-    int submit(aio_t &aio, int *retries);
-    int get_next_completed(int timeout_ms, aio_t **paio, int max);
-  };
-#endif
 };
 
 #endif
diff --git a/src/os/fs/aio.cc b/src/os/fs/aio.cc
new file mode 100644
index 00000000000..a5edf626665
--- /dev/null
+++ b/src/os/fs/aio.cc
@@ -0,0 +1,51 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "aio.h"
+
+#if defined(HAVE_LIBAIO)
+
+int aio_queue_t::submit(aio_t &aio, int *retries)
+{
+  // 2^16 * 125us = ~8 seconds, so max sleep is ~16 seconds
+  int attempts = 16;
+  int delay = 125;
+  iocb *piocb = &aio.iocb;
+  while (true) {
+    int r = io_submit(ctx, 1, &piocb);
+    if (r < 0) {
+      if (r == -EAGAIN && attempts-- > 0) {
+	usleep(delay);
+	delay *= 2;
+	(*retries)++;
+	continue;
+      }
+      return r;
+    }
+    assert(r == 1);
+    break;
+  }
+  return 0;
+}
+
+int aio_queue_t::get_next_completed(int timeout_ms, aio_t **paio, int max)
+{
+  io_event event[max];
+  struct timespec t = {
+    timeout_ms / 1000,
+    (timeout_ms % 1000) * 1000 * 1000
+  };
+
+  int r = 0;
+  do {
+    r = io_getevents(ctx, 1, max, event, &t);
+  } while (r == -EINTR);
+
+  for (int i=0; i<r; ++i) {
+    paio[i] = (aio_t *)event[i].obj;
+    paio[i]->rval = event[i].res;
+  }
+  return r;
+}
+
+#endif
diff --git a/src/os/fs/aio.h b/src/os/fs/aio.h
new file mode 100644
index 00000000000..c4757158cc9
--- /dev/null
+++ b/src/os/fs/aio.h
@@ -0,0 +1,89 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "acconfig.h"
+#ifdef HAVE_LIBAIO
+# include <libaio.h>
+
+#include <boost/intrusive/list.hpp>
+#include <boost/container/small_vector.hpp>
+
+#include "include/buffer.h"
+
+struct aio_t {
+  struct iocb iocb;  // must be first element; see shenanigans in aio_queue_t
+  void *priv;
+  int fd;
+  boost::container::small_vector<iovec,4> iov;
+  uint64_t offset, length;
+  int rval;
+  bufferlist bl;  ///< write payload (so that it remains stable for duration)
+
+  boost::intrusive::list_member_hook<> queue_item;
+
+  aio_t(void *p, int f) : priv(p), fd(f), offset(0), length(0), rval(-1000) {
+  }
+
+  void pwritev(uint64_t _offset, uint64_t len) {
+    offset = _offset;
+    length = len;
+    io_prep_pwritev(&iocb, fd, &iov[0], iov.size(), offset);
+  }
+  void pread(uint64_t _offset, uint64_t len) {
+    offset = _offset;
+    length = len;
+    bufferptr p = buffer::create_page_aligned(length);
+    io_prep_pread(&iocb, fd, p.c_str(), length, offset);
+    bl.append(std::move(p));
+  }
+
+  int get_return_value() {
+    return rval;
+  }
+};
+
+typedef boost::intrusive::list<
+  aio_t,
+  boost::intrusive::member_hook<
+    aio_t,
+    boost::intrusive::list_member_hook<>,
+    &aio_t::queue_item> > aio_list_t;
+
+struct aio_queue_t {
+  int max_iodepth;
+  io_context_t ctx;
+
+  explicit aio_queue_t(unsigned max_iodepth)
+    : max_iodepth(max_iodepth),
+      ctx(0) {
+  }
+  ~aio_queue_t() {
+    assert(ctx == 0);
+  }
+
+  int init() {
+    assert(ctx == 0);
+    int r = io_setup(max_iodepth, &ctx);
+    if (r < 0) {
+      if (ctx) {
+	io_destroy(ctx);
+	ctx = 0;
+      }
+    }
+    return r;
+  }
+  void shutdown() {
+    if (ctx) {
+      int r = io_destroy(ctx);
+      assert(r == 0);
+      ctx = 0;
+    }
+  }
+
+  int submit(aio_t &aio, int *retries);
+  int get_next_completed(int timeout_ms, aio_t **paio, int max);
+};
+
+#endif
diff --git a/src/os/kv.h b/src/os/kv.h
index c3d5bd13a71..64048b088e2 100644
--- a/src/os/kv.h
+++ b/src/os/kv.h
@@ -14,7 +14,7 @@ inline static void _key_encode_u32(uint32_t u, T *key) {
 #ifdef CEPH_BIG_ENDIAN
   bu = u;
 #elif defined(CEPH_LITTLE_ENDIAN)
-  bu = swab32(u);
+  bu = swab(u);
 #else
 # error wtf
 #endif
@@ -27,7 +27,7 @@ inline static void _key_encode_u32(uint32_t u, size_t pos, T *key) {
 #ifdef CEPH_BIG_ENDIAN
   bu = u;
 #elif defined(CEPH_LITTLE_ENDIAN)
-  bu = swab32(u);
+  bu = swab(u);
 #else
 # error wtf
 #endif
@@ -40,7 +40,7 @@ inline static const char *_key_decode_u32(const char *key, uint32_t *pu) {
 #ifdef CEPH_BIG_ENDIAN
   *pu = bu;
 #elif defined(CEPH_LITTLE_ENDIAN)
-  *pu = swab32(bu);
+  *pu = swab(bu);
 #else
 # error wtf
 #endif
@@ -53,7 +53,7 @@ inline static void _key_encode_u64(uint64_t u, T *key) {
 #ifdef CEPH_BIG_ENDIAN
   bu = u;
 #elif defined(CEPH_LITTLE_ENDIAN)
-  bu = swab64(u);
+  bu = swab(u);
 #else
 # error wtf
 #endif
@@ -66,7 +66,7 @@ inline static const char *_key_decode_u64(const char *key, uint64_t *pu) {
 #ifdef CEPH_BIG_ENDIAN
   *pu = bu;
 #elif defined(CEPH_LITTLE_ENDIAN)
-  *pu = swab64(bu);
+  *pu = swab(bu);
 #else
 # error wtf
 #endif
diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc
index 569bb89529e..53a8814b2c9 100644
--- a/src/osd/ECBackend.cc
+++ b/src/osd/ECBackend.cc
@@ -412,10 +412,12 @@ void ECBackend::handle_recovery_read_complete(
     op.xattrs.swap(*attrs);
 
     if (!op.obc) {
-      // attrs only reference the origin bufferlist (decode from ECSubReadReply message)
-      // whose size is much greater than attrs in recovery. If obc cache it (get_obc maybe
-      // cache the attr), this causes the whole origin bufferlist would not be free until
-      // obc is evicted from obc cache. So rebuild the bufferlist before cache it.
+      // attrs only reference the origin bufferlist (decode from
+      // ECSubReadReply message) whose size is much greater than attrs
+      // in recovery. If obc cache it (get_obc maybe cache the attr),
+      // this causes the whole origin bufferlist would not be free
+      // until obc is evicted from obc cache. So rebuild the
+      // bufferlist before cache it.
       for (map<string, bufferlist>::iterator it = op.xattrs.begin();
            it != op.xattrs.end();
            ++it) {
@@ -480,6 +482,7 @@ void ECBackend::dispatch_recovery_messages(RecoveryMessages &m, int priority)
     MOSDPGPush *msg = new MOSDPGPush();
     msg->set_priority(priority);
     msg->map_epoch = get_parent()->get_epoch();
+    msg->min_epoch = get_parent()->get_last_peering_reset_epoch();
     msg->from = get_parent()->whoami_shard();
     msg->pgid = spg_t(get_parent()->get_info().pgid.pgid, i->first.shard);
     msg->pushes.swap(i->second);
@@ -496,6 +499,7 @@ void ECBackend::dispatch_recovery_messages(RecoveryMessages &m, int priority)
     MOSDPGPushReply *msg = new MOSDPGPushReply();
     msg->set_priority(priority);
     msg->map_epoch = get_parent()->get_epoch();
+    msg->min_epoch = get_parent()->get_last_peering_reset_epoch();
     msg->from = get_parent()->whoami_shard();
     msg->pgid = spg_t(get_parent()->get_info().pgid.pgid, i->first.shard);
     msg->replies.swap(i->second);
@@ -703,6 +707,17 @@ void ECBackend::recover_object(
     h->ops.back().recovery_info.size = obc->obs.oi.size;
     h->ops.back().recovery_info.oi = obc->obs.oi;
   }
+  if (hoid.is_snap()) {
+    if (obc) {
+      assert(obc->ssc);
+      h->ops.back().recovery_info.ss = obc->ssc->snapset;
+    } else if (head) {
+      assert(head->ssc);
+      h->ops.back().recovery_info.ss = head->ssc->snapset;
+    } else {
+      assert(0 == "neither obc nor head set for a snap object");
+    }
+  }
   h->ops.back().recovery_progress.omap_complete = true;
   for (set<pg_shard_t>::const_iterator i =
 	 get_parent()->get_actingbackfill_shards().begin();
@@ -735,13 +750,13 @@ bool ECBackend::handle_message(
     // not conflict with ECSubWrite's operator<<.
     MOSDECSubOpWrite *op = static_cast<MOSDECSubOpWrite*>(
       _op->get_nonconst_req());
-    handle_sub_write(op->op.from, _op, op->op);
+    handle_sub_write(op->op.from, _op, op->op, _op->pg_trace);
     return true;
   }
   case MSG_OSD_EC_WRITE_REPLY: {
     const MOSDECSubOpWriteReply *op = static_cast<const MOSDECSubOpWriteReply*>(
       _op->get_req());
-    handle_sub_write_reply(op->op.from, op->op);
+    handle_sub_write_reply(op->op.from, op->op, _op->pg_trace);
     return true;
   }
   case MSG_OSD_EC_READ: {
@@ -749,7 +764,9 @@ bool ECBackend::handle_message(
     MOSDECSubOpReadReply *reply = new MOSDECSubOpReadReply;
     reply->pgid = get_parent()->primary_spg_t();
     reply->map_epoch = get_parent()->get_epoch();
-    handle_sub_read(op->op.from, op->op, &(reply->op));
+    reply->min_epoch = get_parent()->get_interval_start_epoch();
+    handle_sub_read(op->op.from, op->op, &(reply->op), _op->pg_trace);
+    reply->trace = _op->pg_trace;
     get_parent()->send_message_osd_cluster(
       op->op.from.osd, reply, get_parent()->get_epoch());
     return true;
@@ -760,7 +777,7 @@ bool ECBackend::handle_message(
     MOSDECSubOpReadReply *op = static_cast<MOSDECSubOpReadReply*>(
       _op->get_nonconst_req());
     RecoveryMessages rm;
-    handle_sub_read_reply(op->op.from, op->op, &rm);
+    handle_sub_read_reply(op->op.from, op->op, &rm, _op->pg_trace);
     dispatch_recovery_messages(rm, priority);
     return true;
   }
@@ -799,22 +816,25 @@ struct SubWriteCommitted : public Context {
   ceph_tid_t tid;
   eversion_t version;
   eversion_t last_complete;
+  const ZTracer::Trace trace;
   SubWriteCommitted(
     ECBackend *pg,
     OpRequestRef msg,
     ceph_tid_t tid,
     eversion_t version,
-    eversion_t last_complete)
+    eversion_t last_complete,
+    const ZTracer::Trace &trace)
     : pg(pg), msg(msg), tid(tid),
-      version(version), last_complete(last_complete) {}
+      version(version), last_complete(last_complete), trace(trace) {}
   void finish(int) override {
     if (msg)
       msg->mark_event("sub_op_committed");
-    pg->sub_write_committed(tid, version, last_complete);
+    pg->sub_write_committed(tid, version, last_complete, trace);
   }
 };
 void ECBackend::sub_write_committed(
-  ceph_tid_t tid, eversion_t version, eversion_t last_complete) {
+  ceph_tid_t tid, eversion_t version, eversion_t last_complete,
+  const ZTracer::Trace &trace) {
   if (get_parent()->pgb_is_primary()) {
     ECSubWriteReply reply;
     reply.tid = tid;
@@ -823,17 +843,20 @@ void ECBackend::sub_write_committed(
     reply.from = get_parent()->whoami_shard();
     handle_sub_write_reply(
       get_parent()->whoami_shard(),
-      reply);
+      reply, trace);
   } else {
     get_parent()->update_last_complete_ondisk(last_complete);
     MOSDECSubOpWriteReply *r = new MOSDECSubOpWriteReply;
     r->pgid = get_parent()->primary_spg_t();
     r->map_epoch = get_parent()->get_epoch();
+    r->min_epoch = get_parent()->get_interval_start_epoch();
     r->op.tid = tid;
     r->op.last_complete = last_complete;
     r->op.committed = true;
     r->op.from = get_parent()->whoami_shard();
     r->set_priority(CEPH_MSG_PRIO_HIGH);
+    r->trace = trace;
+    r->trace.event("sending sub op commit");
     get_parent()->send_message_osd_cluster(
       get_parent()->primary_shard().osd, r, get_parent()->get_epoch());
   }
@@ -844,20 +867,23 @@ struct SubWriteApplied : public Context {
   OpRequestRef msg;
   ceph_tid_t tid;
   eversion_t version;
+  const ZTracer::Trace trace;
   SubWriteApplied(
     ECBackend *pg,
     OpRequestRef msg,
     ceph_tid_t tid,
-    eversion_t version)
-    : pg(pg), msg(msg), tid(tid), version(version) {}
+    eversion_t version,
+    const ZTracer::Trace &trace)
+    : pg(pg), msg(msg), tid(tid), version(version), trace(trace) {}
   void finish(int) override {
     if (msg)
       msg->mark_event("sub_op_applied");
-    pg->sub_write_applied(tid, version);
+    pg->sub_write_applied(tid, version, trace);
   }
 };
 void ECBackend::sub_write_applied(
-  ceph_tid_t tid, eversion_t version) {
+  ceph_tid_t tid, eversion_t version,
+  const ZTracer::Trace &trace) {
   parent->op_applied(version);
   if (get_parent()->pgb_is_primary()) {
     ECSubWriteReply reply;
@@ -866,15 +892,18 @@ void ECBackend::sub_write_applied(
     reply.applied = true;
     handle_sub_write_reply(
       get_parent()->whoami_shard(),
-      reply);
+      reply, trace);
   } else {
     MOSDECSubOpWriteReply *r = new MOSDECSubOpWriteReply;
     r->pgid = get_parent()->primary_spg_t();
     r->map_epoch = get_parent()->get_epoch();
+    r->min_epoch = get_parent()->get_interval_start_epoch();
     r->op.from = get_parent()->whoami_shard();
     r->op.tid = tid;
     r->op.applied = true;
     r->set_priority(CEPH_MSG_PRIO_HIGH);
+    r->trace = trace;
+    r->trace.event("sending sub op apply");
     get_parent()->send_message_osd_cluster(
       get_parent()->primary_shard().osd, r, get_parent()->get_epoch());
   }
@@ -884,10 +913,12 @@ void ECBackend::handle_sub_write(
   pg_shard_t from,
   OpRequestRef msg,
   ECSubWrite &op,
+  const ZTracer::Trace &trace,
   Context *on_local_applied_sync)
 {
   if (msg)
     msg->mark_started();
+  trace.event("handle_sub_write");
   assert(!get_parent()->get_log().get_missing().is_missing(op.soid));
   if (!get_parent()->pgb_is_primary())
     get_parent()->update_stats(op.stats);
@@ -932,10 +963,10 @@ void ECBackend::handle_sub_write(
       new SubWriteCommitted(
 	this, msg, op.tid,
 	op.at_version,
-	get_parent()->get_info().last_complete)));
+	get_parent()->get_info().last_complete, trace)));
   localt.register_on_applied(
     get_parent()->bless_context(
-      new SubWriteApplied(this, msg, op.tid, op.at_version)));
+      new SubWriteApplied(this, msg, op.tid, op.at_version, trace)));
   vector<ObjectStore::Transaction> tls;
   tls.reserve(2);
   tls.push_back(std::move(op.t));
@@ -946,8 +977,10 @@ void ECBackend::handle_sub_write(
 void ECBackend::handle_sub_read(
   pg_shard_t from,
   const ECSubRead &op,
-  ECSubReadReply *reply)
+  ECSubReadReply *reply,
+  const ZTracer::Trace &trace)
 {
+  trace.event("handle sub read");
   shard_id_t shard = get_parent()->whoami_shard().shard;
   for(auto i = op.to_read.begin();
       i != op.to_read.end();
@@ -1041,11 +1074,13 @@ error:
 
 void ECBackend::handle_sub_write_reply(
   pg_shard_t from,
-  const ECSubWriteReply &op)
+  const ECSubWriteReply &op,
+  const ZTracer::Trace &trace)
 {
   map<ceph_tid_t, Op>::iterator i = tid_to_op_map.find(op.tid);
   assert(i != tid_to_op_map.end());
   if (op.committed) {
+    trace.event("sub write committed");
     assert(i->second.pending_commit.count(from));
     i->second.pending_commit.erase(from);
     if (from != get_parent()->whoami_shard()) {
@@ -1053,6 +1088,7 @@ void ECBackend::handle_sub_write_reply(
     }
   }
   if (op.applied) {
+    trace.event("sub write applied");
     assert(i->second.pending_apply.count(from));
     i->second.pending_apply.erase(from);
   }
@@ -1061,11 +1097,13 @@ void ECBackend::handle_sub_write_reply(
     dout(10) << __func__ << " Calling on_all_applied on " << i->second << dendl;
     i->second.on_all_applied->complete(0);
     i->second.on_all_applied = 0;
+    i->second.trace.event("ec write all applied");
   }
   if (i->second.pending_commit.empty() && i->second.on_all_commit) {
     dout(10) << __func__ << " Calling on_all_commit on " << i->second << dendl;
     i->second.on_all_commit->complete(0);
     i->second.on_all_commit = 0;
+    i->second.trace.event("ec write all committed");
   }
   check_ops();
 }
@@ -1073,8 +1111,10 @@ void ECBackend::handle_sub_write_reply(
 void ECBackend::handle_sub_read_reply(
   pg_shard_t from,
   ECSubReadReply &op,
-  RecoveryMessages *m)
+  RecoveryMessages *m,
+  const ZTracer::Trace &trace)
 {
+  trace.event("ec sub read reply");
   dout(10) << __func__ << ": reply " << op << dendl;
   map<ceph_tid_t, ReadOp>::iterator iter = tid_to_read_map.find(op.tid);
   if (iter == tid_to_read_map.end()) {
@@ -1204,6 +1244,7 @@ void ECBackend::handle_sub_read_reply(
   }
   if (rop.in_progress.empty() || is_complete == rop.complete.size()) {
     dout(20) << __func__ << " Complete: " << rop << dendl;
+    rop.trace.event("ec read complete");
     complete_read_op(rop, m);
   } else {
     dout(10) << __func__ << " readop not complete: " << rop << dendl;
@@ -1423,6 +1464,8 @@ void ECBackend::submit_transaction(
   op->tid = tid;
   op->reqid = reqid;
   op->client_op = client_op;
+  if (client_op)
+    op->trace = client_op->pg_trace;
   
   dout(10) << __func__ << ": op " << *op << " starting" << dendl;
   start_rmw(op, std::move(t));
@@ -1579,8 +1622,11 @@ void ECBackend::start_read_op(
       _op,
       std::move(to_read))).first->second;
   dout(10) << __func__ << ": starting " << op << dendl;
-  do_read_op(
-    op);
+  if (_op) {
+    op.trace = _op->pg_trace;
+    op.trace.event("start ec read");
+  }
+  do_read_op(op);
 }
 
 void ECBackend::do_read_op(ReadOp &op)
@@ -1636,9 +1682,15 @@ void ECBackend::do_read_op(ReadOp &op)
       get_parent()->whoami_spg_t().pgid,
       i->first.shard);
     msg->map_epoch = get_parent()->get_epoch();
+    msg->min_epoch = get_parent()->get_interval_start_epoch();
     msg->op = i->second;
     msg->op.from = get_parent()->whoami_shard();
     msg->op.tid = tid;
+    if (op.trace) {
+      // initialize a child span for this shard
+      msg->trace.init("ec sub read", nullptr, &op.trace);
+      msg->trace.keyval("shard", i->first.shard.id);
+    }
     get_parent()->send_message_osd_cluster(
       i->first.osd,
       msg,
@@ -1838,6 +1890,8 @@ bool ECBackend::try_reads_to_commit()
     trans[i->shard];
   }
 
+  op->trace.event("start ec write");
+
   map<hobject_t,extent_map> written;
   if (op->plan.t) {
     ECTransaction::generate_transactions(
@@ -1919,6 +1973,14 @@ bool ECBackend::try_reads_to_commit()
       op->temp_added,
       op->temp_cleared,
       !should_send);
+
+    ZTracer::Trace trace;
+    if (op->trace) {
+      // initialize a child span for this shard
+      trace.init("ec sub write", nullptr, &op->trace);
+      trace.keyval("shard", i->shard.id);
+    }
+
     if (*i == get_parent()->whoami_shard()) {
       should_write_local = true;
       local_write_op.claim(sop);
@@ -1926,6 +1988,8 @@ bool ECBackend::try_reads_to_commit()
       MOSDECSubOpWrite *r = new MOSDECSubOpWrite(sop);
       r->pgid = spg_t(get_parent()->primary_spg_t().pgid, i->shard);
       r->map_epoch = get_parent()->get_epoch();
+      r->min_epoch = get_parent()->get_interval_start_epoch();
+      r->trace = trace;
       get_parent()->send_message_osd_cluster(
 	i->osd, r, get_parent()->get_epoch());
     }
@@ -1935,6 +1999,7 @@ bool ECBackend::try_reads_to_commit()
 	get_parent()->whoami_shard(),
 	op->client_op,
 	local_write_op,
+	op->trace,
 	op->on_local_applied_sync);
       op->on_local_applied_sync = 0;
   }
diff --git a/src/osd/ECBackend.h b/src/osd/ECBackend.h
index 96913644fe5..f659675d9ce 100644
--- a/src/osd/ECBackend.h
+++ b/src/osd/ECBackend.h
@@ -58,28 +58,37 @@ public:
   friend struct SubWriteApplied;
   friend struct SubWriteCommitted;
   void sub_write_applied(
-    ceph_tid_t tid, eversion_t version);
+    ceph_tid_t tid,
+    eversion_t version,
+    const ZTracer::Trace &trace);
   void sub_write_committed(
-    ceph_tid_t tid, eversion_t version, eversion_t last_complete);
+    ceph_tid_t tid,
+    eversion_t version,
+    eversion_t last_complete,
+    const ZTracer::Trace &trace);
   void handle_sub_write(
     pg_shard_t from,
     OpRequestRef msg,
     ECSubWrite &op,
+    const ZTracer::Trace &trace,
     Context *on_local_applied_sync = 0
     );
   void handle_sub_read(
     pg_shard_t from,
     const ECSubRead &op,
-    ECSubReadReply *reply
+    ECSubReadReply *reply,
+    const ZTracer::Trace &trace
     );
   void handle_sub_write_reply(
     pg_shard_t from,
-    const ECSubWriteReply &op
+    const ECSubWriteReply &op,
+    const ZTracer::Trace &trace
     );
   void handle_sub_read_reply(
     pg_shard_t from,
     ECSubReadReply &op,
-    RecoveryMessages *m
+    RecoveryMessages *m,
+    const ZTracer::Trace &trace
     );
 
   /// @see ReadOp below
@@ -370,6 +379,8 @@ public:
     // of the available shards.
     bool for_recovery;
 
+    ZTracer::Trace trace;
+
     map<hobject_t, read_request_t> to_read;
     map<hobject_t, read_result_t> complete;
 
@@ -447,6 +458,7 @@ public:
     vector<pg_log_entry_t> log_entries;
     ceph_tid_t tid;
     osd_reqid_t reqid;
+    ZTracer::Trace trace;
 
     eversion_t roll_forward_to; /// Soon to be generated internally
 
diff --git a/src/osd/ECTransaction.cc b/src/osd/ECTransaction.cc
index 31197786642..91fbf5a58d9 100644
--- a/src/osd/ECTransaction.cc
+++ b/src/osd/ECTransaction.cc
@@ -166,10 +166,9 @@ void ECTransaction::generate_transactions(
       if (entry &&
 	  entry->is_modify() &&
 	  op.updated_snaps) {
-	vector<snapid_t> snaps(
-	  op.updated_snaps->second.begin(),
-	  op.updated_snaps->second.end());
-	::encode(snaps, entry->snaps);
+	bufferlist bl(op.updated_snaps->second.size() * 8 + 8);
+	::encode(op.updated_snaps->second, bl);
+	entry->snaps.swap(bl);
       }
 
       ldpp_dout(dpp, 20) << "generate_transactions: "
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index b196f1fba9e..72afa48a97e 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -229,7 +229,6 @@ OSDService::OSDService(OSD *osd) :
   peering_wq(osd->peering_wq),
   recovery_gen_wq("recovery_gen_wq", cct->_conf->osd_recovery_thread_timeout,
 		  &osd->disk_tp),
-  op_gen_wq("op_gen_wq", cct->_conf->osd_recovery_thread_timeout, &osd->osd_tp),
   class_handler(osd->class_handler),
   pg_epoch_lock("OSDService::pg_epoch_lock"),
   publish_lock("OSDService::publish_lock"),
@@ -514,8 +513,8 @@ void OSDService::init()
   objecter_finisher.start();
   objecter->set_client_incarnation(0);
 
-  // exclude objecter from daemonperf output
-  objecter->get_logger()->set_suppress_nicks(true);
+  // deprioritize objecter in daemonperf output
+  objecter->get_logger()->set_prio_adjust(-3);
 
   watch_timer.init();
   agent_timer.init();
@@ -1814,6 +1813,7 @@ OSD::OSD(CephContext *cct_, ObjectStore *store_,
   clog(log_client.create_channel()),
   whoami(id),
   dev_path(dev), journal_path(jdev),
+  trace_endpoint("0.0.0.0", 0, "osd"),
   asok_hook(NULL),
   osd_compat(get_osd_compat_set()),
   osd_tp(cct, "OSD::osd_tp", "tp_osd", cct->_conf->osd_op_threads, "osd_op_threads"),
@@ -1879,6 +1879,11 @@ OSD::OSD(CephContext *cct_, ObjectStore *store_,
                                            cct->_conf->osd_op_history_duration);
   op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
                                                     cct->_conf->osd_op_history_slow_op_threshold);
+#ifdef WITH_BLKIN
+  std::stringstream ss;
+  ss << "osd." << whoami;
+  trace_endpoint.copy_name(ss.str());
+#endif
 }
 
 OSD::~OSD()
@@ -2111,6 +2116,20 @@ bool OSD::asok_command(string admin_command, cmdmap_t& cmdmap, string format,
     store->generate_db_histogram(f);
   } else if (admin_command == "flush_store_cache") {
     store->flush_cache();
+  } else if (admin_command == "dump_pgstate_history") {
+    f->open_object_section("pgstate_history");
+    RWLock::RLocker l2(pg_map_lock);
+    for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
+        it != pg_map.end();
+        ++it) {
+
+      PG *pg = it->second;
+      f->dump_stream("pg") << pg->get_pgid();
+      pg->lock();
+      pg->pgstate_history.dump(f);
+      pg->unlock();
+    }
+    f->close_section();
   } else {
     assert(0 == "broken asok registration");
   }
@@ -2621,6 +2640,10 @@ void OSD::final_init()
                                      asok_hook,
                                      "Flush bluestore internal cache");
   assert(r == 0);
+  r = admin_socket->register_command("dump_pgstate_history", "dump_pgstate_history",
+				     asok_hook,
+				     "show recent state history");
+  assert(r == 0);
 
   test_ops_hook = new TestOpsSocketHook(&(this->service), this->store);
   // Note: pools are CephString instead of CephPoolname because
@@ -2741,145 +2764,227 @@ void OSD::create_logger()
   };
 
 
-  osd_plb.add_u64(l_osd_op_wip, "op_wip",
-      "Replication operations currently being processed (primary)");   // rep ops currently being processed (primary)
-  osd_plb.add_u64_counter(l_osd_op,       "op",
-      "Client operations", "ops");           // client ops
-  osd_plb.add_u64_counter(l_osd_op_inb,   "op_in_bytes",
-      "Client operations total write size", "wr");       // client op in bytes (writes)
-  osd_plb.add_u64_counter(l_osd_op_outb,  "op_out_bytes",
-      "Client operations total read size", "rd");      // client op out bytes (reads)
-  osd_plb.add_time_avg(l_osd_op_lat,   "op_latency",
-      "Latency of client operations (including queue time)", "lat");       // client op latency
-  osd_plb.add_time_avg(l_osd_op_process_lat, "op_process_latency",
-      "Latency of client operations (excluding queue time)");   // client op process latency
-  osd_plb.add_time_avg(l_osd_op_prepare_lat, "op_prepare_latency",
-      "Latency of client operations (excluding queue time and wait for finished)"); // client op prepare latency
-
-  osd_plb.add_u64_counter(l_osd_op_r,      "op_r",
-      "Client read operations");        // client reads
-  osd_plb.add_u64_counter(l_osd_op_r_outb, "op_r_out_bytes",
-      "Client data read");   // client read out bytes
-  osd_plb.add_time_avg(l_osd_op_r_lat,  "op_r_latency",
-      "Latency of read operation (including queue time)");    // client read latency
-  osd_plb.add_histogram(l_osd_op_r_lat_outb_hist,  "op_r_latency_out_bytes_histogram",
-      op_hist_x_axis_config, op_hist_y_axis_config,
-      "Histogram of operation latency (including queue time) + data read");
-  osd_plb.add_time_avg(l_osd_op_r_process_lat, "op_r_process_latency",
-      "Latency of read operation (excluding queue time)");   // client read process latency
-  osd_plb.add_time_avg(l_osd_op_r_prepare_lat, "op_r_prepare_latency",
-      "Latency of read operations (excluding queue time and wait for finished)"); // client read prepare latency
-  osd_plb.add_u64_counter(l_osd_op_w,      "op_w",
-      "Client write operations");        // client writes
-  osd_plb.add_u64_counter(l_osd_op_w_inb,  "op_w_in_bytes",
-      "Client data written");    // client write in bytes
-  osd_plb.add_time_avg(l_osd_op_w_lat,  "op_w_latency",
-      "Latency of write operation (including queue time)");    // client write latency
-  osd_plb.add_histogram(l_osd_op_w_lat_inb_hist,  "op_w_latency_in_bytes_histogram",
-      op_hist_x_axis_config, op_hist_y_axis_config,
-      "Histogram of operation latency (including queue time) + data written");
-  osd_plb.add_time_avg(l_osd_op_w_process_lat, "op_w_process_latency",
-      "Latency of write operation (excluding queue time)");   // client write process latency
-  osd_plb.add_time_avg(l_osd_op_w_prepare_lat, "op_w_prepare_latency",
-      "Latency of write operations (excluding queue time and wait for finished)"); // client write prepare latency
-  osd_plb.add_u64_counter(l_osd_op_rw,     "op_rw",
-      "Client read-modify-write operations");       // client rmw
-  osd_plb.add_u64_counter(l_osd_op_rw_inb, "op_rw_in_bytes",
-      "Client read-modify-write operations write in");   // client rmw in bytes
-  osd_plb.add_u64_counter(l_osd_op_rw_outb,"op_rw_out_bytes",
-      "Client read-modify-write operations read out ");  // client rmw out bytes
-  osd_plb.add_time_avg(l_osd_op_rw_lat, "op_rw_latency",
-      "Latency of read-modify-write operation (including queue time)");   // client rmw latency
-  osd_plb.add_histogram(l_osd_op_rw_lat_inb_hist, "op_rw_latency_in_bytes_histogram",
-      op_hist_x_axis_config, op_hist_y_axis_config,
-      "Histogram of rw operation latency (including queue time) + data written");
-  osd_plb.add_histogram(l_osd_op_rw_lat_outb_hist, "op_rw_latency_out_bytes_histogram",
-      op_hist_x_axis_config, op_hist_y_axis_config,
-      "Histogram of rw operation latency (including queue time) + data read");
-  osd_plb.add_time_avg(l_osd_op_rw_process_lat, "op_rw_process_latency",
-      "Latency of read-modify-write operation (excluding queue time)");   // client rmw process latency
-  osd_plb.add_time_avg(l_osd_op_rw_prepare_lat, "op_rw_prepare_latency",
-      "Latency of read-modify-write operations (excluding queue time and wait for finished)"); // client rmw prepare latency
-
-  osd_plb.add_u64_counter(l_osd_sop,       "subop", "Suboperations");         // subops
-  osd_plb.add_u64_counter(l_osd_sop_inb,   "subop_in_bytes", "Suboperations total size");     // subop in bytes
-  osd_plb.add_time_avg(l_osd_sop_lat,   "subop_latency", "Suboperations latency");     // subop latency
-
-  osd_plb.add_u64_counter(l_osd_sop_w,     "subop_w", "Replicated writes");          // replicated (client) writes
-  osd_plb.add_u64_counter(l_osd_sop_w_inb, "subop_w_in_bytes", "Replicated written data size");      // replicated write in bytes
-  osd_plb.add_time_avg(l_osd_sop_w_lat, "subop_w_latency", "Replicated writes latency");      // replicated write latency
-  osd_plb.add_u64_counter(l_osd_sop_pull,     "subop_pull", "Suboperations pull requests");       // pull request
-  osd_plb.add_time_avg(l_osd_sop_pull_lat, "subop_pull_latency", "Suboperations pull latency");
-  osd_plb.add_u64_counter(l_osd_sop_push,     "subop_push", "Suboperations push messages");       // push (write)
-  osd_plb.add_u64_counter(l_osd_sop_push_inb, "subop_push_in_bytes", "Suboperations pushed size");
-  osd_plb.add_time_avg(l_osd_sop_push_lat, "subop_push_latency", "Suboperations push latency");
-
-  osd_plb.add_u64_counter(l_osd_pull,      "pull", "Pull requests sent");       // pull requests sent
-  osd_plb.add_u64_counter(l_osd_push,      "push", "Push messages sent");       // push messages
-  osd_plb.add_u64_counter(l_osd_push_outb, "push_out_bytes", "Pushed size");  // pushed bytes
-
-  osd_plb.add_u64_counter(l_osd_rop, "recovery_ops",
-      "Started recovery operations", "recop");       // recovery ops (started)
+  osd_plb.add_u64(
+    l_osd_op_wip, "op_wip",
+    "Replication operations currently being processed (primary)");
+  osd_plb.add_u64_counter(
+    l_osd_op, "op",
+    "Client operations",
+    "ops", PerfCountersBuilder::PRIO_CRITICAL);
+  osd_plb.add_u64_counter(
+    l_osd_op_inb,   "op_in_bytes",
+    "Client operations total write size",
+    "wr", PerfCountersBuilder::PRIO_INTERESTING);
+  osd_plb.add_u64_counter(
+    l_osd_op_outb,  "op_out_bytes",
+    "Client operations total read size",
+    "rd", PerfCountersBuilder::PRIO_INTERESTING);
+  osd_plb.add_time_avg(
+    l_osd_op_lat,   "op_latency",
+    "Latency of client operations (including queue time)",
+    "l", 9);
+  osd_plb.add_time_avg(
+    l_osd_op_process_lat, "op_process_latency",
+    "Latency of client operations (excluding queue time)");
+  osd_plb.add_time_avg(
+    l_osd_op_prepare_lat, "op_prepare_latency",
+    "Latency of client operations (excluding queue time and wait for finished)");
+
+  osd_plb.add_u64_counter(
+    l_osd_op_r, "op_r", "Client read operations");
+  osd_plb.add_u64_counter(
+    l_osd_op_r_outb, "op_r_out_bytes", "Client data read");
+  osd_plb.add_time_avg(
+    l_osd_op_r_lat, "op_r_latency",
+    "Latency of read operation (including queue time)");
+  osd_plb.add_u64_counter_histogram(
+    l_osd_op_r_lat_outb_hist, "op_r_latency_out_bytes_histogram",
+    op_hist_x_axis_config, op_hist_y_axis_config,
+    "Histogram of operation latency (including queue time) + data read");
+  osd_plb.add_time_avg(
+    l_osd_op_r_process_lat, "op_r_process_latency",
+    "Latency of read operation (excluding queue time)");
+  osd_plb.add_time_avg(
+    l_osd_op_r_prepare_lat, "op_r_prepare_latency",
+    "Latency of read operations (excluding queue time and wait for finished)");
+  osd_plb.add_u64_counter(
+    l_osd_op_w, "op_w", "Client write operations");
+  osd_plb.add_u64_counter(
+    l_osd_op_w_inb, "op_w_in_bytes", "Client data written");
+  osd_plb.add_time_avg(
+    l_osd_op_w_lat,  "op_w_latency",
+    "Latency of write operation (including queue time)");
+  osd_plb.add_u64_counter_histogram(
+    l_osd_op_w_lat_inb_hist, "op_w_latency_in_bytes_histogram",
+    op_hist_x_axis_config, op_hist_y_axis_config,
+    "Histogram of operation latency (including queue time) + data written");
+  osd_plb.add_time_avg(
+    l_osd_op_w_process_lat, "op_w_process_latency",
+    "Latency of write operation (excluding queue time)");
+  osd_plb.add_time_avg(
+    l_osd_op_w_prepare_lat, "op_w_prepare_latency",
+    "Latency of write operations (excluding queue time and wait for finished)");
+  osd_plb.add_u64_counter(
+    l_osd_op_rw, "op_rw",
+    "Client read-modify-write operations");
+  osd_plb.add_u64_counter(
+    l_osd_op_rw_inb, "op_rw_in_bytes",
+    "Client read-modify-write operations write in");
+  osd_plb.add_u64_counter(
+    l_osd_op_rw_outb,"op_rw_out_bytes",
+    "Client read-modify-write operations read out ");
+  osd_plb.add_time_avg(
+    l_osd_op_rw_lat, "op_rw_latency",
+    "Latency of read-modify-write operation (including queue time)");
+  osd_plb.add_u64_counter_histogram(
+    l_osd_op_rw_lat_inb_hist, "op_rw_latency_in_bytes_histogram",
+    op_hist_x_axis_config, op_hist_y_axis_config,
+    "Histogram of rw operation latency (including queue time) + data written");
+  osd_plb.add_u64_counter_histogram(
+    l_osd_op_rw_lat_outb_hist, "op_rw_latency_out_bytes_histogram",
+    op_hist_x_axis_config, op_hist_y_axis_config,
+    "Histogram of rw operation latency (including queue time) + data read");
+  osd_plb.add_time_avg(
+    l_osd_op_rw_process_lat, "op_rw_process_latency",
+    "Latency of read-modify-write operation (excluding queue time)");
+  osd_plb.add_time_avg(
+    l_osd_op_rw_prepare_lat, "op_rw_prepare_latency",
+    "Latency of read-modify-write operations (excluding queue time and wait for finished)");
+
+  osd_plb.add_u64_counter(
+    l_osd_sop, "subop", "Suboperations");
+  osd_plb.add_u64_counter(
+    l_osd_sop_inb, "subop_in_bytes", "Suboperations total size");
+  osd_plb.add_time_avg(l_osd_sop_lat, "subop_latency", "Suboperations latency");
+
+  osd_plb.add_u64_counter(l_osd_sop_w, "subop_w", "Replicated writes");
+  osd_plb.add_u64_counter(
+    l_osd_sop_w_inb, "subop_w_in_bytes", "Replicated written data size");
+  osd_plb.add_time_avg(
+    l_osd_sop_w_lat, "subop_w_latency", "Replicated writes latency");
+  osd_plb.add_u64_counter(
+    l_osd_sop_pull, "subop_pull", "Suboperations pull requests");
+  osd_plb.add_time_avg(
+    l_osd_sop_pull_lat, "subop_pull_latency", "Suboperations pull latency");
+  osd_plb.add_u64_counter(
+    l_osd_sop_push, "subop_push", "Suboperations push messages");
+  osd_plb.add_u64_counter(
+    l_osd_sop_push_inb, "subop_push_in_bytes", "Suboperations pushed size");
+  osd_plb.add_time_avg(
+    l_osd_sop_push_lat, "subop_push_latency", "Suboperations push latency");
+
+  osd_plb.add_u64_counter(l_osd_pull, "pull", "Pull requests sent");
+  osd_plb.add_u64_counter(l_osd_push, "push", "Push messages sent");
+  osd_plb.add_u64_counter(l_osd_push_outb, "push_out_bytes", "Pushed size");
+
+  osd_plb.add_u64_counter(
+    l_osd_rop, "recovery_ops",
+    "Started recovery operations",
+    "rop", PerfCountersBuilder::PRIO_INTERESTING);
 
   osd_plb.add_u64(l_osd_loadavg, "loadavg", "CPU load");
-  osd_plb.add_u64(l_osd_buf, "buffer_bytes", "Total allocated buffer size");       // total ceph::buffer bytes
-  osd_plb.add_u64(l_osd_history_alloc_bytes, "history_alloc_Mbytes");       // total ceph::buffer bytes in history
-  osd_plb.add_u64(l_osd_history_alloc_num, "history_alloc_num");       // total ceph::buffer num in history
-  osd_plb.add_u64(l_osd_cached_crc, "cached_crc", "Total number getting crc from crc_cache"); // total ceph::buffer buffer_cached_crc
-  osd_plb.add_u64(l_osd_cached_crc_adjusted, "cached_crc_adjusted", "Total number getting crc from crc_cache with adjusting"); // total ceph::buffer buffer_cached_crc_adjusted
-
-  osd_plb.add_u64(l_osd_pg, "numpg", "Placement groups");   // num pgs
-  osd_plb.add_u64(l_osd_pg_primary, "numpg_primary", "Placement groups for which this osd is primary"); // num primary pgs
-  osd_plb.add_u64(l_osd_pg_replica, "numpg_replica", "Placement groups for which this osd is replica"); // num replica pgs
-  osd_plb.add_u64(l_osd_pg_stray, "numpg_stray", "Placement groups ready to be deleted from this osd");   // num stray pgs
-  osd_plb.add_u64(l_osd_hb_to, "heartbeat_to_peers", "Heartbeat (ping) peers we send to");     // heartbeat peers we send to
-  osd_plb.add_u64_counter(l_osd_map, "map_messages", "OSD map messages");           // osdmap messages
-  osd_plb.add_u64_counter(l_osd_mape, "map_message_epochs", "OSD map epochs");         // osdmap epochs
-  osd_plb.add_u64_counter(l_osd_mape_dup, "map_message_epoch_dups", "OSD map duplicates"); // dup osdmap epochs
-  osd_plb.add_u64_counter(l_osd_waiting_for_map, "messages_delayed_for_map", "Operations waiting for OSD map"); // dup osdmap epochs
-  osd_plb.add_u64_counter(l_osd_map_cache_hit, "osd_map_cache_hit", "osdmap cache hit");
-  osd_plb.add_u64_counter(l_osd_map_cache_miss, "osd_map_cache_miss", "osdmap cache miss");
-  osd_plb.add_u64_counter(l_osd_map_cache_miss_low, "osd_map_cache_miss_low", "osdmap cache miss below cache lower bound");
-  osd_plb.add_u64_avg(l_osd_map_cache_miss_low_avg, "osd_map_cache_miss_low_avg", "osdmap cache miss, avg distance below cache lower bound");
+  osd_plb.add_u64(l_osd_buf, "buffer_bytes", "Total allocated buffer size");
+  osd_plb.add_u64(l_osd_history_alloc_bytes, "history_alloc_Mbytes");
+  osd_plb.add_u64(l_osd_history_alloc_num, "history_alloc_num");
+  osd_plb.add_u64(
+    l_osd_cached_crc, "cached_crc", "Total number getting crc from crc_cache");
+  osd_plb.add_u64(
+    l_osd_cached_crc_adjusted, "cached_crc_adjusted",
+    "Total number getting crc from crc_cache with adjusting");
+  osd_plb.add_u64(l_osd_missed_crc, "missed_crc", 
+    "Total number of crc cache misses");
+
+  osd_plb.add_u64(l_osd_pg, "numpg", "Placement groups",
+		  "pgs", PerfCountersBuilder::PRIO_USEFUL);
+  osd_plb.add_u64(
+    l_osd_pg_primary, "numpg_primary",
+    "Placement groups for which this osd is primary");
+  osd_plb.add_u64(
+    l_osd_pg_replica, "numpg_replica",
+    "Placement groups for which this osd is replica");
+  osd_plb.add_u64(
+    l_osd_pg_stray, "numpg_stray",
+    "Placement groups ready to be deleted from this osd");
+  osd_plb.add_u64(
+    l_osd_hb_to, "heartbeat_to_peers", "Heartbeat (ping) peers we send to");
+  osd_plb.add_u64_counter(l_osd_map, "map_messages", "OSD map messages");
+  osd_plb.add_u64_counter(l_osd_mape, "map_message_epochs", "OSD map epochs");
+  osd_plb.add_u64_counter(
+    l_osd_mape_dup, "map_message_epoch_dups", "OSD map duplicates");
+  osd_plb.add_u64_counter(
+    l_osd_waiting_for_map, "messages_delayed_for_map",
+    "Operations waiting for OSD map");
+  osd_plb.add_u64_counter(
+    l_osd_map_cache_hit, "osd_map_cache_hit", "osdmap cache hit");
+  osd_plb.add_u64_counter(
+    l_osd_map_cache_miss, "osd_map_cache_miss", "osdmap cache miss");
+  osd_plb.add_u64_counter(
+    l_osd_map_cache_miss_low, "osd_map_cache_miss_low",
+    "osdmap cache miss below cache lower bound");
+  osd_plb.add_u64_avg(
+    l_osd_map_cache_miss_low_avg, "osd_map_cache_miss_low_avg",
+    "osdmap cache miss, avg distance below cache lower bound");
 
   osd_plb.add_u64(l_osd_stat_bytes, "stat_bytes", "OSD size");
   osd_plb.add_u64(l_osd_stat_bytes_used, "stat_bytes_used", "Used space");
   osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail", "Available space");
 
-  osd_plb.add_u64_counter(l_osd_copyfrom, "copyfrom", "Rados \"copy-from\" operations");
+  osd_plb.add_u64_counter(
+    l_osd_copyfrom, "copyfrom", "Rados \"copy-from\" operations");
 
   osd_plb.add_u64_counter(l_osd_tier_promote, "tier_promote", "Tier promotions");
   osd_plb.add_u64_counter(l_osd_tier_flush, "tier_flush", "Tier flushes");
-  osd_plb.add_u64_counter(l_osd_tier_flush_fail, "tier_flush_fail", "Failed tier flushes");
-  osd_plb.add_u64_counter(l_osd_tier_try_flush, "tier_try_flush", "Tier flush attempts");
-  osd_plb.add_u64_counter(l_osd_tier_try_flush_fail, "tier_try_flush_fail", "Failed tier flush attempts");
-  osd_plb.add_u64_counter(l_osd_tier_evict, "tier_evict", "Tier evictions");
-  osd_plb.add_u64_counter(l_osd_tier_whiteout, "tier_whiteout", "Tier whiteouts");
-  osd_plb.add_u64_counter(l_osd_tier_dirty, "tier_dirty", "Dirty tier flag set");
-  osd_plb.add_u64_counter(l_osd_tier_clean, "tier_clean", "Dirty tier flag cleaned");
-  osd_plb.add_u64_counter(l_osd_tier_delay, "tier_delay", "Tier delays (agent waiting)");
-  osd_plb.add_u64_counter(l_osd_tier_proxy_read, "tier_proxy_read", "Tier proxy reads");
-  osd_plb.add_u64_counter(l_osd_tier_proxy_write, "tier_proxy_write", "Tier proxy writes");
-
-  osd_plb.add_u64_counter(l_osd_agent_wake, "agent_wake", "Tiering agent wake up");
-  osd_plb.add_u64_counter(l_osd_agent_skip, "agent_skip", "Objects skipped by agent");
-  osd_plb.add_u64_counter(l_osd_agent_flush, "agent_flush", "Tiering agent flushes");
-  osd_plb.add_u64_counter(l_osd_agent_evict, "agent_evict", "Tiering agent evictions");
-
-  osd_plb.add_u64_counter(l_osd_object_ctx_cache_hit, "object_ctx_cache_hit", "Object context cache hits");
-  osd_plb.add_u64_counter(l_osd_object_ctx_cache_total, "object_ctx_cache_total", "Object context cache lookups");
+  osd_plb.add_u64_counter(
+    l_osd_tier_flush_fail, "tier_flush_fail", "Failed tier flushes");
+  osd_plb.add_u64_counter(
+    l_osd_tier_try_flush, "tier_try_flush", "Tier flush attempts");
+  osd_plb.add_u64_counter(
+    l_osd_tier_try_flush_fail, "tier_try_flush_fail",
+    "Failed tier flush attempts");
+  osd_plb.add_u64_counter(
+    l_osd_tier_evict, "tier_evict", "Tier evictions");
+  osd_plb.add_u64_counter(
+    l_osd_tier_whiteout, "tier_whiteout", "Tier whiteouts");
+  osd_plb.add_u64_counter(
+    l_osd_tier_dirty, "tier_dirty", "Dirty tier flag set");
+  osd_plb.add_u64_counter(
+    l_osd_tier_clean, "tier_clean", "Dirty tier flag cleaned");
+  osd_plb.add_u64_counter(
+    l_osd_tier_delay, "tier_delay", "Tier delays (agent waiting)");
+  osd_plb.add_u64_counter(
+    l_osd_tier_proxy_read, "tier_proxy_read", "Tier proxy reads");
+  osd_plb.add_u64_counter(
+    l_osd_tier_proxy_write, "tier_proxy_write", "Tier proxy writes");
+
+  osd_plb.add_u64_counter(
+    l_osd_agent_wake, "agent_wake", "Tiering agent wake up");
+  osd_plb.add_u64_counter(
+    l_osd_agent_skip, "agent_skip", "Objects skipped by agent");
+  osd_plb.add_u64_counter(
+    l_osd_agent_flush, "agent_flush", "Tiering agent flushes");
+  osd_plb.add_u64_counter(
+    l_osd_agent_evict, "agent_evict", "Tiering agent evictions");
+
+  osd_plb.add_u64_counter(
+    l_osd_object_ctx_cache_hit, "object_ctx_cache_hit", "Object context cache hits");
+  osd_plb.add_u64_counter(
+    l_osd_object_ctx_cache_total, "object_ctx_cache_total", "Object context cache lookups");
 
   osd_plb.add_u64_counter(l_osd_op_cache_hit, "op_cache_hit");
-  osd_plb.add_time_avg(l_osd_tier_flush_lat, "osd_tier_flush_lat", "Object flush latency");
-  osd_plb.add_time_avg(l_osd_tier_promote_lat, "osd_tier_promote_lat", "Object promote latency");
-  osd_plb.add_time_avg(l_osd_tier_r_lat, "osd_tier_r_lat", "Object proxy read latency");
-
-  osd_plb.add_u64_counter(l_osd_pg_info, "osd_pg_info",
-			  "PG updated its info (using any method)");
-  osd_plb.add_u64_counter(l_osd_pg_fastinfo, "osd_pg_fastinfo",
-			  "PG updated its info using fastinfo attr");
-  osd_plb.add_u64_counter(l_osd_pg_biginfo, "osd_pg_biginfo",
-			  "PG updated its biginfo attr");
+  osd_plb.add_time_avg(
+    l_osd_tier_flush_lat, "osd_tier_flush_lat", "Object flush latency");
+  osd_plb.add_time_avg(
+    l_osd_tier_promote_lat, "osd_tier_promote_lat", "Object promote latency");
+  osd_plb.add_time_avg(
+    l_osd_tier_r_lat, "osd_tier_r_lat", "Object proxy read latency");
+
+  osd_plb.add_u64_counter(
+    l_osd_pg_info, "osd_pg_info", "PG updated its info (using any method)");
+  osd_plb.add_u64_counter(
+    l_osd_pg_fastinfo, "osd_pg_fastinfo",
+    "PG updated its info using fastinfo attr");
+  osd_plb.add_u64_counter(
+    l_osd_pg_biginfo, "osd_pg_biginfo", "PG updated its biginfo attr");
 
   logger = osd_plb.create_perf_counters();
   cct->get_perfcounters_collection()->add(logger);
@@ -2947,6 +3052,9 @@ int OSD::shutdown()
   cct->_conf->set_val("debug_ms", "100");
   cct->_conf->apply_changes(NULL);
 
+  // stop MgrClient earlier as it's more like an internal consumer of OSD
+  mgrc.shutdown();
+
   service.start_shutdown();
 
   // stop sending work to pgs.  this just prevents any new work in _process
@@ -2995,6 +3103,7 @@ int OSD::shutdown()
   cct->get_admin_socket()->unregister_command("dump_objectstore_kv_stats");
   cct->get_admin_socket()->unregister_command("calc_objectstore_db_histogram");
   cct->get_admin_socket()->unregister_command("flush_store_cache");
+  cct->get_admin_socket()->unregister_command("dump_pgstate_history");
   delete asok_hook;
   asok_hook = NULL;
 
@@ -3108,7 +3217,6 @@ int OSD::shutdown()
   store = 0;
   dout(10) << "Store synced" << dendl;
 
-  mgrc.shutdown();
   monc->shutdown();
   osd_lock.Unlock();
 
@@ -3499,7 +3607,7 @@ PG *OSD::_create_lock_pg(
   vector<int>& up, int up_primary,
   vector<int>& acting, int acting_primary,
   pg_history_t history,
-  const pg_interval_map_t& pi,
+  const PastIntervals& pi,
   ObjectStore::Transaction& t)
 {
   assert(osd_lock.is_locked());
@@ -3720,23 +3828,36 @@ void OSD::build_past_intervals_parallel()
         ++i) {
       PG *pg = i->second;
 
-      epoch_t start, end;
-      if (!pg->_calc_past_interval_range(&start, &end, superblock.oldest_map)) {
-        if (pg->info.history.same_interval_since == 0)
-          pg->info.history.same_interval_since = end;
-        continue;
+      auto rpib = pg->get_required_past_interval_bounds(
+	pg->info,
+	superblock.oldest_map);
+      if (rpib.first >= rpib.second && pg->past_intervals.empty()) {
+        if (pg->info.history.same_interval_since == 0) {
+          pg->info.history.same_interval_since = rpib.second;
+	}
+	continue;
+      } else {
+	auto apib = pg->past_intervals.get_bounds();
+	if (apib.second >= rpib.second &&
+	    apib.first <= rpib.first) {
+	  if (pg->info.history.same_interval_since == 0) {
+	    pg->info.history.same_interval_since = rpib.second;
+	  }
+	  continue;
+	}
       }
 
-      dout(10) << pg->info.pgid << " needs " << start << "-" << end << dendl;
+      dout(10) << pg->info.pgid << " needs " << rpib.first << "-"
+	       << rpib.second << dendl;
       pistate& p = pis[pg];
-      p.start = start;
-      p.end = end;
+      p.start = rpib.first;
+      p.end = rpib.second;
       p.same_interval_since = 0;
 
-      if (start < cur_epoch)
-        cur_epoch = start;
-      if (end > end_epoch)
-        end_epoch = end;
+      if (rpib.first < cur_epoch)
+        cur_epoch = rpib.first;
+      if (rpib.second > end_epoch)
+        end_epoch = rpib.second;
     }
   }
   if (pis.empty()) {
@@ -3785,7 +3906,7 @@ void OSD::build_past_intervals_parallel()
       boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
         pg->get_is_recoverable_predicate());
       std::stringstream debug;
-      bool new_interval = pg_interval_t::check_new_interval(
+      bool new_interval = PastIntervals::check_new_interval(
 	p.primary,
 	primary,
 	p.old_acting, acting,
@@ -3859,7 +3980,7 @@ void OSD::build_past_intervals_parallel()
 int OSD::handle_pg_peering_evt(
   spg_t pgid,
   const pg_history_t& orig_history,
-  const pg_interval_map_t& pi,
+  const PastIntervals& pi,
   epoch_t epoch,
   PG::CephPeeringEvtRef evt)
 {
@@ -3948,7 +4069,7 @@ int OSD::handle_pg_peering_evt(
       vector<int> old_acting = old_pg_state->acting;
       int old_primary = old_pg_state->primary.osd;
       pg_history_t old_history = old_pg_state->info.history;
-      pg_interval_map_t old_past_intervals = old_pg_state->past_intervals;
+      PastIntervals old_past_intervals = old_pg_state->past_intervals;
       old_pg_state->unlock();
       pg = _create_lock_pg(
 	old_osd_map,
@@ -3984,7 +4105,7 @@ int OSD::handle_pg_peering_evt(
       vector<int> old_acting = old_pg_state->acting;
       int old_primary = old_pg_state->primary.osd;
       pg_history_t old_history = old_pg_state->info.history;
-      pg_interval_map_t old_past_intervals = old_pg_state->past_intervals;
+      PastIntervals old_past_intervals = old_pg_state->past_intervals;
       old_pg_state->unlock();
       PG *parent = _create_lock_pg(
 	old_osd_map,
@@ -4034,6 +4155,79 @@ int OSD::handle_pg_peering_evt(
 }
 
 
+void OSD::build_initial_pg_history(
+  spg_t pgid,
+  epoch_t created,
+  utime_t created_stamp,
+  pg_history_t *h,
+  PastIntervals *pi)
+{
+  dout(10) << __func__ << " " << pgid << " created " << created << dendl;
+  h->epoch_created = created;
+  h->epoch_pool_created = created;
+  h->same_interval_since = created;
+  h->same_up_since = created;
+  h->same_primary_since = created;
+  h->last_scrub_stamp = created_stamp;
+  h->last_deep_scrub_stamp = created_stamp;
+  h->last_clean_scrub_stamp = created_stamp;
+
+  OSDMapRef lastmap = service.get_map(created);
+  int up_primary, acting_primary;
+  vector<int> up, acting;
+  lastmap->pg_to_up_acting_osds(
+    pgid.pgid, &up, &up_primary, &acting, &acting_primary);
+
+  ostringstream debug;
+  for (epoch_t e = created + 1; e <= osdmap->get_epoch(); ++e) {
+    OSDMapRef osdmap = service.get_map(e);
+    int new_up_primary, new_acting_primary;
+    vector<int> new_up, new_acting;
+    osdmap->pg_to_up_acting_osds(
+      pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
+
+    // this is a bit imprecise, but sufficient?
+    struct min_size_predicate_t : public IsPGRecoverablePredicate {
+      const pg_pool_t *pi;
+      bool operator()(const set<pg_shard_t> &have) const {
+	return have.size() >= pi->min_size;
+      }
+      min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
+    } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
+
+    bool new_interval = PastIntervals::check_new_interval(
+      acting_primary,
+      new_acting_primary,
+      acting, new_acting,
+      up_primary,
+      new_up_primary,
+      up, new_up,
+      h->same_interval_since,
+      h->last_epoch_clean,
+      osdmap,
+      lastmap,
+      pgid.pgid,
+      &min_size_predicate,
+      pi,
+      &debug);
+    if (new_interval) {
+      h->same_interval_since = e;
+    }
+    if (up != new_up) {
+      h->same_up_since = e;
+    }
+    if (acting_primary != new_acting_primary) {
+      h->same_primary_since = e;
+    }
+    lastmap = osdmap;
+  }
+  dout(20) << __func__ << " " << debug.str() << dendl;
+  dout(10) << __func__ << " " << *h << " " << *pi
+	   << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
+		       pi->get_bounds()) << ")"
+	   << dendl;
+}
+
 /**
  * Fill in the passed history so you know same_interval_since, same_up_since,
  * and same_primary_since.
@@ -4113,7 +4307,7 @@ bool OSD::project_pg_history(spg_t pgid, pg_history_t& h, epoch_t from,
       break;
   }
 
-  // base case: these floors should be the creation epoch if we didn't
+  // base case: these floors should be the pg creation epoch if we didn't
   // find any changes.
   if (e == h.epoch_created) {
     if (!h.same_interval_since)
@@ -4678,6 +4872,7 @@ void OSD::tick_without_osd_lock()
   logger->set(l_osd_history_alloc_num, buffer::get_history_alloc_num());
   logger->set(l_osd_cached_crc, buffer::get_cached_crc());
   logger->set(l_osd_cached_crc_adjusted, buffer::get_cached_crc_adjusted());
+  logger->set(l_osd_missed_crc, buffer::get_missed_crc());
 
   // osd_lock is not being held, which means the OSD state
   // might change when doing the monitor report
@@ -6361,7 +6556,7 @@ void OSD::dispatch_session_waiting(Session *session, OSDMapRef osdmap)
     assert(ms_can_fast_dispatch(op->get_req()));
     const MOSDFastDispatchOp *m = static_cast<const MOSDFastDispatchOp*>(
       op->get_req());
-    if (m->get_map_epoch() > osdmap->get_epoch()) {
+    if (m->get_min_epoch() > osdmap->get_epoch()) {
       break;
     }
     session->waiting_on_map.erase(i++);
@@ -6403,8 +6598,13 @@ void OSD::ms_fast_dispatch(Message *m)
         reqid.name._num, reqid.tid, reqid.inc);
   }
 
-  // note sender epoch
+  if (m->trace)
+    op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
+
+  // note sender epoch, min req'd epoch
   op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
+  op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
+  assert(op->min_epoch <= op->sent_epoch); // sanity check!
 
   service.maybe_inject_dispatch_delay();
 
@@ -6636,6 +6836,8 @@ void OSD::_dispatch(Message *m)
   case MSG_OSD_RECOVERY_RESERVE:
     {
       OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
+      if (m->trace)
+        op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
       // no map?  starting up?
       if (!osdmap) {
         dout(7) << "no OSDMap, not booted" << dendl;
@@ -6833,11 +7035,16 @@ void OSD::sched_scrub()
         break;
       }
 
+      if ((scrub.deadline >= now) && !(time_permit && load_is_low)) {
+        dout(10) << __func__ << " not scheduling scrub for " << scrub.pgid << " due to "
+                 << (!time_permit ? "time not permit" : "high load") << dendl;
+        continue;
+      }
+
       PG *pg = _lookup_lock_pg(scrub.pgid);
       if (!pg)
 	continue;
-      if (pg->get_pgbackend()->scrub_supported() && pg->is_active() &&
-	  (scrub.deadline < now || (time_permit && load_is_low))) {
+      if (pg->get_pgbackend()->scrub_supported() && pg->is_active()) {
 	dout(10) << "sched_scrub scrubbing " << scrub.pgid << " at " << scrub.sched_time
 		 << (pg->scrubber.must_scrub ? ", explicitly requested" :
 		     (load_is_low ? ", load_is_low" : " deadline < now"))
@@ -7924,16 +8131,9 @@ void OSD::handle_pg_create(OpRequestRef op)
     osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
     int role = osdmap->calc_pg_role(whoami, acting, acting.size());
 
-    if (up_primary != whoami) {
-      dout(10) << "mkpg " << on << "  not primary (role="
-	       << role << "), skipping" << dendl;
-      continue;
-    }
-    if (up != acting) {
-      dout(10) << "mkpg " << on << "  up " << up
-	       << " != acting " << acting << ", ignoring" << dendl;
-      // we'll get a query soon anyway, since we know the pg
-      // must exist. we can ignore this.
+    if (acting_primary != whoami) {
+      dout(10) << "mkpg " << on << "  not acting_primary (" << acting_primary
+	       << "), my role=" << role << ", skipping" << dendl;
       continue;
     }
 
@@ -7941,20 +8141,11 @@ void OSD::handle_pg_create(OpRequestRef op)
     bool mapped = osdmap->get_primary_shard(on, &pgid);
     assert(mapped);
 
-    pg_interval_map_t pi;
+    PastIntervals pi(
+      osdmap->get_pools().at(pgid.pool()).ec_pool(),
+      *osdmap);
     pg_history_t history;
-    history.epoch_created = created;
-    history.last_scrub_stamp = ci->second;
-    history.last_deep_scrub_stamp = ci->second;
-    history.last_clean_scrub_stamp = ci->second;
-
-    // project history from created epoch (handle_pg_peering_evt does
-    // it from msg send epoch)
-    bool valid_history = project_pg_history(
-      pgid, history, created, up, up_primary, acting, acting_primary);
-    // the pg creation message must have come from a mon and therefore
-    // cannot be on the other side of a map gap
-    assert(valid_history);
+    build_initial_pg_history(pgid, created, ci->second, &history, &pi);
 
     // The mon won't resend unless the primary changed, so
     // we ignore same_interval_since.  We'll pass this history
@@ -7999,10 +8190,10 @@ PG::RecoveryCtx OSD::create_context()
   C_Contexts *on_safe = new C_Contexts(cct);
   map<int, map<spg_t,pg_query_t> > *query_map =
     new map<int, map<spg_t, pg_query_t> >;
-  map<int,vector<pair<pg_notify_t, pg_interval_map_t> > > *notify_list =
-    new map<int, vector<pair<pg_notify_t, pg_interval_map_t> > >;
-  map<int,vector<pair<pg_notify_t, pg_interval_map_t> > > *info_map =
-    new map<int,vector<pair<pg_notify_t, pg_interval_map_t> > >;
+  map<int,vector<pair<pg_notify_t, PastIntervals> > > *notify_list =
+    new map<int, vector<pair<pg_notify_t, PastIntervals> > >;
+  map<int,vector<pair<pg_notify_t, PastIntervals> > > *info_map =
+    new map<int,vector<pair<pg_notify_t, PastIntervals> > >;
   PG::RecoveryCtx rctx(query_map, info_map, notify_list,
 		       on_applied, on_safe, t);
   return rctx;
@@ -8084,11 +8275,11 @@ void OSD::dispatch_context(PG::RecoveryCtx &ctx, PG *pg, OSDMapRef curmap,
  */
 
 void OSD::do_notifies(
-  map<int,vector<pair<pg_notify_t,pg_interval_map_t> > >& notify_list,
+  map<int,vector<pair<pg_notify_t,PastIntervals> > >& notify_list,
   OSDMapRef curmap)
 {
   for (map<int,
-	   vector<pair<pg_notify_t,pg_interval_map_t> > >::iterator it =
+	   vector<pair<pg_notify_t,PastIntervals> > >::iterator it =
 	 notify_list.begin();
        it != notify_list.end();
        ++it) {
@@ -8143,11 +8334,11 @@ void OSD::do_queries(map<int, map<spg_t,pg_query_t> >& query_map,
 
 
 void OSD::do_infos(map<int,
-		       vector<pair<pg_notify_t, pg_interval_map_t> > >& info_map,
+		       vector<pair<pg_notify_t, PastIntervals> > >& info_map,
 		   OSDMapRef curmap)
 {
   for (map<int,
-	   vector<pair<pg_notify_t, pg_interval_map_t> > >::iterator p =
+	   vector<pair<pg_notify_t, PastIntervals> > >::iterator p =
 	 info_map.begin();
        p != info_map.end();
        ++p) {
@@ -8155,7 +8346,7 @@ void OSD::do_infos(map<int,
       dout(20) << __func__ << " skipping down osd." << p->first << dendl;
       continue;
     }
-    for (vector<pair<pg_notify_t,pg_interval_map_t> >::iterator i = p->second.begin();
+    for (vector<pair<pg_notify_t,PastIntervals> >::iterator i = p->second.begin();
 	 i != p->second.end();
 	 ++i) {
       dout(20) << __func__ << " sending info " << i->first.info
@@ -8322,10 +8513,8 @@ void OSD::handle_pg_trim(OpRequestRef op)
     dout(10) << *pg << " replica osd." << from << " lcod " << m->trim_to << dendl;
     pg->peer_last_complete_ondisk[pg_shard_t(from, m->pgid.shard)] =
       m->trim_to;
-    if (pg->calc_min_last_complete_ondisk()) {
-      dout(10) << *pg << " min lcod now " << pg->min_last_complete_ondisk << dendl;
-      pg->trim_peers();
-    }
+    // trim log when the pg is recovered
+    pg->calc_min_last_complete_ondisk();
   } else {
     // primary is instructing us to trim
     ObjectStore::Transaction t;
@@ -8457,7 +8646,7 @@ void OSD::handle_pg_query(OpRequestRef op)
 
   op->mark_started();
 
-  map< int, vector<pair<pg_notify_t, pg_interval_map_t> > > notify_list;
+  map< int, vector<pair<pg_notify_t, PastIntervals> > > notify_list;
 
   for (auto it = m->pg_list.begin();
        it != m->pg_list.end();
@@ -8541,7 +8730,9 @@ void OSD::handle_pg_query(OpRequestRef op)
 	    it->second.epoch_sent,
 	    osdmap->get_epoch(),
 	    empty),
-	  pg_interval_map_t()));
+	  PastIntervals(
+	    osdmap->get_pools().at(pgid.pool()).ec_pool(),
+	    *osdmap)));
     }
   }
   do_notifies(notify_list, osdmap);
@@ -8822,6 +9013,9 @@ void OSD::enqueue_op(spg_t pg, OpRequestRef& op, epoch_t epoch)
 	   << " latency " << latency
 	   << " epoch " << epoch
 	   << " " << *(op->get_req()) << dendl;
+  op->osd_trace.event("enqueue op");
+  op->osd_trace.keyval("priority", op->get_req()->get_priority());
+  op->osd_trace.keyval("cost", op->get_req()->get_cost());
   op->mark_queued_for_pg();
   op_shardedwq.queue(make_pair(pg, PGQueueable(op, epoch)));
 }
@@ -8858,6 +9052,7 @@ void OSD::dequeue_op(
     return;
 
   op->mark_reached_pg();
+  op->osd_trace.event("dequeue_op");
 
   pg->do_request(op, handle);
 
@@ -9226,7 +9421,7 @@ int OSD::init_op_flags(OpRequestRef& op)
 	  derr << "class " << cname << " open got " << cpp_strerror(r) << dendl;
 	  if (r == -ENOENT)
 	    r = -EOPNOTSUPP;
-	  else if (r != -EPERM) // propgate permission errors
+	  else if (r != -EPERM) // propagate permission errors
 	    r = -EIO;
 	  return r;
 	}
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index 5dff199449c..cad51ed9264 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -25,6 +25,7 @@
 #include "common/WorkQueue.h"
 #include "common/AsyncReserver.h"
 #include "common/ceph_context.h"
+#include "common/zipkin_trace.h"
 
 #include "mgr/MgrClient.h"
 
@@ -113,6 +114,7 @@ enum {
   l_osd_history_alloc_num,
   l_osd_cached_crc,
   l_osd_cached_crc_adjusted,
+  l_osd_missed_crc,
 
   l_osd_pg,
   l_osd_pg_primary,
@@ -471,7 +473,6 @@ public:
   MonClient   *&monc;
   ThreadPool::BatchWorkQueue<PG> &peering_wq;
   GenContextWQ recovery_gen_wq;
-  GenContextWQ op_gen_wq;
   ClassHandler  *&class_handler;
 
   void enqueue_back(spg_t pgid, PGQueueable qi);
@@ -1321,6 +1322,7 @@ protected:
   int whoami;
   std::string dev_path, journal_path;
 
+  ZTracer::Endpoint trace_endpoint;
   void create_logger();
   void create_recoverystate_perf();
   void tick();
@@ -2050,7 +2052,7 @@ protected:
     vector<int>& up, int up_primary,
     vector<int>& acting, int acting_primary,
     pg_history_t history,
-    const pg_interval_map_t& pi,
+    const PastIntervals& pi,
     ObjectStore::Transaction& t);
 
   PG* _make_pg(OSDMapRef createmap, spg_t pgid);
@@ -2060,13 +2062,21 @@ protected:
   int handle_pg_peering_evt(
     spg_t pgid,
     const pg_history_t& orig_history,
-    const pg_interval_map_t& pi,
+    const PastIntervals& pi,
     epoch_t epoch,
     PG::CephPeeringEvtRef evt);
   
   void load_pgs();
   void build_past_intervals_parallel();
 
+  /// build initial pg history and intervals on create
+  void build_initial_pg_history(
+    spg_t pgid,
+    epoch_t created,
+    utime_t created_stamp,
+    pg_history_t *h,
+    PastIntervals *pi);
+
   /// project pg history from from to now
   bool project_pg_history(
     spg_t pgid, pg_history_t& h, epoch_t from,
@@ -2202,13 +2212,13 @@ protected:
   void dispatch_context_transaction(PG::RecoveryCtx &ctx, PG *pg,
                                     ThreadPool::TPHandle *handle = NULL);
   void do_notifies(map<int,
-		       vector<pair<pg_notify_t, pg_interval_map_t> > >&
+		       vector<pair<pg_notify_t, PastIntervals> > >&
 		       notify_list,
 		   OSDMapRef map);
   void do_queries(map<int, map<spg_t,pg_query_t> >& query_map,
 		  OSDMapRef map);
   void do_infos(map<int,
-		    vector<pair<pg_notify_t, pg_interval_map_t> > >& info_map,
+		    vector<pair<pg_notify_t, PastIntervals> > >& info_map,
 		OSDMapRef map);
 
   bool require_mon_peer(const Message *m);
diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc
index 96bfa8a29e4..fbfcd5a5b40 100644
--- a/src/osd/OSDMap.cc
+++ b/src/osd/OSDMap.cc
@@ -30,6 +30,10 @@
  
 #define dout_subsys ceph_subsys_osd
 
+MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap, osdmap, osdmap);
+MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap::Incremental, osdmap_inc, osdmap);
+
+
 // ----------------------------------
 // osd_info_t
 
@@ -289,6 +293,38 @@ bool OSDMap::containing_subtree_is_down(CephContext *cct, int id, int subtree_ty
   }
 }
 
+bool OSDMap::subtree_type_is_down(CephContext *cct, int id, int subtree_type, set<int> *down_in_osds, set<int> *up_in_osds,
+                                           set<int> *subtree_up, unordered_map<int, set<int> > *subtree_type_down) const
+{
+  if (id >= 0) {
+    bool is_down_ret = is_down(id);
+    if (is_down_ret) {
+      down_in_osds->insert(id);
+    } else {
+      up_in_osds->insert(id);
+    }
+    return is_down_ret;
+  }
+
+  if (subtree_type_down &&
+      (*subtree_type_down)[subtree_type].count(id)) {
+    return true;
+  }
+
+  list<int> children;
+  crush->get_children(id, &children);
+  for (const auto &child : children) {
+    if (!subtree_type_is_down(cct, child, crush->get_bucket_type(child), down_in_osds, up_in_osds, subtree_up, subtree_type_down)) {
+      subtree_up->insert(id);
+      return false;
+    }
+  }
+  if (subtree_type_down) {
+    (*subtree_type_down)[subtree_type].insert(id);
+  }
+  return true;
+}
+
 void OSDMap::Incremental::encode_client_old(bufferlist& bl) const
 {
   __u16 v = 5;
@@ -441,7 +477,7 @@ void OSDMap::Incremental::encode(bufferlist& bl, uint64_t features) const
   }
 
   {
-    uint8_t target_v = 4;
+    uint8_t target_v = 5;
     if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
       target_v = 2;
     }
@@ -462,6 +498,7 @@ void OSDMap::Incremental::encode(bufferlist& bl, uint64_t features) const
       ::encode(new_nearfull_ratio, bl);
       ::encode(new_full_ratio, bl);
       ::encode(new_backfillfull_ratio, bl);
+      ::encode(new_require_min_compat_client, bl);
     }
     ENCODE_FINISH(bl); // osd-only data
   }
@@ -646,7 +683,7 @@ void OSDMap::Incremental::decode(bufferlist::iterator& bl)
   }
 
   {
-    DECODE_START(4, bl); // extended, osd-only data
+    DECODE_START(5, bl); // extended, osd-only data
     ::decode(new_hb_back_up, bl);
     ::decode(new_up_thru, bl);
     ::decode(new_last_clean_interval, bl);
@@ -674,6 +711,8 @@ void OSDMap::Incremental::decode(bufferlist::iterator& bl)
     } else {
       new_backfillfull_ratio = -1;
     }
+    if (struct_v >= 5)
+      ::decode(new_require_min_compat_client, bl);
     DECODE_FINISH(bl); // osd-only data
   }
 
@@ -718,6 +757,7 @@ void OSDMap::Incremental::dump(Formatter *f) const
   f->dump_float("new_full_ratio", new_full_ratio);
   f->dump_float("new_nearfull_ratio", new_nearfull_ratio);
   f->dump_float("new_backfillfull_ratio", new_backfillfull_ratio);
+  f->dump_string("new_require_min_compat_client", new_require_min_compat_client);
 
   if (fullmap.length()) {
     f->open_object_section("full_map");
@@ -1158,11 +1198,13 @@ uint64_t OSDMap::get_features(int entity_type, uint64_t *pmask) const
     features |= CEPH_FEATURE_CRUSH_V4;
   if (crush->has_nondefault_tunables5())
     features |= CEPH_FEATURE_CRUSH_TUNABLES5;
+  if (crush->has_incompat_choose_args())
+    features |= CEPH_FEATURE_CRUSH_CHOOSE_ARGS;
   mask |= CEPH_FEATURES_CRUSH;
 
   if (!pg_upmap.empty() || !pg_upmap_items.empty())
-    features |= CEPH_FEATUREMASK_OSDMAP_REMAP;
-  mask |= CEPH_FEATUREMASK_OSDMAP_REMAP;
+    features |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
+  mask |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
 
   for (auto &pool: pools) {
     if (pool.second.has_flag(pg_pool_t::FLAG_HASHPSPOOL)) {
@@ -1190,8 +1232,8 @@ uint64_t OSDMap::get_features(int entity_type, uint64_t *pmask) const
   }
   if (entity_type == CEPH_ENTITY_TYPE_OSD) {
     for (auto &erasure_code_profile : erasure_code_profiles) {
-      const map<string,string> &profile = erasure_code_profile.second;
-      const auto &plugin = profile.find("plugin");
+      auto& profile = erasure_code_profile.second;
+      const auto& plugin = profile.find("plugin");
       if (plugin != profile.end()) {
 	if (plugin->second == "isa" || plugin->second == "lrc")
 	  features |= CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2;
@@ -1234,6 +1276,36 @@ uint64_t OSDMap::get_features(int entity_type, uint64_t *pmask) const
   return features;
 }
 
+pair<string,string> OSDMap::get_min_compat_client() const
+{
+  uint64_t f = get_features(CEPH_ENTITY_TYPE_CLIENT, nullptr);
+
+  if (HAVE_FEATURE(f, OSDMAP_PG_UPMAP) ||      // v12.0.0-1733-g27d6f43
+      HAVE_FEATURE(f, CRUSH_CHOOSE_ARGS)) {     // v12.0.1-2172-gef1ef28
+    return make_pair("luminous", "12.2.0");
+  }
+  if (HAVE_FEATURE(f, CRUSH_TUNABLES5)) {      // v10.0.0-612-g043a737
+    return make_pair("jewel", "10.2.0");
+  }
+  if (HAVE_FEATURE(f, CRUSH_V4)) {             // v0.91-678-g325fc56
+    return make_pair("hammer", "0.94");
+  }
+  if (HAVE_FEATURE(f, OSD_PRIMARY_AFFINITY) || // v0.76-553-gf825624
+      HAVE_FEATURE(f, CRUSH_TUNABLES3) ||      // v0.76-395-ge20a55d
+      HAVE_FEATURE(f, OSD_ERASURE_CODES) ||    // v0.73-498-gbfc86a8
+      HAVE_FEATURE(f, OSD_CACHEPOOL)) {        // v0.67-401-gb91c1c5
+    return make_pair("firefly", "0.80");
+  }
+  if (HAVE_FEATURE(f, CRUSH_TUNABLES2) ||      // v0.54-684-g0cc47ff
+      HAVE_FEATURE(f, OSDHASHPSPOOL)) {        // v0.57-398-g8cc2b0f
+    return make_pair("dumpling", "0.67");
+  }
+  if (HAVE_FEATURE(f, CRUSH_TUNABLES)) {       // v0.48argonaut-206-g6f381af
+    return make_pair("argonaut", "0.48argonaut-207");
+  }
+  return make_pair("argonaut", "0.48");
+}
+
 void OSDMap::_calc_up_osd_features()
 {
   bool first = true;
@@ -1355,7 +1427,7 @@ void OSDMap::clean_temps(CephContext *cct,
     vector<int> raw_up;
     int primary;
     tmpmap.pg_to_raw_up(pg.first, &raw_up, &primary);
-    if (raw_up == pg.second) {
+    if (vectors_equal(raw_up, pg.second)) {
       ldout(cct, 10) << __func__ << "  removing pg_temp " << pg.first << " "
 		     << pg.second << " that matches raw_up mapping" << dendl;
       if (osdmap.pg_temp->count(pg.first))
@@ -1586,6 +1658,9 @@ int OSDMap::apply_incremental(const Incremental &inc)
   if (inc.new_full_ratio >= 0) {
     full_ratio = inc.new_full_ratio;
   }
+  if (inc.new_require_min_compat_client.length()) {
+    require_min_compat_client = inc.new_require_min_compat_client;
+  }
 
   // do new crush map last (after up/down stuff)
   if (inc.crush.length()) {
@@ -1663,7 +1738,7 @@ void OSDMap::_remove_nonexistent_osds(const pg_pool_t& pool,
     if (removed)
       osds.resize(osds.size() - removed);
   } else {
-    for (auto osd : osds) {
+    for (auto& osd : osds) {
       if (!exists(osd))
 	osd = CRUSH_ITEM_NONE;
     }
@@ -1714,7 +1789,7 @@ void OSDMap::_apply_remap(const pg_pool_t& pi, pg_t raw_pg, vector<int> *raw) co
 	return;
       }
     }
-    *raw = p->second;
+    *raw = vector<int>(p->second.begin(), p->second.end());
     return;
   }
 
@@ -2104,7 +2179,7 @@ void OSDMap::encode(bufferlist& bl, uint64_t features) const
 
   {
     uint8_t v = 4;
-    if (!HAVE_FEATURE(features, OSDMAP_REMAP)) {
+    if (!HAVE_FEATURE(features, OSDMAP_PG_UPMAP)) {
       v = 3;
     }
     ENCODE_START(v, 1, bl); // client-usable data
@@ -2151,7 +2226,7 @@ void OSDMap::encode(bufferlist& bl, uint64_t features) const
   }
 
   {
-    uint8_t target_v = 3;
+    uint8_t target_v = 4;
     if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
       target_v = 1;
     }
@@ -2176,6 +2251,7 @@ void OSDMap::encode(bufferlist& bl, uint64_t features) const
       ::encode(nearfull_ratio, bl);
       ::encode(full_ratio, bl);
       ::encode(backfillfull_ratio, bl);
+      ::encode(require_min_compat_client, bl);
     }
     ENCODE_FINISH(bl); // osd-only data
   }
@@ -2364,7 +2440,7 @@ void OSDMap::decode(bufferlist::iterator& bl)
     ::decode(*pg_temp, bl);
     ::decode(*primary_temp, bl);
     if (struct_v >= 2) {
-      osd_primary_affinity.reset(new vector<__u32>);
+      osd_primary_affinity.reset(new mempool::osdmap::vector<__u32>);
       ::decode(*osd_primary_affinity, bl);
       if (osd_primary_affinity->empty())
 	osd_primary_affinity.reset();
@@ -2393,7 +2469,7 @@ void OSDMap::decode(bufferlist::iterator& bl)
   }
 
   {
-    DECODE_START(3, bl); // extended, osd-only data
+    DECODE_START(4, bl); // extended, osd-only data
     ::decode(osd_addrs->hb_back_addr, bl);
     ::decode(osd_info, bl);
     ::decode(blacklist, bl);
@@ -2415,6 +2491,8 @@ void OSDMap::decode(bufferlist::iterator& bl)
     } else {
       backfillfull_ratio = 0;
     }
+    if (struct_v >= 4)
+      ::decode(require_min_compat_client, bl);
     DECODE_FINISH(bl); // osd-only data
   }
 
@@ -2461,8 +2539,9 @@ void OSDMap::post_decode()
   _calc_up_osd_features();
 }
 
-void OSDMap::dump_erasure_code_profiles(const map<string,map<string,string> > &profiles,
-					Formatter *f)
+void OSDMap::dump_erasure_code_profiles(
+  const mempool::osdmap::map<string,map<string,string>>& profiles,
+  Formatter *f)
 {
   f->open_object_section("erasure_code_profiles");
   for (const auto &profile : profiles) {
@@ -2488,6 +2567,10 @@ void OSDMap::dump(Formatter *f) const
   f->dump_string("cluster_snapshot", get_cluster_snapshot());
   f->dump_int("pool_max", get_pool_max());
   f->dump_int("max_osd", get_max_osd());
+  f->dump_string("require_min_compat_client", require_min_compat_client);
+  auto mv = get_min_compat_client();
+  f->dump_string("min_compat_client", mv.first);
+  f->dump_string("min_compat_client_version", mv.second);
 
   f->open_array_section("pools");
   for (const auto &pool : pools) {
@@ -2701,6 +2784,11 @@ void OSDMap::print(ostream& out) const
   out << "full_ratio " << full_ratio << "\n";
   out << "backfillfull_ratio " << backfillfull_ratio << "\n";
   out << "nearfull_ratio " << nearfull_ratio << "\n";
+  if (require_min_compat_client.length()) {
+    out << "require_min_compat_client " << require_min_compat_client << "\n";
+  }
+  auto mv = get_min_compat_client();
+  out << "min_compat_client " << mv.first << " " << mv.second << "\n";
   if (get_cluster_snapshot().length())
     out << "cluster_snapshot " << get_cluster_snapshot() << "\n";
   out << "\n";
@@ -3321,7 +3409,7 @@ int OSDMap::clean_pg_upmaps(
     vector<int> raw;
     int primary;
     pg_to_raw_osds(p.first, &raw, &primary);
-    if (raw == p.second) {
+    if (vectors_equal(raw, p.second)) {
       ldout(cct, 10) << " removing redundant pg_upmap " << p.first << " "
 		     << p.second << dendl;
       pending_inc->old_pg_upmap.insert(p.first);
@@ -3332,7 +3420,7 @@ int OSDMap::clean_pg_upmaps(
     vector<int> raw;
     int primary;
     pg_to_raw_osds(p.first, &raw, &primary);
-    vector<pair<int,int>> newmap;
+    mempool::osdmap::vector<pair<int,int>> newmap;
     for (auto& q : p.second) {
       if (std::find(raw.begin(), raw.end(), q.first) != raw.end()) {
 	newmap.push_back(q);
@@ -3400,17 +3488,27 @@ bool OSDMap::try_pg_upmap(
 
 int OSDMap::calc_pg_upmaps(
   CephContext *cct,
-  float max_deviation,
+  float max_deviation_ratio,
   int max,
-  const set<int64_t>& only_pools,
+  const set<int64_t>& only_pools_orig,
   OSDMap::Incremental *pending_inc)
 {
+  set<int64_t> only_pools;
+  if (only_pools_orig.empty()) {
+    for (auto& i : pools) {
+      only_pools.insert(i.first);
+    }
+  } else {
+    only_pools = only_pools_orig;
+  }
   OSDMap tmp;
   tmp.deepish_copy_from(*this);
   int num_changed = 0;
   while (true) {
     map<int,set<pg_t>> pgs_by_osd;
     int total_pgs = 0;
+    float osd_weight_total = 0;
+    map<int,float> osd_weight;
     for (auto& i : pools) {
       if (!only_pools.empty() && !only_pools.count(i.first))
 	continue;
@@ -3424,18 +3522,29 @@ int OSDMap::calc_pg_upmaps(
 	}
       }
       total_pgs += i.second.get_size() * i.second.get_pg_num();
+
+      map<int,float> pmap;
+      int ruleno = tmp.crush->find_rule(i.second.get_crush_ruleset(),
+					i.second.get_type(),
+					i.second.get_size());
+      tmp.crush->get_rule_weight_osd_map(ruleno, &pmap);
+      ldout(cct,30) << __func__ << " pool " << i.first << " ruleno " << ruleno << dendl;
+      for (auto p : pmap) {
+	osd_weight[p.first] += p.second;
+	osd_weight_total += p.second;
+      }
     }
-    float osd_weight_total = 0;
-    map<int,float> osd_weight;
-    for (auto& i : pgs_by_osd) {
-      float w = crush->get_item_weightf(i.first);
-      osd_weight[i.first] = w;
-      osd_weight_total += w;
-      ldout(cct, 20) << " osd." << i.first << " weight " << w
-		     << " pgs " << i.second.size() << dendl;
+    for (auto& i : osd_weight) {
+      int pgs = 0;
+      auto p = pgs_by_osd.find(i.first);
+      if (p != pgs_by_osd.end())
+	pgs = p->second.size();
+      else
+	pgs_by_osd.emplace(i.first, set<pg_t>());
+      ldout(cct, 20) << " osd." << i.first << " weight " << i.second
+		     << " pgs " << pgs << dendl;
     }
 
-    // NOTE: we assume we touch all osds with CRUSH!
     float pgs_per_weight = total_pgs / osd_weight_total;
     ldout(cct, 10) << " osd_weight_total " << osd_weight_total << dendl;
     ldout(cct, 10) << " pgs_per_weight " << pgs_per_weight << dendl;
@@ -3454,7 +3563,7 @@ int OSDMap::calc_pg_upmaps(
 		     << dendl;
       osd_deviation[i.first] = deviation;
       deviation_osd.insert(make_pair(deviation, i.first));
-      if (deviation > 0)
+      if (deviation >= 1.0)
 	overfull.insert(i.first);
     }
 
@@ -3476,14 +3585,14 @@ int OSDMap::calc_pg_upmaps(
     bool restart = false;
     for (auto p = deviation_osd.rbegin(); p != deviation_osd.rend(); ++p) {
       int osd = p->second;
+      float deviation = p->first;
       float target = osd_weight[osd] * pgs_per_weight;
-      float deviation = deviation_osd.rbegin()->first;
-      if (deviation/target < max_deviation) {
+      if (deviation/target < max_deviation_ratio) {
 	ldout(cct, 10) << " osd." << osd
 		       << " target " << target
 		       << " deviation " << deviation
-		       << " -> " << deviation/target
-		       << " < max " << max_deviation << dendl;
+		       << " -> ratio " << deviation/target
+		       << " < max ratio " << max_deviation_ratio << dendl;
 	break;
       }
       int num_to_move = deviation;
@@ -3530,7 +3639,7 @@ int OSDMap::calc_pg_upmaps(
 	  continue;
 	}
 	assert(orig != out);
-	vector<pair<int,int>>& rmi = tmp.pg_upmap_items[pg];
+	auto& rmi = tmp.pg_upmap_items[pg];
 	for (unsigned i = 0; i < out.size(); ++i) {
 	  if (orig[i] != out[i]) {
 	    rmi.push_back(make_pair(orig[i], out[i]));
diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h
index 72163d82f8d..74f0b833a2e 100644
--- a/src/osd/OSDMap.h
+++ b/src/osd/OSDMap.h
@@ -40,6 +40,17 @@ using namespace std;
 class CephContext;
 class CrushWrapper;
 
+// FIXME C++11 does not have std::equal for two differently-typed containers.
+// use this until we move to c++14
+template<typename A, typename B>
+bool vectors_equal(A a, B b)
+{
+  return
+    a.size() == b.size() &&
+    (a.empty() ||
+     memcmp((char*)&a[0], (char*)&b[0], sizeof(a[0]) * a.size()) == 0);
+}
+
 
 /*
  * we track up to two intervals during which the osd was alive and
@@ -106,10 +117,13 @@ ostream& operator<<(ostream& out, const osd_xinfo_t& xi);
 /** OSDMap
  */
 class OSDMap {
-
 public:
+  MEMPOOL_CLASS_HELPERS();
+
   class Incremental {
   public:
+    MEMPOOL_CLASS_HELPERS();
+
     /// feature bits we were encoded with.  the subsequent OSDMap
     /// encoding should match.
     uint64_t encode_features;
@@ -125,32 +139,32 @@ public:
 
     // incremental
     int32_t new_max_osd;
-    map<int64_t,pg_pool_t> new_pools;
-    map<int64_t,string> new_pool_names;
-    set<int64_t> old_pools;
-    map<string,map<string,string> > new_erasure_code_profiles;
-    vector<string> old_erasure_code_profiles;
-    map<int32_t,entity_addr_t> new_up_client;
-    map<int32_t,entity_addr_t> new_up_cluster;
-    map<int32_t,uint8_t> new_state;             // XORed onto previous state.
-    map<int32_t,uint32_t> new_weight;
-    map<pg_t,vector<int32_t> > new_pg_temp;     // [] to remove
-    map<pg_t, int32_t> new_primary_temp;            // [-1] to remove
-    map<int32_t,uint32_t> new_primary_affinity;
-    map<int32_t,epoch_t> new_up_thru;
-    map<int32_t,pair<epoch_t,epoch_t> > new_last_clean_interval;
-    map<int32_t,epoch_t> new_lost;
-    map<int32_t,uuid_d> new_uuid;
-    map<int32_t,osd_xinfo_t> new_xinfo;
-
-    map<entity_addr_t,utime_t> new_blacklist;
-    vector<entity_addr_t> old_blacklist;
-    map<int32_t, entity_addr_t> new_hb_back_up;
-    map<int32_t, entity_addr_t> new_hb_front_up;
-
-    map<pg_t,vector<int32_t>> new_pg_upmap;
-    map<pg_t,vector<pair<int32_t,int32_t>>> new_pg_upmap_items;
-    set<pg_t> old_pg_upmap, old_pg_upmap_items;
+    mempool::osdmap::map<int64_t,pg_pool_t> new_pools;
+    mempool::osdmap::map<int64_t,string> new_pool_names;
+    mempool::osdmap::set<int64_t> old_pools;
+    mempool::osdmap::map<string,map<string,string> > new_erasure_code_profiles;
+    mempool::osdmap::vector<string> old_erasure_code_profiles;
+    mempool::osdmap::map<int32_t,entity_addr_t> new_up_client;
+    mempool::osdmap::map<int32_t,entity_addr_t> new_up_cluster;
+    mempool::osdmap::map<int32_t,uint8_t> new_state;             // XORed onto previous state.
+    mempool::osdmap::map<int32_t,uint32_t> new_weight;
+    mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t> > new_pg_temp;     // [] to remove
+    mempool::osdmap::map<pg_t, int32_t> new_primary_temp;            // [-1] to remove
+    mempool::osdmap::map<int32_t,uint32_t> new_primary_affinity;
+    mempool::osdmap::map<int32_t,epoch_t> new_up_thru;
+    mempool::osdmap::map<int32_t,pair<epoch_t,epoch_t> > new_last_clean_interval;
+    mempool::osdmap::map<int32_t,epoch_t> new_lost;
+    mempool::osdmap::map<int32_t,uuid_d> new_uuid;
+    mempool::osdmap::map<int32_t,osd_xinfo_t> new_xinfo;
+
+    mempool::osdmap::map<entity_addr_t,utime_t> new_blacklist;
+    mempool::osdmap::vector<entity_addr_t> old_blacklist;
+    mempool::osdmap::map<int32_t, entity_addr_t> new_hb_back_up;
+    mempool::osdmap::map<int32_t, entity_addr_t> new_hb_front_up;
+
+    mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t>> new_pg_upmap;
+    mempool::osdmap::map<pg_t,mempool::osdmap::vector<pair<int32_t,int32_t>>> new_pg_upmap_items;
+    mempool::osdmap::set<pg_t> old_pg_upmap, old_pg_upmap_items;
 
     string cluster_snapshot;
 
@@ -158,6 +172,8 @@ public:
     float new_backfillfull_ratio = -1;
     float new_full_ratio = -1;
 
+    string new_require_min_compat_client;
+
     mutable bool have_crc;      ///< crc values are defined
     uint32_t full_crc;  ///< crc of the resulting OSDMap
     mutable uint32_t inc_crc;   ///< crc of this incremental
@@ -194,12 +210,11 @@ public:
       return &new_pools[pool];
     }
     bool has_erasure_code_profile(const string &name) const {
-      map<string,map<string,string> >::const_iterator i =
-	new_erasure_code_profiles.find(name);
+      auto i = new_erasure_code_profiles.find(name);
       return i != new_erasure_code_profiles.end();
     }
     void set_erasure_code_profile(const string &name,
-				  const map<string,string> &profile) {
+				  const map<string,string>& profile) {
       new_erasure_code_profiles[name] = profile;
     }
 
@@ -223,33 +238,33 @@ private:
   vector<uint8_t> osd_state;
 
   struct addrs_s {
-    vector<ceph::shared_ptr<entity_addr_t> > client_addr;
-    vector<ceph::shared_ptr<entity_addr_t> > cluster_addr;
-    vector<ceph::shared_ptr<entity_addr_t> > hb_back_addr;
-    vector<ceph::shared_ptr<entity_addr_t> > hb_front_addr;
+    mempool::osdmap::vector<ceph::shared_ptr<entity_addr_t> > client_addr;
+    mempool::osdmap::vector<ceph::shared_ptr<entity_addr_t> > cluster_addr;
+    mempool::osdmap::vector<ceph::shared_ptr<entity_addr_t> > hb_back_addr;
+    mempool::osdmap::vector<ceph::shared_ptr<entity_addr_t> > hb_front_addr;
     entity_addr_t blank;
   };
   ceph::shared_ptr<addrs_s> osd_addrs;
 
-  vector<__u32>   osd_weight;   // 16.16 fixed point, 0x10000 = "in", 0 = "out"
-  vector<osd_info_t> osd_info;
-  ceph::shared_ptr< map<pg_t,vector<int32_t> > > pg_temp;  // temp pg mapping (e.g. while we rebuild)
-  ceph::shared_ptr< map<pg_t,int32_t > > primary_temp;  // temp primary mapping (e.g. while we rebuild)
-  ceph::shared_ptr< vector<__u32> > osd_primary_affinity; ///< 16.16 fixed point, 0x10000 = baseline
+  mempool::osdmap::vector<__u32>   osd_weight;   // 16.16 fixed point, 0x10000 = "in", 0 = "out"
+  mempool::osdmap::vector<osd_info_t> osd_info;
+  ceph::shared_ptr< mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t> > > pg_temp;  // temp pg mapping (e.g. while we rebuild)
+  ceph::shared_ptr< mempool::osdmap::map<pg_t,int32_t > > primary_temp;  // temp primary mapping (e.g. while we rebuild)
+  ceph::shared_ptr< mempool::osdmap::vector<__u32> > osd_primary_affinity; ///< 16.16 fixed point, 0x10000 = baseline
 
   // remap (post-CRUSH, pre-up)
-  map<pg_t,vector<int32_t>> pg_upmap; ///< remap pg
-  map<pg_t,vector<pair<int32_t,int32_t>>> pg_upmap_items; ///< remap osds in up set
+  mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t>> pg_upmap; ///< remap pg
+  mempool::osdmap::map<pg_t,mempool::osdmap::vector<pair<int32_t,int32_t>>> pg_upmap_items; ///< remap osds in up set
 
-  map<int64_t,pg_pool_t> pools;
-  map<int64_t,string> pool_name;
-  map<string,map<string,string> > erasure_code_profiles;
-  map<string,int64_t> name_pool;
+  mempool::osdmap::map<int64_t,pg_pool_t> pools;
+  mempool::osdmap::map<int64_t,string> pool_name;
+  mempool::osdmap::map<string,map<string,string> > erasure_code_profiles;
+  mempool::osdmap::map<string,int64_t> name_pool;
 
-  ceph::shared_ptr< vector<uuid_d> > osd_uuid;
-  vector<osd_xinfo_t> osd_xinfo;
+  ceph::shared_ptr< mempool::osdmap::vector<uuid_d> > osd_uuid;
+  mempool::osdmap::vector<osd_xinfo_t> osd_xinfo;
 
-  ceph::unordered_map<entity_addr_t,utime_t> blacklist;
+  mempool::osdmap::unordered_map<entity_addr_t,utime_t> blacklist;
 
   epoch_t cluster_snapshot_epoch;
   string cluster_snapshot;
@@ -257,6 +272,9 @@ private:
 
   float full_ratio = 0, backfillfull_ratio = 0, nearfull_ratio = 0;
 
+  /// min compat client we want to support
+  string require_min_compat_client;
+
   mutable uint64_t cached_up_osd_features;
 
   mutable bool crc_defined;
@@ -279,9 +297,9 @@ private:
 	     num_osd(0), num_up_osd(0), num_in_osd(0),
 	     max_osd(0),
 	     osd_addrs(std::make_shared<addrs_s>()),
-	     pg_temp(std::make_shared<map<pg_t,vector<int32_t>>>()),
-	     primary_temp(std::make_shared<map<pg_t,int32_t>>()),
-	     osd_uuid(std::make_shared<vector<uuid_d>>()),
+	     pg_temp(std::make_shared<mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t>>>()),
+	     primary_temp(std::make_shared<mempool::osdmap::map<pg_t,int32_t>>()),
+	     osd_uuid(std::make_shared<mempool::osdmap::vector<uuid_d>>()),
 	     cluster_snapshot_epoch(0),
 	     new_blacklist_entries(false),
 	     cached_up_osd_features(0),
@@ -298,12 +316,12 @@ public:
 
   void deepish_copy_from(const OSDMap& o) {
     *this = o;
-    primary_temp.reset(new map<pg_t,int32_t>(*o.primary_temp));
-    pg_temp.reset(new map<pg_t,vector<int32_t> >(*o.pg_temp));
-    osd_uuid.reset(new vector<uuid_d>(*o.osd_uuid));
+    primary_temp.reset(new mempool::osdmap::map<pg_t,int32_t>(*o.primary_temp));
+    pg_temp.reset(new mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t> >(*o.pg_temp));
+    osd_uuid.reset(new mempool::osdmap::vector<uuid_d>(*o.osd_uuid));
 
     if (o.osd_primary_affinity)
-      osd_primary_affinity.reset(new vector<__u32>(*o.osd_primary_affinity));
+      osd_primary_affinity.reset(new mempool::osdmap::vector<__u32>(*o.osd_primary_affinity));
 
     // NOTE: this still references shared entity_addr_t's.
     osd_addrs.reset(new addrs_s(*o.osd_addrs));
@@ -344,8 +362,11 @@ public:
     return nearfull_ratio;
   }
   void count_full_nearfull_osds(int *full, int *backfill, int *nearfull) const;
-  void get_full_osd_util(const ceph::unordered_map<int32_t,osd_stat_t> &osd_stat,
-                         map<int, float> *full, map<int, float> *backfill, map<int, float> *nearfull) const;
+  void get_full_osd_util(
+    const ceph::unordered_map<int32_t,osd_stat_t> &osd_stat,
+    map<int, float> *full,
+    map<int, float> *backfill,
+    map<int, float> *nearfull) const;
 
   /***** cluster state *****/
   /* osds */
@@ -409,8 +430,9 @@ public:
   void set_primary_affinity(int o, int w) {
     assert(o < max_osd);
     if (!osd_primary_affinity)
-      osd_primary_affinity.reset(new vector<__u32>(max_osd,
-						   CEPH_OSD_DEFAULT_PRIMARY_AFFINITY));
+      osd_primary_affinity.reset(
+	new mempool::osdmap::vector<__u32>(
+	  max_osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY));
     (*osd_primary_affinity)[o] = w;
   }
   unsigned get_primary_affinity(int o) const {
@@ -424,27 +446,26 @@ public:
   }
 
   bool has_erasure_code_profile(const string &name) const {
-    map<string,map<string,string> >::const_iterator i =
-      erasure_code_profiles.find(name);
+    auto i = erasure_code_profiles.find(name);
     return i != erasure_code_profiles.end();
   }
   int get_erasure_code_profile_default(CephContext *cct,
 				       map<string,string> &profile_map,
 				       ostream *ss);
   void set_erasure_code_profile(const string &name,
-				const map<string,string> &profile) {
+				const map<string,string>& profile) {
     erasure_code_profiles[name] = profile;
   }
-  const map<string,string> &get_erasure_code_profile(const string &name) const {
-    map<string,map<string,string> >::const_iterator i =
-      erasure_code_profiles.find(name);
+  const map<string,string> &get_erasure_code_profile(
+    const string &name) const {
     static map<string,string> empty;
+    auto i = erasure_code_profiles.find(name);
     if (i == erasure_code_profiles.end())
       return empty;
     else
       return i->second;
   }
-  const map<string,map<string,string> > &get_erasure_code_profiles() const {
+  const mempool::osdmap::map<string,map<string,string> > &get_erasure_code_profiles() const {
     return erasure_code_profiles;
   }
 
@@ -479,6 +500,9 @@ public:
   bool subtree_is_down(int id, set<int> *down_cache) const;
   bool containing_subtree_is_down(CephContext *cct, int osd, int subtree_type, set<int> *down_cache) const;
   
+  bool subtree_type_is_down(CephContext *cct, int id, int subtree_type, set<int> *down_in_osds, set<int> *up_in_osds,
+                            set<int> *subtree_up, unordered_map<int, set<int> > *subtree_type_down) const;
+
   int identify_osd(const entity_addr_t& addr) const;
   int identify_osd(const uuid_d& u) const;
   int identify_osd_on_all_channels(const entity_addr_t& addr) const;
@@ -591,6 +615,12 @@ public:
   uint64_t get_features(int entity_type, uint64_t *mask) const;
 
   /**
+   * get oldest *client* version (firefly, hammer, etc.) that can connect given
+   * the feature bits required (according to get_features()).
+   */
+  pair<string,string> get_min_compat_client() const;
+
+  /**
    * get intersection of features supported by up osds
    */
   uint64_t get_up_osd_features() const;
@@ -731,12 +761,12 @@ public:
     pg_to_up_acting_osds(pg, &up, &up_primary, &acting, &acting_primary);
   }
   bool pg_is_ec(pg_t pg) const {
-    map<int64_t, pg_pool_t>::const_iterator i = pools.find(pg.pool());
+    auto i = pools.find(pg.pool());
     assert(i != pools.end());
     return i->second.ec_pool();
   }
   bool get_primary_shard(const pg_t& pgid, spg_t *out) const {
-    map<int64_t, pg_pool_t>::const_iterator i = get_pools().find(pgid.pool());
+    auto i = get_pools().find(pgid.pool());
     if (i == get_pools().end()) {
       return false;
     }
@@ -757,7 +787,7 @@ public:
   }
 
   int64_t lookup_pg_pool_name(const string& name) const {
-    map<string,int64_t>::const_iterator p = name_pool.find(name);
+    auto p = name_pool.find(name);
     if (p == name_pool.end())
       return -ENOENT;
     return p->second;
@@ -766,14 +796,14 @@ public:
   int64_t get_pool_max() const {
     return pool_max;
   }
-  const map<int64_t,pg_pool_t>& get_pools() const {
+  const mempool::osdmap::map<int64_t,pg_pool_t>& get_pools() const {
     return pools;
   }
-  map<int64_t,pg_pool_t>& get_pools() {
+  mempool::osdmap::map<int64_t,pg_pool_t>& get_pools() {
     return pools;
   }
   const string& get_pool_name(int64_t p) const {
-    map<int64_t, string>::const_iterator i = pool_name.find(p);
+    auto i = pool_name.find(p);
     assert(i != pool_name.end());
     return i->second;
   }
@@ -781,25 +811,25 @@ public:
     return pools.count(p);
   }
   const pg_pool_t* get_pg_pool(int64_t p) const {
-    map<int64_t, pg_pool_t>::const_iterator i = pools.find(p);
+    auto i = pools.find(p);
     if (i != pools.end())
       return &i->second;
     return NULL;
   }
   unsigned get_pg_size(pg_t pg) const {
-    map<int64_t,pg_pool_t>::const_iterator p = pools.find(pg.pool());
+    auto p = pools.find(pg.pool());
     assert(p != pools.end());
     return p->second.get_size();
   }
   int get_pg_type(pg_t pg) const {
-    map<int64_t,pg_pool_t>::const_iterator p = pools.find(pg.pool());
+    auto p = pools.find(pg.pool());
     assert(p != pools.end());
     return p->second.get_type();
   }
 
 
   pg_t raw_pg_to_pg(pg_t pg) const {
-    map<int64_t,pg_pool_t>::const_iterator p = pools.find(pg.pool());
+    auto p = pools.find(pg.pool());
     assert(p != pools.end());
     return p->second.raw_pg_to_pg(pg);
   }
@@ -936,8 +966,9 @@ public:
 
   string get_flag_string() const;
   static string get_flag_string(unsigned flags);
-  static void dump_erasure_code_profiles(const map<string,map<string,string> > &profiles,
-					 Formatter *f);
+  static void dump_erasure_code_profiles(
+    const mempool::osdmap::map<string,map<string,string> > &profiles,
+    Formatter *f);
   void dump(Formatter *f) const;
   static void generate_test_instances(list<OSDMap*>& o);
   bool check_new_blacklist_entries() const { return new_blacklist_entries; }
diff --git a/src/osd/OSDMapMapping.h b/src/osd/OSDMapMapping.h
index c325693785e..81996641ec0 100644
--- a/src/osd/OSDMapMapping.h
+++ b/src/osd/OSDMapMapping.h
@@ -285,6 +285,7 @@ public:
 	   int *acting_primary) const {
     auto p = pools.find(pgid.pool());
     assert(p != pools.end());
+    assert(pgid.ps() < p->second.pg_num);
     p->second.get(pgid.ps(), up, up_primary, acting, acting_primary);
   }
 
diff --git a/src/osd/OpRequest.h b/src/osd/OpRequest.h
index a32e03b9e93..56663ecbd51 100644
--- a/src/osd/OpRequest.h
+++ b/src/osd/OpRequest.h
@@ -110,6 +110,7 @@ public:
 
   bool check_send_map = true; ///< true until we check if sender needs a map
   epoch_t sent_epoch = 0;     ///< client's map epoch
+  epoch_t min_epoch = 0;      ///< min epoch needed to handle this msg
 
   bool hitset_inserted;
   const Message *get_req() const { return request; }
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index f8c14131f6d..fe248331e7c 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -92,6 +92,66 @@ static ostream& _prefix(std::ostream *_dout, T *t)
 
 MEMPOOL_DEFINE_OBJECT_FACTORY(PG::CephPeeringEvt, pg_peering_evt, osd);
 
+void PGStateHistory::enter(PG* pg, const utime_t entime, const char* state)
+{
+  // Ignore trimming state machine for now
+  if (::strstr(state, "Trimming") != NULL) {
+    return;
+  } else if (pi != nullptr) {
+    pi->enter_state(entime, state);
+  } else {
+    // Store current state since we can't reliably take the PG lock here
+    if ( tmppi == nullptr) {
+      tmppi = std::unique_ptr<PGStateInstance>(new PGStateInstance);
+    }
+
+    thispg = pg;
+    tmppi->enter_state(entime, state);
+  }
+}
+
+void PGStateHistory::exit(const char* state) {
+  // Ignore trimming state machine for now
+  // Do nothing if PG is being destroyed!
+  if (::strstr(state, "Trimming") != NULL || pg_in_destructor) {
+    return;
+  } else {
+    bool ilocked = false;
+    if(!thispg->is_locked()) {
+      thispg->lock();
+      ilocked = true;
+    }
+    if (pi == nullptr) {
+      buffer.push_back(std::unique_ptr<PGStateInstance>(tmppi.release()));
+      pi = buffer.back().get();
+      pi->setepoch(thispg->get_osdmap()->get_epoch());
+    }
+
+    pi->exit_state(ceph_clock_now());
+    if (::strcmp(state, "Reset") == 0) {
+      this->reset();
+    }
+    if(ilocked) {
+      thispg->unlock();
+    }
+  }
+}
+
+void PGStateHistory::dump(Formatter* f) const {
+  f->open_array_section("history");
+  for (auto pi = buffer.begin(); pi != buffer.end(); ++pi) {
+    f->open_object_section("states");
+    f->dump_stream("epoch") << (*pi)->this_epoch;
+    for (auto she : (*pi)->state_history) {
+      f->dump_string("state", std::get<2>(she));
+      f->dump_stream("enter") << std::get<0>(she);
+      f->dump_stream("exit") << std::get<1>(she);
+    }
+    f->close_section();
+  }
+  f->close_section();
+}
+
 void PG::get(const char* tag)
 {
   ref++;
@@ -229,12 +289,17 @@ PG::PG(OSDService *o, OSDMapRef curmap,
   #ifdef PG_DEBUG_REFS
   _ref_id_lock("PG::_ref_id_lock"), _ref_id(0),
   #endif
-  deleting(false), dirty_info(false), dirty_big_info(false),
+  deleting(false),
+  trace_endpoint("0.0.0.0", 0, "PG"),
+  dirty_info(false), dirty_big_info(false),
   info(p),
   info_struct_v(0),
   coll(p), pg_log(cct),
   pgmeta_oid(p.make_pgmeta_oid()),
   missing_loc(this),
+  past_intervals(
+    curmap->get_pools().at(p.pgid.pool()).ec_pool(),
+    *curmap),
   stat_queue_item(this),
   scrub_queued(false),
   recovery_queued(false),
@@ -261,16 +326,26 @@ PG::PG(OSDService *o, OSDMapRef curmap,
   peer_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
   acting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
   upacting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
-  last_epoch(0)
+  last_epoch(0),
+  scrub_sleep_lock("PG::scrub_sleep_lock"),
+  scrub_sleep_timer(o->cct, scrub_sleep_lock, false /* relax locking */)
 {
 #ifdef PG_DEBUG_REFS
   osd->add_pgid(p, this);
 #endif
+#ifdef WITH_BLKIN
+  std::stringstream ss;
+  ss << "PG " << info.pgid;
+  trace_endpoint.copy_name(ss.str());
+#endif
   osr->shard_hint = p;
 }
 
 PG::~PG()
 {
+  pgstate_history.set_pg_in_destructor();
+  Mutex::Locker l(scrub_sleep_lock);
+  scrub_sleep_timer.shutdown();
 #ifdef PG_DEBUG_REFS
   osd->remove_pgid(info.pgid, this);
 #endif
@@ -333,8 +408,11 @@ void PG::proc_master_log(
     info.last_epoch_started = oinfo.last_epoch_started;
     dirty_info = true;
   }
-  if (info.history.merge(oinfo.history))
+  if (oinfo.last_interval_started > info.last_interval_started) {
+    info.last_interval_started = oinfo.last_interval_started;
     dirty_info = true;
+  }
+  update_history(oinfo.history);
   assert(cct->_conf->osd_find_best_info_ignore_history_les ||
 	 info.last_epoch_started >= info.history.last_epoch_started);
 
@@ -385,11 +463,8 @@ bool PG::proc_replica_info(
   assert(is_primary());
   peer_info[from] = oinfo;
   might_have_unfound.insert(from);
-  
-  unreg_next_scrub();
-  if (info.history.merge(oinfo.history))
-    dirty_info = true;
-  reg_next_scrub();
+
+  update_history(oinfo.history);
   
   // stray?
   if (!is_up(from) && !is_acting(from)) {
@@ -476,7 +551,7 @@ bool PG::search_for_missing(
   pg_shard_t from,
   RecoveryCtx *ctx)
 {
-  unsigned num_unfound_before = missing_loc.num_unfound();
+  uint64_t num_unfound_before = missing_loc.num_unfound();
   bool found_missing = missing_loc.add_source_info(
     from, oinfo, omissing, ctx->handle);
   if (found_missing && num_unfound_before != missing_loc.num_unfound())
@@ -601,11 +676,12 @@ bool PG::MissingLoc::add_source_info(
 void PG::discover_all_missing(map<int, map<spg_t,pg_query_t> > &query_map)
 {
   auto &missing = pg_log.get_missing();
-  assert(have_unfound());
+  uint64_t unfound = get_num_unfound();
+  assert(unfound > 0);
 
   dout(10) << __func__ << " "
 	   << missing.num_missing() << " missing, "
-	   << get_num_unfound() << " unfound"
+	   << unfound << " unfound"
 	   << dendl;
 
   std::set<pg_shard_t>::const_iterator m = might_have_unfound.begin();
@@ -713,135 +789,55 @@ bool PG::needs_backfill() const
   return false;
 }
 
-bool PG::_calc_past_interval_range(epoch_t *start, epoch_t *end, epoch_t oldest_map)
-{
-  if (info.history.same_interval_since) {
-    *end = info.history.same_interval_since;
-  } else {
-    // PG must be imported, so let's calculate the whole range.
-    *end = osdmap_ref->get_epoch();
-  }
-
-  // Do we already have the intervals we want?
-  map<epoch_t,pg_interval_t>::const_iterator pif = past_intervals.begin();
-  if (pif != past_intervals.end()) {
-    if (pif->first <= info.history.last_epoch_clean) {
-      dout(10) << __func__ << ": already have past intervals back to "
-	       << info.history.last_epoch_clean << dendl;
-      return false;
-    }
-    *end = pif->first;
-  }
-
-  *start = MAX(MAX(info.history.epoch_created,
-		   info.history.last_epoch_clean),
-	       oldest_map);
-  if (*start >= *end) {
-    dout(10) << __func__ << " start epoch " << *start << " >= end epoch " << *end
-	     << ", nothing to do" << dendl;
-    return false;
-  }
-
-  return true;
-}
-
 
-void PG::generate_past_intervals()
+void PG::check_past_interval_bounds() const
 {
-  epoch_t cur_epoch, end_epoch;
-  if (!_calc_past_interval_range(&cur_epoch, &end_epoch,
-      osd->get_superblock().oldest_map)) {
-    if (info.history.same_interval_since == 0) {
-      info.history.same_interval_since = end_epoch;
-      dirty_info = true;
+  auto rpib = get_required_past_interval_bounds(
+    info,
+    osd->get_superblock().oldest_map);
+  if (rpib.first >= rpib.second) {
+    if (!past_intervals.empty()) {
+      osd->clog->error() << info.pgid << " required past_interval bounds are"
+			 << " empty [" << rpib << ") but past_intervals is not: "
+			 << past_intervals;
+      derr << info.pgid << " required past_interval bounds are"
+	   << " empty [" << rpib << ") but past_intervals is not: "
+	   << past_intervals << dendl;
+      assert(past_intervals.empty());
     }
-    return;
-  }
-
-  OSDMapRef last_map, cur_map;
-  int primary = -1;
-  int up_primary = -1;
-  vector<int> acting, up, old_acting, old_up;
-
-  cur_map = osd->get_map(cur_epoch);
-  cur_map->pg_to_up_acting_osds(
-    get_pgid().pgid, &up, &up_primary, &acting, &primary);
-  epoch_t same_interval_since = cur_epoch;
-  dout(10) << __func__ << " over epochs " << cur_epoch << "-"
-	   << end_epoch << dendl;
-  ++cur_epoch;
-  for (; cur_epoch <= end_epoch; ++cur_epoch) {
-    int old_primary = primary;
-    int old_up_primary = up_primary;
-    last_map.swap(cur_map);
-    old_up.swap(up);
-    old_acting.swap(acting);
-
-    cur_map = osd->get_map(cur_epoch);
-    pg_t pgid = get_pgid().pgid;
-    if (last_map->get_pools().count(pgid.pool()))
-      pgid = pgid.get_ancestor(last_map->get_pg_num(pgid.pool()));
-    cur_map->pg_to_up_acting_osds(pgid, &up, &up_primary, &acting, &primary);
-
-    boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
-      get_is_recoverable_predicate());
-    std::stringstream debug;
-    bool new_interval = pg_interval_t::check_new_interval(
-      old_primary,
-      primary,
-      old_acting,
-      acting,
-      old_up_primary,
-      up_primary,
-      old_up,
-      up,
-      same_interval_since,
-      info.history.last_epoch_clean,
-      cur_map,
-      last_map,
-      pgid,
-      recoverable.get(),
-      &past_intervals,
-      &debug);
-    if (new_interval) {
-      dout(10) << debug.str() << dendl;
-      same_interval_since = cur_epoch;
+  } else {
+    if (past_intervals.empty()) {
+      osd->clog->error() << info.pgid << " required past_interval bounds are"
+			 << " not empty [" << rpib << ") but past_intervals "
+			 << past_intervals << " is empty";
+      derr << info.pgid << " required past_interval bounds are"
+	   << " not empty [" << rpib << ") but past_intervals "
+	   << past_intervals << " is empty" << dendl;
+      assert(!past_intervals.empty());
+    }
+
+    auto apib = past_intervals.get_bounds();
+    if (apib.first > rpib.first) {
+      osd->clog->error() << info.pgid << " past_intervals [" << apib
+			 << ") start interval does not contain the required"
+			 << " bound [" << rpib << ") start";
+      derr << info.pgid << " past_intervals [" << apib
+	   << ") start interval does not contain the required"
+	   << " bound [" << rpib << ") start" << dendl;
+      assert(0 == "past_interval start interval mismatch");
+    }
+    if (apib.second != rpib.second) {
+      osd->clog->error() << info.pgid << " past_interal bound [" << apib
+			 << ") end does not match required [" << rpib
+			 << ") end";
+      derr << info.pgid << " past_interal bound [" << apib
+	   << ") end does not match required [" << rpib
+	   << ") end" << dendl;
+      assert(0 == "past_interval end mismatch");
     }
   }
-
-  // PG import needs recalculated same_interval_since
-  if (info.history.same_interval_since == 0) {
-    assert(same_interval_since);
-    dout(10) << __func__ << " fix same_interval_since " << same_interval_since << " pg " << *this << dendl;
-    dout(10) << __func__ << " past_intervals " << past_intervals << dendl;
-    // Fix it
-    info.history.same_interval_since = same_interval_since;
-  }
-
-  // record our work.
-  dirty_info = true;
-  dirty_big_info = true;
 }
 
-/*
- * Trim past_intervals.
- *
- * This gets rid of all the past_intervals that happened before last_epoch_clean.
- */
-void PG::trim_past_intervals()
-{
-  std::map<epoch_t,pg_interval_t>::iterator pif = past_intervals.begin();
-  std::map<epoch_t,pg_interval_t>::iterator end = past_intervals.end();
-  while (pif != end) {
-    if (pif->second.last >= info.history.last_epoch_clean)
-      return;
-    dout(10) << __func__ << ": trimming " << pif->second << dendl;
-    past_intervals.erase(pif++);
-    dirty_big_info = true;
-  }
-}
-
-
 bool PG::adjust_need_up_thru(const OSDMapRef osdmap)
 {
   epoch_t up_thru = osdmap->get_up_thru(osd->whoami);
@@ -907,7 +903,7 @@ bool PG::all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const
   return true;
 }
 
-void PG::build_prior(std::unique_ptr<PriorSet> &prior_set)
+PastIntervals::PriorSet PG::build_prior()
 {
   if (1) {
     // sanity check
@@ -917,18 +913,33 @@ void PG::build_prior(std::unique_ptr<PriorSet> &prior_set)
       assert(info.history.last_epoch_started >= it->second.history.last_epoch_started);
     }
   }
-  prior_set.reset(
-    new PriorSet(
-      cct,
-      pool.info.ec_pool(),
-      get_pgbackend()->get_is_recoverable_predicate(),
-      *get_osdmap(),
-      past_intervals,
-      up,
-      acting,
-      info,
-      this));
-  PriorSet &prior(*prior_set.get());
+
+  const OSDMap &osdmap = *get_osdmap();
+  PastIntervals::PriorSet prior = past_intervals.get_prior_set(
+    pool.info.ec_pool(),
+    info.history.last_epoch_started,
+    get_pgbackend()->get_is_recoverable_predicate(),
+    [&](epoch_t start, int osd, epoch_t *lost_at) {
+      const osd_info_t *pinfo = 0;
+      if (osdmap.exists(osd)) {
+	pinfo = &osdmap.get_info(osd);
+	if (lost_at)
+	  *lost_at = pinfo->lost_at;
+      }
+
+      if (osdmap.is_up(osd)) {
+	return PastIntervals::UP;
+      } else if (!pinfo) {
+	return PastIntervals::DNE;
+      } else if (pinfo->lost_at > start) {
+	return PastIntervals::LOST;
+      } else {
+	return PastIntervals::DOWN;
+      }
+    },
+    up,
+    acting,
+    this);
 				 
   if (prior.pg_down) {
     state_set(PG_STATE_DOWN);
@@ -945,7 +956,8 @@ void PG::build_prior(std::unique_ptr<PriorSet> &prior_set)
 	     << ", all is well" << dendl;
     need_up_thru = false;
   }
-  set_probe_targets(prior_set->probe);
+  set_probe_targets(prior.probe);
+  return prior;
 }
 
 void PG::clear_primary_state()
@@ -1110,7 +1122,6 @@ void PG::calc_ec_acting(
   const vector<int> &up,
   pg_shard_t up_primary,
   const map<pg_shard_t, pg_info_t> &all_info,
-  bool compat_mode,
   bool restrict_to_up_acting,
   vector<int> *_want,
   set<pg_shard_t> *backfill,
@@ -1198,7 +1209,6 @@ void PG::calc_replicated_acting(
   const vector<int> &up,
   pg_shard_t up_primary,
   const map<pg_shard_t, pg_info_t> &all_info,
-  bool compat_mode,
   bool restrict_to_up_acting,
   vector<int> *want,
   set<pg_shard_t> *backfill,
@@ -1252,16 +1262,8 @@ void PG::calc_replicated_acting(
        * as far backwards as necessary to pick up any peers which can
        * be log recovered by auth_log_shard's log */
       ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl;
-      if (compat_mode) {
-	if (backfill->empty()) {
-	  backfill->insert(up_cand);
-	  want->push_back(*i);
-	  acting_backfill->insert(up_cand);
-	}
-      } else {
-	backfill->insert(up_cand);
-	acting_backfill->insert(up_cand);
-      }
+      backfill->insert(up_cand);
+      acting_backfill->insert(up_cand);
     } else {
       want->push_back(*i);
       acting_backfill->insert(up_cand);
@@ -1381,27 +1383,6 @@ bool PG::choose_acting(pg_shard_t &auth_log_shard_id,
   assert(!auth_log_shard->second.is_incomplete());
   auth_log_shard_id = auth_log_shard->first;
 
-  // Determine if compatibility needed
-  bool compat_mode = !cct->_conf->osd_debug_override_acting_compat;
-  if (compat_mode) {
-    bool all_support = true;
-    OSDMapRef osdmap = get_osdmap();
-
-    for (map<pg_shard_t, pg_info_t>::iterator it = all_info.begin();
-	 it != all_info.end();
-	 ++it) {
-      pg_shard_t peer = it->first;
-
-      const osd_xinfo_t& xi = osdmap->get_xinfo(peer.osd);
-      if (!(xi.features & CEPH_FEATURE_OSD_ERASURE_CODES)) {
-	all_support = false;
-	break;
-      }
-    }
-    if (all_support)
-      compat_mode = false;
-  }
-
   set<pg_shard_t> want_backfill, want_acting_backfill;
   vector<int> want;
   pg_shard_t want_primary;
@@ -1415,7 +1396,6 @@ bool PG::choose_acting(pg_shard_t &auth_log_shard_id,
       up,
       up_primary,
       all_info,
-      compat_mode,
       restrict_to_up_acting,
       &want,
       &want_backfill,
@@ -1431,7 +1411,6 @@ bool PG::choose_acting(pg_shard_t &auth_log_shard_id,
       up,
       up_primary,
       all_info,
-      compat_mode,
       restrict_to_up_acting,
       &want,
       &want_backfill,
@@ -1480,7 +1459,7 @@ bool PG::choose_acting(pg_shard_t &auth_log_shard_id,
     if (want_acting == up) {
       // There can't be any pending backfill if
       // want is the same as crush map up OSDs.
-      assert(compat_mode || want_backfill.empty());
+      assert(want_backfill.empty());
       vector<int> empty;
       osd->queue_want_pg_temp(info.pgid.pgid, empty);
     } else
@@ -1522,32 +1501,11 @@ void PG::build_might_have_unfound()
 
   dout(10) << __func__ << dendl;
 
-  // Make sure that we have past intervals.
-  generate_past_intervals();
-
-  // We need to decide who might have unfound objects that we need
-  std::map<epoch_t,pg_interval_t>::const_reverse_iterator p = past_intervals.rbegin();
-  std::map<epoch_t,pg_interval_t>::const_reverse_iterator end = past_intervals.rend();
-  for (; p != end; ++p) {
-    const pg_interval_t &interval(p->second);
-    // We already have all the objects that exist at last_epoch_clean,
-    // so there's no need to look at earlier intervals.
-    if (interval.last < info.history.last_epoch_clean)
-      break;
-
-    // If nothing changed, we don't care about this interval.
-    if (!interval.maybe_went_rw)
-      continue;
+  check_past_interval_bounds();
 
-    int i = 0;
-    std::vector<int>::const_iterator a = interval.acting.begin();
-    std::vector<int>::const_iterator a_end = interval.acting.end();
-    for (; a != a_end; ++a, ++i) {
-      pg_shard_t shard(*a, pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD);
-      if (*a != CRUSH_ITEM_NONE && shard != pg_whoami)
-	might_have_unfound.insert(shard);
-    }
-  }
+  might_have_unfound = past_intervals.get_might_have_unfound(
+    pg_whoami,
+    pool.info.ec_pool());
 
   // include any (stray) peers
   for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
@@ -1576,7 +1534,7 @@ void PG::activate(ObjectStore::Transaction& t,
 		  map<int,
 		      vector<
 			pair<pg_notify_t,
-			     pg_interval_map_t> > > *activator_map,
+			     PastIntervals> > > *activator_map,
                   RecoveryCtx *ctx)
 {
   assert(!is_peered());
@@ -1594,13 +1552,16 @@ void PG::activate(ObjectStore::Transaction& t,
       assert(cct->_conf->osd_find_best_info_ignore_history_les ||
 	     info.last_epoch_started <= activation_epoch);
       info.last_epoch_started = activation_epoch;
+      info.last_interval_started = info.history.same_interval_since;
     }
   } else if (is_acting(pg_whoami)) {
     /* update last_epoch_started on acting replica to whatever the primary sent
      * unless it's smaller (could happen if we are going peered rather than
      * active, see doc/dev/osd_internals/last_epoch_started.rst) */
-    if (info.last_epoch_started < activation_epoch)
+    if (info.last_epoch_started < activation_epoch) {
       info.last_epoch_started = activation_epoch;
+      info.last_interval_started = info.history.same_interval_since;
+    }
   }
 
   auto &missing = pg_log.get_missing();
@@ -1730,6 +1691,7 @@ void PG::activate(ObjectStore::Transaction& t,
 	pi.last_complete = info.last_update;
 	pi.set_last_backfill(hobject_t());
 	pi.last_epoch_started = info.last_epoch_started;
+	pi.last_interval_started = info.last_interval_started;
 	pi.history = info.history;
 	pi.hit_set = info.hit_set;
 	pi.stats.stats.clear();
@@ -1919,6 +1881,7 @@ void PG::_activate_committed(epoch_t epoch, epoch_t activation_epoch)
     peer_activated.insert(pg_whoami);
     dout(10) << "_activate_committed " << epoch
 	     << " peer_activated now " << peer_activated 
+	     << " last_interval_started " << info.history.last_interval_started
 	     << " last_epoch_started " << info.history.last_epoch_started
 	     << " same_interval_since " << info.history.same_interval_since << dendl;
     assert(!actingbackfill.empty());
@@ -1934,13 +1897,14 @@ void PG::_activate_committed(epoch_t epoch, epoch_t activation_epoch)
       info);
 
     i.info.history.last_epoch_started = activation_epoch;
+    i.info.history.last_interval_started = i.info.history.same_interval_since;
     if (acting.size() >= pool.info.min_size) {
       state_set(PG_STATE_ACTIVE);
     } else {
       state_set(PG_STATE_PEERED);
     }
 
-    m->pg_list.push_back(make_pair(i, pg_interval_map_t()));
+    m->pg_list.push_back(make_pair(i, PastIntervals()));
     osd->send_message_osd_cluster(get_primary().osd, m, get_osdmap()->get_epoch());
 
     // waiters
@@ -2005,7 +1969,7 @@ void PG::queue_recovery(bool front)
 
 bool PG::queue_scrub()
 {
-  assert(_lock.is_locked());
+  assert(is_locked());
   if (is_scrubbing()) {
     return false;
   }
@@ -2052,8 +2016,10 @@ void PG::mark_clean()
   // NOTE: this is actually a bit premature: we haven't purged the
   // strays yet.
   info.history.last_epoch_clean = get_osdmap()->get_epoch();
+  info.history.last_interval_clean = info.history.same_interval_since;
 
-  trim_past_intervals();
+  past_intervals.clear();
+  dirty_big_info = true;
 
   if (is_active()) {
     /* The check is needed because if we are below min_size we're not
@@ -2241,6 +2207,7 @@ void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits)
   info.stats.stats_invalid = true;
   child->info.stats.stats_invalid = true;
   child->info.last_epoch_started = info.last_epoch_started;
+  child->info.last_interval_started = info.last_interval_started;
 
   child->snap_trimq = snap_trimq;
 
@@ -2701,6 +2668,12 @@ void PG::publish_stats_to_osd()
   utime_t now = ceph_clock_now();
   if (info.stats.state != state) {
     info.stats.last_change = now;
+    // Optimistic estimation, if we just find out an inactive PG,
+    // assumt it is active till now.
+    if (!(state & PG_STATE_ACTIVE) &&
+	(info.stats.state & PG_STATE_ACTIVE))
+      info.stats.last_active = now;
+
     if ((state & PG_STATE_ACTIVE) &&
 	!(info.stats.state & PG_STATE_ACTIVE))
       info.stats.last_became_active = now;
@@ -2787,13 +2760,13 @@ void PG::init(
   const vector<int>& newup, int new_up_primary,
   const vector<int>& newacting, int new_acting_primary,
   const pg_history_t& history,
-  const pg_interval_map_t& pi,
+  const PastIntervals& pi,
   bool backfill,
   ObjectStore::Transaction *t)
 {
   dout(10) << "init role " << role << " up " << newup << " acting " << newacting
 	   << " history " << history
-	   << " " << pi.size() << " past_intervals"
+	   << " past_intervals " << pi
 	   << dendl;
 
   set_role(role);
@@ -2826,6 +2799,8 @@ void PG::init(
   dirty_info = true;
   dirty_big_info = true;
   write_if_dirty(*t);
+
+  scrub_sleep_timer.init();
 }
 
 #pragma GCC diagnostic ignored "-Wpragmas"
@@ -2834,16 +2809,11 @@ void PG::init(
 
 void PG::upgrade(ObjectStore *store)
 {
-  assert(info_struct_v <= 9);
+  assert(info_struct_v <= 10);
   ObjectStore::Transaction t;
 
   assert(info_struct_v >= 7);
 
-  // 8 -> 9
-  if (info_struct_v <= 8) {
-    // no special action needed.
-  }
-
   // 7 -> 8
   if (info_struct_v <= 7) {
     pg_log.mark_log_for_rewrite();
@@ -2851,8 +2821,25 @@ void PG::upgrade(ObjectStore *store)
     ghobject_t biginfo_oid(OSD::make_pg_biginfo_oid(pg_id));
     t.remove(coll_t::meta(), log_oid);
     t.remove(coll_t::meta(), biginfo_oid);
-
     t.touch(coll, pgmeta_oid);
+  }
+
+  // 8 -> 9
+  if (info_struct_v <= 8) {
+    // no special action needed.
+  }
+
+  // 9 -> 10
+  if (info_struct_v <= 9) {
+    // previous versions weren't (as) aggressively clearing past_intervals
+    if (info.history.last_epoch_clean >= info.history.same_interval_since) {
+      dout(20) << __func__ << " clearing past_intervals" << dendl;
+      past_intervals.clear();
+    }
+  }
+
+  // update infover_key
+  if (info_struct_v < cur_struct_v) {
     map<string,bufferlist> v;
     __u8 ver = cur_struct_v;
     ::encode(ver, v[infover_key]);
@@ -2886,7 +2873,7 @@ int PG::_prepare_write_info(CephContext* cct,
 			    map<string,bufferlist> *km,
 			    epoch_t epoch,
 			    pg_info_t &info, pg_info_t &last_written_info,
-			    map<epoch_t,pg_interval_t> &past_intervals,
+			    PastIntervals &past_intervals,
 			    bool dirty_big_info,
 			    bool dirty_epoch,
 			    bool try_fast_info,
@@ -3075,12 +3062,13 @@ void PG::write_if_dirty(ObjectStore::Transaction& t)
     t.omap_setkeys(coll, pgmeta_oid, km);
 }
 
-void PG::trim_peers()
+void PG::trim_log()
 {
   assert(is_primary());
   calc_trim_to();
-  dout(10) << "trim_peers " << pg_trim_to << dendl;
+  dout(10) << __func__ << " to " << pg_trim_to << dendl;
   if (pg_trim_to != eversion_t()) {
+    // inform peers to trim log
     assert(!actingbackfill.empty());
     for (set<pg_shard_t>::iterator i = actingbackfill.begin();
 	 i != actingbackfill.end();
@@ -3094,6 +3082,10 @@ void PG::trim_peers()
 	  pg_trim_to),
 	get_osdmap()->get_epoch());
     }
+
+    // trim primary as well
+    pg_log.trim(pg_trim_to, info);
+    dirty_info = true;
   }
 }
 
@@ -3136,6 +3128,9 @@ void PG::append_log(
   if (info.last_epoch_started != info.history.last_epoch_started) {
     info.history.last_epoch_started = info.last_epoch_started;
   }
+  if (info.last_interval_started != info.history.last_interval_started) {
+    info.history.last_interval_started = info.last_interval_started;
+  }
   dout(10) << "append_log " << pg_log.get_log() << " " << logv << dendl;
 
   PGLogEntryHandler handler{this, &t};
@@ -3210,7 +3205,7 @@ std::string PG::get_corrupt_pg_log_name() const
 
 int PG::read_info(
   ObjectStore *store, spg_t pgid, const coll_t &coll, bufferlist &bl,
-  pg_info_t &info, map<epoch_t,pg_interval_t> &past_intervals,
+  pg_info_t &info, PastIntervals &past_intervals,
   __u8 &struct_v)
 {
   // try for v8 or later
@@ -3234,7 +3229,11 @@ int PG::read_info(
     ::decode(info, p);
 
     p = values[biginfo_key].begin();
-    ::decode(past_intervals, p);
+    if (struct_v >= 10) {
+      ::decode(past_intervals, p);
+    } else {
+      past_intervals.decode_classic(p);
+    }
     ::decode(info.purged_snaps, p);
 
     p = values[fastinfo_key].begin();
@@ -3452,7 +3451,7 @@ void PG::requeue_map_waiters()
   epoch_t epoch = get_osdmap()->get_epoch();
   auto p = waiting_for_map.begin();
   while (p != waiting_for_map.end()) {
-    if (op_must_wait_for_map(epoch, p->second.front())) {
+    if (epoch < p->second.front()->min_epoch) {
       dout(20) << __func__ << " " << p->first << " front op "
 	       << p->second.front() << " must still wait, doing nothing"
 	       << dendl;
@@ -3496,7 +3495,7 @@ void PG::requeue_map_waiters()
 bool PG::sched_scrub()
 {
   bool nodeep_scrub = false;
-  assert(_lock.is_locked());
+  assert(is_locked());
   if (!(is_primary() && is_active() && is_clean() && !is_scrubbing())) {
     return false;
   }
@@ -3720,6 +3719,7 @@ void PG::_request_scrub_map(
   MOSDRepScrub *repscrubop = new MOSDRepScrub(
     spg_t(info.pgid.pgid, replica.shard), version,
     get_osdmap()->get_epoch(),
+    get_last_peering_reset(),
     start, end, deep, seed);
   // default priority, we want the rep scrub processed prior to any recovery
   // or client io messages (we are holding a lock!)
@@ -3932,47 +3932,67 @@ void PG::_scan_rollback_obs(
 
 void PG::_scan_snaps(ScrubMap &smap) 
 {
-  for (map<hobject_t, ScrubMap::object>::iterator i = smap.objects.begin();
-       i != smap.objects.end();
+  hobject_t head;
+  SnapSet snapset;
+  for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
+       i != smap.objects.rend();
        ++i) {
     const hobject_t &hoid = i->first;
     ScrubMap::object &o = i->second;
 
-    if (hoid.snap < CEPH_MAXSNAP) {
-      // fake nlinks for old primaries
+    if (hoid.is_head() || hoid.is_snapdir()) {
+      // parse the SnapSet
       bufferlist bl;
-      if (o.attrs.find(OI_ATTR) == o.attrs.end()) {
-	o.nlinks = 0;
+      if (o.attrs.find(SS_ATTR) == o.attrs.end()) {
 	continue;
       }
-      bl.push_back(o.attrs[OI_ATTR]);
-      object_info_t oi;
+      bl.push_back(o.attrs[SS_ATTR]);
+      auto p = bl.begin();
       try {
-	oi.decode(bl);
+	::decode(snapset, p);
       } catch(...) {
-	o.nlinks = 0;
 	continue;
       }
-      if (oi.snaps.empty()) {
-	// Just head
-	o.nlinks = 1;
-      } else if (oi.snaps.size() == 1) {
-	// Just head + only snap
-	o.nlinks = 2;
+      head = hoid.get_head();
+      continue;
+    }
+    if (hoid.snap < CEPH_MAXSNAP) {
+      // check and if necessary fix snap_mapper
+      if (hoid.get_head() != head) {
+	derr << __func__ << " no head for " << hoid << " (have " << head << ")"
+	     << dendl;
+	continue;
+      }
+      set<snapid_t> obj_snaps;
+      if (!snapset.is_legacy()) {
+	auto p = snapset.clone_snaps.find(hoid.snap);
+	if (p == snapset.clone_snaps.end()) {
+	  derr << __func__ << " no clone_snaps for " << hoid << " in " << snapset
+	       << dendl;
+	  continue;
+	}
+	obj_snaps.insert(p->second.begin(), p->second.end());
       } else {
-	// Just head + 1st and last snaps
-	o.nlinks = 3;
+	bufferlist bl;
+	if (o.attrs.find(OI_ATTR) == o.attrs.end()) {
+	  continue;
+	}
+	bl.push_back(o.attrs[OI_ATTR]);
+	object_info_t oi;
+	try {
+	  oi.decode(bl);
+	} catch(...) {
+	  continue;
+	}
+	obj_snaps.insert(oi.legacy_snaps.begin(), oi.legacy_snaps.end());
       }
-
-      // check and if necessary fix snap_mapper
-      set<snapid_t> oi_snaps(oi.snaps.begin(), oi.snaps.end());
       set<snapid_t> cur_snaps;
       int r = snap_mapper.get_snaps(hoid, &cur_snaps);
       if (r != 0 && r != -ENOENT) {
 	derr << __func__ << ": get_snaps returned " << cpp_strerror(r) << dendl;
 	ceph_abort();
       }
-      if (r == -ENOENT || cur_snaps != oi_snaps) {
+      if (r == -ENOENT || cur_snaps != obj_snaps) {
 	ObjectStore::Transaction t;
 	OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
 	if (r == 0) {
@@ -3987,7 +4007,7 @@ void PG::_scan_snaps(ScrubMap &smap)
 			    << info.pgid
 			    << " oid " << hoid << " snaps in mapper: "
 			    << cur_snaps << ", oi: "
-			    << oi_snaps
+			    << obj_snaps
 			    << "...repaired";
 	} else {
 	  osd->clog->error() << "osd." << osd->whoami
@@ -3995,18 +4015,16 @@ void PG::_scan_snaps(ScrubMap &smap)
 			    << info.pgid
 			    << " oid " << hoid << " snaps missing in mapper"
 			    << ", should be: "
-			    << oi_snaps
+			    << obj_snaps
 			    << "...repaired";
 	}
-	snap_mapper.add_oid(hoid, oi_snaps, &_t);
+	snap_mapper.add_oid(hoid, obj_snaps, &_t);
 	r = osd->store->apply_transaction(osr.get(), std::move(t));
 	if (r != 0) {
 	  derr << __func__ << ": apply_transaction got " << cpp_strerror(r)
 	       << dendl;
 	}
       }
-    } else {
-      o.nlinks = 1;
     }
   }
 }
@@ -4189,22 +4207,34 @@ void PG::scrub(epoch_t queued, ThreadPool::TPHandle &handle)
 {
   if (cct->_conf->osd_scrub_sleep > 0 &&
       (scrubber.state == PG::Scrubber::NEW_CHUNK ||
-       scrubber.state == PG::Scrubber::INACTIVE)) {
+       scrubber.state == PG::Scrubber::INACTIVE) && scrubber.needs_sleep) {
+    ceph_assert(!scrubber.sleeping);
     dout(20) << __func__ << " state is INACTIVE|NEW_CHUNK, sleeping" << dendl;
-    unlock();
-    utime_t t;
-    t.set_from_double(cct->_conf->osd_scrub_sleep);
-    handle.suspend_tp_timeout();
-    t.sleep();
-    handle.reset_tp_timeout();
-    lock();
-    dout(20) << __func__ << " slept for " << t << dendl;
+    // Do an async sleep so we don't block the op queue
+    auto scrub_requeue_callback = new FunctionContext([this](int r) {
+      lock();
+      scrubber.sleeping = false;
+      scrubber.needs_sleep = false;
+      dout(20) << __func__ << " slept for "
+               << ceph_clock_now() - scrubber.sleep_start
+               << ", re-queuing scrub" << dendl;
+      scrub_queued = false;
+      requeue_scrub();
+      scrubber.sleep_start = utime_t();
+      unlock();
+    });
+    Mutex::Locker l(scrub_sleep_lock);
+    scrub_sleep_timer.add_event_after(cct->_conf->osd_scrub_sleep, scrub_requeue_callback);
+    scrubber.sleeping = true;
+    scrubber.sleep_start = ceph_clock_now();
+    return;
   }
   if (pg_has_reset_since(queued)) {
     return;
   }
   assert(scrub_queued);
   scrub_queued = false;
+  scrubber.needs_sleep = true;
 
   if (!is_primary() || !is_active() || !is_clean() || !is_scrubbing()) {
     dout(10) << "scrub -- not primary or active or not clean" << dendl;
@@ -4320,7 +4350,8 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
   int ret;
 
   while (!done) {
-    dout(20) << "scrub state " << Scrubber::state_string(scrubber.state) << dendl;
+    dout(20) << "scrub state " << Scrubber::state_string(scrubber.state)
+	     << " [" << scrubber.start << "," << scrubber.end << ")" << dendl;
 
     switch (scrubber.state) {
       case PG::Scrubber::INACTIVE:
@@ -4566,11 +4597,13 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
         ceph_abort();
     }
   }
+  dout(20) << "scrub final state " << Scrubber::state_string(scrubber.state)
+	   << " [" << scrubber.start << "," << scrubber.end << ")" << dendl;
 }
 
 void PG::scrub_clear_state()
 {
-  assert(_lock.is_locked());
+  assert(is_locked());
   state_clear(PG_STATE_SCRUBBING);
   state_clear(PG_STATE_REPAIR);
   state_clear(PG_STATE_DEEP_SCRUB);
@@ -4866,6 +4899,7 @@ void PG::share_pg_info()
     pg_shard_t peer = *i;
     if (peer_info.count(peer)) {
       peer_info[peer].last_epoch_started = info.last_epoch_started;
+      peer_info[peer].last_interval_started = info.last_interval_started;
       peer_info[peer].history.merge(info.history);
     }
     MOSDPGInfo *m = new MOSDPGInfo(get_osdmap()->get_epoch());
@@ -4876,7 +4910,7 @@ void PG::share_pg_info()
 	  get_osdmap()->get_epoch(),
 	  get_osdmap()->get_epoch(),
 	  info),
-	pg_interval_map_t()));
+	PastIntervals()));
     osd->send_message_osd_cluster(peer.osd, m, get_osdmap()->get_epoch());
   }
 }
@@ -4955,11 +4989,18 @@ void PG::merge_new_log_entries(
   }
 }
 
-void PG::update_history_from_master(pg_history_t new_history)
+void PG::update_history(const pg_history_t& new_history)
 {
   unreg_next_scrub();
-  if (info.history.merge(new_history))
+  if (info.history.merge(new_history)) {
+    dout(20) << __func__ << " advanced history from " << new_history << dendl;
     dirty_info = true;
+    if (info.history.last_epoch_clean >= info.history.same_interval_since) {
+      dout(20) << __func__ << " clearing past_intervals" << dendl;
+      past_intervals.clear();
+      dirty_big_info = true;
+    }
+  }
   reg_next_scrub();
 }
 
@@ -5045,7 +5086,7 @@ bool PG::should_restart_peering(
   OSDMapRef lastmap,
   OSDMapRef osdmap)
 {
-  if (pg_interval_t::is_new_interval(
+  if (PastIntervals::is_new_interval(
 	primary.osd,
 	newactingprimary,
 	acting,
@@ -5195,7 +5236,7 @@ void PG::start_peering_interval(
     assert(info.history.same_interval_since != 0);
     boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
       get_is_recoverable_predicate());
-    bool new_interval = pg_interval_t::check_new_interval(
+    bool new_interval = PastIntervals::check_new_interval(
       old_acting_primary.osd,
       new_acting_primary,
       oldacting, newacting,
@@ -5213,7 +5254,15 @@ void PG::start_peering_interval(
     dout(10) << __func__ << ": check_new_interval output: "
 	     << debug.str() << dendl;
     if (new_interval) {
-      dout(10) << " noting past " << past_intervals.rbegin()->second << dendl;
+      if (osdmap->get_epoch() == osd->get_superblock().oldest_map &&
+	  info.history.last_epoch_clean < osdmap->get_epoch()) {
+	dout(10) << " map gap, clearing past_intervals and faking" << dendl;
+	// our information is incomplete and useless; someone else was clean
+	// after everything we know if osdmaps were trimmed.
+	past_intervals.clear();
+      } else {
+	dout(10) << " noting past " << past_intervals << dendl;
+      }
       dirty_info = true;
       dirty_big_info = true;
       info.history.same_interval_since = osdmap->get_epoch();
@@ -5236,7 +5285,7 @@ void PG::start_peering_interval(
 
   on_new_interval();
 
-  dout(10) << " up " << oldup << " -> " << up 
+  dout(1) << __func__ << " up " << oldup << " -> " << up
 	   << ", acting " << oldacting << " -> " << acting 
 	   << ", acting_primary " << old_acting_primary << " -> " << new_acting_primary
 	   << ", up_primary " << old_up_primary << " -> " << new_up_primary
@@ -5345,10 +5394,7 @@ void PG::proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &oinfo)
 {
   assert(!is_primary());
 
-  unreg_next_scrub();
-  if (info.history.merge(oinfo.history))
-    dirty_info = true;
-  reg_next_scrub();
+  update_history(oinfo.history);
 
   if (last_complete_ondisk.epoch >= info.history.last_epoch_started) {
     // DEBUG: verify that the snaps are empty in snap_mapper
@@ -5397,8 +5443,8 @@ ostream& operator<<(ostream& out, const PG& pg)
   out << " lpr=" << pg.get_last_peering_reset();
 
   if (!pg.past_intervals.empty()) {
-    out << " pi=" << pg.past_intervals.begin()->first << "-" << pg.past_intervals.rbegin()->second.last
-	<< "/" << pg.past_intervals.size();
+    out << " pi=[" << pg.past_intervals.get_bounds()
+	<< ")/" << pg.past_intervals.size();
   }
 
   if (pg.is_peered()) {
@@ -5452,7 +5498,7 @@ ostream& operator<<(ostream& out, const PG& pg)
   if (pg.pg_log.get_missing().num_missing()) {
     out << " m=" << pg.pg_log.get_missing().num_missing();
     if (pg.is_primary()) {
-      int unfound = pg.get_num_unfound();
+      uint64_t unfound = pg.get_num_unfound();
       if (unfound)
 	out << " u=" << unfound;
     }
@@ -5609,116 +5655,6 @@ bool PG::can_discard_request(OpRequestRef& op)
   return true;
 }
 
-bool PG::op_must_wait_for_map(epoch_t cur_epoch, OpRequestRef& op)
-{
-  switch (op->get_req()->get_type()) {
-  case CEPH_MSG_OSD_OP:
-    return !have_same_or_newer_map(
-      cur_epoch,
-      static_cast<const MOSDOp*>(op->get_req())->get_map_epoch());
-
-  case CEPH_MSG_OSD_BACKOFF:
-    return false; // we don't care about maps
-
-  case MSG_OSD_SUBOP:
-    return !have_same_or_newer_map(
-      cur_epoch,
-      static_cast<const MOSDSubOp*>(op->get_req())->map_epoch);
-
-  case MSG_OSD_REPOP:
-    return !have_same_or_newer_map(
-      cur_epoch,
-      static_cast<const MOSDRepOp*>(op->get_req())->map_epoch);
-
-  case MSG_OSD_SUBOPREPLY:
-    return !have_same_or_newer_map(
-      cur_epoch,
-      static_cast<const MOSDSubOpReply*>(op->get_req())->map_epoch);
-
-  case MSG_OSD_REPOPREPLY:
-    return !have_same_or_newer_map(
-      cur_epoch,
-      static_cast<const MOSDRepOpReply*>(op->get_req())->map_epoch);
-
-  case MSG_OSD_PG_SCAN:
-    return !have_same_or_newer_map(
-      cur_epoch,
-      static_cast<const MOSDPGScan*>(op->get_req())->map_epoch);
-
-  case MSG_OSD_PG_BACKFILL:
-    return !have_same_or_newer_map(
-      cur_epoch,
-      static_cast<const MOSDPGBackfill*>(op->get_req())->map_epoch);
-
-  case MSG_OSD_PG_BACKFILL_REMOVE:
-    return !have_same_or_newer_map(
-      cur_epoch,
-      static_cast<const MOSDPGBackfillRemove*>(op->get_req())->map_epoch);
-
-  case MSG_OSD_PG_PUSH:
-    return !have_same_or_newer_map(
-      cur_epoch,
-      static_cast<const MOSDPGPush*>(op->get_req())->map_epoch);
-
-  case MSG_OSD_PG_PULL:
-    return !have_same_or_newer_map(
-      cur_epoch,
-      static_cast<const MOSDPGPull*>(op->get_req())->map_epoch);
-
-  case MSG_OSD_PG_PUSH_REPLY:
-    return !have_same_or_newer_map(
-      cur_epoch,
-      static_cast<const MOSDPGPushReply*>(op->get_req())->map_epoch);
-
-  case MSG_OSD_EC_WRITE:
-    return !have_same_or_newer_map(
-      cur_epoch,
-      static_cast<const MOSDECSubOpWrite*>(op->get_req())->map_epoch);
-
-  case MSG_OSD_EC_WRITE_REPLY:
-    return !have_same_or_newer_map(
-      cur_epoch,
-      static_cast<const MOSDECSubOpWriteReply*>(op->get_req())->map_epoch);
-
-  case MSG_OSD_EC_READ:
-    return !have_same_or_newer_map(
-      cur_epoch,
-      static_cast<const MOSDECSubOpRead*>(op->get_req())->map_epoch);
-
-  case MSG_OSD_EC_READ_REPLY:
-    return !have_same_or_newer_map(
-      cur_epoch,
-      static_cast<const MOSDECSubOpReadReply*>(op->get_req())->map_epoch);
-
-  case MSG_OSD_REP_SCRUB:
-    return !have_same_or_newer_map(
-      cur_epoch,
-      static_cast<const MOSDRepScrub*>(op->get_req())->map_epoch);
-
-  case MSG_OSD_SCRUB_RESERVE:
-    return !have_same_or_newer_map(
-      cur_epoch,
-      static_cast<const MOSDScrubReserve*>(op->get_req())->map_epoch);
-
-  case MSG_OSD_REP_SCRUBMAP:
-    return !have_same_or_newer_map(
-      cur_epoch,
-      static_cast<const MOSDRepScrubMap*>(op->get_req())->map_epoch);
-
-  case MSG_OSD_PG_UPDATE_LOG_MISSING:
-    return !have_same_or_newer_map(
-      cur_epoch,
-      static_cast<const MOSDPGUpdateLogMissing*>(op->get_req())->map_epoch);
-
-  case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
-    return !have_same_or_newer_map(
-      cur_epoch,
-      static_cast<const MOSDPGUpdateLogMissingReply*>(op->get_req())->map_epoch);
-  }
-  ceph_abort();
-  return false;
-}
-
 void PG::take_waiters()
 {
   dout(10) << "take_waiters" << dendl;
@@ -5792,6 +5728,7 @@ void PG::handle_advance_map(
 	   << dendl;
   update_osdmap_ref(osdmap);
   pool.update(osdmap);
+  past_intervals.update_type_from_map(pool.info.ec_pool(), *osdmap);
   if (cct->_conf->osd_debug_verify_cached_snaps) {
     interval_set<snapid_t> actual_removed_snaps;
     const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
@@ -5884,15 +5821,6 @@ void PG::update_store_on_load()
   }
 }
 
-std::ostream& operator<<(std::ostream& oss,
-			 const struct PG::PriorSet &prior)
-{
-  oss << "PriorSet[probe=" << prior.probe << " "
-      << "down=" << prior.down << " "
-      << "blocked_by=" << prior.blocked_by << "]";
-  return oss;
-}
-
 /*------------ Recovery State Machine----------------*/
 #undef dout_prefix
 #define dout_prefix (*_dout << context< RecoveryMachine >().pg->gen_prefix() \
@@ -5901,7 +5829,7 @@ std::ostream& operator<<(std::ostream& oss,
 /*------Crashed-------*/
 PG::RecoveryState::Crashed::Crashed(my_context ctx)
   : my_base(ctx),
-    NamedState(context< RecoveryMachine >().pg->cct, "Crashed")
+    NamedState(context< RecoveryMachine >().pg, "Crashed")
 {
   context< RecoveryMachine >().log_enter(state_name);
   assert(0 == "we got a bad state machine event");
@@ -5911,7 +5839,7 @@ PG::RecoveryState::Crashed::Crashed(my_context ctx)
 /*------Initial-------*/
 PG::RecoveryState::Initial::Initial(my_context ctx)
   : my_base(ctx),
-    NamedState(context< RecoveryMachine >().pg->cct, "Initial")
+    NamedState(context< RecoveryMachine >().pg, "Initial")
 {
   context< RecoveryMachine >().log_enter(state_name);
 }
@@ -5965,7 +5893,7 @@ void PG::RecoveryState::Initial::exit()
 /*------Started-------*/
 PG::RecoveryState::Started::Started(my_context ctx)
   : my_base(ctx),
-    NamedState(context< RecoveryMachine >().pg->cct, "Started")
+    NamedState(context< RecoveryMachine >().pg, "Started")
 {
   context< RecoveryMachine >().log_enter(state_name);
 }
@@ -6030,7 +5958,7 @@ void PG::RecoveryState::Started::exit()
 /*--------Reset---------*/
 PG::RecoveryState::Reset::Reset(my_context ctx)
   : my_base(ctx),
-    NamedState(context< RecoveryMachine >().pg->cct, "Reset")
+    NamedState(context< RecoveryMachine >().pg, "Reset")
 {
   context< RecoveryMachine >().log_enter(state_name);
   PG *pg = context< RecoveryMachine >().pg;
@@ -6061,10 +5989,6 @@ boost::statechart::result PG::RecoveryState::Reset::react(const AdvMap& advmap)
   PG *pg = context< RecoveryMachine >().pg;
   ldout(pg->cct, 10) << "Reset advmap" << dendl;
 
-  // make sure we have past_intervals filled in.  hopefully this will happen
-  // _before_ we are active.
-  pg->generate_past_intervals();
-
   pg->check_full_transition(advmap.lastmap, advmap.osdmap);
 
   if (pg->should_restart_peering(
@@ -6083,6 +6007,7 @@ boost::statechart::result PG::RecoveryState::Reset::react(const AdvMap& advmap)
       context< RecoveryMachine >().get_cur_transaction());
   }
   pg->remove_down_peer_info(advmap.osdmap);
+  pg->check_past_interval_bounds();
   return discard_event();
 }
 
@@ -6126,7 +6051,7 @@ void PG::RecoveryState::Reset::exit()
 /*-------Start---------*/
 PG::RecoveryState::Start::Start(my_context ctx)
   : my_base(ctx),
-    NamedState(context< RecoveryMachine >().pg->cct, "Start")
+    NamedState(context< RecoveryMachine >().pg, "Start")
 {
   context< RecoveryMachine >().log_enter(state_name);
 
@@ -6151,7 +6076,7 @@ void PG::RecoveryState::Start::exit()
 /*---------Primary--------*/
 PG::RecoveryState::Primary::Primary(my_context ctx)
   : my_base(ctx),
-    NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary")
+    NamedState(context< RecoveryMachine >().pg, "Started/Primary")
 {
   context< RecoveryMachine >().log_enter(state_name);
   PG *pg = context< RecoveryMachine >().pg;
@@ -6209,7 +6134,7 @@ void PG::RecoveryState::Primary::exit()
 /*---------Peering--------*/
 PG::RecoveryState::Peering::Peering(my_context ctx)
   : my_base(ctx),
-    NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Peering"),
+    NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering"),
     history_les_bound(false)
 {
   context< RecoveryMachine >().log_enter(state_name);
@@ -6225,7 +6150,7 @@ boost::statechart::result PG::RecoveryState::Peering::react(const AdvMap& advmap
 {
   PG *pg = context< RecoveryMachine >().pg;
   ldout(pg->cct, 10) << "Peering advmap" << dendl;
-  if (prior_set.get()->affected_by_map(advmap.osdmap, pg)) {
+  if (prior_set.affected_by_map(*(advmap.osdmap), pg)) {
     ldout(pg->cct, 1) << "Peering, affected_by_map, going to Reset" << dendl;
     post_event(advmap);
     return transit< Reset >();
@@ -6245,35 +6170,29 @@ boost::statechart::result PG::RecoveryState::Peering::react(const QueryState& q)
   q.f->dump_stream("enter_time") << enter_time;
 
   q.f->open_array_section("past_intervals");
-  for (map<epoch_t,pg_interval_t>::iterator p = pg->past_intervals.begin();
-       p != pg->past_intervals.end();
-       ++p) {
-    q.f->open_object_section("past_interval");
-    p->second.dump(q.f);
-    q.f->close_section();
-  }
+  pg->past_intervals.dump(q.f);
   q.f->close_section();
 
   q.f->open_array_section("probing_osds");
-  for (set<pg_shard_t>::iterator p = prior_set->probe.begin();
-       p != prior_set->probe.end();
+  for (set<pg_shard_t>::iterator p = prior_set.probe.begin();
+       p != prior_set.probe.end();
        ++p)
     q.f->dump_stream("osd") << *p;
   q.f->close_section();
 
-  if (prior_set->pg_down)
+  if (prior_set.pg_down)
     q.f->dump_string("blocked", "peering is blocked due to down osds");
 
   q.f->open_array_section("down_osds_we_would_probe");
-  for (set<int>::iterator p = prior_set->down.begin();
-       p != prior_set->down.end();
+  for (set<int>::iterator p = prior_set.down.begin();
+       p != prior_set.down.end();
        ++p)
     q.f->dump_int("osd", *p);
   q.f->close_section();
 
   q.f->open_array_section("peering_blocked_by");
-  for (map<int,epoch_t>::iterator p = prior_set->blocked_by.begin();
-       p != prior_set->blocked_by.end();
+  for (map<int,epoch_t>::iterator p = prior_set.blocked_by.begin();
+       p != prior_set.blocked_by.end();
        ++p) {
     q.f->open_object_section("osd");
     q.f->dump_int("osd", p->first);
@@ -6311,7 +6230,7 @@ void PG::RecoveryState::Peering::exit()
 /*------Backfilling-------*/
 PG::RecoveryState::Backfilling::Backfilling(my_context ctx)
   : my_base(ctx),
-    NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active/Backfilling")
+    NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Backfilling")
 {
   context< RecoveryMachine >().log_enter(state_name);
   PG *pg = context< RecoveryMachine >().pg;
@@ -6368,7 +6287,7 @@ void PG::RecoveryState::Backfilling::exit()
 
 PG::RecoveryState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx)
   : my_base(ctx),
-    NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active/WaitRemoteBackfillReserved"),
+    NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitRemoteBackfillReserved"),
     backfill_osd_it(context< Active >().remote_shards_to_reserve_backfill.begin())
 {
   context< RecoveryMachine >().log_enter(state_name);
@@ -6450,7 +6369,7 @@ PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationReje
 /*--WaitLocalBackfillReserved--*/
 PG::RecoveryState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx)
   : my_base(ctx),
-    NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active/WaitLocalBackfillReserved")
+    NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitLocalBackfillReserved")
 {
   context< RecoveryMachine >().log_enter(state_name);
   PG *pg = context< RecoveryMachine >().pg;
@@ -6475,7 +6394,7 @@ void PG::RecoveryState::WaitLocalBackfillReserved::exit()
 /*----NotBackfilling------*/
 PG::RecoveryState::NotBackfilling::NotBackfilling(my_context ctx)
   : my_base(ctx),
-    NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active/NotBackfilling")
+    NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/NotBackfilling")
 {
   context< RecoveryMachine >().log_enter(state_name);
   PG *pg = context< RecoveryMachine >().pg;
@@ -6505,7 +6424,7 @@ void PG::RecoveryState::NotBackfilling::exit()
 /*----NotRecovering------*/
 PG::RecoveryState::NotRecovering::NotRecovering(my_context ctx)
   : my_base(ctx),
-    NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active/NotRecovering")
+    NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/NotRecovering")
 {
   context< RecoveryMachine >().log_enter(state_name);
   PG *pg = context< RecoveryMachine >().pg;
@@ -6523,7 +6442,7 @@ void PG::RecoveryState::NotRecovering::exit()
 /*---RepNotRecovering----*/
 PG::RecoveryState::RepNotRecovering::RepNotRecovering(my_context ctx)
   : my_base(ctx),
-    NamedState(context< RecoveryMachine >().pg->cct, "Started/ReplicaActive/RepNotRecovering")
+    NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepNotRecovering")
 {
   context< RecoveryMachine >().log_enter(state_name);
 }
@@ -6539,7 +6458,7 @@ void PG::RecoveryState::RepNotRecovering::exit()
 /*---RepWaitRecoveryReserved--*/
 PG::RecoveryState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx)
   : my_base(ctx),
-    NamedState(context< RecoveryMachine >().pg->cct, "Started/ReplicaActive/RepWaitRecoveryReserved")
+    NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepWaitRecoveryReserved")
 {
   context< RecoveryMachine >().log_enter(state_name);
   PG *pg = context< RecoveryMachine >().pg;
@@ -6577,7 +6496,7 @@ void PG::RecoveryState::RepWaitRecoveryReserved::exit()
 /*-RepWaitBackfillReserved*/
 PG::RecoveryState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx)
   : my_base(ctx),
-    NamedState(context< RecoveryMachine >().pg->cct, "Started/ReplicaActive/RepWaitBackfillReserved")
+    NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepWaitBackfillReserved")
 {
   context< RecoveryMachine >().log_enter(state_name);
 }
@@ -6659,7 +6578,7 @@ PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteReservationRejecte
 /*---RepRecovering-------*/
 PG::RecoveryState::RepRecovering::RepRecovering(my_context ctx)
   : my_base(ctx),
-    NamedState(context< RecoveryMachine >().pg->cct, "Started/ReplicaActive/RepRecovering")
+    NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepRecovering")
 {
   context< RecoveryMachine >().log_enter(state_name);
 }
@@ -6684,7 +6603,7 @@ void PG::RecoveryState::RepRecovering::exit()
 /*------Activating--------*/
 PG::RecoveryState::Activating::Activating(my_context ctx)
   : my_base(ctx),
-    NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active/Activating")
+    NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Activating")
 {
   context< RecoveryMachine >().log_enter(state_name);
 }
@@ -6699,7 +6618,7 @@ void PG::RecoveryState::Activating::exit()
 
 PG::RecoveryState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx)
   : my_base(ctx),
-    NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active/WaitLocalRecoveryReserved")
+    NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitLocalRecoveryReserved")
 {
   context< RecoveryMachine >().log_enter(state_name);
   PG *pg = context< RecoveryMachine >().pg;
@@ -6741,7 +6660,7 @@ void PG::RecoveryState::WaitLocalRecoveryReserved::exit()
 
 PG::RecoveryState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx)
   : my_base(ctx),
-    NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
+    NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
     remote_recovery_reservation_it(context< Active >().remote_shards_to_reserve_recovery.begin())
 {
   context< RecoveryMachine >().log_enter(state_name);
@@ -6781,7 +6700,7 @@ void PG::RecoveryState::WaitRemoteRecoveryReserved::exit()
 
 PG::RecoveryState::Recovering::Recovering(my_context ctx)
   : my_base(ctx),
-    NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active/Recovering")
+    NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Recovering")
 {
   context< RecoveryMachine >().log_enter(state_name);
 
@@ -6846,7 +6765,7 @@ void PG::RecoveryState::Recovering::exit()
 
 PG::RecoveryState::Recovered::Recovered(my_context ctx)
   : my_base(ctx),
-    NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active/Recovered")
+    NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Recovered")
 {
   pg_shard_t auth_log_shard;
 
@@ -6866,6 +6785,9 @@ PG::RecoveryState::Recovered::Recovered(my_context ctx)
     pg->publish_stats_to_osd();
   }
 
+  // trim pglog on recovered
+  pg->trim_log();
+
   // adjust acting set?  (e.g. because backfill completed...)
   bool history_les_bound = false;
   if (pg->acting != pg->up && !pg->choose_acting(auth_log_shard,
@@ -6886,7 +6808,7 @@ void PG::RecoveryState::Recovered::exit()
 
 PG::RecoveryState::Clean::Clean(my_context ctx)
   : my_base(ctx),
-    NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active/Clean")
+    NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Clean")
 {
   context< RecoveryMachine >().log_enter(state_name);
 
@@ -6931,7 +6853,7 @@ set<pg_shard_t> unique_osd_shard_set(const pg_shard_t & skip, const T &in)
 /*---------Active---------*/
 PG::RecoveryState::Active::Active(my_context ctx)
   : my_base(ctx),
-    NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active"),
+    NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active"),
     remote_shards_to_reserve_recovery(
       unique_osd_shard_set(
 	context< RecoveryMachine >().pg->pg_whoami,
@@ -7039,7 +6961,7 @@ boost::statechart::result PG::RecoveryState::Active::react(const ActMap&)
   if (pg->cct->_conf->osd_check_for_log_corruption)
     pg->check_log_for_corruption(pg->osd->store);
 
-  int unfound = pg->missing_loc.num_unfound();
+  uint64_t unfound = pg->missing_loc.num_unfound();
   if (unfound > 0 &&
       pg->all_unfound_are_queried_or_lost(pg->get_osdmap())) {
     if (pg->cct->_conf->osd_auto_mark_unfound_lost) {
@@ -7204,6 +7126,7 @@ boost::statechart::result PG::RecoveryState::Active::react(const AllReplicasActi
 
   // info.last_epoch_started is set during activate()
   pg->info.history.last_epoch_started = pg->info.last_epoch_started;
+  pg->info.history.last_interval_started = pg->info.last_interval_started;
   pg->dirty_info = true;
 
   pg->share_pg_info();
@@ -7245,7 +7168,7 @@ void PG::RecoveryState::Active::exit()
 /*------ReplicaActive-----*/
 PG::RecoveryState::ReplicaActive::ReplicaActive(my_context ctx) 
   : my_base(ctx),
-    NamedState(context< RecoveryMachine >().pg->cct, "Started/ReplicaActive")
+    NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive")
 {
   context< RecoveryMachine >().log_enter(state_name);
 
@@ -7310,7 +7233,7 @@ boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MQuery&
 {
   PG *pg = context< RecoveryMachine >().pg;
   if (query.query.type == pg_query_t::MISSING) {
-    pg->update_history_from_master(query.query.history);
+    pg->update_history(query.query.history);
     pg->fulfill_log(query.from, query.query, query.query_epoch);
   } // else: from prior to activation, safe to ignore
   return discard_event();
@@ -7337,7 +7260,7 @@ void PG::RecoveryState::ReplicaActive::exit()
 /*-------Stray---*/
 PG::RecoveryState::Stray::Stray(my_context ctx) 
   : my_base(ctx),
-    NamedState(context< RecoveryMachine >().pg->cct, "Started/Stray")
+    NamedState(context< RecoveryMachine >().pg, "Started/Stray")
 {
   context< RecoveryMachine >().log_enter(state_name);
 
@@ -7405,7 +7328,7 @@ boost::statechart::result PG::RecoveryState::Stray::react(const MQuery& query)
   PG *pg = context< RecoveryMachine >().pg;
   if (query.query.type == pg_query_t::INFO) {
     pair<pg_shard_t, pg_info_t> notify_info;
-    pg->update_history_from_master(query.query.history);
+    pg->update_history(query.query.history);
     pg->fulfill_info(query.from, query.query, notify_info);
     context< RecoveryMachine >().send_notify(
       notify_info.first,
@@ -7449,22 +7372,21 @@ void PG::RecoveryState::Stray::exit()
 /*--------GetInfo---------*/
 PG::RecoveryState::GetInfo::GetInfo(my_context ctx)
   : my_base(ctx),
-    NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Peering/GetInfo")
+    NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/GetInfo")
 {
   context< RecoveryMachine >().log_enter(state_name);
 
   PG *pg = context< RecoveryMachine >().pg;
-  pg->generate_past_intervals();
-  unique_ptr<PriorSet> &prior_set = context< Peering >().prior_set;
+  pg->check_past_interval_bounds();
+  PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
 
   assert(pg->blocked_by.empty());
 
-  if (!prior_set.get())
-    pg->build_prior(prior_set);
+  prior_set = pg->build_prior();
 
   pg->reset_min_peer_features();
   get_infos();
-  if (prior_set->pg_down) {
+  if (prior_set.pg_down) {
     post_event(IsDown());
   } else if (peer_info_requested.empty()) {
     post_event(GotInfo());
@@ -7474,11 +7396,11 @@ PG::RecoveryState::GetInfo::GetInfo(my_context ctx)
 void PG::RecoveryState::GetInfo::get_infos()
 {
   PG *pg = context< RecoveryMachine >().pg;
-  unique_ptr<PriorSet> &prior_set = context< Peering >().prior_set;
+  PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
 
   pg->blocked_by.clear();
-  for (set<pg_shard_t>::const_iterator it = prior_set->probe.begin();
-       it != prior_set->probe.end();
+  for (set<pg_shard_t>::const_iterator it = prior_set.probe.begin();
+       it != prior_set.probe.end();
        ++it) {
     pg_shard_t peer = *it;
     if (peer == pg->pg_whoami) {
@@ -7522,17 +7444,17 @@ boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& in
   if (pg->proc_replica_info(
 	infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent)) {
     // we got something new ...
-    unique_ptr<PriorSet> &prior_set = context< Peering >().prior_set;
+    PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
     if (old_start < pg->info.history.last_epoch_started) {
       ldout(pg->cct, 10) << " last_epoch_started moved forward, rebuilding prior" << dendl;
-      pg->build_prior(prior_set);
+      prior_set = pg->build_prior();
 
       // filter out any osds that got dropped from the probe set from
       // peer_info_requested.  this is less expensive than restarting
       // peering (which would re-probe everyone).
       set<pg_shard_t>::iterator p = peer_info_requested.begin();
       while (p != peer_info_requested.end()) {
-	if (prior_set->probe.count(*p) == 0) {
+	if (prior_set.probe.count(*p) == 0) {
 	  ldout(pg->cct, 20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl;
 	  peer_info_requested.erase(p++);
 	} else {
@@ -7546,62 +7468,7 @@ boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& in
     pg->apply_peer_features(infoevt.features);
 
     // are we done getting everything?
-    if (peer_info_requested.empty() && !prior_set->pg_down) {
-      /*
-       * make sure we have at least one !incomplete() osd from the
-       * last rw interval.  the incomplete (backfilling) replicas
-       * get a copy of the log, but they don't get all the object
-       * updates, so they are insufficient to recover changes during
-       * that interval.
-       */
-      if (pg->info.history.last_epoch_started) {
-	for (map<epoch_t,pg_interval_t>::reverse_iterator p = pg->past_intervals.rbegin();
-	     p != pg->past_intervals.rend();
-	     ++p) {
-	  if (p->first < pg->info.history.last_epoch_started)
-	    break;
-	  if (!p->second.maybe_went_rw)
-	    continue;
-	  pg_interval_t& interval = p->second;
-	  ldout(pg->cct, 10) << " last maybe_went_rw interval was " << interval << dendl;
-	  OSDMapRef osdmap = pg->get_osdmap();
-
-	  /*
-	   * this mirrors the PriorSet calculation: we wait if we
-	   * don't have an up (AND !incomplete) node AND there are
-	   * nodes down that might be usable.
-	   */
-	  bool any_up_complete_now = false;
-	  bool any_down_now = false;
-	  for (unsigned i=0; i<interval.acting.size(); i++) {
-	    int o = interval.acting[i];
-	    if (o == CRUSH_ITEM_NONE)
-	      continue;
-	    pg_shard_t so(o, pg->pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD);
-	    if (!osdmap->exists(o) || osdmap->get_info(o).lost_at > interval.first)
-	      continue;  // dne or lost
-	    if (osdmap->is_up(o)) {
-	      pg_info_t *pinfo;
-	      if (so == pg->pg_whoami) {
-		pinfo = &pg->info;
-	      } else {
-		assert(pg->peer_info.count(so));
-		pinfo = &pg->peer_info[so];
-	      }
-	      if (!pinfo->is_incomplete())
-		any_up_complete_now = true;
-	    } else {
-	      any_down_now = true;
-	    }
-	  }
-	  if (!any_up_complete_now && any_down_now) {
-	    ldout(pg->cct, 10) << " no osds up+complete from interval " << interval << dendl;
-	    post_event(IsDown());
-	    return discard_event();
-	  }
-	  break;
-	}
-      }
+    if (peer_info_requested.empty() && !prior_set.pg_down) {
       ldout(pg->cct, 20) << "Common peer features: " << hex << pg->get_min_peer_features() << dec << dendl;
       ldout(pg->cct, 20) << "Common acting features: " << hex << pg->get_min_acting_features() << dec << dendl;
       ldout(pg->cct, 20) << "Common upacting features: " << hex << pg->get_min_upacting_features() << dec << dendl;
@@ -7651,7 +7518,7 @@ void PG::RecoveryState::GetInfo::exit()
 PG::RecoveryState::GetLog::GetLog(my_context ctx)
   : my_base(ctx),
     NamedState(
-      context< RecoveryMachine >().pg->cct, "Started/Primary/Peering/GetLog"),
+      context< RecoveryMachine >().pg, "Started/Primary/Peering/GetLog"),
     msg(0)
 {
   context< RecoveryMachine >().log_enter(state_name);
@@ -7785,7 +7652,7 @@ void PG::RecoveryState::GetLog::exit()
 /*------WaitActingChange--------*/
 PG::RecoveryState::WaitActingChange::WaitActingChange(my_context ctx)
   : my_base(ctx),
-    NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Peering/WaitActingChange")
+    NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/WaitActingChange")
 {
   context< RecoveryMachine >().log_enter(state_name);
 }
@@ -7848,7 +7715,7 @@ void PG::RecoveryState::WaitActingChange::exit()
 /*------Down--------*/
 PG::RecoveryState::Down::Down(my_context ctx)
   : my_base(ctx),
-    NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Peering/Down")
+    NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/Down")
 {
   context< RecoveryMachine >().log_enter(state_name);
   PG *pg = context< RecoveryMachine >().pg;
@@ -7856,9 +7723,9 @@ PG::RecoveryState::Down::Down(my_context ctx)
   pg->state_clear(PG_STATE_PEERING);
   pg->state_set(PG_STATE_DOWN);
 
-  unique_ptr<PriorSet> &prior_set = context< Peering >().prior_set;
+  auto &prior_set = context< Peering >().prior_set;
   assert(pg->blocked_by.empty());
-  pg->blocked_by.insert(prior_set->down.begin(), prior_set->down.end());
+  pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
   pg->publish_stats_to_osd();
 }
 
@@ -7889,7 +7756,7 @@ boost::statechart::result PG::RecoveryState::Down::react(const QueryState& q)
 /*------Incomplete--------*/
 PG::RecoveryState::Incomplete::Incomplete(my_context ctx)
   : my_base(ctx),
-    NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Peering/Incomplete")
+    NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/Incomplete")
 {
   context< RecoveryMachine >().log_enter(state_name);
   PG *pg = context< RecoveryMachine >().pg;
@@ -7897,9 +7764,9 @@ PG::RecoveryState::Incomplete::Incomplete(my_context ctx)
   pg->state_clear(PG_STATE_PEERING);
   pg->state_set(PG_STATE_INCOMPLETE);
 
-  unique_ptr<PriorSet> &prior_set = context< Peering >().prior_set;
+  PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
   assert(pg->blocked_by.empty());
-  pg->blocked_by.insert(prior_set->down.begin(), prior_set->down.end());
+  pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
   pg->publish_stats_to_osd();
 }
 
@@ -7956,12 +7823,13 @@ void PG::RecoveryState::Incomplete::exit()
 /*------GetMissing--------*/
 PG::RecoveryState::GetMissing::GetMissing(my_context ctx)
   : my_base(ctx),
-    NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Peering/GetMissing")
+    NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/GetMissing")
 {
   context< RecoveryMachine >().log_enter(state_name);
 
   PG *pg = context< RecoveryMachine >().pg;
   assert(!pg->actingbackfill.empty());
+  eversion_t since;
   for (set<pg_shard_t>::iterator i = pg->actingbackfill.begin();
        i != pg->actingbackfill.end();
        ++i) {
@@ -7995,7 +7863,7 @@ PG::RecoveryState::GetMissing::GetMissing(my_context ctx)
 
     // We pull the log from the peer's last_epoch_started to ensure we
     // get enough log to detect divergent updates.
-    eversion_t since(pi.last_epoch_started, 0);
+    since.epoch = pi.last_epoch_started;
     assert(pi.last_update >= pg->info.log_tail);  // or else choose_acting() did a bad thing
     if (pi.log_tail <= since) {
       ldout(pg->cct, 10) << " requesting log+missing since " << since << " from osd." << *i << dendl;
@@ -8095,7 +7963,7 @@ void PG::RecoveryState::GetMissing::exit()
 /*------WaitUpThru--------*/
 PG::RecoveryState::WaitUpThru::WaitUpThru(my_context ctx)
   : my_base(ctx),
-    NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Peering/WaitUpThru")
+    NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/WaitUpThru")
 {
   context< RecoveryMachine >().log_enter(state_name);
 }
@@ -8163,169 +8031,6 @@ void PG::RecoveryState::RecoveryMachine::log_exit(const char *state_name, utime_
 #undef dout_prefix
 #define dout_prefix (*_dout << (debug_pg ? debug_pg->gen_prefix() : string()) << " PriorSet: ")
 
-PG::PriorSet::PriorSet(CephContext* cct,
-		       bool ec_pool,
-		       IsPGRecoverablePredicate *c,
-		       const OSDMap &osdmap,
-		       const map<epoch_t, pg_interval_t> &past_intervals,
-		       const vector<int> &up,
-		       const vector<int> &acting,
-		       const pg_info_t &info,
-		       const PG *debug_pg)
-  : cct(cct), ec_pool(ec_pool), pg_down(false), pcontdec(c)
-{
-  // Include current acting and up nodes... not because they may
-  // contain old data (this interval hasn't gone active, obviously),
-  // but because we want their pg_info to inform choose_acting(), and
-  // so that we know what they do/do not have explicitly before
-  // sending them any new info/logs/whatever.
-  for (unsigned i = 0; i < acting.size(); i++) {
-    if (acting[i] != CRUSH_ITEM_NONE)
-      probe.insert(pg_shard_t(acting[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
-  }
-  // It may be possible to exclude the up nodes, but let's keep them in
-  // there for now.
-  for (unsigned i = 0; i < up.size(); i++) {
-    if (up[i] != CRUSH_ITEM_NONE)
-      probe.insert(pg_shard_t(up[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
-  }
-
-  for (map<epoch_t,pg_interval_t>::const_reverse_iterator p = past_intervals.rbegin();
-       p != past_intervals.rend();
-       ++p) {
-    const pg_interval_t &interval = p->second;
-    dout(10) << "build_prior " << interval << dendl;
-
-    if (interval.last < info.history.last_epoch_started)
-      break;  // we don't care
-
-    if (interval.acting.empty())
-      continue;
-
-    if (!interval.maybe_went_rw)
-      continue;
-
-    // look at candidate osds during this interval.  each falls into
-    // one of three categories: up, down (but potentially
-    // interesting), or lost (down, but we won't wait for it).
-    set<pg_shard_t> up_now;
-    bool any_down_now = false;  // any candidates down now (that might have useful data)
-
-    // consider ACTING osds
-    for (unsigned i = 0; i < interval.acting.size(); i++) {
-      int o = interval.acting[i];
-      if (o == CRUSH_ITEM_NONE)
-	continue;
-      pg_shard_t so(o, ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD);
-
-      const osd_info_t *pinfo = 0;
-      if (osdmap.exists(o))
-	pinfo = &osdmap.get_info(o);
-
-      if (osdmap.is_up(o)) {
-	// include past acting osds if they are up.
-	probe.insert(so);
-	up_now.insert(so);
-      } else if (!pinfo) {
-	dout(10) << "build_prior  prior osd." << o << " no longer exists" << dendl;
-	down.insert(o);
-      } else if (pinfo->lost_at > interval.first) {
-	dout(10) << "build_prior  prior osd." << o << " is down, but lost_at " << pinfo->lost_at << dendl;
-	up_now.insert(so);
-	down.insert(o);
-      } else {
-	dout(10) << "build_prior  prior osd." << o << " is down" << dendl;
-	down.insert(o);
-	any_down_now = true;
-      }
-    }
-
-    // if not enough osds survived this interval, and we may have gone rw,
-    // then we need to wait for one of those osds to recover to
-    // ensure that we haven't lost any information.
-    if (!(*pcontdec)(up_now) && any_down_now) {
-      // fixme: how do we identify a "clean" shutdown anyway?
-      dout(10) << "build_prior  possibly went active+rw, insufficient up;"
-	       << " including down osds" << dendl;
-      for (vector<int>::const_iterator i = interval.acting.begin();
-	   i != interval.acting.end();
-	   ++i) {
-	if (osdmap.exists(*i) &&   // if it doesn't exist, we already consider it lost.
-	    osdmap.is_down(*i)) {
-	  pg_down = true;
-
-	  // make note of when any down osd in the cur set was lost, so that
-	  // we can notice changes in prior_set_affected.
-	  blocked_by[*i] = osdmap.get_info(*i).lost_at;
-	}
-      }
-    }
-  }
-
-  dout(10) << "build_prior final: probe " << probe
-	   << " down " << down
-	   << " blocked_by " << blocked_by
-	   << (pg_down ? " pg_down":"")
-	   << dendl;
-}
-
-// true if the given map affects the prior set
-bool PG::PriorSet::affected_by_map(const OSDMapRef osdmap, const PG *debug_pg) const
-{
-  for (set<pg_shard_t>::iterator p = probe.begin();
-       p != probe.end();
-       ++p) {
-    int o = p->osd;
-
-    // did someone in the prior set go down?
-    if (osdmap->is_down(o) && down.count(o) == 0) {
-      dout(10) << "affected_by_map osd." << o << " now down" << dendl;
-      return true;
-    }
-
-    // did a down osd in cur get (re)marked as lost?
-    map<int, epoch_t>::const_iterator r = blocked_by.find(o);
-    if (r != blocked_by.end()) {
-      if (!osdmap->exists(o)) {
-	dout(10) << "affected_by_map osd." << o << " no longer exists" << dendl;
-	return true;
-      }
-      if (osdmap->get_info(o).lost_at != r->second) {
-	dout(10) << "affected_by_map osd." << o << " (re)marked as lost" << dendl;
-	return true;
-      }
-    }
-  }
-
-  // did someone in the prior down set go up?
-  for (set<int>::const_iterator p = down.begin();
-       p != down.end();
-       ++p) {
-    int o = *p;
-
-    if (osdmap->is_up(o)) {
-      dout(10) << "affected_by_map osd." << o << " now up" << dendl;
-      return true;
-    }
-
-    // did someone in the prior set get lost or destroyed?
-    if (!osdmap->exists(o)) {
-      dout(10) << "affected_by_map osd." << o << " no longer exists" << dendl;
-      return true;
-    }
-    // did a down osd in down get (re)marked as lost?
-    map<int, epoch_t>::const_iterator r = blocked_by.find(o);
-    if (r != blocked_by.end()) {
-      if (osdmap->get_info(o).lost_at != r->second) {
-        dout(10) << "affected_by_map osd." << o << " (re)marked as lost" << dendl;
-        return true;
-      }
-    } 
-  }
-
-  return false;
-}
-
 void PG::RecoveryState::start_handle(RecoveryCtx *new_ctx) {
   assert(!rctx);
   assert(!orig_ctx);
diff --git a/src/osd/PG.h b/src/osd/PG.h
index ea96417d5f8..0f9282a44a4 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -23,6 +23,7 @@
 #include <boost/statechart/transition.hpp>
 #include <boost/statechart/event_base.hpp>
 #include <boost/scoped_ptr.hpp>
+#include <boost/circular_buffer.hpp>
 #include "include/memory.h"
 #include "include/mempool.h"
 
@@ -35,6 +36,7 @@
 #include "include/xlist.h"
 #include "SnapMapper.h"
 #include "Session.h"
+#include "common/Timer.h"
 
 #include "PGLog.h"
 #include "OSDMap.h"
@@ -45,13 +47,14 @@
 #include <atomic>
 #include <list>
 #include <memory>
+#include <stack>
 #include <string>
+#include <tuple>
 using namespace std;
 
 // #include "include/unordered_map.h"
 // #include "include/unordered_set.h"
 
-
 //#define DEBUG_RECOVERY_OIDS   // track set of recovering oids explicitly, to find counting bugs
 
 class OSD;
@@ -74,6 +77,59 @@ namespace Scrub {
 void intrusive_ptr_add_ref(PG *pg);
 void intrusive_ptr_release(PG *pg);
 
+using state_history_entry = std::tuple<utime_t, utime_t, const char*>;
+using embedded_state = std::pair<utime_t, const char*>;
+
+struct PGStateInstance {
+  // Time spent in pg states
+
+  void setepoch(const epoch_t current_epoch) {
+    this_epoch = current_epoch;
+  }
+
+  void enter_state(const utime_t entime, const char* state) {
+    embedded_states.push(std::make_pair(entime, state));
+  }
+
+  void exit_state(const utime_t extime) {
+    embedded_state this_state = embedded_states.top();
+    state_history.push_back(state_history_entry{
+        this_state.first, extime, this_state.second});
+    embedded_states.pop();
+  }
+
+  epoch_t this_epoch;
+  utime_t enter_time;
+  std::vector<state_history_entry> state_history;
+  std::stack<embedded_state> embedded_states;
+};
+
+class PGStateHistory {
+  // Member access protected with the PG lock
+public:
+  PGStateHistory() : buffer(10) {}
+
+  void enter(PG* pg, const utime_t entime, const char* state);
+
+  void exit(const char* state);
+
+  void reset() {
+    pi = nullptr;
+  }
+
+  void set_pg_in_destructor() { pg_in_destructor = true; }
+
+  void dump(Formatter* f) const;
+
+private:
+  bool pg_in_destructor = false;
+  PG* thispg = nullptr;
+  std::unique_ptr<PGStateInstance> tmppi;
+  PGStateInstance* pi = nullptr;
+  boost::circular_buffer<std::unique_ptr<PGStateInstance>> buffer;
+
+};
+
 #ifdef PG_DEBUG_REFS
 #include "common/tracked_int_ptr.hpp"
   uint64_t get_with_id(PG *pg);
@@ -194,7 +250,7 @@ struct PGPool {
  *
  */
 
-class PG : protected DoutPrefixProvider {
+class PG : public DoutPrefixProvider {
 protected:
   OSDService *osd;
   CephContext *cct;
@@ -255,6 +311,7 @@ protected:
 public:
   bool deleting;  // true while in removing or OSD is shutting down
 
+  ZTracer::Endpoint trace_endpoint;
 
   void lock_suspend_timeout(ThreadPool::TPHandle &handle);
   void lock(bool no_lockdep = false) const;
@@ -287,7 +344,8 @@ public:
   pg_info_t info;               ///< current pg info
   pg_info_t last_written_info;  ///< last written info
   __u8 info_struct_v;
-  static const __u8 cur_struct_v = 9;
+  static const __u8 cur_struct_v = 10;
+  // v10 is the new past_intervals encoding
   // v9 was fastinfo_key addition
   // v8 was the move to a per-pg pgmeta object
   // v7 was SnapMapper addition in 86658392516d5175b2756659ef7ffaaf95b0f8ad
@@ -491,7 +549,7 @@ public:
     }
   } missing_loc;
   
-  map<epoch_t,pg_interval_t> past_intervals;
+  PastIntervals past_intervals;
 
   interval_set<snapid_t> snap_trimq;
 
@@ -551,45 +609,19 @@ public:
   set<int> blocked_by; ///< osds we are blocked by (for pg stats)
 
   // [primary only] content recovery state
- protected:
-  struct PriorSet {
-    CephContext* cct;
-    const bool ec_pool;
-    set<pg_shard_t> probe; /// current+prior OSDs we need to probe.
-    set<int> down;  /// down osds that would normally be in @a probe and might be interesting.
-    map<int, epoch_t> blocked_by;  /// current lost_at values for any OSDs in cur set for which (re)marking them lost would affect cur set
-
-    bool pg_down;   /// some down osds are included in @a cur; the DOWN pg state bit should be set.
-    boost::scoped_ptr<IsPGRecoverablePredicate> pcontdec;
-    PriorSet(CephContext* cct,
-	     bool ec_pool,
-	     IsPGRecoverablePredicate *c,
-	     const OSDMap &osdmap,
-	     const map<epoch_t, pg_interval_t> &past_intervals,
-	     const vector<int> &up,
-	     const vector<int> &acting,
-	     const pg_info_t &info,
-	     const PG *debug_pg = nullptr);
-
-    bool affected_by_map(const OSDMapRef osdmap, const PG *debug_pg=0) const;
-  };
-
-  friend std::ostream& operator<<(std::ostream& oss,
-				  const struct PriorSet &prior);
-
 
 public:    
   struct BufferedRecoveryMessages {
     map<int, map<spg_t, pg_query_t> > query_map;
-    map<int, vector<pair<pg_notify_t, pg_interval_map_t> > > info_map;
-    map<int, vector<pair<pg_notify_t, pg_interval_map_t> > > notify_list;
+    map<int, vector<pair<pg_notify_t, PastIntervals> > > info_map;
+    map<int, vector<pair<pg_notify_t, PastIntervals> > > notify_list;
   };
 
   struct RecoveryCtx {
     utime_t start_time;
     map<int, map<spg_t, pg_query_t> > *query_map;
-    map<int, vector<pair<pg_notify_t, pg_interval_map_t> > > *info_map;
-    map<int, vector<pair<pg_notify_t, pg_interval_map_t> > > *notify_list;
+    map<int, vector<pair<pg_notify_t, PastIntervals> > > *info_map;
+    map<int, vector<pair<pg_notify_t, PastIntervals> > > *notify_list;
     set<PGRef> created_pgs;
     C_Contexts *on_applied;
     C_Contexts *on_safe;
@@ -597,9 +629,9 @@ public:
     ThreadPool::TPHandle* handle;
     RecoveryCtx(map<int, map<spg_t, pg_query_t> > *query_map,
 		map<int,
-		    vector<pair<pg_notify_t, pg_interval_map_t> > > *info_map,
+		    vector<pair<pg_notify_t, PastIntervals> > > *info_map,
 		map<int,
-		    vector<pair<pg_notify_t, pg_interval_map_t> > > *notify_list,
+		    vector<pair<pg_notify_t, PastIntervals> > > *notify_list,
 		C_Contexts *on_applied,
 		C_Contexts *on_safe,
 		ObjectStore::Transaction *transaction)
@@ -633,20 +665,20 @@ public:
 	  omap[j->first] = j->second;
 	}
       }
-      for (map<int, vector<pair<pg_notify_t, pg_interval_map_t> > >::iterator i
+      for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i
 	     = m.info_map.begin();
 	   i != m.info_map.end();
 	   ++i) {
-	vector<pair<pg_notify_t, pg_interval_map_t> > &ovec =
+	vector<pair<pg_notify_t, PastIntervals> > &ovec =
 	  (*info_map)[i->first];
 	ovec.reserve(ovec.size() + i->second.size());
 	ovec.insert(ovec.end(), i->second.begin(), i->second.end());
       }
-      for (map<int, vector<pair<pg_notify_t, pg_interval_map_t> > >::iterator i
+      for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i
 	     = m.notify_list.begin();
 	   i != m.notify_list.end();
 	   ++i) {
-	vector<pair<pg_notify_t, pg_interval_map_t> > &ovec =
+	vector<pair<pg_notify_t, PastIntervals> > &ovec =
 	  (*notify_list)[i->first];
 	ovec.reserve(ovec.size() + i->second.size());
 	ovec.insert(ovec.end(), i->second.begin(), i->second.end());
@@ -654,14 +686,19 @@ public:
     }
   };
 
+
+  PGStateHistory pgstate_history;
+
   struct NamedState {
     const char *state_name;
     utime_t enter_time;
+    PG* pg;
     const char *get_state_name() { return state_name; }
-    NamedState(CephContext *cct_, const char *state_name_)
-      : state_name(state_name_),
-	enter_time(ceph_clock_now()) {}
-    virtual ~NamedState() {}
+    NamedState(PG *pg_, const char *state_name_)
+      : state_name(state_name_), enter_time(ceph_clock_now()), pg(pg_) {
+        pg->pgstate_history.enter(pg, enter_time, state_name);
+      }
+    virtual ~NamedState() { pg->pgstate_history.exit(state_name); }
   };
 
 
@@ -933,10 +970,21 @@ public:
 
   void mark_clean();  ///< mark an active pg clean
 
-  bool _calc_past_interval_range(epoch_t *start, epoch_t *end, epoch_t oldest_map);
-  void generate_past_intervals();
-  void trim_past_intervals();
-  void build_prior(std::unique_ptr<PriorSet> &prior_set);
+  /// return [start,end) bounds for required past_intervals
+  static pair<epoch_t, epoch_t> get_required_past_interval_bounds(
+    const pg_info_t &info,
+    epoch_t oldest_map) {
+    epoch_t start = MAX(
+      info.history.last_epoch_clean ? info.history.last_epoch_clean :
+       info.history.epoch_pool_created,
+      oldest_map);
+    epoch_t end = MAX(
+      info.history.same_interval_since,
+      info.history.epoch_pool_created);
+    return make_pair(start, end);
+  }
+  void check_past_interval_bounds() const;
+  PastIntervals::PriorSet build_prior();
 
   void remove_down_peer_info(const OSDMapRef osdmap);
 
@@ -1032,7 +1080,6 @@ public:
     const vector<int> &up,
     pg_shard_t up_primary,
     const map<pg_shard_t, pg_info_t> &all_info,
-    bool compat_mode,
     bool restrict_to_up_acting,
     vector<int> *want,
     set<pg_shard_t> *backfill,
@@ -1047,7 +1094,6 @@ public:
     const vector<int> &up,
     pg_shard_t up_primary,
     const map<pg_shard_t, pg_info_t> &all_info,
-    bool compat_mode,
     bool restrict_to_up_acting,
     vector<int> *want,
     set<pg_shard_t> *backfill,
@@ -1064,7 +1110,7 @@ public:
     list<Context*>& tfin,
     map<int, map<spg_t,pg_query_t> >& query_map,
     map<int,
-      vector<pair<pg_notify_t, pg_interval_map_t> > > *activator_map,
+      vector<pair<pg_notify_t, PastIntervals> > > *activator_map,
     RecoveryCtx *ctx);
   void _activate_committed(epoch_t epoch, epoch_t activation_epoch);
   void all_activated_and_committed();
@@ -1074,7 +1120,7 @@ public:
   bool have_unfound() const { 
     return missing_loc.have_unfound();
   }
-  int get_num_unfound() const {
+  uint64_t get_num_unfound() const {
     return missing_loc.num_unfound();
   }
 
@@ -1156,6 +1202,11 @@ public:
     OpRequestRef active_rep_scrub;
     utime_t scrub_reg_stamp;  // stamp we registered for
 
+    // For async sleep
+    bool sleeping = false;
+    bool needs_sleep = true;
+    utime_t sleep_start;
+
     // flags to indicate explicitly requested scrubs (by admin)
     bool must_scrub, must_deep_scrub, must_repair;
 
@@ -1270,6 +1321,9 @@ public:
       authoritative.clear();
       num_digest_updates_pending = 0;
       cleaned_meta_map = ScrubMap();
+      sleeping = false;
+      needs_sleep = true;
+      sleep_start = utime_t();
     }
 
     void create_results(const hobject_t& obj);
@@ -1573,7 +1627,7 @@ public:
 	return state->rctx->query_map;
       }
 
-      map<int, vector<pair<pg_notify_t, pg_interval_map_t> > > *get_info_map() {
+      map<int, vector<pair<pg_notify_t, PastIntervals> > > *get_info_map() {
 	assert(state->rctx);
 	assert(state->rctx->info_map);
 	return state->rctx->info_map;
@@ -1594,7 +1648,7 @@ public:
       RecoveryCtx *get_recovery_ctx() { return &*(state->rctx); }
 
       void send_notify(pg_shard_t to,
-		       const pg_notify_t &info, const pg_interval_map_t &pi) {
+		       const pg_notify_t &info, const PastIntervals &pi) {
 	assert(state->rctx);
 	assert(state->rctx->notify_list);
 	(*state->rctx->notify_list)[to.osd].push_back(make_pair(info, pi));
@@ -1744,7 +1798,7 @@ public:
     struct Active;
 
     struct Peering : boost::statechart::state< Peering, Primary, GetInfo >, NamedState {
-      std::unique_ptr< PriorSet > prior_set;
+      PastIntervals::PriorSet prior_set;
       bool history_les_bound;  //< need osd_find_best_info_ignore_history_les
 
       explicit Peering(my_context ctx);
@@ -2143,6 +2197,9 @@ public:
 
   epoch_t last_epoch;
 
+  Mutex scrub_sleep_lock;
+  SafeTimer scrub_sleep_timer;
+
  public:
   const spg_t&      get_pgid() const { return pg_id; }
 
@@ -2242,7 +2299,7 @@ public:
     const vector<int>& acting,
     int acting_primary,
     const pg_history_t& history,
-    const pg_interval_map_t& pim,
+    const PastIntervals& pim,
     bool backfill,
     ObjectStore::Transaction *t);
 
@@ -2266,7 +2323,7 @@ public:
     epoch_t epoch,
     pg_info_t &info,
     pg_info_t &last_written_info,
-    map<epoch_t,pg_interval_t> &past_intervals,
+    PastIntervals &past_intervals,
     bool dirty_big_info,
     bool dirty_epoch,
     bool try_fast_info,
@@ -2298,12 +2355,12 @@ public:
     ObjectStore::Transaction &t,
     bool transaction_applied = true);
   bool check_log_for_corruption(ObjectStore *store);
-  void trim_peers();
+  void trim_log();
 
   std::string get_corrupt_pg_log_name() const;
   static int read_info(
     ObjectStore *store, spg_t pgid, const coll_t &coll,
-    bufferlist &bl, pg_info_t &info, map<epoch_t,pg_interval_t> &past_intervals,
+    bufferlist &bl, pg_info_t &info, PastIntervals &past_intervals,
     __u8 &);
   void read_state(ObjectStore *store, bufferlist &bl);
   static bool _has_removal_flag(ObjectStore *store, spg_t pgid);
@@ -2357,7 +2414,7 @@ public:
     return deleting || e < get_last_peering_reset();
   }
 
-  void update_history_from_master(pg_history_t new_history);
+  void update_history(const pg_history_t& history);
   void fulfill_info(pg_shard_t from, const pg_query_t &query,
 		    pair<pg_shard_t, pg_info_t> &notify_info);
   void fulfill_log(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch);
@@ -2381,8 +2438,6 @@ public:
   template<typename T, int MSGTYPE>
   bool can_discard_replica_op(OpRequestRef& op);
 
-  static bool op_must_wait_for_map(epoch_t cur_epoch, OpRequestRef& op);
-
   bool old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch);
   bool old_peering_evt(CephPeeringEvtRef evt) {
     return old_peering_msg(evt->get_epoch_sent(), evt->get_epoch_requested());
diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h
index 66bb890af01..b50f0d8c78c 100644
--- a/src/osd/PGBackend.h
+++ b/src/osd/PGBackend.h
@@ -132,6 +132,8 @@ typedef ceph::shared_ptr<const OSDMap> OSDMapRef;
        OpRequestRef op = OpRequestRef()
        ) = 0;
      virtual epoch_t get_epoch() const = 0;
+     virtual epoch_t get_interval_start_epoch() const = 0;
+     virtual epoch_t get_last_peering_reset_epoch() const = 0;
 
      virtual const set<pg_shard_t> &get_actingbackfill_shards() const = 0;
      virtual const set<pg_shard_t> &get_acting_shards() const = 0;
diff --git a/src/osd/PGLog.h b/src/osd/PGLog.h
index e4d0fa304ab..d4dfc450a42 100644
--- a/src/osd/PGLog.h
+++ b/src/osd/PGLog.h
@@ -440,6 +440,9 @@ public:
 	assert(get_can_rollback_to() == head);
       }
 
+      // make sure our buffers don't pin bigger buffers
+      e.mod_desc.trim_bl();
+
       // add to log
       log.push_back(e);
 
diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc
index 427571551f0..8c4f08c7090 100644
--- a/src/osd/PrimaryLogPG.cc
+++ b/src/osd/PrimaryLogPG.cc
@@ -333,18 +333,27 @@ void PrimaryLogPG::on_local_recover(
 
   ObjectRecoveryInfo recovery_info(_recovery_info);
   clear_object_snap_mapping(t, hoid);
-  if (recovery_info.soid.snap < CEPH_NOSNAP) {
-    assert(recovery_info.oi.snaps.size());
+  if (recovery_info.soid.is_snap()) {
     OSDriver::OSTransaction _t(osdriver.get_transaction(t));
-    set<snapid_t> snaps(
-      recovery_info.oi.snaps.begin(),
-      recovery_info.oi.snaps.end());
+    set<snapid_t> snaps;
+    dout(20) << " snapset " << recovery_info.ss
+	     << " legacy_snaps " << recovery_info.oi.legacy_snaps << dendl;
+    if (recovery_info.ss.is_legacy() ||
+	recovery_info.ss.seq == 0 /* jewel osd doesn't populate this */) {
+      assert(recovery_info.oi.legacy_snaps.size());
+      snaps.insert(recovery_info.oi.legacy_snaps.begin(),
+		   recovery_info.oi.legacy_snaps.end());
+    } else {
+      auto p = recovery_info.ss.clone_snaps.find(hoid.snap);
+      assert(p != recovery_info.ss.clone_snaps.end());  // hmm, should we warn?
+      snaps.insert(p->second.begin(), p->second.end());
+    }
+    dout(20) << " snaps " << snaps << dendl;
     snap_mapper.add_oid(
       recovery_info.soid,
       snaps,
       &_t);
   }
-
   if (pg_log.get_missing().is_missing(recovery_info.soid) &&
       pg_log.get_missing().get_items().find(recovery_info.soid)->second.need > recovery_info.version) {
     assert(is_primary());
@@ -428,7 +437,6 @@ void PrimaryLogPG::on_local_recover(
   // update pg
   dirty_info = true;
   write_if_dirty(*t);
-
 }
 
 void PrimaryLogPG::on_global_recover(
@@ -677,6 +685,48 @@ void PrimaryLogPG::wait_for_blocked_object(const hobject_t& soid, OpRequestRef o
   op->mark_delayed("waiting for blocked object");
 }
 
+void PrimaryLogPG::maybe_force_recovery()
+{
+  // no force if not in degraded/recovery/backfill stats
+  if (!is_degraded() &&
+      !state_test(PG_STATE_RECOVERING |
+                  PG_STATE_RECOVERY_WAIT |
+		  PG_STATE_BACKFILL |
+		  PG_STATE_BACKFILL_WAIT |
+		  PG_STATE_BACKFILL_TOOFULL))
+    return;
+
+  if (pg_log.get_log().approx_size() <
+      cct->_conf->osd_max_pg_log_entries *
+        cct->_conf->osd_force_recovery_pg_log_entries_factor)
+    return;
+
+  // find the oldest missing object
+  version_t min_version = 0;
+  hobject_t soid;
+  if (!pg_log.get_missing().get_items().empty()) {
+    min_version = pg_log.get_missing().get_rmissing().begin()->first;
+    soid = pg_log.get_missing().get_rmissing().begin()->second;
+  }
+  assert(!actingbackfill.empty());
+  for (set<pg_shard_t>::iterator it = actingbackfill.begin();
+       it != actingbackfill.end();
+       ++it) {
+    if (*it == get_primary()) continue;
+    pg_shard_t peer = *it;
+    if (peer_missing.count(peer) &&
+	!peer_missing[peer].get_items().empty() &&
+	min_version > peer_missing[peer].get_rmissing().begin()->first) {
+      min_version = peer_missing[peer].get_rmissing().begin()->first;
+      soid = peer_missing[peer].get_rmissing().begin()->second;
+    }
+  }
+
+  // recover it
+  if (soid != hobject_t())
+    maybe_kick_recovery(soid);
+}
+
 class PGLSPlainFilter : public PGLSFilter {
   string val;
 public:
@@ -942,7 +992,7 @@ int PrimaryLogPG::do_command(
       return -EROFS;
     }
 
-    int unfound = missing_loc.num_unfound();
+    uint64_t unfound = missing_loc.num_unfound();
     if (!unfound) {
       ss << "pg has no unfound objects";
       return 0;  // make command idempotent
@@ -1554,6 +1604,10 @@ void PrimaryLogPG::do_request(
   OpRequestRef& op,
   ThreadPool::TPHandle &handle)
 {
+  if (op->osd_trace) {
+    op->pg_trace.init("pg op", &trace_endpoint, &op->osd_trace);
+    op->pg_trace.event("do request");
+  }
   // make sure we have a new enough map
   auto p = waiting_for_map.find(op->get_source());
   if (p != waiting_for_map.end()) {
@@ -1564,9 +1618,9 @@ void PrimaryLogPG::do_request(
     op->mark_delayed("waiting_for_map not empty");
     return;
   }
-  if (op_must_wait_for_map(get_osdmap()->get_epoch(), op)) {
-    dout(20) << __func__ << " queue on waiting_for_map "
-	     << op->get_source() << dendl;
+  if (!have_same_or_newer_map(op->min_epoch)) {
+    dout(20) << __func__ << " min " << op->min_epoch
+	     << ", queue on waiting_for_map " << op->get_source() << dendl;
     waiting_for_map[op->get_source()].push_back(op);
     op->mark_delayed("op must wait for map");
     return;
@@ -2238,6 +2292,9 @@ void PrimaryLogPG::do_op(OpRequestRef& op)
   } else if (op->may_write() || op->may_cache()) {
     osd->logger->tinc(l_osd_op_w_prepare_lat, prepare_latency);
   }
+
+  // force recovery of the oldest missing object if too many logs
+  maybe_force_recovery();
 }
 
 void PrimaryLogPG::record_write_error(OpRequestRef op, const hobject_t &soid,
@@ -2999,6 +3056,7 @@ void PrimaryLogPG::execute_ctx(OpContext *ctx)
       // client specified snapc
       ctx->snapc.seq = m->get_snap_seq();
       ctx->snapc.snaps = m->get_snaps();
+      filter_snapc(ctx->snapc.snaps);
     }
     if ((m->has_flag(CEPH_OSD_FLAG_ORDERSNAP)) &&
 	ctx->snapc.seq < obc->ssc->snapset.seq) {
@@ -3429,11 +3487,7 @@ void PrimaryLogPG::do_backfill(OpRequestRef op)
       assert(cct->_conf->osd_kill_backfill_at != 2);
 
       info.set_last_backfill(m->last_backfill);
-      if (m->compat_stat_sum) {
-	info.stats.stats = m->stats.stats; // Previously, we only sent sum
-      } else {
-	info.stats = m->stats;
-      }
+      info.stats = m->stats;
 
       ObjectStore::Transaction t;
       dirty_info = true;
@@ -3470,7 +3524,8 @@ void PrimaryLogPG::do_backfill_remove(OpRequestRef op)
   assert(r == 0);
 }
 
-PrimaryLogPG::OpContextUPtr PrimaryLogPG::trim_object(bool first, const hobject_t &coid)
+PrimaryLogPG::OpContextUPtr PrimaryLogPG::trim_object(
+  bool first, const hobject_t &coid)
 {
   // load clone info
   bufferlist bl;
@@ -3488,15 +3543,30 @@ PrimaryLogPG::OpContextUPtr PrimaryLogPG::trim_object(bool first, const hobject_
   ObjectContextRef snapset_obc = get_object_context(snapoid, false);
   assert(snapset_obc);
 
+  SnapSet& snapset = obc->ssc->snapset;
+
+  bool legacy = snapset.is_legacy() ||
+    !get_osdmap()->test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS);
+
   object_info_t &coi = obc->obs.oi;
-  set<snapid_t> old_snaps(coi.snaps.begin(), coi.snaps.end());
+  set<snapid_t> old_snaps;
+  if (legacy) {
+    old_snaps.insert(coi.legacy_snaps.begin(), coi.legacy_snaps.end());
+  } else {
+    auto p = snapset.clone_snaps.find(coid.snap);
+    if (p == snapset.clone_snaps.end()) {
+      osd->clog->error() << __func__ << " No clone_snaps in snapset " << snapset
+			 << " for " << coid << "\n";
+      return NULL;
+    }
+    old_snaps.insert(snapset.clone_snaps[coid.snap].begin(),
+		     snapset.clone_snaps[coid.snap].end());
+  }
   if (old_snaps.empty()) {
     osd->clog->error() << __func__ << " No object info snaps for " << coid;
     return NULL;
   }
 
-  SnapSet& snapset = obc->ssc->snapset;
-
   dout(10) << coid << " old_snaps " << old_snaps
 	   << " old snapset " << snapset << dendl;
   if (snapset.seq == 0) {
@@ -3591,6 +3661,7 @@ PrimaryLogPG::OpContextUPtr PrimaryLogPG::trim_object(bool first, const hobject_
     snapset.clones.erase(p);
     snapset.clone_overlap.erase(last);
     snapset.clone_size.erase(last);
+    snapset.clone_snaps.erase(last);
 	
     ctx->log.push_back(
       pg_log_entry_t(
@@ -3611,9 +3682,15 @@ PrimaryLogPG::OpContextUPtr PrimaryLogPG::trim_object(bool first, const hobject_
     ctx->at_version.version++;
   } else {
     // save adjusted snaps for this object
-    dout(10) << coid << " snaps " << old_snaps
-	     << " -> " << new_snaps << dendl;
-    coi.snaps = vector<snapid_t>(new_snaps.rbegin(), new_snaps.rend());
+    dout(10) << coid << " snaps " << old_snaps << " -> " << new_snaps << dendl;
+    if (legacy) {
+      coi.legacy_snaps = vector<snapid_t>(new_snaps.rbegin(), new_snaps.rend());
+    } else {
+      snapset.clone_snaps[coid.snap] = vector<snapid_t>(new_snaps.rbegin(),
+							new_snaps.rend());
+      // we still do a 'modify' event on this object just to trigger a
+      // snapmapper.update ... :(
+    }
 
     coi.prior_version = coi.version;
     coi.version = ctx->at_version;
@@ -3632,18 +3709,26 @@ PrimaryLogPG::OpContextUPtr PrimaryLogPG::trim_object(bool first, const hobject_
 	ctx->mtime,
 	0)
       );
+    ctx->at_version.version++;
 
     t->update_snaps(
       coid,
       old_snaps,
       new_snaps);
-    ctx->at_version.version++;
   }
 
   // save head snapset
-  dout(10) << coid << " new snapset " << snapset << dendl;
-
-  if (snapset.clones.empty() && !snapset.head_exists) {
+  dout(10) << coid << " new snapset " << snapset << " on "
+	   << snapset_obc->obs.oi << dendl;
+  if (snapset.clones.empty() &&
+      (!snapset.head_exists ||
+       (snapset_obc->obs.oi.is_whiteout() &&
+	!(snapset_obc->obs.oi.is_dirty() && pool.info.is_tier()) &&
+	!snapset_obc->obs.oi.is_cache_pinned()))) {
+    // NOTE: this arguably constitutes minor interference with the
+    // tiering agent if this is a cache tier since a snap trim event
+    // is effectively evicting a whiteout we might otherwise want to
+    // keep around.
     dout(10) << coid << " removing " << snapoid << dendl;
     ctx->log.push_back(
       pg_log_entry_t(
@@ -3656,7 +3741,24 @@ PrimaryLogPG::OpContextUPtr PrimaryLogPG::trim_object(bool first, const hobject_
 	ctx->mtime,
 	0)
       );
-
+    if (snapoid.is_head()) {
+      derr << "removing snap head" << dendl;
+      object_info_t& oi = ctx->snapset_obc->obs.oi;
+      ctx->delta_stats.num_objects--;
+      if (oi.is_dirty()) {
+	ctx->delta_stats.num_objects_dirty--;
+	oi.clear_flag(object_info_t::FLAG_DIRTY);
+      }
+      if (oi.is_omap())
+	ctx->delta_stats.num_objects_omap--;
+      if (oi.is_whiteout()) {
+	dout(20) << __func__ << " trimming whiteout on " << oi.soid << dendl;
+	ctx->delta_stats.num_whiteouts--;
+	oi.clear_flag(object_info_t::FLAG_WHITEOUT);
+      }
+      if (oi.is_cache_pinned())
+	ctx->delta_stats.num_objects_pinned--;
+    }
     ctx->snapset_obc->obs.exists = false;
     
     t->remove(snapoid);
@@ -3782,6 +3884,37 @@ int PrimaryLogPG::do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr)
   }
 }
 
+int PrimaryLogPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op)
+{
+  ceph_osd_op& op = osd_op.op;
+  vector<OSDOp> read_ops(1);
+  OSDOp& read_op = read_ops[0];
+  int result = 0;
+
+  read_op.op.op = CEPH_OSD_OP_SYNC_READ;
+  read_op.op.extent.offset = op.extent.offset;
+  read_op.op.extent.length = op.extent.length;
+  read_op.op.extent.truncate_seq = op.extent.truncate_seq;
+  read_op.op.extent.truncate_size = op.extent.truncate_size;
+
+  result = do_osd_ops(ctx, read_ops);
+  if (result < 0) {
+    derr << "do_extent_cmp do_osd_ops failed " << result << dendl;
+    return result;
+  }
+
+  if (read_op.outdata.length() != osd_op.indata.length())
+    return -EINVAL;
+
+  for (uint64_t p = 0; p < osd_op.indata.length(); p++) {
+    if (read_op.outdata[p] != osd_op.indata[p]) {
+      return (-MAX_ERRNO - p);
+    }
+  }
+
+  return result;
+}
+
 int PrimaryLogPG::do_writesame(OpContext *ctx, OSDOp& osd_op)
 {
   ceph_osd_op& op = osd_op.op;
@@ -4195,6 +4328,7 @@ void PrimaryLogPG::maybe_create_new_object(
   if (!obs.exists) {
     ctx->delta_stats.num_objects++;
     obs.exists = true;
+    assert(!obs.oi.is_whiteout());
     obs.oi.new_object();
     if (!ignore_transaction)
       ctx->op_t->create(obs.oi.soid);
@@ -4476,6 +4610,12 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
       
       // --- READS ---
 
+    case CEPH_OSD_OP_CMPEXT:
+      ++ctx->num_read;
+      tracepoint(osd, do_osd_op_pre_extent_cmp, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
+      result = do_extent_cmp(ctx, osd_op);
+      break;
+
     case CEPH_OSD_OP_SYNC_READ:
       if (pool.info.require_rollback()) {
 	result = -EOPNOTSUPP;
@@ -4943,7 +5083,7 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	  if (result < 0)
 	    break;
 	}
-	result = _delete_oid(ctx, true);
+	result = _delete_oid(ctx, true, false);
 	if (result >= 0) {
 	  // mark that this is a cache eviction to avoid triggering normal
 	  // make_writeable() clone or snapdir object creation in finish_ctx()
@@ -5116,7 +5256,7 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
         if (ssc->snapset.head_exists)
           clonecount++;
         resp.clones.reserve(clonecount);
-        for (vector<snapid_t>::const_iterator clone_iter = ssc->snapset.clones.begin();
+        for (auto clone_iter = ssc->snapset.clones.begin();
 	     clone_iter != ssc->snapset.clones.end(); ++clone_iter) {
           clone_info ci;
           ci.cloneid = *clone_iter;
@@ -5124,42 +5264,58 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	  hobject_t clone_oid = soid;
 	  clone_oid.snap = *clone_iter;
 
-	  /* No need to take a lock here.  We are only inspecting state cached on
-	   * in the ObjectContext, so we aren't performing an actual read unless
-	   * the clone obc is not already loaded (in which case, it cannot have
-	   * an in progress write).  We also do not risk exposing uncommitted
-	   * state since we do have a read lock on the head object or snapdir,
-	   * which we would have to write lock in order to make user visible
-	   * modifications to the snapshot state (snap trim related mutations
-	   * are not user visible).
-	   */
-	  if (is_missing_object(clone_oid)) {
-	    dout(20) << "LIST_SNAPS " << clone_oid << " missing" << dendl;
-	    wait_for_unreadable_object(clone_oid, ctx->op);
-	    result = -EAGAIN;
-	    break;
-	  }
-
-	  ObjectContextRef clone_obc = get_object_context(clone_oid, false);
-	  if (!clone_obc) {
-	    if (maybe_handle_cache(
-		  ctx->op, true, clone_obc, -ENOENT, clone_oid, true)) {
-	      // promoting the clone
-	      result = -EAGAIN;
-	    } else {
+	  if (!ssc->snapset.is_legacy()) {
+	    auto p = ssc->snapset.clone_snaps.find(*clone_iter);
+	    if (p == ssc->snapset.clone_snaps.end()) {
 	      osd->clog->error() << "osd." << osd->whoami
-				 << ": missing clone " << clone_oid
-				 << " for oid "
-				 << soid;
-	      // should not happen
-	      result = -ENOENT;
+				 << ": inconsistent clone_snaps found for oid "
+				 << soid << " clone " << *clone_iter
+				 << " snapset " << ssc->snapset;
+	      result = -EINVAL;
+	      break;
+	    }
+	    for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
+	      ci.snaps.push_back(*q);
+	    }
+	  } else {
+	    /* No need to take a lock here.  We are only inspecting state cached on
+	     * in the ObjectContext, so we aren't performing an actual read unless
+	     * the clone obc is not already loaded (in which case, it cannot have
+	     * an in progress write).  We also do not risk exposing uncommitted
+	     * state since we do have a read lock on the head object or snapdir,
+	     * which we would have to write lock in order to make user visible
+	     * modifications to the snapshot state (snap trim related mutations
+	     * are not user visible).
+	     */
+	    if (is_missing_object(clone_oid)) {
+	      dout(20) << "LIST_SNAPS " << clone_oid << " missing" << dendl;
+	      wait_for_unreadable_object(clone_oid, ctx->op);
+	      result = -EAGAIN;
+	      break;
+	    }
+
+	    ObjectContextRef clone_obc = get_object_context(clone_oid, false);
+	    if (!clone_obc) {
+	      if (maybe_handle_cache(
+		    ctx->op, true, clone_obc, -ENOENT, clone_oid, true)) {
+		// promoting the clone
+		result = -EAGAIN;
+	      } else {
+		osd->clog->error() << "osd." << osd->whoami
+				   << ": missing clone " << clone_oid
+				   << " for oid "
+				   << soid;
+		// should not happen
+		result = -ENOENT;
+	      }
+	      break;
+	    }
+	    for (vector<snapid_t>::reverse_iterator p =
+		   clone_obc->obs.oi.legacy_snaps.rbegin();
+		 p != clone_obc->obs.oi.legacy_snaps.rend();
+		 ++p) {
+	      ci.snaps.push_back(*p);
 	    }
-	    break;
-	  }
-	  for (vector<snapid_t>::reverse_iterator p = clone_obc->obs.oi.snaps.rbegin();
-	       p != clone_obc->obs.oi.snaps.rend();
-	       ++p) {
-	    ci.snaps.push_back(*p);
 	  }
 
           dout(20) << " clone " << *clone_iter << " snaps " << ci.snaps << dendl;
@@ -5167,7 +5323,8 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
           map<snapid_t, interval_set<uint64_t> >::const_iterator coi;
           coi = ssc->snapset.clone_overlap.find(ci.cloneid);
           if (coi == ssc->snapset.clone_overlap.end()) {
-            osd->clog->error() << "osd." << osd->whoami << ": inconsistent clone_overlap found for oid "
+            osd->clog->error() << "osd." << osd->whoami
+			       << ": inconsistent clone_overlap found for oid "
 			      << soid << " clone " << *clone_iter;
             result = -EINVAL;
             break;
@@ -5176,14 +5333,16 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
           ci.overlap.reserve(o.num_intervals());
           for (interval_set<uint64_t>::const_iterator r = o.begin();
                r != o.end(); ++r) {
-            ci.overlap.push_back(pair<uint64_t,uint64_t>(r.get_start(), r.get_len()));
+            ci.overlap.push_back(pair<uint64_t,uint64_t>(r.get_start(),
+							 r.get_len()));
           }
 
           map<snapid_t, uint64_t>::const_iterator si;
           si = ssc->snapset.clone_size.find(ci.cloneid);
           if (si == ssc->snapset.clone_size.end()) {
-            osd->clog->error() << "osd." << osd->whoami << ": inconsistent clone_size found for oid "
-			      << soid << " clone " << *clone_iter;
+            osd->clog->error() << "osd." << osd->whoami
+			       << ": inconsistent clone_size found for oid "
+			       << soid << " clone " << *clone_iter;
             result = -EINVAL;
             break;
           }
@@ -5376,7 +5535,7 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	else
 	  obs.oi.clear_data_digest();
 	write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
-				    op.extent.offset, op.extent.length, true);
+				    op.extent.offset, op.extent.length);
 
       }
       break;
@@ -5409,7 +5568,7 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
 
 	write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
-	    0, op.extent.length, true, op.extent.length != oi.size ? true : false);
+	    0, op.extent.length, true);
       }
       break;
 
@@ -5536,7 +5695,9 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
     case CEPH_OSD_OP_DELETE:
       ++ctx->num_write;
       tracepoint(osd, do_osd_op_pre_delete, soid.oid.name.c_str(), soid.snap.val);
-      result = _delete_oid(ctx, ctx->ignore_cache);
+      {
+	result = _delete_oid(ctx, false, ctx->ignore_cache);
+      }
       break;
 
     case CEPH_OSD_OP_WATCH:
@@ -6296,7 +6457,10 @@ int PrimaryLogPG::_verify_no_head_clones(const hobject_t& soid,
   return 0;
 }
 
-inline int PrimaryLogPG::_delete_oid(OpContext *ctx, bool no_whiteout)
+inline int PrimaryLogPG::_delete_oid(
+  OpContext *ctx,
+  bool no_whiteout,     // no whiteouts, no matter what.
+  bool try_no_whiteout) // try not to whiteout
 {
   SnapSet& snapset = ctx->new_snapset;
   ObjectState& obs = ctx->new_obs;
@@ -6304,7 +6468,38 @@ inline int PrimaryLogPG::_delete_oid(OpContext *ctx, bool no_whiteout)
   const hobject_t& soid = oi.soid;
   PGTransaction* t = ctx->op_t.get();
 
-  if (!obs.exists || (obs.oi.is_whiteout() && !no_whiteout))
+  // cache: cache: set whiteout on delete?
+  bool whiteout = false;
+  if (pool.info.cache_mode != pg_pool_t::CACHEMODE_NONE
+      && !no_whiteout
+      && !try_no_whiteout) {
+    whiteout = true;
+  }
+  bool legacy;
+  if (get_osdmap()->test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
+    legacy = false;
+    // in luminous or later, we can't delete the head if there are
+    // clones. we trust the caller passing no_whiteout has already
+    // verified they don't exist.
+    if (!snapset.clones.empty() ||
+	(!ctx->snapc.snaps.empty() && ctx->snapc.snaps[0] > snapset.seq)) {
+      if (no_whiteout) {
+	dout(20) << __func__ << " has or will have clones but no_whiteout=1"
+		 << dendl;
+      } else {
+	dout(20) << __func__ << " has or will have clones; will whiteout"
+		 << dendl;
+	whiteout = true;
+      }
+    }
+  } else {
+    legacy = false;
+  }
+  dout(20) << __func__ << " " << soid << " whiteout=" << (int)whiteout
+	   << " no_whiteout=" << (int)no_whiteout
+	   << " try_no_whiteout=" << (int)try_no_whiteout
+	   << dendl;
+  if (!obs.exists || (obs.oi.is_whiteout() && whiteout))
     return -ENOENT;
 
   t->remove(soid);
@@ -6336,8 +6531,7 @@ inline int PrimaryLogPG::_delete_oid(OpContext *ctx, bool no_whiteout)
   }
   oi.watchers.clear();
 
-  // cache: cache: set whiteout on delete?
-  if (pool.info.cache_mode != pg_pool_t::CACHEMODE_NONE && !no_whiteout) {
+  if (whiteout) {
     dout(20) << __func__ << " setting whiteout on " << soid << dendl;
     oi.set_flag(object_info_t::FLAG_WHITEOUT);
     ctx->delta_stats.num_whiteouts++;
@@ -6346,18 +6540,21 @@ inline int PrimaryLogPG::_delete_oid(OpContext *ctx, bool no_whiteout)
     return 0;
   }
 
+  // delete the head
   ctx->delta_stats.num_objects--;
   if (soid.is_snap())
     ctx->delta_stats.num_object_clones--;
   if (oi.is_whiteout()) {
     dout(20) << __func__ << " deleting whiteout on " << soid << dendl;
     ctx->delta_stats.num_whiteouts--;
+    oi.clear_flag(object_info_t::FLAG_WHITEOUT);
   }
   if (oi.is_cache_pinned()) {
     ctx->delta_stats.num_objects_pinned--;
   }
-  if (soid.is_head())
+  if ((legacy || snapset.is_legacy()) && soid.is_head()) {
     snapset.head_exists = false;
+  }
   obs.exists = false;
   return 0;
 }
@@ -6423,7 +6620,7 @@ int PrimaryLogPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
       // Cannot delete an object with watchers
       ret = -EBUSY;
     } else {
-      _delete_oid(ctx, false);
+      _delete_oid(ctx, false, false);
       ret = 0;
     }
   } else if (ret) {
@@ -6521,7 +6718,7 @@ void PrimaryLogPG::make_writeable(OpContext *ctx)
 
   // clone?
   assert(soid.snap == CEPH_NOSNAP);
-  dout(20) << "make_writeable " << soid << " snapset=" << ctx->snapset
+  dout(20) << "make_writeable " << soid << " snapset=" << ctx->new_snapset
 	   << "  snapc=" << snapc << dendl;
   
   bool was_dirty = ctx->obc->obs.oi.is_dirty();
@@ -6564,12 +6761,10 @@ void PrimaryLogPG::make_writeable(OpContext *ctx)
   if (ctx->new_snapset.seq > snapc.seq) {
     snapc.seq = ctx->new_snapset.seq;
     snapc.snaps = ctx->new_snapset.snaps;
+    filter_snapc(snapc.snaps);
     dout(10) << " using newer snapc " << snapc << dendl;
   }
 
-  if (ctx->obs->exists)
-    filter_snapc(snapc.snaps);
-  
   if ((ctx->obs->exists && !ctx->obs->oi.is_whiteout()) && // head exist(ed)
       snapc.snaps.size() &&                 // there are snaps
       !ctx->cache_evict &&
@@ -6610,7 +6805,12 @@ void PrimaryLogPG::make_writeable(OpContext *ctx)
     snap_oi->version = ctx->at_version;
     snap_oi->prior_version = ctx->obs->oi.version;
     snap_oi->copy_user_bits(ctx->obs->oi);
-    snap_oi->snaps = snaps;
+
+    bool legacy = ctx->new_snapset.is_legacy() ||
+      !get_osdmap()->test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS);
+    if (legacy) {
+      snap_oi->legacy_snaps = snaps;
+    }
 
     _make_clone(ctx, ctx->op_t.get(), ctx->clone_obc, soid, coid, snap_oi);
     
@@ -6626,6 +6826,9 @@ void PrimaryLogPG::make_writeable(OpContext *ctx)
     ctx->delta_stats.num_object_clones++;
     ctx->new_snapset.clones.push_back(coid.snap);
     ctx->new_snapset.clone_size[coid.snap] = ctx->obs->oi.size;
+    if (!legacy) {
+      ctx->new_snapset.clone_snaps[coid.snap] = snaps;
+    }
 
     // clone_overlap should contain an entry for each clone 
     // (an empty interval_set if there is no overlap)
@@ -6636,7 +6839,8 @@ void PrimaryLogPG::make_writeable(OpContext *ctx)
     // log clone
     dout(10) << " cloning v " << ctx->obs->oi.version
 	     << " to " << coid << " v " << ctx->at_version
-	     << " snaps=" << snaps << dendl;
+	     << " snaps=" << snaps
+	     << " snapset=" << ctx->new_snapset << dendl;
     ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::CLONE, coid, ctx->at_version,
 				      ctx->obs->oi.version,
 				      ctx->obs->oi.user_version,
@@ -6664,28 +6868,37 @@ void PrimaryLogPG::make_writeable(OpContext *ctx)
   // update snapset with latest snap context
   ctx->new_snapset.seq = snapc.seq;
   ctx->new_snapset.snaps = snapc.snaps;
-  ctx->new_snapset.head_exists = ctx->new_obs.exists;
-  dout(20) << "make_writeable " << soid << " done, snapset=" << ctx->new_snapset << dendl;
+  if (!get_osdmap()->test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
+    // pessimistic assumption that this is a net-new legacy SnapSet
+    ctx->delta_stats.num_legacy_snapsets++;
+    ctx->new_snapset.head_exists = ctx->new_obs.exists;
+  } else if (ctx->new_snapset.is_legacy()) {
+    ctx->new_snapset.head_exists = ctx->new_obs.exists;
+  }
+  dout(20) << "make_writeable " << soid
+	   << " done, snapset=" << ctx->new_snapset << dendl;
 }
 
 
 void PrimaryLogPG::write_update_size_and_usage(object_stat_sum_t& delta_stats, object_info_t& oi,
 					       interval_set<uint64_t>& modified, uint64_t offset,
-					       uint64_t length, bool count_bytes, bool force_changesize)
+					       uint64_t length, bool write_full)
 {
   interval_set<uint64_t> ch;
-  if (length)
+  if (write_full) {
+    if (oi.size)
+      ch.insert(0, oi.size);
+  } else if (length)
     ch.insert(offset, length);
   modified.union_of(ch);
-  if (force_changesize || offset + length > oi.size) {
+  if (write_full || offset + length > oi.size) {
     uint64_t new_size = offset + length;
     delta_stats.num_bytes -= oi.size;
     delta_stats.num_bytes += new_size;
     oi.size = new_size;
   }
   delta_stats.num_wr++;
-  if (count_bytes)
-    delta_stats.num_wr_kb += SHIFT_ROUND_UP(length, 10);
+  delta_stats.num_wr_kb += SHIFT_ROUND_UP(length, 10);
 }
 
 void PrimaryLogPG::add_interval_usage(interval_set<uint64_t>& s, object_stat_sum_t& delta_stats)
@@ -6906,32 +7119,35 @@ void PrimaryLogPG::finish_ctx(OpContext *ctx, int log_op_type, bool maintain_ssc
 
   if (soid.snap == CEPH_NOSNAP && maintain_ssc) {
     ::encode(ctx->new_snapset, bss);
-    assert(ctx->new_obs.exists == ctx->new_snapset.head_exists);
+    assert(ctx->new_obs.exists == ctx->new_snapset.head_exists ||
+	   !ctx->new_snapset.is_legacy());
 
     if (ctx->new_obs.exists) {
       if (!ctx->obs->exists) {
 	if (ctx->snapset_obc && ctx->snapset_obc->obs.exists) {
 	  hobject_t snapoid = soid.get_snapdir();
+	  dout(10) << " removing unneeded snapdir " << snapoid << dendl;
 	  ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::DELETE, snapoid,
 	      ctx->at_version,
 	      ctx->snapset_obc->obs.oi.version,
 	      0, osd_reqid_t(), ctx->mtime, 0));
 	  ctx->op_t->remove(snapoid);
-	  dout(10) << " removing old " << snapoid << dendl;
 
 	  ctx->at_version.version++;
 
 	  ctx->snapset_obc->obs.exists = false;
 	}
       }
-    } else if (ctx->new_snapset.clones.size() &&
+    } else if (!ctx->new_snapset.clones.empty() &&
 	       !ctx->cache_evict &&
+	       !ctx->new_snapset.head_exists &&
 	       (!ctx->snapset_obc || !ctx->snapset_obc->obs.exists)) {
       // save snapset on _snap
       hobject_t snapoid(soid.oid, soid.get_key(), CEPH_SNAPDIR, soid.get_hash(),
 			info.pgid.pool(), soid.get_namespace());
       dout(10) << " final snapset " << ctx->new_snapset
 	       << " in " << snapoid << dendl;
+      assert(!get_osdmap()->test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS));
       ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, snapoid,
 					ctx->at_version,
 	                                eversion_t(),
@@ -7018,6 +7234,9 @@ void PrimaryLogPG::finish_ctx(OpContext *ctx, int log_op_type, bool maintain_ssc
     ctx->new_obs.oi = object_info_t(ctx->obc->obs.oi.soid);
   }
 
+  bool legacy_snapset = ctx->new_snapset.is_legacy() ||
+    !get_osdmap()->test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS);
+
   // append to log
   ctx->log.push_back(pg_log_entry_t(log_op_type, soid, ctx->at_version,
 				    ctx->obs->oi.version,
@@ -7028,9 +7247,16 @@ void PrimaryLogPG::finish_ctx(OpContext *ctx, int log_op_type, bool maintain_ssc
     case pg_log_entry_t::MODIFY:
     case pg_log_entry_t::PROMOTE:
     case pg_log_entry_t::CLEAN:
-      dout(20) << __func__ << " encoding snaps " << ctx->new_obs.oi.snaps
-	       << dendl;
-      ::encode(ctx->new_obs.oi.snaps, ctx->log.back().snaps);
+      if (legacy_snapset) {
+	dout(20) << __func__ << " encoding legacy_snaps "
+		 << ctx->new_obs.oi.legacy_snaps
+		 << dendl;
+	::encode(ctx->new_obs.oi.legacy_snaps, ctx->log.back().snaps);
+      } else {
+	dout(20) << __func__ << " encoding snaps from " << ctx->new_snapset
+		 << dendl;
+	::encode(ctx->new_snapset.clone_snaps[soid.snap], ctx->log.back().snaps);
+      }
       break;
     default:
       break;
@@ -7074,7 +7300,12 @@ void PrimaryLogPG::apply_stats(
 
   if (is_primary() && scrubber.active) {
     if (soid < scrubber.start) {
+      dout(20) << __func__ << " " << soid << " < [" << scrubber.start
+	       << "," << scrubber.end << ")" << dendl;
       scrub_cstat.add(delta_stats);
+    } else {
+      dout(20) << __func__ << " " << soid << " >= [" << scrubber.start
+	       << "," << scrubber.end << ")" << dendl;
     }
   }
 }
@@ -7196,10 +7427,16 @@ int PrimaryLogPG::fill_in_copy_get(
   // size, mtime
   reply_obj.size = oi.size;
   reply_obj.mtime = oi.mtime;
+  assert(obc->ssc);
   if (soid.snap < CEPH_NOSNAP) {
-    reply_obj.snaps = oi.snaps;
+    if (obc->ssc->snapset.is_legacy()) {
+      reply_obj.snaps = oi.legacy_snaps;
+    } else {
+      auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
+      assert(p != obc->ssc->snapset.clone_snaps.end()); // warn?
+      reply_obj.snaps = p->second;
+    }
   } else {
-    assert(obc->ssc);
     reply_obj.snap_seq = obc->ssc->snapset.seq;
   }
   if (oi.is_data_digest()) {
@@ -7842,15 +8079,22 @@ void PrimaryLogPG::finish_promote(int r, CopyResults *results,
     tctx->at_version = get_next_version();
     filter_snapc(tctx->new_snapset.snaps);
     vector<snapid_t> new_clones;
+    map<snapid_t, vector<snapid_t>> new_clone_snaps;
     for (vector<snapid_t>::iterator i = tctx->new_snapset.clones.begin();
 	 i != tctx->new_snapset.clones.end();
 	 ++i) {
-      if (*i != soid.snap)
+      if (*i != soid.snap) {
 	new_clones.push_back(*i);
+	auto p = tctx->new_snapset.clone_snaps.find(*i);
+	if (p != tctx->new_snapset.clone_snaps.end()) {
+	  new_clone_snaps[*i] = p->second;
+	}
+      }
     }
     tctx->new_snapset.clones.swap(new_clones);
     tctx->new_snapset.clone_overlap.erase(soid.snap);
     tctx->new_snapset.clone_size.erase(soid.snap);
+    tctx->new_snapset.clone_snaps.swap(new_clone_snaps);
 
     // take RWWRITE lock for duration of our local write.  ignore starvation.
     if (!tctx->lock_manager.take_write_lock(
@@ -7902,6 +8146,9 @@ void PrimaryLogPG::finish_promote(int r, CopyResults *results,
 
   tctx->extra_reqids = results->reqids;
 
+  bool legacy_snapset = tctx->new_snapset.is_legacy() ||
+    !get_osdmap()->test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS);
+
   if (whiteout) {
     // create a whiteout
     tctx->op_t->create(soid);
@@ -7931,8 +8178,13 @@ void PrimaryLogPG::finish_promote(int r, CopyResults *results,
     tctx->new_obs.oi.truncate_size = results->truncate_size;
 
     if (soid.snap != CEPH_NOSNAP) {
-      tctx->new_obs.oi.snaps = results->snaps;
-      assert(!tctx->new_obs.oi.snaps.empty());
+      if (legacy_snapset) {
+	tctx->new_obs.oi.legacy_snaps = results->snaps;
+	assert(!tctx->new_obs.oi.legacy_snaps.empty());
+      } else {
+	// it's already in the snapset
+	assert(obc->ssc->snapset.clone_snaps.count(soid.snap));
+      }
       assert(obc->ssc->snapset.clone_size.count(soid.snap));
       assert(obc->ssc->snapset.clone_size[soid.snap] ==
 	     results->object_size);
@@ -7946,7 +8198,9 @@ void PrimaryLogPG::finish_promote(int r, CopyResults *results,
 
   if (results->mirror_snapset) {
     assert(tctx->new_obs.oi.soid.snap == CEPH_NOSNAP);
-    tctx->new_snapset.from_snap_set(results->snapset);
+    tctx->new_snapset.from_snap_set(
+      results->snapset,
+      !get_osdmap()->test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS));
   }
   tctx->new_snapset.head_exists = true;
   dout(20) << __func__ << " new_snapset " << tctx->new_snapset << dendl;
@@ -8171,7 +8425,14 @@ int PrimaryLogPG::start_flush(
       snapc.seq = snapset.seq;
       snapc.snaps = snapset.snaps;
     } else {
-      snapid_t min_included_snap = oi.snaps.back();
+      snapid_t min_included_snap;
+      if (snapset.is_legacy()) {
+	min_included_snap = oi.legacy_snaps.back();
+      } else {
+	auto p = snapset.clone_snaps.find(soid.snap);
+	assert(p != snapset.clone_snaps.end());
+	min_included_snap = p->second.back();
+      }
       snapc = snapset.get_ssc_as_of(min_included_snap - 1);
     }
 
@@ -8834,6 +9095,7 @@ void PrimaryLogPG::submit_log_entries(
 	    spg_t(info.pgid.pgid, i->shard),
 	    pg_whoami.shard,
 	    get_osdmap()->get_epoch(),
+	    last_peering_reset,
 	    repop->rep_tid);
 	  osd->send_message_osd_cluster(
 	    peer.osd, m, get_osdmap()->get_epoch());
@@ -9195,6 +9457,7 @@ ObjectContextRef PrimaryLogPG::get_object_context(
   dout(10) << __func__ << ": " << obc << " " << soid
 	   << " " << obc->rwstate
 	   << " oi: " << obc->obs.oi
+	   << " exists: " << (int)obc->obs.exists
 	   << " ssc: " << obc->ssc
 	   << " snapset: " << obc->ssc->snapset << dendl;
   return obc;
@@ -9427,10 +9690,20 @@ int PrimaryLogPG::find_object_context(const hobject_t& oid,
   ssc = 0;
 
   // clone
-  dout(20) << "find_object_context  " << soid << " snaps " << obc->obs.oi.snaps
+  dout(20) << "find_object_context  " << soid
+	   << " snapset " << obc->ssc->snapset
+	   << " legacy_snaps " << obc->obs.oi.legacy_snaps
 	   << dendl;
-  snapid_t first = obc->obs.oi.snaps[obc->obs.oi.snaps.size()-1];
-  snapid_t last = obc->obs.oi.snaps[0];
+  snapid_t first, last;
+  if (obc->ssc->snapset.is_legacy()) {
+    first = obc->obs.oi.legacy_snaps.back();
+    last = obc->obs.oi.legacy_snaps.front();
+  } else {
+    auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
+    assert(p != obc->ssc->snapset.clone_snaps.end());
+    first = p->second.back();
+    last = p->second.front();
+  }
   if (first <= oid.snap) {
     dout(20) << "find_object_context  " << soid << " [" << first << "," << last
 	     << "] contains " << oid.snap << " -- HIT " << obc->obs << dendl;
@@ -9544,10 +9817,10 @@ SnapSetContext *PrimaryLogPG::get_snapset_context(
 	r = pgbackend->objects_get_attr(oid.get_head(), SS_ATTR, &bv);
       if (r < 0) {
 	// try _snapset
-      if (!(oid.is_snapdir() && !oid_existed))
-	r = pgbackend->objects_get_attr(oid.get_snapdir(), SS_ATTR, &bv);
-      if (r < 0 && !can_create)
-	return NULL;
+	if (!(oid.is_snapdir() && !oid_existed))
+	  r = pgbackend->objects_get_attr(oid.get_snapdir(), SS_ATTR, &bv);
+	if (r < 0 && !can_create)
+	  return NULL;
       }
     } else {
       assert(attrs->count(SS_ATTR));
@@ -9721,9 +9994,7 @@ void PrimaryLogPG::_committed_pushed_object(
 	    last_complete_ondisk),
 	  get_osdmap()->get_epoch());
       } else {
-	// we are the primary.  tell replicas to trim?
-	if (calc_min_last_complete_ondisk())
-	  trim_peers();
+	calc_min_last_complete_ondisk();
       }
     }
 
@@ -9870,6 +10141,7 @@ void PrimaryLogPG::do_update_log_missing(OpRequestRef &op)
 	    spg_t(info.pgid.pgid, primary_shard().shard),
 	    pg_whoami.shard,
 	    msg->get_epoch(),
+	    msg->min_epoch,
 	    msg->get_tid());
 	reply->set_priority(CEPH_MSG_PRIO_HIGH);
 	msg->get_connection()->send_message(reply);
@@ -9954,7 +10226,7 @@ void PrimaryLogPG::mark_all_unfound_lost(
   ObcLockManager manager;
   eversion_t v = get_next_version();
   v.epoch = get_osdmap()->get_epoch();
-  unsigned num_unfound = missing_loc.num_unfound();
+  uint64_t num_unfound = missing_loc.num_unfound();
   while (m != mend) {
     const hobject_t &oid(m->first);
     if (!missing_loc.is_unfound(oid)) {
@@ -10148,7 +10420,8 @@ void PrimaryLogPG::on_removal(ObjectStore::Transaction *t)
 
   write_if_dirty(*t);
 
-  on_shutdown();
+  if (!deleting)
+    on_shutdown();
 }
 
 void PrimaryLogPG::on_shutdown()
@@ -10524,8 +10797,8 @@ bool PrimaryLogPG::start_recovery_ops(
 
   const pg_missing_t &missing = pg_log.get_missing();
 
-  int num_missing = missing.num_missing();
-  int num_unfound = get_num_unfound();
+  unsigned int num_missing = missing.num_missing();
+  uint64_t num_unfound = get_num_unfound();
 
   if (num_missing == 0) {
     info.last_complete = info.last_update;
@@ -11096,7 +11369,7 @@ uint64_t PrimaryLogPG::recover_backfill(
 	dout(10) << " scanning peer osd." << bt << " from " << pbi.end << dendl;
 	epoch_t e = get_osdmap()->get_epoch();
 	MOSDPGScan *m = new MOSDPGScan(
-	  MOSDPGScan::OP_SCAN_GET_DIGEST, pg_whoami, e, e,
+	  MOSDPGScan::OP_SCAN_GET_DIGEST, pg_whoami, e, last_peering_reset,
 	  spg_t(info.pgid.pgid, bt.shard),
 	  pbi.end, hobject_t());
 	osd->send_message_osd_cluster(bt.osd, m, get_osdmap()->get_epoch());
@@ -11400,7 +11673,7 @@ uint64_t PrimaryLogPG::recover_backfill(
         m = new MOSDPGBackfill(
 	  MOSDPGBackfill::OP_BACKFILL_FINISH,
 	  e,
-	  e,
+	  last_peering_reset,
 	  spg_t(info.pgid.pgid, bt.shard));
         // Use default priority here, must match sub_op priority
         /* pinfo.stats might be wrong if we did log-based recovery on the
@@ -11412,7 +11685,7 @@ uint64_t PrimaryLogPG::recover_backfill(
         m = new MOSDPGBackfill(
 	  MOSDPGBackfill::OP_BACKFILL_PROGRESS,
 	  e,
-	  e,
+	  last_peering_reset,
 	  spg_t(info.pgid.pgid, bt.shard));
         // Use default priority here, must match sub_op priority
       }
@@ -12379,7 +12652,7 @@ bool PrimaryLogPG::agent_maybe_evict(ObjectContextRef& obc, bool after_flush)
 
   ctx->at_version = get_next_version();
   assert(ctx->new_obs.exists);
-  int r = _delete_oid(ctx.get(), true);
+  int r = _delete_oid(ctx.get(), true, false);
   if (obc->obs.oi.is_omap())
     ctx->delta_stats.num_objects_omap--;
   ctx->delta_stats.num_evict++;
@@ -12848,6 +13121,9 @@ void PrimaryLogPG::scrub_snapshot_metadata(
   const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
   boost::optional<snapid_t> all_clones;   // Unspecified snapid_t or boost::none
 
+  /// snapsets to repair
+  map<hobject_t,SnapSet> snapset_to_repair;
+
   // traverse in reverse order.
   boost::optional<hobject_t> head;
   boost::optional<SnapSet> snapset; // If initialized so will head (above)
@@ -12929,6 +13205,10 @@ void PrimaryLogPG::scrub_snapshot_metadata(
 	if (oi->is_cache_pinned())
 	  ++stat.num_objects_pinned;
       }
+    } else {
+      // pessimistic assumption that this object might contain a
+      // legacy SnapSet
+      stat.num_legacy_snapsets++;
     }
 
     // Check for any problems while processing clones
@@ -13011,7 +13291,7 @@ void PrimaryLogPG::scrub_snapshot_metadata(
 	bl.push_back(p->second.attrs[SS_ATTR]);
 	bufferlist::iterator blp = bl.begin();
         try {
-	   snapset = SnapSet(); // Initialize optional<> before decoding into it
+	  snapset = SnapSet(); // Initialize optional<> before decoding into it
 	  ::decode(snapset.get(), blp);
         } catch (buffer::error& e) {
 	  snapset = boost::none;
@@ -13048,6 +13328,23 @@ void PrimaryLogPG::scrub_snapshot_metadata(
 	  ++scrubber.shallow_errors;
 	  head_error.set_head_mismatch();
 	}
+
+	if (get_osdmap()->test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
+	  if (soid.is_snapdir()) {
+	    dout(10) << " will move snapset to head from " << soid << dendl;
+	    snapset_to_repair[soid.get_head()] = *snapset;
+	  } else if (snapset->is_legacy()) {
+	    dout(10) << " will convert legacy snapset on " << soid << " " << *snapset
+		     << dendl;
+	    snapset_to_repair[soid.get_head()] = *snapset;
+	  }
+	} else {
+	  stat.num_legacy_snapsets++;
+	}
+      } else {
+	// pessimistic assumption that this object might contain a
+	// legacy SnapSet
+	stat.num_legacy_snapsets++;
       }
     } else {
       assert(soid.is_snap());
@@ -13105,6 +13402,21 @@ void PrimaryLogPG::scrub_snapshot_metadata(
         }
       }
 
+      // migrate legacy_snaps to snapset?
+      auto p = snapset_to_repair.find(soid.get_head());
+      if (p != snapset_to_repair.end()) {
+	if (!oi || oi->legacy_snaps.empty()) {
+	  osd->clog->error() << mode << " " << info.pgid << " " << soid
+			     << " has no oi or legacy_snaps; cannot convert "
+			     << *snapset;
+	  ++scrubber.shallow_errors;
+	} else {
+	  dout(20) << __func__ << "   copying legacy_snaps " << oi->legacy_snaps
+		   << " to snapset " << p->second << dendl;
+	  p->second.clone_snaps[soid.snap] = oi->legacy_snaps;
+	}
+      }
+
       // what's next?
       ++curclone;
       if (soid_error.errors)
@@ -13169,6 +13481,90 @@ void PrimaryLogPG::scrub_snapshot_metadata(
     simple_opc_submit(std::move(ctx));
     ++scrubber.num_digest_updates_pending;
   }
+  for (auto& p : snapset_to_repair) {
+    // cache pools may not have the clones, which means we won't know
+    // what snaps they have.  fake out the clone_snaps entries anyway (with
+    // blank snap lists).
+    p.second.head_exists = true;
+    if (pool.info.allow_incomplete_clones()) {
+      for (auto s : p.second.clones) {
+	if (p.second.clone_snaps.count(s) == 0) {
+	  dout(10) << __func__ << " " << p.first << " faking clone_snaps for "
+		   << s << dendl;
+	  p.second.clone_snaps[s];
+	}
+      }
+    }
+    if (p.second.clones.size() != p.second.clone_snaps.size() ||
+	p.second.is_legacy()) {
+      // this happens if we encounter other errors above, like a missing
+      // or extra clone.
+      dout(10) << __func__ << " not writing snapset to " << p.first
+	       << " snapset " << p.second << " clones " << p.second.clones
+	       << "; didn't convert fully" << dendl;
+      scrub_cstat.sum.num_legacy_snapsets++;
+      continue;
+    }
+    dout(10) << __func__ << " writing snapset to " << p.first
+	     << " " << p.second << dendl;
+    ObjectContextRef obc = get_object_context(p.first, true);
+    if (!obc) {
+      osd->clog->error() << info.pgid << " " << mode
+			 << " cannot get object context for "
+			 << p.first;
+      continue;
+    } else if (obc->obs.oi.soid != p.first) {
+      osd->clog->error() << info.pgid << " " << mode
+			 << " object " << p.first
+			 << " has a valid oi attr with a mismatched name, "
+			 << " obc->obs.oi.soid: " << obc->obs.oi.soid;
+      continue;
+    }
+    ObjectContextRef snapset_obc;
+    if (!obc->obs.exists) {
+      snapset_obc = get_object_context(p.first.get_snapdir(), false);
+      if (!snapset_obc) {
+	osd->clog->error() << info.pgid << " " << mode
+			   << " cannot get object context for "
+			   << p.first.get_snapdir();
+	continue;
+      }
+    }
+    OpContextUPtr ctx = simple_opc_create(obc);
+    PGTransaction *t = ctx->op_t.get();
+    ctx->snapset_obc = snapset_obc;
+    ctx->at_version = get_next_version();
+    ctx->mtime = utime_t();      // do not update mtime
+    ctx->new_snapset = p.second;
+    if (!ctx->new_obs.exists) {
+      dout(20) << __func__ << "   making " << p.first << " a whiteout" << dendl;
+      ctx->new_obs.exists = true;
+      ctx->new_snapset.head_exists = true;
+      ctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
+      ++ctx->delta_stats.num_whiteouts;
+      ++ctx->delta_stats.num_objects;
+      t->create(p.first);
+      if (p.first < scrubber.start) {
+	dout(20) << __func__ << " kludging around update outside of scrub range"
+		 << dendl;
+      } else {
+	scrub_cstat.add(ctx->delta_stats);
+      }
+    }
+    dout(20) << __func__ << "   final snapset " << ctx->new_snapset << dendl;
+    assert(!ctx->new_snapset.is_legacy());
+    finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
+    ctx->register_on_success(
+      [this]() {
+	dout(20) << "updating snapset" << dendl;
+	if (--scrubber.num_digest_updates_pending == 0) {
+	  requeue_scrub();
+	}
+      });
+
+    simple_opc_submit(std::move(ctx));
+    ++scrubber.num_digest_updates_pending;
+  }
 
   dout(10) << __func__ << " (" << mode << ") finish" << dendl;
 }
@@ -13240,6 +13636,14 @@ void PrimaryLogPG::_scrub_finish()
       publish_stats_to_osd();
       share_pg_info();
     }
+  } else if (scrub_cstat.sum.num_legacy_snapsets !=
+	     info.stats.stats.sum.num_legacy_snapsets) {
+    osd->clog->info() << info.pgid << " " << mode << " updated num_legacy_snapsets"
+		      << " from " << info.stats.stats.sum.num_legacy_snapsets
+		      << " -> " << scrub_cstat.sum.num_legacy_snapsets << "\n";
+    info.stats.stats.sum.num_legacy_snapsets = scrub_cstat.sum.num_legacy_snapsets;
+    publish_stats_to_osd();
+    share_pg_info();
   }
 }
 
@@ -13270,7 +13674,7 @@ void PrimaryLogPG::SnapTrimmer::log_exit(const char *state_name, utime_t enter_t
 /* NotTrimming */
 PrimaryLogPG::NotTrimming::NotTrimming(my_context ctx)
   : my_base(ctx), 
-    NamedState(context< SnapTrimmer >().pg->cct, "NotTrimming")
+    NamedState(context< SnapTrimmer >().pg, "NotTrimming")
 {
   context< SnapTrimmer >().log_enter(state_name);
 }
@@ -13324,7 +13728,7 @@ boost::statechart::result PrimaryLogPG::WaitReservation::react(const SnapTrimRes
 /* AwaitAsyncWork */
 PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx)
   : my_base(ctx),
-    NamedState(context< SnapTrimmer >().pg->cct, "Trimming/AwaitAsyncWork")
+    NamedState(context< SnapTrimmer >().pg, "Trimming/AwaitAsyncWork")
 {
   auto *pg = context< SnapTrimmer >().pg;
   context< SnapTrimmer >().log_enter(state_name);
diff --git a/src/osd/PrimaryLogPG.h b/src/osd/PrimaryLogPG.h
index 8f56f060edf..c7263d6b520 100644
--- a/src/osd/PrimaryLogPG.h
+++ b/src/osd/PrimaryLogPG.h
@@ -289,6 +289,12 @@ public:
   epoch_t get_epoch() const override {
     return get_osdmap()->get_epoch();
   }
+  epoch_t get_interval_start_epoch() const override {
+    return info.history.same_interval_since;
+  }
+  epoch_t get_last_peering_reset_epoch() const override {
+    return get_last_peering_reset();
+  }
   const set<pg_shard_t> &get_actingbackfill_shards() const override {
     return actingbackfill;
   }
@@ -1093,8 +1099,7 @@ protected:
 
   void write_update_size_and_usage(object_stat_sum_t& stats, object_info_t& oi,
 				   interval_set<uint64_t>& modified, uint64_t offset,
-				   uint64_t length, bool count_bytes,
-				   bool force_changesize=false);
+				   uint64_t length, bool write_full=false);
   void add_interval_usage(interval_set<uint64_t>& s, object_stat_sum_t& st);
 
 
@@ -1311,6 +1316,7 @@ protected:
 
   friend class C_ChecksumRead;
 
+  int do_extent_cmp(OpContext *ctx, OSDOp& osd_op);
   int do_writesame(OpContext *ctx, OSDOp& osd_op);
 
   bool pgls_filter(PGLSFilter *filter, hobject_t& sobj, bufferlist& outdata);
@@ -1482,7 +1488,7 @@ private:
 
     explicit Trimming(my_context ctx)
       : my_base(ctx),
-	NamedState(context< SnapTrimmer >().pg->cct, "Trimming") {
+	NamedState(context< SnapTrimmer >().pg, "Trimming") {
       context< SnapTrimmer >().log_enter(state_name);
       assert(context< SnapTrimmer >().can_trim());
       assert(in_flight.empty());
@@ -1507,7 +1513,7 @@ private:
     Context *wakeup = nullptr;
     explicit WaitTrimTimer(my_context ctx)
       : my_base(ctx),
-	NamedState(context< SnapTrimmer >().pg->cct, "Trimming/WaitTrimTimer") {
+	NamedState(context< SnapTrimmer >().pg, "Trimming/WaitTrimTimer") {
       context< SnapTrimmer >().log_enter(state_name);
       assert(context<Trimming>().in_flight.empty());
       struct OnTimer : Context {
@@ -1557,7 +1563,7 @@ private:
       > reactions;
     explicit WaitRWLock(my_context ctx)
       : my_base(ctx),
-	NamedState(context< SnapTrimmer >().pg->cct, "Trimming/WaitRWLock") {
+	NamedState(context< SnapTrimmer >().pg, "Trimming/WaitRWLock") {
       context< SnapTrimmer >().log_enter(state_name);
       assert(context<Trimming>().in_flight.empty());
     }
@@ -1580,7 +1586,7 @@ private:
       > reactions;
     explicit WaitRepops(my_context ctx)
       : my_base(ctx),
-	NamedState(context< SnapTrimmer >().pg->cct, "Trimming/WaitRepops") {
+	NamedState(context< SnapTrimmer >().pg, "Trimming/WaitRepops") {
       context< SnapTrimmer >().log_enter(state_name);
       assert(!context<Trimming>().in_flight.empty());
     }
@@ -1634,7 +1640,7 @@ private:
 
     explicit WaitReservation(my_context ctx)
       : my_base(ctx),
-	NamedState(context< SnapTrimmer >().pg->cct, "Trimming/WaitReservation") {
+	NamedState(context< SnapTrimmer >().pg, "Trimming/WaitReservation") {
       context< SnapTrimmer >().log_enter(state_name);
       assert(context<Trimming>().in_flight.empty());
       auto *pg = context< SnapTrimmer >().pg;
@@ -1666,7 +1672,7 @@ private:
       > reactions;
     explicit WaitScrub(my_context ctx)
       : my_base(ctx),
-	NamedState(context< SnapTrimmer >().pg->cct, "Trimming/WaitScrub") {
+	NamedState(context< SnapTrimmer >().pg, "Trimming/WaitScrub") {
       context< SnapTrimmer >().log_enter(state_name);
     }
     void exit() {
@@ -1696,7 +1702,7 @@ private:
   // return true if we're creating a local object, false for a
   // whiteout or no change.
   void maybe_create_new_object(OpContext *ctx, bool ignore_transaction=false);
-  int _delete_oid(OpContext *ctx, bool no_whiteout);
+  int _delete_oid(OpContext *ctx, bool no_whiteout, bool try_no_whiteout);
   int _rollback_to(OpContext *ctx, ceph_osd_op& op);
 public:
   bool is_missing_object(const hobject_t& oid) const;
@@ -1721,6 +1727,8 @@ public:
   void wait_for_blocked_object(const hobject_t& soid, OpRequestRef op);
   void kick_object_context_blocked(ObjectContextRef obc);
 
+  void maybe_force_recovery();
+
   void mark_all_unfound_lost(
     int what,
     ConnectionRef con,
diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc
index 4561a11719e..d51506df2ef 100644
--- a/src/osd/ReplicatedBackend.cc
+++ b/src/osd/ReplicatedBackend.cc
@@ -358,10 +358,9 @@ void generate_transaction(
     le.mark_unrollbackable();
     auto oiter = pgt->op_map.find(le.soid);
     if (oiter != pgt->op_map.end() && oiter->second.updated_snaps) {
-      vector<snapid_t> snaps(
-	oiter->second.updated_snaps->second.begin(),
-	oiter->second.updated_snaps->second.end());
-      ::encode(snaps, le.snaps);
+      bufferlist bl(oiter->second.updated_snaps->second.size() * 8 + 8);
+      ::encode(oiter->second.updated_snaps->second, bl);
+      le.snaps.swap(bl);
     }
   }
 
@@ -584,8 +583,10 @@ void ReplicatedBackend::op_applied(
   FUNCTRACE();
   OID_EVENT_TRACE_WITH_MSG((op && op->op) ? op->op->get_req() : NULL, "OP_APPLIED_BEGIN", true);
   dout(10) << __func__ << ": " << op->tid << dendl;
-  if (op->op)
+  if (op->op) {
     op->op->mark_event("op_applied");
+    op->op->pg_trace.event("op applied");
+  }
 
   op->waiting_for_applied.erase(get_parent()->whoami_shard());
   parent->op_applied(op->v);
@@ -606,8 +607,10 @@ void ReplicatedBackend::op_commit(
   FUNCTRACE();
   OID_EVENT_TRACE_WITH_MSG((op && op->op) ? op->op->get_req() : NULL, "OP_COMMIT_BEGIN", true);
   dout(10) << __func__ << ": " << op->tid << dendl;
-  if (op->op)
+  if (op->op) {
     op->op->mark_event("op_commit");
+    op->op->pg_trace.event("op commit");
+  }
 
   op->waiting_for_commit.erase(get_parent()->whoami_shard());
 
@@ -661,6 +664,7 @@ void ReplicatedBackend::do_repop_reply(OpRequestRef op)
         ostringstream ss;
         ss << "sub_op_commit_rec from " << from;
 	ip_op.op->mark_event_string(ss.str());
+	ip_op.op->pg_trace.event("sub_op_commit_rec");
       }
     } else {
       assert(ip_op.waiting_for_applied.count(from));
@@ -668,6 +672,7 @@ void ReplicatedBackend::do_repop_reply(OpRequestRef op)
         ostringstream ss;
         ss << "sub_op_applied_rec from " << from;
 	ip_op.op->mark_event_string(ss.str());
+	ip_op.op->pg_trace.event("sub_op_applied_rec");
       }
     }
     ip_op.waiting_for_applied.erase(from);
@@ -824,6 +829,7 @@ void ReplicatedBackend::_do_push(OpRequestRef op)
   reply->set_priority(m->get_priority());
   reply->pgid = get_info().pgid;
   reply->map_epoch = m->map_epoch;
+  reply->min_epoch = m->min_epoch;
   reply->replies.swap(replies);
   reply->compute_cost(cct);
 
@@ -902,6 +908,7 @@ void ReplicatedBackend::_do_pull_response(OpRequestRef op)
     reply->set_priority(m->get_priority());
     reply->pgid = get_info().pgid;
     reply->map_epoch = m->map_epoch;
+    reply->min_epoch = m->min_epoch;
     reply->set_pulls(&replies);
     reply->compute_cost(cct);
 
@@ -972,6 +979,7 @@ Message * ReplicatedBackend::generate_subop(
     spg_t(get_info().pgid.pgid, peer.shard),
     soid, acks_wanted,
     get_osdmap()->get_epoch(),
+    parent->get_last_peering_reset_epoch(),
     tid, at_version);
 
   // ship resulting transaction, log entries, and pg_stats
@@ -1018,6 +1026,8 @@ void ReplicatedBackend::issue_op(
   InProgressOp *op,
   ObjectStore::Transaction &op_t)
 {
+  if (op->op)
+    op->op->pg_trace.event("issue replication ops");
 
   if (parent->get_actingbackfill_shards().size() > 1) {
     ostringstream ss;
@@ -1050,7 +1060,8 @@ void ReplicatedBackend::issue_op(
       op_t,
       peer,
       pinfo);
-
+    if (op->op)
+      wr->trace.init("replicated op", nullptr, &op->op->pg_trace);
     get_parent()->send_message_osd_cluster(
       peer.osd, wr, get_osdmap()->get_epoch());
   }
@@ -1148,24 +1159,21 @@ void ReplicatedBackend::repop_applied(RepModifyRef rm)
 {
   rm->op->mark_event("sub_op_applied");
   rm->applied = true;
+  rm->op->pg_trace.event("sup_op_applied");
 
   dout(10) << __func__ << " on " << rm << " op "
 	   << *rm->op->get_req() << dendl;
   const Message *m = rm->op->get_req();
-
-  Message *ack = NULL;
-  eversion_t version;
-
   const MOSDRepOp *req = static_cast<const MOSDRepOp*>(m);
-  version = req->version;
-  if (!rm->committed)
-    ack = new MOSDRepOpReply(
-	static_cast<const MOSDRepOp*>(m), parent->whoami_shard(),
-	0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK);
+  eversion_t version = req->version;
 
   // send ack to acker only if we haven't sent a commit already
-  if (ack) {
+  if (!rm->committed) {
+    Message *ack = new MOSDRepOpReply(
+      req, parent->whoami_shard(),
+      0, get_osdmap()->get_epoch(), req->min_epoch, CEPH_OSD_FLAG_ACK);
     ack->set_priority(CEPH_MSG_PRIO_HIGH); // this better match commit priority!
+    ack->trace = rm->op->pg_trace;
     get_parent()->send_message_osd_cluster(
       rm->ackerosd, ack, get_osdmap()->get_epoch());
   }
@@ -1176,24 +1184,26 @@ void ReplicatedBackend::repop_applied(RepModifyRef rm)
 void ReplicatedBackend::repop_commit(RepModifyRef rm)
 {
   rm->op->mark_commit_sent();
+  rm->op->pg_trace.event("sup_op_commit");
   rm->committed = true;
 
   // send commit.
-  const Message *m = rm->op->get_req();
+  const MOSDRepOp *m = static_cast<const MOSDRepOp*>(rm->op->get_req());
+  assert(m->get_type() == MSG_OSD_REPOP);
   dout(10) << __func__ << " on op " << *m
 	   << ", sending commit to osd." << rm->ackerosd
 	   << dendl;
-  assert(m->get_type() == MSG_OSD_REPOP);
   assert(get_osdmap()->is_up(rm->ackerosd));
 
   get_parent()->update_last_complete_ondisk(rm->last_complete);
 
   MOSDRepOpReply *reply = new MOSDRepOpReply(
-    static_cast<const MOSDRepOp*>(m),
+    m,
     get_parent()->whoami_shard(),
-    0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ONDISK);
+    0, get_osdmap()->get_epoch(), m->get_min_epoch(), CEPH_OSD_FLAG_ONDISK);
   reply->set_last_complete_ondisk(rm->last_complete);
   reply->set_priority(CEPH_MSG_PRIO_HIGH); // this better match ack priority!
+  reply->trace = rm->op->pg_trace;
   get_parent()->send_message_osd_cluster(
     rm->ackerosd, reply, get_osdmap()->get_epoch());
 
@@ -1416,6 +1426,7 @@ void ReplicatedBackend::prepare_pull(
     SnapSetContext *ssc = headctx->ssc;
     assert(ssc);
     dout(10) << " snapset " << ssc->snapset << dendl;
+    recovery_info.ss = ssc->snapset;
     calc_clone_subsets(
       ssc->snapset, soid, get_parent()->get_local_missing(),
       get_info().last_backfill,
@@ -1498,6 +1509,7 @@ void ReplicatedBackend::prep_push_to_replica(
     SnapSetContext *ssc = obc->ssc;
     assert(ssc);
     dout(15) << "push_to_replica snapset is " << ssc->snapset << dendl;
+    pop->recovery_info.ss = ssc->snapset;
     map<pg_shard_t, pg_missing_t>::const_iterator pm =
       get_parent()->get_shard_missing().find(peer);
     assert(pm != get_parent()->get_shard_missing().end());
@@ -1569,11 +1581,8 @@ void ReplicatedBackend::prep_push(
   pi.recovery_info.clone_subset = clone_subsets;
   pi.recovery_info.soid = soid;
   pi.recovery_info.oi = obc->obs.oi;
+  pi.recovery_info.ss = pop->recovery_info.ss;
   pi.recovery_info.version = version;
-  pi.recovery_progress.first = true;
-  pi.recovery_progress.data_recovered_to = 0;
-  pi.recovery_progress.data_complete = 0;
-  pi.recovery_progress.omap_complete = 0;
   pi.lock_manager = std::move(lock_manager);
 
   ObjectRecoveryProgress new_progress;
@@ -1851,6 +1860,7 @@ void ReplicatedBackend::send_pushes(int prio, map<pg_shard_t, vector<PushOp> > &
       msg->from = get_parent()->whoami_shard();
       msg->pgid = get_parent()->primary_spg_t();
       msg->map_epoch = get_osdmap()->get_epoch();
+      msg->min_epoch = get_parent()->get_last_peering_reset_epoch();
       msg->set_priority(prio);
       for (;
            (j != i->second.end() &&
@@ -1886,6 +1896,7 @@ void ReplicatedBackend::send_pulls(int prio, map<pg_shard_t, vector<PullOp> > &p
     msg->set_priority(prio);
     msg->pgid = get_parent()->primary_spg_t();
     msg->map_epoch = get_osdmap()->get_epoch();
+    msg->min_epoch = get_parent()->get_last_peering_reset_epoch();
     msg->set_pulls(&i->second);
     msg->compute_cost(cct);
     get_parent()->send_message_osd_cluster(msg, con);
diff --git a/src/osd/SnapMapper.cc b/src/osd/SnapMapper.cc
index 8f758f05d4b..d4c393120e2 100644
--- a/src/osd/SnapMapper.cc
+++ b/src/osd/SnapMapper.cc
@@ -162,6 +162,7 @@ void SnapMapper::clear_snaps(
   const hobject_t &oid,
   MapCacher::Transaction<std::string, bufferlist> *t)
 {
+  dout(20) << __func__ << " " << oid << dendl;
   assert(check(oid));
   set<string> to_remove;
   to_remove.insert(to_object_key(oid));
@@ -178,6 +179,7 @@ void SnapMapper::set_snaps(
   bufferlist bl;
   ::encode(in, bl);
   to_set[to_object_key(oid)] = bl;
+  dout(20) << __func__ << " " << oid << " " << in.snaps << dendl;
   backend.set_keys(to_set, t);
 }
 
@@ -297,6 +299,7 @@ int SnapMapper::_remove_oid(
   const hobject_t &oid,
   MapCacher::Transaction<std::string, bufferlist> *t)
 {
+  dout(20) << __func__ << " " << oid << dendl;
   object_snaps out;
   int r = get_snaps(oid, &out);
   if (r < 0)
diff --git a/src/osd/osd_internal_types.h b/src/osd/osd_internal_types.h
index d754109ad58..924a8be0c37 100644
--- a/src/osd/osd_internal_types.h
+++ b/src/osd/osd_internal_types.h
@@ -14,6 +14,17 @@
   * replicas ack.
   */
 
+struct SnapSetContext {
+  hobject_t oid;
+  SnapSet snapset;
+  int ref;
+  bool registered : 1;
+  bool exists : 1;
+
+  explicit SnapSetContext(const hobject_t& o) :
+    oid(o), ref(0), registered(false), exists(true) { }
+};
+
 struct ObjectContext;
 
 struct ObjectState {
diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc
index 01dccd6d510..fbebf058d2b 100644
--- a/src/osd/osd_types.cc
+++ b/src/osd/osd_types.cc
@@ -1883,11 +1883,12 @@ void object_stat_sum_t::dump(Formatter *f) const
   f->dump_int("num_evict_mode_some", num_evict_mode_some);
   f->dump_int("num_evict_mode_full", num_evict_mode_full);
   f->dump_int("num_objects_pinned", num_objects_pinned);
+  f->dump_int("num_legacy_snapsets", num_legacy_snapsets);
 }
 
 void object_stat_sum_t::encode(bufferlist& bl) const
 {
-  ENCODE_START(15, 3, bl);
+  ENCODE_START(16, 14, bl);
 #if defined(CEPH_LITTLE_ENDIAN)
   bl.append((char *)(&num_bytes), sizeof(object_stat_sum_t));
 #else
@@ -1925,6 +1926,7 @@ void object_stat_sum_t::encode(bufferlist& bl) const
   ::encode(num_evict_mode_full, bl);
   ::encode(num_objects_pinned, bl);
   ::encode(num_objects_missing, bl);
+  ::encode(num_legacy_snapsets, bl);
 #endif
   ENCODE_FINISH(bl);
 }
@@ -1932,110 +1934,52 @@ void object_stat_sum_t::encode(bufferlist& bl) const
 void object_stat_sum_t::decode(bufferlist::iterator& bl)
 {
   bool decode_finish = false;
-  DECODE_START_LEGACY_COMPAT_LEN(14, 3, 3, bl);
+  DECODE_START(16, bl);
 #if defined(CEPH_LITTLE_ENDIAN)
-  if (struct_v >= 15) {
+  if (struct_v >= 16) {
     bl.copy(sizeof(object_stat_sum_t), (char*)(&num_bytes));
     decode_finish = true;
   }
 #endif
   if (!decode_finish) {
     ::decode(num_bytes, bl);
-    if (struct_v < 3) {
-      uint64_t num_kb;
-      ::decode(num_kb, bl);
-    }
     ::decode(num_objects, bl);
     ::decode(num_object_clones, bl);
     ::decode(num_object_copies, bl);
     ::decode(num_objects_missing_on_primary, bl);
     ::decode(num_objects_degraded, bl);
-    if (struct_v >= 2)
-      ::decode(num_objects_unfound, bl);
+    ::decode(num_objects_unfound, bl);
     ::decode(num_rd, bl);
     ::decode(num_rd_kb, bl);
     ::decode(num_wr, bl);
     ::decode(num_wr_kb, bl);
-    if (struct_v >= 4)
-      ::decode(num_scrub_errors, bl);
-    else
-      num_scrub_errors = 0;
-    if (struct_v >= 5) {
-      ::decode(num_objects_recovered, bl);
-      ::decode(num_bytes_recovered, bl);
-      ::decode(num_keys_recovered, bl);
-    } else {
-      num_objects_recovered = 0;
-      num_bytes_recovered = 0;
-      num_keys_recovered = 0;
-    }
-    if (struct_v >= 6) {
-      ::decode(num_shallow_scrub_errors, bl);
-      ::decode(num_deep_scrub_errors, bl);
-    } else {
-      num_shallow_scrub_errors = 0;
-      num_deep_scrub_errors = 0;
-    }
-    if (struct_v >= 7) {
-      ::decode(num_objects_dirty, bl);
-      ::decode(num_whiteouts, bl);
-    } else {
-      num_objects_dirty = 0;
-      num_whiteouts = 0;
-    }
-    if (struct_v >= 8) {
-      ::decode(num_objects_omap, bl);
-    } else {
-      num_objects_omap = 0;
-    }
-    if (struct_v >= 9) {
-      ::decode(num_objects_hit_set_archive, bl);
-    } else {
-      num_objects_hit_set_archive = 0;
-    }
-    if (struct_v >= 10) {
-      ::decode(num_objects_misplaced, bl);
-    } else {
-      num_objects_misplaced = 0;
-    }
-    if (struct_v >= 11) {
-      ::decode(num_bytes_hit_set_archive, bl);
-    } else {
-      num_bytes_hit_set_archive = 0;
-    }
-    if (struct_v >= 12) {
-      ::decode(num_flush, bl);
-      ::decode(num_flush_kb, bl);
-      ::decode(num_evict, bl);
-      ::decode(num_evict_kb, bl);
-      ::decode(num_promote, bl);
+    ::decode(num_scrub_errors, bl);
+    ::decode(num_objects_recovered, bl);
+    ::decode(num_bytes_recovered, bl);
+    ::decode(num_keys_recovered, bl);
+    ::decode(num_shallow_scrub_errors, bl);
+    ::decode(num_deep_scrub_errors, bl);
+    ::decode(num_objects_dirty, bl);
+    ::decode(num_whiteouts, bl);
+    ::decode(num_objects_omap, bl);
+    ::decode(num_objects_hit_set_archive, bl);
+    ::decode(num_objects_misplaced, bl);
+    ::decode(num_bytes_hit_set_archive, bl);
+    ::decode(num_flush, bl);
+    ::decode(num_flush_kb, bl);
+    ::decode(num_evict, bl);
+    ::decode(num_evict_kb, bl);
+    ::decode(num_promote, bl);
+    ::decode(num_flush_mode_high, bl);
+    ::decode(num_flush_mode_low, bl);
+    ::decode(num_evict_mode_some, bl);
+    ::decode(num_evict_mode_full, bl);
+    ::decode(num_objects_pinned, bl);
+    ::decode(num_objects_missing, bl);
+    if (struct_v >= 16) {
+      ::decode(num_legacy_snapsets, bl);
     } else {
-      num_flush = 0;
-      num_flush_kb = 0;
-      num_evict = 0;
-      num_evict_kb = 0;
-      num_promote = 0;
-    }
-    if (struct_v >= 13) {
-      ::decode(num_flush_mode_high, bl);
-      ::decode(num_flush_mode_low, bl);
-      ::decode(num_evict_mode_some, bl);
-      ::decode(num_evict_mode_full, bl);
-    } else {
-      num_flush_mode_high = 0;
-      num_flush_mode_low = 0;
-      num_evict_mode_some = 0;
-      num_evict_mode_full = 0;
-    }
-    if (struct_v >= 14) {
-      ::decode(num_objects_pinned, bl);
-    } else {
-      num_objects_pinned = 0;
-    }
-    if (struct_v >= 15) {
-      ::decode(num_objects_missing, bl);
-    } else {
-      num_objects_missing = 0;
+      num_legacy_snapsets = num_object_clones;  // upper bound
     }
   }
   DECODE_FINISH(bl);
@@ -2115,6 +2059,7 @@ void object_stat_sum_t::add(const object_stat_sum_t& o)
   num_evict_mode_some += o.num_evict_mode_some;
   num_evict_mode_full += o.num_evict_mode_full;
   num_objects_pinned += o.num_objects_pinned;
+  num_legacy_snapsets += o.num_legacy_snapsets;
 }
 
 void object_stat_sum_t::sub(const object_stat_sum_t& o)
@@ -2153,6 +2098,7 @@ void object_stat_sum_t::sub(const object_stat_sum_t& o)
   num_evict_mode_some -= o.num_evict_mode_some;
   num_evict_mode_full -= o.num_evict_mode_full;
   num_objects_pinned -= o.num_objects_pinned;
+  num_legacy_snapsets -= o.num_legacy_snapsets;
 }
 
 bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r)
@@ -2191,7 +2137,8 @@ bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r)
     l.num_flush_mode_low == r.num_flush_mode_low &&
     l.num_evict_mode_some == r.num_evict_mode_some &&
     l.num_evict_mode_full == r.num_evict_mode_full &&
-    l.num_objects_pinned == r.num_objects_pinned;
+    l.num_objects_pinned == r.num_objects_pinned &&
+    l.num_legacy_snapsets == r.num_legacy_snapsets;
 }
 
 // -- object_stat_collection_t --
@@ -2323,7 +2270,7 @@ void pg_stat_t::dump_brief(Formatter *f) const
 
 void pg_stat_t::encode(bufferlist &bl) const
 {
-  ENCODE_START(22, 8, bl);
+  ENCODE_START(22, 22, bl);
   ::encode(version, bl);
   ::encode(reported_seq, bl);
   ::encode(reported_epoch, bl);
@@ -2370,7 +2317,7 @@ void pg_stat_t::encode(bufferlist &bl) const
 void pg_stat_t::decode(bufferlist::iterator &bl)
 {
   bool tmp;
-  DECODE_START_LEGACY_COMPAT_LEN(22, 8, 8, bl);
+  DECODE_START(22, bl);
   ::decode(version, bl);
   ::decode(reported_seq, bl);
   ::decode(reported_epoch, bl);
@@ -2378,145 +2325,45 @@ void pg_stat_t::decode(bufferlist::iterator &bl)
   ::decode(log_start, bl);
   ::decode(ondisk_log_start, bl);
   ::decode(created, bl);
-  if (struct_v >= 7)
-    ::decode(last_epoch_clean, bl);
-  else
-    last_epoch_clean = 0;
-  if (struct_v < 6) {
-    old_pg_t opgid;
-    ::decode(opgid, bl);
-    parent = opgid;
-  } else {
-    ::decode(parent, bl);
-  }
+  ::decode(last_epoch_clean, bl);
+  ::decode(parent, bl);
   ::decode(parent_split_bits, bl);
   ::decode(last_scrub, bl);
   ::decode(last_scrub_stamp, bl);
-  if (struct_v <= 4) {
-    ::decode(stats.sum.num_bytes, bl);
-    uint64_t num_kb;
-    ::decode(num_kb, bl);
-    ::decode(stats.sum.num_objects, bl);
-    ::decode(stats.sum.num_object_clones, bl);
-    ::decode(stats.sum.num_object_copies, bl);
-    ::decode(stats.sum.num_objects_missing_on_primary, bl);
-    ::decode(stats.sum.num_objects_degraded, bl);
-    ::decode(log_size, bl);
-    ::decode(ondisk_log_size, bl);
-    if (struct_v >= 2) {
-      ::decode(stats.sum.num_rd, bl);
-      ::decode(stats.sum.num_rd_kb, bl);
-      ::decode(stats.sum.num_wr, bl);
-      ::decode(stats.sum.num_wr_kb, bl);
-    }
-    if (struct_v >= 3) {
-      ::decode(up, bl);
-    }
-    if (struct_v == 4) {
-      ::decode(stats.sum.num_objects_unfound, bl);  // sigh.
-    }
-    ::decode(acting, bl);
-  } else {
-    ::decode(stats, bl);
-    ::decode(log_size, bl);
-    ::decode(ondisk_log_size, bl);
-    ::decode(up, bl);
-    ::decode(acting, bl);
-    if (struct_v >= 9) {
-      ::decode(last_fresh, bl);
-      ::decode(last_change, bl);
-      ::decode(last_active, bl);
-      ::decode(last_clean, bl);
-      ::decode(last_unstale, bl);
-      ::decode(mapping_epoch, bl);
-      if (struct_v >= 10) {
-        ::decode(last_deep_scrub, bl);
-        ::decode(last_deep_scrub_stamp, bl);
-      }
-    }
-  }
-  if (struct_v < 11) {
-    stats_invalid = false;
-  } else {    
-    ::decode(tmp, bl);
-    stats_invalid = tmp;
-  }
-  if (struct_v >= 12) {
-    ::decode(last_clean_scrub_stamp, bl);
-  } else {
-    last_clean_scrub_stamp = utime_t();
-  }
-  if (struct_v >= 13) {
-    ::decode(last_became_active, bl);
-  } else {
-    last_became_active = last_active;
-  }
-  if (struct_v >= 14) {
-    ::decode(tmp, bl);
-    dirty_stats_invalid = tmp;
-  } else {
-    // if we are decoding an old encoding of this object, then the
-    // encoder may not have supported num_objects_dirty accounting.
-    dirty_stats_invalid = true;
-  }
-  if (struct_v >= 15) {
-    ::decode(up_primary, bl);
-    ::decode(acting_primary, bl);
-  } else {
-    up_primary = up.size() ? up[0] : -1;
-    acting_primary = acting.size() ? acting[0] : -1;
-  }
-  if (struct_v >= 16) {
-    ::decode(tmp, bl);
-    omap_stats_invalid = tmp;
-  } else {
-    // if we are decoding an old encoding of this object, then the
-    // encoder may not have supported num_objects_omap accounting.
-    omap_stats_invalid = true;
-  }
-  if (struct_v >= 17) {
-    ::decode(tmp, bl);
-    hitset_stats_invalid = tmp;
-  } else {
-    // if we are decoding an old encoding of this object, then the
-    // encoder may not have supported num_objects_hit_set_archive accounting.
-    hitset_stats_invalid = true;
-  }
-  if (struct_v >= 18) {
-    ::decode(blocked_by, bl);
-  } else {
-    blocked_by.clear();
-  }
-  if (struct_v >= 19) {
-    ::decode(last_undegraded, bl);
-    ::decode(last_fullsized, bl);
-  } else {
-    last_undegraded = utime_t();
-    last_fullsized = utime_t();
-  }
-  if (struct_v >= 20) {
-    ::decode(tmp, bl);
-    hitset_bytes_stats_invalid = tmp;
-  } else {
-    // if we are decoding an old encoding of this object, then the
-    // encoder may not have supported num_bytes_hit_set_archive accounting.
-    hitset_bytes_stats_invalid = true;
-  }
-  if (struct_v >= 21) {
-    ::decode(last_peered, bl);
-    ::decode(last_became_peered, bl);
-  } else {
-    last_peered = last_active;
-    last_became_peered = last_became_active;
-  }
-  if (struct_v >= 22) {
-    ::decode(tmp, bl);
-    pin_stats_invalid = tmp;
-  } else {
-    // if we are decoding an old encoding of this object, then the
-    // encoder may not have supported num_objects_pinned accounting.
-    pin_stats_invalid = true;
-  }
+  ::decode(stats, bl);
+  ::decode(log_size, bl);
+  ::decode(ondisk_log_size, bl);
+  ::decode(up, bl);
+  ::decode(acting, bl);
+  ::decode(last_fresh, bl);
+  ::decode(last_change, bl);
+  ::decode(last_active, bl);
+  ::decode(last_clean, bl);
+  ::decode(last_unstale, bl);
+  ::decode(mapping_epoch, bl);
+  ::decode(last_deep_scrub, bl);
+  ::decode(last_deep_scrub_stamp, bl);
+  ::decode(tmp, bl);
+  stats_invalid = tmp;
+  ::decode(last_clean_scrub_stamp, bl);
+  ::decode(last_became_active, bl);
+  ::decode(tmp, bl);
+  dirty_stats_invalid = tmp;
+  ::decode(up_primary, bl);
+  ::decode(acting_primary, bl);
+  ::decode(tmp, bl);
+  omap_stats_invalid = tmp;
+  ::decode(tmp, bl);
+  hitset_stats_invalid = tmp;
+  ::decode(blocked_by, bl);
+  ::decode(last_undegraded, bl);
+  ::decode(last_fullsized, bl);
+  ::decode(tmp, bl);
+  hitset_bytes_stats_invalid = tmp;
+  ::decode(last_peered, bl);
+  ::decode(last_became_peered, bl);
+  ::decode(tmp, bl);
+  pin_stats_invalid = tmp;
   DECODE_FINISH(bl);
 }
 
@@ -2702,7 +2549,7 @@ void pool_stat_t::generate_test_instances(list<pool_stat_t*>& o)
 
 void pg_history_t::encode(bufferlist &bl) const
 {
-  ENCODE_START(7, 4, bl);
+  ENCODE_START(9, 4, bl);
   ::encode(epoch_created, bl);
   ::encode(last_epoch_started, bl);
   ::encode(last_epoch_clean, bl);
@@ -2716,12 +2563,15 @@ void pg_history_t::encode(bufferlist &bl) const
   ::encode(last_deep_scrub_stamp, bl);
   ::encode(last_clean_scrub_stamp, bl);
   ::encode(last_epoch_marked_full, bl);
+  ::encode(last_interval_started, bl);
+  ::encode(last_interval_clean, bl);
+  ::encode(epoch_pool_created, bl);
   ENCODE_FINISH(bl);
 }
 
 void pg_history_t::decode(bufferlist::iterator &bl)
 {
-  DECODE_START_LEGACY_COMPAT_LEN(7, 4, 4, bl);
+  DECODE_START_LEGACY_COMPAT_LEN(9, 4, 4, bl);
   ::decode(epoch_created, bl);
   ::decode(last_epoch_started, bl);
   if (struct_v >= 3)
@@ -2746,14 +2596,37 @@ void pg_history_t::decode(bufferlist::iterator &bl)
   if (struct_v >= 7) {
     ::decode(last_epoch_marked_full, bl);
   }
+  if (struct_v >= 8) {
+    ::decode(last_interval_started, bl);
+    ::decode(last_interval_clean, bl);
+  } else {
+    if (last_epoch_started >= same_interval_since) {
+      last_interval_started = same_interval_since;
+    } else {
+      last_interval_started = last_epoch_started; // best guess
+    }
+    if (last_epoch_clean >= same_interval_since) {
+      last_interval_clean = same_interval_since;
+    } else {
+      last_interval_clean = last_epoch_clean; // best guess
+    }
+  }
+  if (struct_v >= 9) {
+    ::decode(epoch_pool_created, bl);
+  } else {
+    epoch_pool_created = epoch_created;
+  }
   DECODE_FINISH(bl);
 }
 
 void pg_history_t::dump(Formatter *f) const
 {
   f->dump_int("epoch_created", epoch_created);
+  f->dump_int("epoch_pool_created", epoch_pool_created);
   f->dump_int("last_epoch_started", last_epoch_started);
+  f->dump_int("last_interval_started", last_interval_started);
   f->dump_int("last_epoch_clean", last_epoch_clean);
+  f->dump_int("last_interval_clean", last_interval_clean);
   f->dump_int("last_epoch_split", last_epoch_split);
   f->dump_int("last_epoch_marked_full", last_epoch_marked_full);
   f->dump_int("same_up_since", same_up_since);
@@ -2771,8 +2644,11 @@ void pg_history_t::generate_test_instances(list<pg_history_t*>& o)
   o.push_back(new pg_history_t);
   o.push_back(new pg_history_t);
   o.back()->epoch_created = 1;
+  o.back()->epoch_pool_created = 1;
   o.back()->last_epoch_started = 2;
+  o.back()->last_interval_started = 2;
   o.back()->last_epoch_clean = 3;
+  o.back()->last_interval_clean = 2;
   o.back()->last_epoch_split = 4;
   o.back()->same_up_since = 5;
   o.back()->same_interval_since = 6;
@@ -2790,7 +2666,7 @@ void pg_history_t::generate_test_instances(list<pg_history_t*>& o)
 
 void pg_info_t::encode(bufferlist &bl) const
 {
-  ENCODE_START(31, 26, bl);
+  ENCODE_START(32, 26, bl);
   ::encode(pgid.pgid, bl);
   ::encode(last_update, bl);
   ::encode(last_complete, bl);
@@ -2809,59 +2685,34 @@ void pg_info_t::encode(bufferlist &bl) const
   ::encode(pgid.shard, bl);
   ::encode(last_backfill, bl);
   ::encode(last_backfill_bitwise, bl);
+  ::encode(last_interval_started, bl);
   ENCODE_FINISH(bl);
 }
 
 void pg_info_t::decode(bufferlist::iterator &bl)
 {
-  DECODE_START_LEGACY_COMPAT_LEN(31, 26, 26, bl);
-  if (struct_v < 23) {
-    old_pg_t opgid;
-    ::decode(opgid, bl);
-    pgid.pgid = opgid;
-  } else {
-    ::decode(pgid.pgid, bl);
-  }
+  DECODE_START(32, bl);
+  ::decode(pgid.pgid, bl);
   ::decode(last_update, bl);
   ::decode(last_complete, bl);
   ::decode(log_tail, bl);
-  if (struct_v < 25) {
-    bool log_backlog;
-    ::decode(log_backlog, bl);
-  }
-  hobject_t old_last_backfill;
-  if (struct_v >= 24) {
+  {
+    hobject_t old_last_backfill;
     ::decode(old_last_backfill, bl);
   }
   ::decode(stats, bl);
   history.decode(bl);
-  if (struct_v >= 22)
-    ::decode(purged_snaps, bl);
-  else {
-    set<snapid_t> snap_trimq;
-    ::decode(snap_trimq, bl);
-  }
-  if (struct_v < 27) {
-    last_epoch_started = history.last_epoch_started;
-  } else {
-    ::decode(last_epoch_started, bl);
-  }
-  if (struct_v >= 28)
-    ::decode(last_user_version, bl);
-  else
-    last_user_version = last_update.version;
-  if (struct_v >= 29)
-    ::decode(hit_set, bl);
-  if (struct_v >= 30)
-    ::decode(pgid.shard, bl);
-  else
-    pgid.shard = shard_id_t::NO_SHARD;
-  if (struct_v >= 31) {
-    ::decode(last_backfill, bl);
-    ::decode(last_backfill_bitwise, bl);
+  ::decode(purged_snaps, bl);
+  ::decode(last_epoch_started, bl);
+  ::decode(last_user_version, bl);
+  ::decode(hit_set, bl);
+  ::decode(pgid.shard, bl);
+  ::decode(last_backfill, bl);
+  ::decode(last_backfill_bitwise, bl);
+  if (struct_v >= 32) {
+    ::decode(last_interval_started, bl);
   } else {
-    last_backfill = old_last_backfill;
-    last_backfill_bitwise = false;
+    last_interval_started = last_epoch_started;
   }
   DECODE_FINISH(bl);
 }
@@ -2933,7 +2784,7 @@ void pg_info_t::generate_test_instances(list<pg_info_t*>& o)
 // -- pg_notify_t --
 void pg_notify_t::encode(bufferlist &bl) const
 {
-  ENCODE_START(2, 1, bl);
+  ENCODE_START(2, 2, bl);
   ::encode(query_epoch, bl);
   ::encode(epoch_sent, bl);
   ::encode(info, bl);
@@ -2948,13 +2799,8 @@ void pg_notify_t::decode(bufferlist::iterator &bl)
   ::decode(query_epoch, bl);
   ::decode(epoch_sent, bl);
   ::decode(info, bl);
-  if (struct_v >= 2) {
-    ::decode(to, bl);
-    ::decode(from, bl);
-  } else {
-    to = shard_id_t::NO_SHARD;
-    from = shard_id_t::NO_SHARD;
-  }
+  ::decode(to, bl);
+  ::decode(from, bl);
   DECODE_FINISH(bl);
 }
 
@@ -2979,9 +2825,9 @@ void pg_notify_t::generate_test_instances(list<pg_notify_t*>& o)
 
 ostream &operator<<(ostream &lhs, const pg_notify_t &notify)
 {
-  lhs << "(query_epoch:" << notify.query_epoch
-      << ", epoch_sent:" << notify.epoch_sent
-      << ", info:" << notify.info;
+  lhs << "(query:" << notify.query_epoch
+      << " sent:" << notify.epoch_sent
+      << " " << notify.info;
   if (notify.from != shard_id_t::NO_SHARD ||
       notify.to != shard_id_t::NO_SHARD)
     lhs << " " << (unsigned)notify.from
@@ -2991,7 +2837,7 @@ ostream &operator<<(ostream &lhs, const pg_notify_t &notify)
 
 // -- pg_interval_t --
 
-void pg_interval_t::encode(bufferlist& bl) const
+void PastIntervals::pg_interval_t::encode(bufferlist& bl) const
 {
   ENCODE_START(4, 2, bl);
   ::encode(first, bl);
@@ -3004,7 +2850,7 @@ void pg_interval_t::encode(bufferlist& bl) const
   ENCODE_FINISH(bl);
 }
 
-void pg_interval_t::decode(bufferlist::iterator& bl)
+void PastIntervals::pg_interval_t::decode(bufferlist::iterator& bl)
 {
   DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, bl);
   ::decode(first, bl);
@@ -3027,7 +2873,7 @@ void pg_interval_t::decode(bufferlist::iterator& bl)
   DECODE_FINISH(bl);
 }
 
-void pg_interval_t::dump(Formatter *f) const
+void PastIntervals::pg_interval_t::dump(Formatter *f) const
 {
   f->dump_unsigned("first", first);
   f->dump_unsigned("last", last);
@@ -3044,7 +2890,7 @@ void pg_interval_t::dump(Formatter *f) const
   f->dump_int("up_primary", up_primary);
 }
 
-void pg_interval_t::generate_test_instances(list<pg_interval_t*>& o)
+void PastIntervals::pg_interval_t::generate_test_instances(list<pg_interval_t*>& o)
 {
   o.push_back(new pg_interval_t);
   o.push_back(new pg_interval_t);
@@ -3056,7 +2902,471 @@ void pg_interval_t::generate_test_instances(list<pg_interval_t*>& o)
   o.back()->maybe_went_rw = true;
 }
 
-bool pg_interval_t::is_new_interval(
+WRITE_CLASS_ENCODER(PastIntervals::pg_interval_t)
+
+class pi_simple_rep : public PastIntervals::interval_rep {
+  map<epoch_t, PastIntervals::pg_interval_t> interval_map;
+
+  pi_simple_rep(
+    bool ec_pool,
+    std::list<PastIntervals::pg_interval_t> &&intervals) {
+    for (auto &&i: intervals)
+      add_interval(ec_pool, i);
+  }
+
+public:
+  pi_simple_rep() = default;
+  pi_simple_rep(const pi_simple_rep &) = default;
+  pi_simple_rep(pi_simple_rep &&) = default;
+  pi_simple_rep &operator=(pi_simple_rep &&) = default;
+  pi_simple_rep &operator=(const pi_simple_rep &) = default;
+
+  size_t size() const override { return interval_map.size(); }
+  bool empty() const override { return interval_map.empty(); }
+  void clear() override { interval_map.clear(); }
+  pair<epoch_t, epoch_t> get_bounds() const override {
+    auto iter = interval_map.begin();
+    if (iter != interval_map.end()) {
+      auto riter = interval_map.rbegin();
+      return make_pair(
+	iter->second.first,
+	riter->second.last + 1);
+    } else {
+      return make_pair(0, 0);
+    }
+  }
+  set<pg_shard_t> get_all_participants(
+    bool ec_pool) const override {
+    set<pg_shard_t> all_participants;
+
+    // We need to decide who might have unfound objects that we need
+    auto p = interval_map.rbegin();
+    auto end = interval_map.rend();
+    for (; p != end; ++p) {
+      const PastIntervals::pg_interval_t &interval(p->second);
+      // If nothing changed, we don't care about this interval.
+      if (!interval.maybe_went_rw)
+	continue;
+
+      int i = 0;
+      std::vector<int>::const_iterator a = interval.acting.begin();
+      std::vector<int>::const_iterator a_end = interval.acting.end();
+      for (; a != a_end; ++a, ++i) {
+	pg_shard_t shard(*a, ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD);
+	if (*a != CRUSH_ITEM_NONE)
+	  all_participants.insert(shard);
+      }
+    }
+    return all_participants;
+  }
+  void add_interval(
+    bool ec_pool,
+    const PastIntervals::pg_interval_t &interval) override {
+    interval_map[interval.first] = interval;
+  }
+  unique_ptr<PastIntervals::interval_rep> clone() const override {
+    return unique_ptr<PastIntervals::interval_rep>(new pi_simple_rep(*this));
+  }
+  ostream &print(ostream &out) const override {
+    return out << interval_map;
+  }
+  void encode(bufferlist &bl) const override {
+    ::encode(interval_map, bl);
+  }
+  void decode(bufferlist::iterator &bl) override {
+    ::decode(interval_map, bl);
+  }
+  void dump(Formatter *f) const override {
+    f->open_array_section("PastIntervals::compat_rep");
+    for (auto &&i: interval_map) {
+      f->open_object_section("pg_interval_t");
+      f->dump_int("epoch", i.first);
+      f->open_object_section("interval");
+      i.second.dump(f);
+      f->close_section();
+      f->close_section();
+    }
+    f->close_section();
+  }
+  bool is_classic() const override {
+    return true;
+  }
+  static void generate_test_instances(list<pi_simple_rep*> &o) {
+    using ival = PastIntervals::pg_interval_t;
+    using ivallst = std::list<ival>;
+    o.push_back(
+      new pi_simple_rep(
+	true, ivallst
+	{ ival{{0, 1, 2}, {0, 1, 2}, 10, 20,  true, 0, 0}
+	, ival{{   1, 2}, {   1, 2}, 21, 30,  true, 1, 1}
+	, ival{{      2}, {      2}, 31, 35, false, 2, 2}
+	, ival{{0,    2}, {0,    2}, 36, 50,  true, 0, 0}
+	}));
+    o.push_back(
+      new pi_simple_rep(
+	false, ivallst
+	{ ival{{0, 1, 2}, {0, 1, 2}, 10, 20,  true, 0, 0}
+	, ival{{   1, 2}, {   1, 2}, 20, 30,  true, 1, 1}
+	, ival{{      2}, {      2}, 31, 35, false, 2, 2}
+	, ival{{0,    2}, {0,    2}, 36, 50,  true, 0, 0}
+	}));
+    o.push_back(
+      new pi_simple_rep(
+	true, ivallst
+	{ ival{{2, 1, 0}, {2, 1, 0}, 10, 20,  true, 1, 1}
+	, ival{{   0, 2}, {   0, 2}, 21, 30,  true, 0, 0}
+	, ival{{   0, 2}, {2,    0}, 31, 35,  true, 2, 2}
+	, ival{{   0, 2}, {   0, 2}, 36, 50,  true, 0, 0}
+	}));
+    return;
+  }
+  void iterate_mayberw_back_to(
+    bool ec_pool,
+    epoch_t les,
+    std::function<void(epoch_t, const set<pg_shard_t> &)> &&f) const override {
+    for (auto i = interval_map.rbegin(); i != interval_map.rend(); ++i) {
+      if (!i->second.maybe_went_rw)
+	continue;
+      if (i->second.last < les)
+	break;
+      set<pg_shard_t> actingset;
+      for (unsigned j = 0; j < i->second.acting.size(); ++j) {
+	if (i->second.acting[j] == CRUSH_ITEM_NONE)
+	  continue;
+	actingset.insert(
+	  pg_shard_t(
+	    i->second.acting[j],
+	    ec_pool ? shard_id_t(j) : shard_id_t::NO_SHARD));
+      }
+      f(i->second.first, actingset);
+    }
+  }
+
+  bool has_full_intervals() const override { return true; }
+  void iterate_all_intervals(
+    std::function<void(const PastIntervals::pg_interval_t &)> &&f
+    ) const override {
+    for (auto &&i: interval_map) {
+      f(i.second);
+    }
+  }
+  virtual ~pi_simple_rep() override {}
+};
+
+/**
+ * pi_compact_rep
+ *
+ * PastIntervals only needs to be able to answer two questions:
+ * 1) Where should the primary look for unfound objects?
+ * 2) List a set of subsets of the OSDs such that contacting at least
+ *    one from each subset guarrantees we speak to at least one witness
+ *    of any completed write.
+ *
+ * Crucially, 2) does not require keeping *all* past intervals.  Certainly,
+ * we don't need to keep any where maybe_went_rw would be false.  We also
+ * needn't keep two intervals where the actingset in one is a subset
+ * of the other (only need to keep the smaller of the two sets).  In order
+ * to accurately trim the set of intervals as last_epoch_started changes
+ * without rebuilding the set from scratch, we'll retain the larger set
+ * if it in an older interval.
+ */
+struct compact_interval_t {
+  epoch_t first;
+  epoch_t last;
+  set<pg_shard_t> acting;
+  bool supersedes(const compact_interval_t &other) {
+    for (auto &&i: acting) {
+      if (!other.acting.count(i))
+	return false;
+    }
+    return true;
+  }
+  void dump(Formatter *f) const {
+    f->open_object_section("compact_interval_t");
+    f->dump_stream("first") << first;
+    f->dump_stream("last") << last;
+    f->dump_stream("acting") << acting;
+    f->close_section();
+  }
+  void encode(bufferlist &bl) const {
+    ENCODE_START(1, 1, bl);
+    ::encode(first, bl);
+    ::encode(last, bl);
+    ::encode(acting, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::iterator &bl) {
+    DECODE_START(1, bl);
+    ::decode(first, bl);
+    ::decode(last, bl);
+    ::decode(acting, bl);
+    DECODE_FINISH(bl);
+  }
+  static void generate_test_instances(list<compact_interval_t*> & o) {
+    /* Not going to be used, we'll generate pi_compact_rep directly */
+  }
+};
+ostream &operator<<(ostream &o, const compact_interval_t &rhs)
+{
+  return o << "([" << rhs.first << "," << rhs.last
+	   << "] acting " << rhs.acting << ")";
+}
+WRITE_CLASS_ENCODER(compact_interval_t)
+
+class pi_compact_rep : public PastIntervals::interval_rep {
+  epoch_t first = 0;
+  epoch_t last = 0; // inclusive
+  set<pg_shard_t> all_participants;
+  list<compact_interval_t> intervals;
+  pi_compact_rep(
+    bool ec_pool,
+    std::list<PastIntervals::pg_interval_t> &&intervals) {
+    for (auto &&i: intervals)
+      add_interval(ec_pool, i);
+  }
+public:
+  pi_compact_rep() = default;
+  pi_compact_rep(const pi_compact_rep &) = default;
+  pi_compact_rep(pi_compact_rep &&) = default;
+  pi_compact_rep &operator=(const pi_compact_rep &) = default;
+  pi_compact_rep &operator=(pi_compact_rep &&) = default;
+
+  size_t size() const override { return intervals.size(); }
+  bool empty() const override {
+    return first > last || (first == 0 && last == 0);
+  }
+  void clear() override {
+    *this = pi_compact_rep();
+  }
+  pair<epoch_t, epoch_t> get_bounds() const override {
+    return make_pair(first, last + 1);
+  }
+  set<pg_shard_t> get_all_participants(
+    bool ec_pool) const override {
+    return all_participants;
+  }
+  void add_interval(
+    bool ec_pool, const PastIntervals::pg_interval_t &interval) override {
+    if (first == 0)
+      first = interval.first;
+    assert(interval.last > last);
+    last = interval.last;
+    set<pg_shard_t> acting;
+    for (unsigned i = 0; i < interval.acting.size(); ++i) {
+      if (interval.acting[i] == CRUSH_ITEM_NONE)
+	continue;
+      acting.insert(
+	pg_shard_t(
+	  interval.acting[i],
+	  ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
+    }
+    all_participants.insert(acting.begin(), acting.end());
+    if (!interval.maybe_went_rw)
+      return;
+    intervals.push_back(
+      compact_interval_t{interval.first, interval.last, acting});
+    auto plast = intervals.end();
+    --plast;
+    for (auto cur = intervals.begin(); cur != plast; ) {
+      if (plast->supersedes(*cur)) {
+	intervals.erase(cur++);
+      } else {
+	++cur;
+      }
+    }
+  }
+  unique_ptr<PastIntervals::interval_rep> clone() const override {
+    return unique_ptr<PastIntervals::interval_rep>(new pi_compact_rep(*this));
+  }
+  ostream &print(ostream &out) const override {
+    return out << "([" << first << "," << last
+	       << "] intervals=" << intervals << ")";
+  }
+  void encode(bufferlist &bl) const override {
+    ENCODE_START(1, 1, bl);
+    ::encode(first, bl);
+    ::encode(last, bl);
+    ::encode(all_participants, bl);
+    ::encode(intervals, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::iterator &bl) override {
+    DECODE_START(1, bl);
+    ::decode(first, bl);
+    ::decode(last, bl);
+    ::decode(all_participants, bl);
+    ::decode(intervals, bl);
+    DECODE_FINISH(bl);
+  }
+  void dump(Formatter *f) const override {
+    f->open_object_section("PastIntervals::compact_rep");
+    f->dump_stream("first") << first;
+    f->dump_stream("last") << last;
+    f->open_array_section("all_participants");
+    for (auto& i : all_participants) {
+      f->dump_object("pg_shard", i);
+    }
+    f->close_section();
+    f->open_array_section("intervals");
+    for (auto &&i: intervals) {
+      i.dump(f);
+    }
+    f->close_section();
+    f->close_section();
+  }
+  bool is_classic() const override {
+    return false;
+  }
+  static void generate_test_instances(list<pi_compact_rep*> &o) {
+    using ival = PastIntervals::pg_interval_t;
+    using ivallst = std::list<ival>;
+    o.push_back(
+      new pi_compact_rep(
+	true, ivallst
+	{ ival{{0, 1, 2}, {0, 1, 2}, 10, 20,  true, 0, 0}
+	, ival{{   1, 2}, {   1, 2}, 21, 30,  true, 1, 1}
+	, ival{{      2}, {      2}, 31, 35, false, 2, 2}
+	, ival{{0,    2}, {0,    2}, 36, 50,  true, 0, 0}
+	}));
+    o.push_back(
+      new pi_compact_rep(
+	false, ivallst
+	{ ival{{0, 1, 2}, {0, 1, 2}, 10, 20,  true, 0, 0}
+	, ival{{   1, 2}, {   1, 2}, 21, 30,  true, 1, 1}
+	, ival{{      2}, {      2}, 31, 35, false, 2, 2}
+	, ival{{0,    2}, {0,    2}, 36, 50,  true, 0, 0}
+	}));
+    o.push_back(
+      new pi_compact_rep(
+	true, ivallst
+	{ ival{{2, 1, 0}, {2, 1, 0}, 10, 20,  true, 1, 1}
+	, ival{{   0, 2}, {   0, 2}, 21, 30,  true, 0, 0}
+	, ival{{   0, 2}, {2,    0}, 31, 35,  true, 2, 2}
+	, ival{{   0, 2}, {   0, 2}, 36, 50,  true, 0, 0}
+	}));
+  }
+  void iterate_mayberw_back_to(
+    bool ec_pool,
+    epoch_t les,
+    std::function<void(epoch_t, const set<pg_shard_t> &)> &&f) const override {
+    for (auto i = intervals.rbegin(); i != intervals.rend(); ++i) {
+      if (i->last < les)
+	break;
+      f(i->first, i->acting);
+    }
+  }
+  virtual ~pi_compact_rep() override {}
+};
+WRITE_CLASS_ENCODER(pi_compact_rep)
+
+PastIntervals::PastIntervals(const PastIntervals &rhs)
+  : past_intervals(rhs.past_intervals ?
+		   rhs.past_intervals->clone() :
+		   nullptr) {}
+
+PastIntervals &PastIntervals::operator=(const PastIntervals &rhs)
+{
+  PastIntervals other(rhs);
+  swap(other);
+  return *this;
+}
+
+ostream& operator<<(ostream& out, const PastIntervals &i)
+{
+  if (i.past_intervals) {
+    return i.past_intervals->print(out);
+  } else {
+    return out << "(empty)";
+  }
+}
+
+ostream& operator<<(ostream& out, const PastIntervals::PriorSet &i)
+{
+  return out << "PriorSet("
+	     << "ec_pool: " << i.ec_pool
+	     << ", probe: " << i.probe
+	     << ", down: " << i.down
+	     << ", blocked_by: " << i.blocked_by
+	     << ", pg_down: " << i.pg_down
+	     << ")";
+}
+
+void PastIntervals::decode(bufferlist::iterator &bl)
+{
+  DECODE_START(1, bl);
+  __u8 type = 0;
+  ::decode(type, bl);
+  switch (type) {
+  case 0:
+    break;
+  case 1:
+    past_intervals.reset(new pi_simple_rep);
+    past_intervals->decode(bl);
+    break;
+  case 2:
+    past_intervals.reset(new pi_compact_rep);
+    past_intervals->decode(bl);
+    break;
+  }
+  DECODE_FINISH(bl);
+}
+
+void PastIntervals::decode_classic(bufferlist::iterator &bl)
+{
+  past_intervals.reset(new pi_simple_rep);
+  past_intervals->decode(bl);
+}
+
+void PastIntervals::generate_test_instances(list<PastIntervals*> &o)
+{
+  {
+    list<pi_simple_rep *> simple;
+    pi_simple_rep::generate_test_instances(simple);
+    for (auto &&i: simple) {
+      // takes ownership of contents
+      o.push_back(new PastIntervals(i));
+    }
+  }
+  {
+    list<pi_compact_rep *> compact;
+    pi_compact_rep::generate_test_instances(compact);
+    for (auto &&i: compact) {
+      // takes ownership of contents
+      o.push_back(new PastIntervals(i));
+    }
+  }
+  return;
+}
+
+void PastIntervals::update_type(bool ec_pool, bool compact)
+{
+  if (!compact) {
+    if (!past_intervals) {
+      past_intervals.reset(new pi_simple_rep);
+    } else {
+      // we never convert from compact back to classic
+      assert(is_classic());
+    }
+  } else {
+    if (!past_intervals) {
+      past_intervals.reset(new pi_compact_rep);
+    } else if (is_classic()) {
+      auto old = std::move(past_intervals);
+      past_intervals.reset(new pi_compact_rep);
+      assert(old->has_full_intervals());
+      old->iterate_all_intervals([&](const pg_interval_t &i) {
+	  past_intervals->add_interval(ec_pool, i);
+	});
+    }
+  }
+}
+
+void PastIntervals::update_type_from_map(bool ec_pool, const OSDMap &osdmap)
+{
+  update_type(ec_pool, osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS));
+}
+
+bool PastIntervals::is_new_interval(
   int old_acting_primary,
   int new_acting_primary,
   const vector<int> &old_acting,
@@ -3084,7 +3394,7 @@ bool pg_interval_t::is_new_interval(
     old_sort_bitwise != new_sort_bitwise;
 }
 
-bool pg_interval_t::is_new_interval(
+bool PastIntervals::is_new_interval(
   int old_acting_primary,
   int new_acting_primary,
   const vector<int> &old_acting,
@@ -3116,7 +3426,7 @@ bool pg_interval_t::is_new_interval(
 		    pgid);
 }
 
-bool pg_interval_t::check_new_interval(
+bool PastIntervals::check_new_interval(
   int old_acting_primary,
   int new_acting_primary,
   const vector<int> &old_acting,
@@ -3131,7 +3441,7 @@ bool pg_interval_t::check_new_interval(
   OSDMapRef lastmap,
   pg_t pgid,
   IsPGRecoverablePredicate *could_have_gone_active,
-  map<epoch_t, pg_interval_t> *past_intervals,
+  PastIntervals *past_intervals,
   std::ostream *out)
 {
   /*
@@ -3179,6 +3489,8 @@ bool pg_interval_t::check_new_interval(
   //  NOTE: a change in the up set primary triggers an interval
   //  change, even though the interval members in the pg_interval_t
   //  do not change.
+  assert(past_intervals);
+  assert(past_intervals->past_intervals);
   if (is_new_interval(
 	old_acting_primary,
 	new_acting_primary,
@@ -3191,7 +3503,7 @@ bool pg_interval_t::check_new_interval(
 	osdmap,
 	lastmap,
 	pgid)) {
-    pg_interval_t& i = (*past_intervals)[same_interval_since];
+    pg_interval_t i;
     i.first = same_interval_since;
     i.last = osdmap->get_epoch() - 1;
     assert(i.first <= i.last);
@@ -3260,13 +3572,74 @@ bool pg_interval_t::check_new_interval(
       if (out)
 	*out << __func__ << " " << i << " : acting set is too small" << std::endl;
     }
+    past_intervals->past_intervals->add_interval(old_pg_pool.ec_pool(), i);
     return true;
   } else {
     return false;
   }
 }
 
-ostream& operator<<(ostream& out, const pg_interval_t& i)
+
+// true if the given map affects the prior set
+bool PastIntervals::PriorSet::affected_by_map(
+  const OSDMap &osdmap,
+  const DoutPrefixProvider *dpp) const
+{
+  for (set<pg_shard_t>::iterator p = probe.begin();
+       p != probe.end();
+       ++p) {
+    int o = p->osd;
+
+    // did someone in the prior set go down?
+    if (osdmap.is_down(o) && down.count(o) == 0) {
+      ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " now down" << dendl;
+      return true;
+    }
+
+    // did a down osd in cur get (re)marked as lost?
+    map<int, epoch_t>::const_iterator r = blocked_by.find(o);
+    if (r != blocked_by.end()) {
+      if (!osdmap.exists(o)) {
+	ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " no longer exists" << dendl;
+	return true;
+      }
+      if (osdmap.get_info(o).lost_at != r->second) {
+	ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " (re)marked as lost" << dendl;
+	return true;
+      }
+    }
+  }
+
+  // did someone in the prior down set go up?
+  for (set<int>::const_iterator p = down.begin();
+       p != down.end();
+       ++p) {
+    int o = *p;
+
+    if (osdmap.is_up(o)) {
+      ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " now up" << dendl;
+      return true;
+    }
+
+    // did someone in the prior set get lost or destroyed?
+    if (!osdmap.exists(o)) {
+      ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " no longer exists" << dendl;
+      return true;
+    }
+    // did a down osd in down get (re)marked as lost?
+    map<int, epoch_t>::const_iterator r = blocked_by.find(o);
+    if (r != blocked_by.end()) {
+      if (osdmap.get_info(o).lost_at != r->second) {
+        ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " (re)marked as lost" << dendl;
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+ostream& operator<<(ostream& out, const PastIntervals::pg_interval_t& i)
 {
   out << "interval(" << i.first << "-" << i.last
       << " up " << i.up << "(" << i.up_primary << ")"
@@ -3282,7 +3655,7 @@ ostream& operator<<(ostream& out, const pg_interval_t& i)
 // -- pg_query_t --
 
 void pg_query_t::encode(bufferlist &bl, uint64_t features) const {
-  ENCODE_START(3, 2, bl);
+  ENCODE_START(3, 3, bl);
   ::encode(type, bl);
   ::encode(since, bl);
   history.encode(bl);
@@ -3293,27 +3666,14 @@ void pg_query_t::encode(bufferlist &bl, uint64_t features) const {
 }
 
 void pg_query_t::decode(bufferlist::iterator &bl) {
-  bufferlist::iterator bl2 = bl;
-  try {
-    DECODE_START(3, bl);
-    ::decode(type, bl);
-    ::decode(since, bl);
-    history.decode(bl);
-    ::decode(epoch_sent, bl);
-    if (struct_v >= 3) {
-      ::decode(to, bl);
-      ::decode(from, bl);
-    } else {
-      to = shard_id_t::NO_SHARD;
-      from = shard_id_t::NO_SHARD;
-    }
-    DECODE_FINISH(bl);
-  } catch (...) {
-    bl = bl2;
-    ::decode(type, bl);
-    ::decode(since, bl);
-    history.decode(bl);
-  }
+  DECODE_START(3, bl);
+  ::decode(type, bl);
+  ::decode(since, bl);
+  history.decode(bl);
+  ::decode(epoch_sent, bl);
+  ::decode(to, bl);
+  ::decode(from, bl);
+  DECODE_FINISH(bl);
 }
 
 void pg_query_t::dump(Formatter *f) const
@@ -4340,25 +4700,31 @@ void OSDSuperblock::generate_test_instances(list<OSDSuperblock*>& o)
 
 void SnapSet::encode(bufferlist& bl) const
 {
-  ENCODE_START(2, 2, bl);
+  ENCODE_START(3, 2, bl);
   ::encode(seq, bl);
   ::encode(head_exists, bl);
   ::encode(snaps, bl);
   ::encode(clones, bl);
   ::encode(clone_overlap, bl);
   ::encode(clone_size, bl);
+  ::encode(clone_snaps, bl);
   ENCODE_FINISH(bl);
 }
 
 void SnapSet::decode(bufferlist::iterator& bl)
 {
-  DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+  DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
   ::decode(seq, bl);
   ::decode(head_exists, bl);
   ::decode(snaps, bl);
   ::decode(clones, bl);
   ::decode(clone_overlap, bl);
   ::decode(clone_size, bl);
+  if (struct_v >= 3) {
+    ::decode(clone_snaps, bl);
+  } else {
+    clone_snaps.clear();
+  }
   DECODE_FINISH(bl);
 }
 
@@ -4375,6 +4741,14 @@ void SnapSet::dump(Formatter *f) const
     f->dump_unsigned("snap", *p);
     f->dump_unsigned("size", clone_size.find(*p)->second);
     f->dump_stream("overlap") << clone_overlap.find(*p)->second;
+    auto q = clone_snaps.find(*p);
+    if (q != clone_snaps.end()) {
+      f->open_array_section("snaps");
+      for (auto s : q->second) {
+	f->dump_unsigned("snap", s);
+      }
+      f->close_section();
+    }
     f->close_section();
   }
   f->close_section();
@@ -4396,16 +4770,26 @@ void SnapSet::generate_test_instances(list<SnapSet*>& o)
   o.back()->clones.push_back(12);
   o.back()->clone_size[12] = 12345;
   o.back()->clone_overlap[12];
+  o.back()->clone_snaps[12] = {12, 10, 8};
 }
 
 ostream& operator<<(ostream& out, const SnapSet& cs)
 {
-  return out << cs.seq << "=" << cs.snaps << ":"
-	     << cs.clones
-	     << (cs.head_exists ? "+head":"");
+  if (cs.is_legacy()) {
+    out << cs.seq << "=" << cs.snaps << ":"
+	<< cs.clones
+	<< (cs.head_exists ? "+head":"");
+    if (!cs.clone_snaps.empty()) {
+      out << "+stray_clone_snaps=" << cs.clone_snaps;
+    }
+    return out;
+  } else {
+    return out << cs.seq << "=" << cs.snaps << ":"
+	       << cs.clone_snaps;
+  }
 }
 
-void SnapSet::from_snap_set(const librados::snap_set_t& ss)
+void SnapSet::from_snap_set(const librados::snap_set_t& ss, bool legacy)
 {
   // NOTE: our reconstruction of snaps (and the snapc) is not strictly
   // correct: it will not include snaps that still logically exist
@@ -4431,6 +4815,13 @@ void SnapSet::from_snap_set(const librados::snap_set_t& ss)
       for (vector<pair<uint64_t, uint64_t> >::const_iterator q =
 	     p->overlap.begin(); q != p->overlap.end(); ++q)
 	clone_overlap[p->cloneid].insert(q->first, q->second);
+      if (!legacy) {
+	// p->snaps is ascending; clone_snaps is descending
+	vector<snapid_t>& v = clone_snaps[p->cloneid];
+	for (auto q = p->snaps.rbegin(); q != p->snaps.rend(); ++q) {
+	  v.push_back(*q);
+	}
+      }
     }
   }
 
@@ -4588,7 +4979,7 @@ void object_info_t::encode(bufferlist& bl, uint64_t features) const
   if (soid.snap == CEPH_NOSNAP)
     ::encode(osd_reqid_t(), bl);  // used to be wrlock_by
   else
-    ::encode(snaps, bl);
+    ::encode(legacy_snaps, bl);
   ::encode(truncate_seq, bl);
   ::encode(truncate_size, bl);
   ::encode(is_lost(), bl);
@@ -4630,7 +5021,7 @@ void object_info_t::decode(bufferlist::iterator& bl)
     osd_reqid_t wrlock_by;
     ::decode(wrlock_by, bl);
   } else {
-    ::decode(snaps, bl);
+    ::decode(legacy_snaps, bl);
   }
   ::decode(truncate_seq, bl);
   ::decode(truncate_size, bl);
@@ -4712,9 +5103,10 @@ void object_info_t::dump(Formatter *f) const
   f->dump_stream("local_mtime") << local_mtime;
   f->dump_unsigned("lost", (int)is_lost());
   f->dump_unsigned("flags", (int)flags);
-  f->open_array_section("snaps");
-  for (vector<snapid_t>::const_iterator p = snaps.begin(); p != snaps.end(); ++p)
-    f->dump_unsigned("snap", *p);
+  f->open_array_section("legacy_snaps");
+  for (auto s : legacy_snaps) {
+    f->dump_unsigned("snap", s);
+  }
   f->close_section();
   f->dump_unsigned("truncate_seq", truncate_seq);
   f->dump_unsigned("truncate_size", truncate_size);
@@ -4747,8 +5139,8 @@ ostream& operator<<(ostream& out, const object_info_t& oi)
 {
   out << oi.soid << "(" << oi.version
       << " " << oi.last_reqid;
-  if (oi.soid.snap != CEPH_NOSNAP)
-    out << " " << oi.snaps;
+  if (oi.soid.snap != CEPH_NOSNAP && !oi.legacy_snaps.empty())
+    out << " " << oi.legacy_snaps;
   if (oi.flags)
     out << " " << oi.get_flag_string();
   out << " s " << oi.size;
@@ -4912,6 +5304,7 @@ ostream &ObjectRecoveryInfo::print(ostream &out) const
 	     << ", size: " << size
 	     << ", copy_subset: " << copy_subset
 	     << ", clone_subset: " << clone_subset
+	     << ", snapset: " << ss
 	     << ")";
 }
 
@@ -5229,14 +5622,14 @@ void ScrubMap::generate_test_instances(list<ScrubMap*>& o)
 void ScrubMap::object::encode(bufferlist& bl) const
 {
   bool compat_read_error = read_error || ec_hash_mismatch || ec_size_mismatch;
-  ENCODE_START(8, 2, bl);
+  ENCODE_START(8, 7, bl);
   ::encode(size, bl);
   ::encode(negative, bl);
   ::encode(attrs, bl);
   ::encode(digest, bl);
   ::encode(digest_present, bl);
-  ::encode(nlinks, bl);
-  ::encode(snapcolls, bl);
+  ::encode((uint32_t)0, bl);  // obsolete nlinks
+  ::encode((uint32_t)0, bl);  // snapcolls
   ::encode(omap_digest, bl);
   ::encode(omap_digest_present, bl);
   ::encode(compat_read_error, bl);
@@ -5249,37 +5642,27 @@ void ScrubMap::object::encode(bufferlist& bl) const
 
 void ScrubMap::object::decode(bufferlist::iterator& bl)
 {
-  DECODE_START_LEGACY_COMPAT_LEN(8, 2, 2, bl);
+  DECODE_START(8, bl);
   ::decode(size, bl);
   bool tmp, compat_read_error = false;
   ::decode(tmp, bl);
   negative = tmp;
   ::decode(attrs, bl);
-  if (struct_v >= 3) {
-    ::decode(digest, bl);
-    ::decode(tmp, bl);
-    digest_present = tmp;
-  }
-  if (struct_v >= 4) {
+  ::decode(digest, bl);
+  ::decode(tmp, bl);
+  digest_present = tmp;
+  {
+    uint32_t nlinks;
     ::decode(nlinks, bl);
+    set<snapid_t> snapcolls;
     ::decode(snapcolls, bl);
-  } else {
-    /* Indicates that encoder was not aware of this field since stat must
-     * return nlink >= 1 */
-    nlinks = 0;
-  }
-  if (struct_v >= 5) {
-    ::decode(omap_digest, bl);
-    ::decode(tmp, bl);
-    omap_digest_present = tmp;
-  }
-  if (struct_v >= 6) {
-    ::decode(compat_read_error, bl);
-  }
-  if (struct_v >= 7) {
-    ::decode(tmp, bl);
-    stat_error = tmp;
   }
+  ::decode(omap_digest, bl);
+  ::decode(tmp, bl);
+  omap_digest_present = tmp;
+  ::decode(compat_read_error, bl);
+  ::decode(tmp, bl);
+  stat_error = tmp;
   if (struct_v >= 8) {
     ::decode(tmp, bl);
     read_error = tmp;
diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h
index 514bdabe28b..1cea9a6ba36 100644
--- a/src/osd/osd_types.h
+++ b/src/osd/osd_types.h
@@ -139,6 +139,12 @@ struct pg_shard_t {
   }
   void encode(bufferlist &bl) const;
   void decode(bufferlist::iterator &bl);
+  void dump(Formatter *f) const {
+    f->dump_unsigned("osd", osd);
+    if (shard != shard_id_t::NO_SHARD) {
+      f->dump_unsigned("shard", shard);
+    }
+  }
 };
 WRITE_CLASS_ENCODER(pg_shard_t)
 WRITE_EQ_OPERATORS_2(pg_shard_t, osd, shard)
@@ -1611,6 +1617,7 @@ struct object_stat_sum_t {
   int32_t num_evict_mode_full;  // 1 when in evict full mode, otherwise 0
   int64_t num_objects_pinned;
   int64_t num_objects_missing;
+  int64_t num_legacy_snapsets; ///< upper bound on pre-luminous-style SnapSets
 
   object_stat_sum_t()
     : num_bytes(0),
@@ -1638,7 +1645,8 @@ struct object_stat_sum_t {
       num_flush_mode_high(0), num_flush_mode_low(0),
       num_evict_mode_some(0), num_evict_mode_full(0),
       num_objects_pinned(0),
-      num_objects_missing(0)
+      num_objects_missing(0),
+      num_legacy_snapsets(0)
   {}
 
   void floor(int64_t f) {
@@ -1677,6 +1685,7 @@ struct object_stat_sum_t {
     FLOOR(num_evict_mode_some);
     FLOOR(num_evict_mode_full);
     FLOOR(num_objects_pinned);
+    FLOOR(num_legacy_snapsets);
 #undef FLOOR
   }
 
@@ -1687,7 +1696,14 @@ struct object_stat_sum_t {
       if (i < (PARAM % out.size())) {           \
 	out[i].PARAM++;                         \
       }                                         \
-    }                                           \
+    }
+#define SPLIT_PRESERVE_NONZERO(PARAM)		\
+    for (unsigned i = 0; i < out.size(); ++i) { \
+      if (PARAM)				\
+	out[i].PARAM = 1 + PARAM / out.size();	\
+      else					\
+	out[i].PARAM = 0;			\
+    }
 
     SPLIT(num_bytes);
     SPLIT(num_objects);
@@ -1723,7 +1739,9 @@ struct object_stat_sum_t {
     SPLIT(num_evict_mode_some);
     SPLIT(num_evict_mode_full);
     SPLIT(num_objects_pinned);
+    SPLIT_PRESERVE_NONZERO(num_legacy_snapsets);
 #undef SPLIT
+#undef SPLIT_PRESERVE_NONZERO
   }
 
   void clear() {
@@ -1778,7 +1796,8 @@ struct object_stat_sum_t {
         sizeof(num_evict_mode_some) +
         sizeof(num_evict_mode_full) +
         sizeof(num_objects_pinned) +
-        sizeof(num_objects_missing)
+        sizeof(num_objects_missing) +
+        sizeof(num_legacy_snapsets)
       ,
       "object_stat_sum_t have padding");
   }
@@ -2085,10 +2104,15 @@ WRITE_CLASS_ENCODER(pg_hit_set_history_t)
  * history they need to worry about.
  */
 struct pg_history_t {
-  epoch_t epoch_created;       // epoch in which PG was created
+  epoch_t epoch_created;       // epoch in which *pg* was created (pool or pg)
+  epoch_t epoch_pool_created;  // epoch in which *pool* was created
+			       // (note: may be pg creation epoch for
+			       // pre-luminous clusters)
   epoch_t last_epoch_started;  // lower bound on last epoch started (anywhere, not necessarily locally)
+  epoch_t last_interval_started; // first epoch of last_epoch_started interval
   epoch_t last_epoch_clean;    // lower bound on last epoch the PG was completely clean.
-  epoch_t last_epoch_split;    // as parent
+  epoch_t last_interval_clean; // first epoch of last_epoch_clean interval
+  epoch_t last_epoch_split;    // as parent or child
   epoch_t last_epoch_marked_full;  // pool or cluster
   
   /**
@@ -2111,8 +2135,11 @@ struct pg_history_t {
   friend bool operator==(const pg_history_t& l, const pg_history_t& r) {
     return
       l.epoch_created == r.epoch_created &&
+      l.epoch_pool_created == r.epoch_pool_created &&
       l.last_epoch_started == r.last_epoch_started &&
+      l.last_interval_started == r.last_interval_started &&
       l.last_epoch_clean == r.last_epoch_clean &&
+      l.last_interval_clean == r.last_interval_clean &&
       l.last_epoch_split == r.last_epoch_split &&
       l.last_epoch_marked_full == r.last_epoch_marked_full &&
       l.same_up_since == r.same_up_since &&
@@ -2127,7 +2154,12 @@ struct pg_history_t {
 
   pg_history_t()
     : epoch_created(0),
-      last_epoch_started(0), last_epoch_clean(0), last_epoch_split(0),
+      epoch_pool_created(0),
+      last_epoch_started(0),
+      last_interval_started(0),
+      last_epoch_clean(0),
+      last_interval_clean(0),
+      last_epoch_split(0),
       last_epoch_marked_full(0),
       same_up_since(0), same_interval_since(0), same_primary_since(0) {}
   
@@ -2138,14 +2170,28 @@ struct pg_history_t {
       epoch_created = other.epoch_created;
       modified = true;
     }
+    if (epoch_pool_created < other.epoch_pool_created) {
+      // FIXME: for jewel compat only; this should either be 0 or always the
+      // same value across all pg instances.
+      epoch_pool_created = other.epoch_pool_created;
+      modified = true;
+    }
     if (last_epoch_started < other.last_epoch_started) {
       last_epoch_started = other.last_epoch_started;
       modified = true;
     }
+    if (last_interval_started < other.last_interval_started) {
+      last_interval_started = other.last_interval_started;
+      modified = true;
+    }
     if (last_epoch_clean < other.last_epoch_clean) {
       last_epoch_clean = other.last_epoch_clean;
       modified = true;
     }
+    if (last_interval_clean < other.last_interval_clean) {
+      last_interval_clean = other.last_interval_clean;
+      modified = true;
+    }
     if (last_epoch_split < other.last_epoch_split) {
       last_epoch_split = other.last_epoch_split; 
       modified = true;
@@ -2185,10 +2231,14 @@ struct pg_history_t {
 WRITE_CLASS_ENCODER(pg_history_t)
 
 inline ostream& operator<<(ostream& out, const pg_history_t& h) {
-  return out << "ec=" << h.epoch_created
+  return out << "ec=" << h.epoch_created << "/" << h.epoch_pool_created
+	     << " lis/c " << h.last_interval_started
+	     << "/" << h.last_interval_clean
 	     << " les/c/f " << h.last_epoch_started << "/" << h.last_epoch_clean
 	     << "/" << h.last_epoch_marked_full
-	     << " " << h.same_up_since << "/" << h.same_interval_since << "/" << h.same_primary_since;
+	     << " " << h.same_up_since
+	     << "/" << h.same_interval_since
+	     << "/" << h.same_primary_since;
 }
 
 
@@ -2206,6 +2256,7 @@ struct pg_info_t {
   eversion_t last_update;      ///< last object version applied to store.
   eversion_t last_complete;    ///< last version pg was complete through.
   epoch_t last_epoch_started;  ///< last epoch at which this pg started on this osd
+  epoch_t last_interval_started; ///< first epoch of last_epoch_started interval
   
   version_t last_user_version; ///< last user object version applied to store
 
@@ -2227,6 +2278,7 @@ struct pg_info_t {
       l.last_update == r.last_update &&
       l.last_complete == r.last_complete &&
       l.last_epoch_started == r.last_epoch_started &&
+      l.last_interval_started == r.last_interval_started &&
       l.last_user_version == r.last_user_version &&
       l.log_tail == r.log_tail &&
       l.last_backfill == r.last_backfill &&
@@ -2238,14 +2290,18 @@ struct pg_info_t {
   }
 
   pg_info_t()
-    : last_epoch_started(0), last_user_version(0),
+    : last_epoch_started(0),
+      last_interval_started(0),
+      last_user_version(0),
       last_backfill(hobject_t::get_max()),
       last_backfill_bitwise(false)
   { }
   // cppcheck-suppress noExplicitConstructor
   pg_info_t(spg_t p)
     : pgid(p),
-      last_epoch_started(0), last_user_version(0),
+      last_epoch_started(0),
+      last_interval_started(0),
+      last_user_version(0),
       last_backfill(hobject_t::get_max()),
       last_backfill_bitwise(false)
   { }
@@ -2289,7 +2345,8 @@ inline ostream& operator<<(ostream& out, const pg_info_t& pgi)
     out << " lb " << pgi.last_backfill
 	<< (pgi.last_backfill_bitwise ? " (bitwise)" : " (NIBBLEWISE)");
   //out << " c " << pgi.epoch_created;
-  out << " local-les=" << pgi.last_epoch_started;
+  out << " local-lis/les=" << pgi.last_interval_started
+      << "/" << pgi.last_epoch_started;
   out << " n=" << pgi.stats.stats.sum.num_objects;
   out << " " << pgi.history
       << ")";
@@ -2471,28 +2528,135 @@ WRITE_CLASS_ENCODER(pg_notify_t)
 ostream &operator<<(ostream &lhs, const pg_notify_t &notify);
 
 
+class OSDMap;
 /**
- * pg_interval_t - information about a past interval
+ * PastIntervals -- information needed to determine the PriorSet and
+ * the might_have_unfound set
  */
-class OSDMap;
-struct pg_interval_t {
-  vector<int32_t> up, acting;
-  epoch_t first, last;
-  bool maybe_went_rw;
-  int32_t primary;
-  int32_t up_primary;
+class PastIntervals {
+public:
+  struct pg_interval_t {
+    vector<int32_t> up, acting;
+    epoch_t first, last;
+    bool maybe_went_rw;
+    int32_t primary;
+    int32_t up_primary;
+
+    pg_interval_t()
+      : first(0), last(0),
+	maybe_went_rw(false),
+	primary(-1),
+	up_primary(-1)
+      {}
+
+    pg_interval_t(
+      vector<int32_t> &&up,
+      vector<int32_t> &&acting,
+      epoch_t first,
+      epoch_t last,
+      bool maybe_went_rw,
+      int32_t primary,
+      int32_t up_primary)
+      : up(up), acting(acting), first(first), last(last),
+	maybe_went_rw(maybe_went_rw), primary(primary), up_primary(up_primary)
+      {}
 
-  pg_interval_t()
-    : first(0), last(0),
-      maybe_went_rw(false),
-      primary(-1),
-      up_primary(-1)
-  {}
+    void encode(bufferlist& bl) const;
+    void decode(bufferlist::iterator& bl);
+    void dump(Formatter *f) const;
+    static void generate_test_instances(list<pg_interval_t*>& o);
+  };
 
-  void encode(bufferlist& bl) const;
-  void decode(bufferlist::iterator& bl);
-  void dump(Formatter *f) const;
-  static void generate_test_instances(list<pg_interval_t*>& o);
+  PastIntervals() = default;
+  PastIntervals(bool ec_pool, const OSDMap &osdmap) : PastIntervals() {
+    update_type_from_map(ec_pool, osdmap);
+  }
+  PastIntervals(bool ec_pool, bool compact) : PastIntervals() {
+    update_type(ec_pool, compact);
+  }
+  PastIntervals(PastIntervals &&rhs) = default;
+  PastIntervals &operator=(PastIntervals &&rhs) = default;
+
+  PastIntervals(const PastIntervals &rhs);
+  PastIntervals &operator=(const PastIntervals &rhs);
+
+  class interval_rep {
+  public:
+    virtual size_t size() const = 0;
+    virtual bool empty() const = 0;
+    virtual void clear() = 0;
+    virtual pair<epoch_t, epoch_t> get_bounds() const = 0;
+    virtual set<pg_shard_t> get_all_participants(
+      bool ec_pool) const = 0;
+    virtual void add_interval(bool ec_pool, const pg_interval_t &interval) = 0;
+    virtual unique_ptr<interval_rep> clone() const = 0;
+    virtual ostream &print(ostream &out) const = 0;
+    virtual void encode(bufferlist &bl) const = 0;
+    virtual void decode(bufferlist::iterator &bl) = 0;
+    virtual void dump(Formatter *f) const = 0;
+    virtual bool is_classic() const = 0;
+    virtual void iterate_mayberw_back_to(
+      bool ec_pool,
+      epoch_t les,
+      std::function<void(epoch_t, const set<pg_shard_t> &)> &&f) const = 0;
+
+    virtual bool has_full_intervals() const { return false; }
+    virtual void iterate_all_intervals(
+      std::function<void(const pg_interval_t &)> &&f) const {
+      assert(!has_full_intervals());
+      assert(0 == "not valid for this implementation");
+    }
+
+    virtual ~interval_rep() {}
+  };
+  friend class pi_simple_rep;
+  friend class pi_compact_rep;
+private:
+
+  unique_ptr<interval_rep> past_intervals;
+
+  PastIntervals(interval_rep *rep) : past_intervals(rep) {}
+
+public:
+  void add_interval(bool ec_pool, const pg_interval_t &interval) {
+    assert(past_intervals);
+    return past_intervals->add_interval(ec_pool, interval);
+  }
+
+  bool is_classic() const {
+    assert(past_intervals);
+    return past_intervals->is_classic();
+  }
+
+  void encode(bufferlist &bl) const {
+    ENCODE_START(1, 1, bl);
+    if (past_intervals) {
+      __u8 type = is_classic() ? 1 : 2;
+      ::encode(type, bl);
+      past_intervals->encode(bl);
+    } else {
+      ::encode((__u8)0, bl);
+    }
+    ENCODE_FINISH(bl);
+  }
+  void encode_classic(bufferlist &bl) const {
+    if (past_intervals) {
+      assert(past_intervals->is_classic());
+      past_intervals->encode(bl);
+    } else {
+      // it's a map<>
+      ::encode((uint32_t)0, bl);
+    }
+  }
+
+  void decode(bufferlist::iterator &bl);
+  void decode_classic(bufferlist::iterator &bl);
+
+  void dump(Formatter *f) const {
+    assert(past_intervals);
+    past_intervals->dump(f);
+  }
+  static void generate_test_instances(list<PastIntervals *> & o);
 
   /**
    * Determines whether there is an interval change
@@ -2549,20 +2713,304 @@ struct pg_interval_t {
     const vector<int> &new_up,                  ///< [in] up as of osdmap
     epoch_t same_interval_since,                ///< [in] as of osdmap
     epoch_t last_epoch_clean,                   ///< [in] current
-    ceph::shared_ptr<const OSDMap> osdmap,  ///< [in] current map
-    ceph::shared_ptr<const OSDMap> lastmap, ///< [in] last map
+    ceph::shared_ptr<const OSDMap> osdmap,      ///< [in] current map
+    ceph::shared_ptr<const OSDMap> lastmap,     ///< [in] last map
     pg_t pgid,                                  ///< [in] pgid for pg
     IsPGRecoverablePredicate *could_have_gone_active, /// [in] predicate whether the pg can be active
-    map<epoch_t, pg_interval_t> *past_intervals,///< [out] intervals
+    PastIntervals *past_intervals,              ///< [out] intervals
     ostream *out = 0                            ///< [out] debug ostream
     );
+  friend ostream& operator<<(ostream& out, const PastIntervals &i);
+
+  template <typename F>
+  void iterate_mayberw_back_to(
+    bool ec_pool,
+    epoch_t les,
+    F &&f) const {
+    assert(past_intervals);
+    past_intervals->iterate_mayberw_back_to(ec_pool, les, std::forward<F>(f));
+  }
+  void clear() {
+    assert(past_intervals);
+    past_intervals->clear();
+  }
+
+  /**
+   * Should return a value which gives an indication of the amount
+   * of state contained
+   */
+  size_t size() const {
+    assert(past_intervals);
+    return past_intervals->size();
+  }
+
+  bool empty() const {
+    assert(past_intervals);
+    return past_intervals->empty();
+  }
+
+  void swap(PastIntervals &other) {
+    using std::swap;
+    swap(other.past_intervals, past_intervals);
+  }
+
+  /**
+   * Return all shards which have been in the acting set back to the
+   * latest epoch to which we have trimmed except for pg_whoami
+   */
+  set<pg_shard_t> get_might_have_unfound(
+    pg_shard_t pg_whoami,
+    bool ec_pool) const {
+    assert(past_intervals);
+    auto ret = past_intervals->get_all_participants(ec_pool);
+    ret.erase(pg_whoami);
+    return ret;
+  }
+
+  /**
+   * Return all shards which we might want to talk to for peering
+   */
+  set<pg_shard_t> get_all_probe(
+    bool ec_pool) const {
+    assert(past_intervals);
+    return past_intervals->get_all_participants(ec_pool);
+  }
+
+  /* Return the set of epochs [start, end) represented by the
+   * past_interval set.
+   */
+  pair<epoch_t, epoch_t> get_bounds() const {
+    assert(past_intervals);
+    return past_intervals->get_bounds();
+  }
+
+  enum osd_state_t {
+    UP,
+    DOWN,
+    DNE,
+    LOST
+  };
+  struct PriorSet {
+    bool ec_pool = false;
+    set<pg_shard_t> probe; /// current+prior OSDs we need to probe.
+    set<int> down;  /// down osds that would normally be in @a probe and might be interesting.
+    map<int, epoch_t> blocked_by;  /// current lost_at values for any OSDs in cur set for which (re)marking them lost would affect cur set
+
+    bool pg_down = false;   /// some down osds are included in @a cur; the DOWN pg state bit should be set.
+    unique_ptr<IsPGRecoverablePredicate> pcontdec;
+
+    PriorSet() = default;
+    PriorSet(PriorSet &&) = default;
+    PriorSet &operator=(PriorSet &&) = default;
+
+    PriorSet &operator=(const PriorSet &) = delete;
+    PriorSet(const PriorSet &) = delete;
+
+    bool operator==(const PriorSet &rhs) const {
+      return (ec_pool == rhs.ec_pool) &&
+	(probe == rhs.probe) &&
+	(down == rhs.down) &&
+	(blocked_by == rhs.blocked_by) &&
+	(pg_down == rhs.pg_down);
+    }
+
+    bool affected_by_map(
+      const OSDMap &osdmap,
+      const DoutPrefixProvider *dpp) const;
+
+    // For verifying tests
+    PriorSet(
+      bool ec_pool,
+      set<pg_shard_t> probe,
+      set<int> down,
+      map<int, epoch_t> blocked_by,
+      bool pg_down,
+      IsPGRecoverablePredicate *pcontdec)
+      : ec_pool(ec_pool), probe(probe), down(down), blocked_by(blocked_by),
+	pg_down(pg_down), pcontdec(pcontdec) {}
+
+  private:
+    template <typename F>
+    PriorSet(
+      const PastIntervals &past_intervals,
+      bool ec_pool,
+      epoch_t last_epoch_started,
+      IsPGRecoverablePredicate *c,
+      F f,
+      const vector<int> &up,
+      const vector<int> &acting,
+      const DoutPrefixProvider *dpp);
+
+    friend class PastIntervals;
+  };
+
+  void update_type(bool ec_pool, bool compact);
+  void update_type_from_map(bool ec_pool, const OSDMap &osdmap);
+
+  template <typename... Args>
+  PriorSet get_prior_set(Args&&... args) const {
+    return PriorSet(*this, std::forward<Args>(args)...);
+  }
 };
-WRITE_CLASS_ENCODER(pg_interval_t)
+WRITE_CLASS_ENCODER(PastIntervals)
+
+ostream& operator<<(ostream& out, const PastIntervals::pg_interval_t& i);
+ostream& operator<<(ostream& out, const PastIntervals &i);
+ostream& operator<<(ostream& out, const PastIntervals::PriorSet &i);
+
+template <typename F>
+PastIntervals::PriorSet::PriorSet(
+  const PastIntervals &past_intervals,
+  bool ec_pool,
+  epoch_t last_epoch_started,
+  IsPGRecoverablePredicate *c,
+  F f,
+  const vector<int> &up,
+  const vector<int> &acting,
+  const DoutPrefixProvider *dpp)
+  : ec_pool(ec_pool), pg_down(false), pcontdec(c)
+{
+  /*
+   * We have to be careful to gracefully deal with situations like
+   * so. Say we have a power outage or something that takes out both
+   * OSDs, but the monitor doesn't mark them down in the same epoch.
+   * The history may look like
+   *
+   *  1: A B
+   *  2:   B
+   *  3:       let's say B dies for good, too (say, from the power spike)
+   *  4: A
+   *
+   * which makes it look like B may have applied updates to the PG
+   * that we need in order to proceed.  This sucks...
+   *
+   * To minimize the risk of this happening, we CANNOT go active if
+   * _any_ OSDs in the prior set are down until we send an MOSDAlive
+   * to the monitor such that the OSDMap sets osd_up_thru to an epoch.
+   * Then, we have something like
+   *
+   *  1: A B
+   *  2:   B   up_thru[B]=0
+   *  3:
+   *  4: A
+   *
+   * -> we can ignore B, bc it couldn't have gone active (alive_thru
+   *    still 0).
+   *
+   * or,
+   *
+   *  1: A B
+   *  2:   B   up_thru[B]=0
+   *  3:   B   up_thru[B]=2
+   *  4:
+   *  5: A
+   *
+   * -> we must wait for B, bc it was alive through 2, and could have
+   *    written to the pg.
+   *
+   * If B is really dead, then an administrator will need to manually
+   * intervene by marking the OSD as "lost."
+   */
 
-ostream& operator<<(ostream& out, const pg_interval_t& i);
+  // Include current acting and up nodes... not because they may
+  // contain old data (this interval hasn't gone active, obviously),
+  // but because we want their pg_info to inform choose_acting(), and
+  // so that we know what they do/do not have explicitly before
+  // sending them any new info/logs/whatever.
+  for (unsigned i = 0; i < acting.size(); i++) {
+    if (acting[i] != 0x7fffffff /* CRUSH_ITEM_NONE, can't import crush.h here */)
+      probe.insert(pg_shard_t(acting[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
+  }
+  // It may be possible to exclude the up nodes, but let's keep them in
+  // there for now.
+  for (unsigned i = 0; i < up.size(); i++) {
+    if (up[i] != 0x7fffffff /* CRUSH_ITEM_NONE, can't import crush.h here */)
+      probe.insert(pg_shard_t(up[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
+  }
+
+  set<pg_shard_t> all_probe = past_intervals.get_all_probe(ec_pool);
+  ldpp_dout(dpp, 10) << "build_prior all_probe " << all_probe << dendl;
+  for (auto &&i: all_probe) {
+    switch (f(0, i.osd, nullptr)) {
+    case UP: {
+      probe.insert(i);
+      break;
+    }
+    case DNE:
+    case LOST:
+    case DOWN: {
+      down.insert(i.osd);
+      break;
+    }
+    }
+  }
 
-typedef map<epoch_t, pg_interval_t> pg_interval_map_t;
+  past_intervals.iterate_mayberw_back_to(
+    ec_pool,
+    last_epoch_started,
+    [&](epoch_t start, const set<pg_shard_t> &acting) {
+      ldpp_dout(dpp, 10) << "build_prior maybe_rw interval:" << start
+			 << ", acting: " << acting << dendl;
+
+      // look at candidate osds during this interval.  each falls into
+      // one of three categories: up, down (but potentially
+      // interesting), or lost (down, but we won't wait for it).
+      set<pg_shard_t> up_now;
+      map<int, epoch_t> candidate_blocked_by;
+      // any candidates down now (that might have useful data)
+      bool any_down_now = false;
+
+      // consider ACTING osds
+      for (auto &&so: acting) {
+	epoch_t lost_at = 0;
+	switch (f(start, so.osd, &lost_at)) {
+	case UP: {
+	  // include past acting osds if they are up.
+	  up_now.insert(so);
+	  break;
+	}
+	case DNE: {
+	  ldpp_dout(dpp, 10) << "build_prior  prior osd." << so.osd
+			     << " no longer exists" << dendl;
+	  break;
+	}
+	case LOST: {
+	  ldpp_dout(dpp, 10) << "build_prior  prior osd." << so.osd
+			     << " is down, but lost_at " << lost_at << dendl;
+	  up_now.insert(so);
+	  break;
+	}
+	case DOWN: {
+	  ldpp_dout(dpp, 10) << "build_prior  prior osd." << so.osd
+			     << " is down" << dendl;
+	  candidate_blocked_by[so.osd] = lost_at;
+	  any_down_now = true;
+	  break;
+	}
+	}
+      }
+
+      // if not enough osds survived this interval, and we may have gone rw,
+      // then we need to wait for one of those osds to recover to
+      // ensure that we haven't lost any information.
+      if (!(*pcontdec)(up_now) && any_down_now) {
+	// fixme: how do we identify a "clean" shutdown anyway?
+	ldpp_dout(dpp, 10) << "build_prior  possibly went active+rw,"
+			   << " insufficient up; including down osds" << dendl;
+	assert(!candidate_blocked_by.empty());
+	pg_down = true;
+	blocked_by.insert(
+	  candidate_blocked_by.begin(),
+	  candidate_blocked_by.end());
+      }
+    });
 
+  ldpp_dout(dpp, 10) << "build_prior final: probe " << probe
+	   << " down " << down
+	   << " blocked_by " << blocked_by
+	   << (pg_down ? " pg_down":"")
+	   << dendl;
+}
 
 /** 
  * pg_query_t - used to ask a peer for information about a pg.
@@ -2695,9 +3143,10 @@ public:
   void swap(ObjectModDesc &other) {
     bl.swap(other.bl);
 
-    ::swap(other.can_local_rollback, can_local_rollback);
-    ::swap(other.rollback_info_completed, rollback_info_completed);
-    ::swap(other.max_required_version, max_required_version);
+    using std::swap;
+    swap(other.can_local_rollback, can_local_rollback);
+    swap(other.rollback_info_completed, rollback_info_completed);
+    swap(other.max_required_version, max_required_version);
   }
   void append_id(ModID id) {
     uint8_t _id(id);
@@ -2789,7 +3238,7 @@ public:
    * in the case that bl contains ptrs which point into a much larger
    * message buffer
    */
-  void trim_bl() {
+  void trim_bl() const {
     if (bl.length() > 0)
       bl.rebuild();
   }
@@ -3030,7 +3479,8 @@ public:
     while (true) {
       if (p == log.begin()) {
 	// yikes, the whole thing is divergent!
-	::swap(divergent, log);
+	using std::swap;
+	swap(divergent, log);
 	break;
       }
       --p;
@@ -3216,8 +3666,6 @@ public:
 
   template <typename missing_type>
   pg_missing_set(const missing_type &m) {
-    for (auto &&i: missing)
-      tracker.changed(i.first);
     missing = m.get_items();
     rmissing = m.get_rmissing();
     for (auto &&i: missing)
@@ -3868,6 +4316,7 @@ struct SnapSet {
   vector<snapid_t> clones;   // ascending
   map<snapid_t, interval_set<uint64_t> > clone_overlap;  // overlap w/ next newest
   map<snapid_t, uint64_t> clone_size;
+  map<snapid_t, vector<snapid_t>> clone_snaps; // descending
 
   SnapSet() : seq(0), head_exists(false) {}
   explicit SnapSet(bufferlist& bl) {
@@ -3875,8 +4324,12 @@ struct SnapSet {
     decode(p);
   }
 
+  bool is_legacy() const {
+    return clone_snaps.size() < clones.size() || !head_exists;
+  }
+
   /// populate SnapSet from a librados::snap_set_t
-  void from_snap_set(const librados::snap_set_t& ss);
+  void from_snap_set(const librados::snap_set_t& ss, bool legacy);
 
   /// get space accounted to clone
   uint64_t get_clone_bytes(snapid_t clone) const;
@@ -4012,7 +4465,8 @@ struct object_info_t {
     return get_flag_string(flags);
   }
 
-  vector<snapid_t> snaps;  // [clone]
+  /// [clone] descending. pre-luminous; moved to SnapSet
+  vector<snapid_t> legacy_snaps;
 
   uint64_t truncate_seq, truncate_size;
 
@@ -4115,19 +4569,6 @@ struct object_info_t {
 };
 WRITE_CLASS_ENCODER_FEATURES(object_info_t)
 
-struct SnapSetContext {
-  hobject_t oid;
-  SnapSet snapset;
-  int ref;
-  bool registered : 1;
-  bool exists : 1;
-
-  explicit SnapSetContext(const hobject_t& o) :
-    oid(o), ref(0), registered(false), exists(true) { }
-};
-
-
-
 ostream& operator<<(ostream& out, const object_info_t& oi);
 
 
@@ -4138,7 +4579,7 @@ struct ObjectRecoveryInfo {
   eversion_t version;
   uint64_t size;
   object_info_t oi;
-  SnapSet ss;
+  SnapSet ss;   // only populated if soid is_snap()
   interval_set<uint64_t> copy_subset;
   map<hobject_t, interval_set<uint64_t>> clone_subset;
 
@@ -4243,11 +4684,9 @@ ostream& operator<<(ostream& out, const PushOp &op);
 struct ScrubMap {
   struct object {
     map<string,bufferptr> attrs;
-    set<snapid_t> snapcolls;
     uint64_t size;
     __u32 omap_digest;         ///< omap crc32c
     __u32 digest;              ///< data crc32c
-    uint32_t nlinks;
     bool negative:1;
     bool digest_present:1;
     bool omap_digest_present:1;
@@ -4258,7 +4697,7 @@ struct ScrubMap {
 
     object() :
       // Init invalid size so it won't match if we get a stat EIO error
-      size(-1), omap_digest(0), digest(0), nlinks(0), 
+      size(-1), omap_digest(0), digest(0),
       negative(false), digest_present(false), omap_digest_present(false), 
       read_error(false), stat_error(false), ec_hash_mismatch(false), ec_size_mismatch(false) {}
 
@@ -4278,9 +4717,10 @@ struct ScrubMap {
     objects.insert(r.objects.begin(), r.objects.end());
   }
   void swap(ScrubMap &r) {
-    ::swap(objects, r.objects);
-    ::swap(valid_through, r.valid_through);
-    ::swap(incr_since, r.incr_since);
+    using std::swap;
+    swap(objects, r.objects);
+    swap(valid_through, r.valid_through);
+    swap(incr_since, r.incr_since);
   }
 
   void encode(bufferlist& bl) const;
diff --git a/src/osdc/Filer.cc b/src/osdc/Filer.cc
index 17152f78b44..df414adff25 100644
--- a/src/osdc/Filer.cc
+++ b/src/osdc/Filer.cc
@@ -377,3 +377,108 @@ void Filer::_do_purge_range(PurgeRange *pr, int fin)
 		     new C_OnFinisher(new C_PurgeRange(this, pr), finisher));
   }
 }
+
+// -----------------------
+struct TruncRange {
+  std::mutex lock;
+  typedef std::lock_guard<std::mutex> lock_guard;
+  typedef std::unique_lock<std::mutex> unique_lock;
+  inodeno_t ino;
+  file_layout_t layout;
+  SnapContext snapc;
+  ceph::real_time mtime;
+  int flags;
+  Context *oncommit;
+  int uncommitted;
+  uint64_t offset;
+  uint64_t length;
+  uint32_t truncate_seq;
+  TruncRange(inodeno_t i, const file_layout_t& l, const SnapContext& sc,
+	     ceph::real_time t, int fl, Context *fin,
+	     uint64_t off, uint64_t len, uint32_t ts)
+    : ino(i), layout(l), snapc(sc), mtime(t), flags(fl), oncommit(fin),
+      uncommitted(0), offset(off), length(len), truncate_seq(ts) {}
+};
+
+void Filer::truncate(inodeno_t ino,
+		     file_layout_t *layout,
+		     const SnapContext& snapc,
+		     uint64_t offset,
+		     uint64_t len,
+		     __u32 truncate_seq,
+		     ceph::real_time mtime,
+		     int flags,
+		     Context *oncommit)
+{
+  uint64_t period = layout->get_period();
+  uint64_t num_objs = Striper::get_num_objects(*layout, len + (offset % period));
+  if (num_objs == 1) {
+    vector<ObjectExtent> extents;
+    Striper::file_to_extents(cct, ino, layout, offset, len, 0, extents);
+    vector<OSDOp> ops(1);
+    ops[0].op.op = CEPH_OSD_OP_TRIMTRUNC;
+    ops[0].op.extent.truncate_seq = truncate_seq;
+    ops[0].op.extent.truncate_size = extents[0].offset;
+    objecter->_modify(extents[0].oid, extents[0].oloc, ops, mtime, snapc,
+		      flags, oncommit);
+    return;
+  }
+
+  if (len > 0 && (offset + len) % period)
+    len += period - ((offset + len) % period);
+
+  TruncRange *tr = new TruncRange(ino, *layout, snapc, mtime, flags, oncommit,
+				  offset, len, truncate_seq);
+  _do_truncate_range(tr, 0);
+}
+
+struct C_TruncRange : public Context {
+  Filer *filer;
+  TruncRange *tr;
+  C_TruncRange(Filer *f, TruncRange *t) : filer(f), tr(t) {}
+  void finish(int r) override {
+    filer->_do_truncate_range(tr, 1);
+  }
+};
+
+void Filer::_do_truncate_range(TruncRange *tr, int fin)
+{
+  TruncRange::unique_lock trl(tr->lock);
+  tr->uncommitted -= fin;
+  ldout(cct, 10) << "_do_truncate_range " << tr->ino << " objects " << tr->offset
+		 << "~" << tr->length << " uncommitted " << tr->uncommitted
+		 << dendl;
+
+  if (tr->length == 0 && tr->uncommitted == 0) {
+    tr->oncommit->complete(0);
+    trl.unlock();
+    delete tr;
+    return;
+  }
+
+  vector<ObjectExtent> extents;
+
+  int max = cct->_conf->filer_max_truncate_ops - tr->uncommitted;
+  if (max > 0 && tr->length > 0) {
+    uint64_t len = tr->layout.get_period() * max;
+    if (len > tr->length)
+      len = tr->length;
+
+    uint64_t offset = tr->offset + tr->length - len;
+    Striper::file_to_extents(cct, tr->ino, &tr->layout, offset, len, 0, extents);
+    tr->uncommitted += extents.size();
+    tr->length -= len;
+  }
+
+  trl.unlock();
+
+  // Issue objecter ops outside tr->lock to avoid lock dependency loop
+  for (const auto& p : extents) {
+    vector<OSDOp> ops(1);
+    ops[0].op.op = CEPH_OSD_OP_TRIMTRUNC;
+    ops[0].op.extent.truncate_size = p.offset;
+    ops[0].op.extent.truncate_seq = tr->truncate_seq;
+    objecter->_modify(p.oid, p.oloc, ops, tr->mtime, tr->snapc, tr->flags,
+		      new C_OnFinisher(new C_TruncRange(this, tr), finisher));
+  }
+}
diff --git a/src/osdc/Filer.h b/src/osdc/Filer.h
index 8f2fd3e686d..00b6caa8f26 100644
--- a/src/osdc/Filer.h
+++ b/src/osdc/Filer.h
@@ -193,31 +193,8 @@ class Filer {
 	       __u32 truncate_seq,
 	       ceph::real_time mtime,
 	       int flags,
-	       Context *oncommit) {
-    vector<ObjectExtent> extents;
-    Striper::file_to_extents(cct, ino, layout, offset, len, 0, extents);
-    if (extents.size() == 1) {
-      vector<OSDOp> ops(1);
-      ops[0].op.op = CEPH_OSD_OP_TRIMTRUNC;
-      ops[0].op.extent.truncate_seq = truncate_seq;
-      ops[0].op.extent.truncate_size = extents[0].offset;
-      objecter->_modify(extents[0].oid, extents[0].oloc, ops, mtime, snapc,
-			flags, oncommit);
-    } else {
-      C_GatherBuilder gcom(cct, oncommit);
-      for (vector<ObjectExtent>::iterator p = extents.begin();
-	   p != extents.end();
-	   ++p) {
-	vector<OSDOp> ops(1);
-	ops[0].op.op = CEPH_OSD_OP_TRIMTRUNC;
-	ops[0].op.extent.truncate_size = p->offset;
-	ops[0].op.extent.truncate_seq = truncate_seq;
-	objecter->_modify(p->oid, p->oloc, ops, mtime, snapc, flags,
-			  oncommit ? gcom.new_sub():0);
-      }
-      gcom.activate();
-    }
-  }
+	       Context *oncommit);
+  void _do_truncate_range(struct TruncRange *pr, int fin);
 
   void zero(inodeno_t ino,
 	   const file_layout_t *layout,
diff --git a/src/osdc/ObjectCacher.cc b/src/osdc/ObjectCacher.cc
index 9a0b6ee2851..a77d6b31883 100644
--- a/src/osdc/ObjectCacher.cc
+++ b/src/osdc/ObjectCacher.cc
@@ -36,19 +36,21 @@ class ObjectCacher::C_ReadFinish : public Context {
   xlist<C_ReadFinish*>::item set_item;
   bool trust_enoent;
   ceph_tid_t tid;
+  ZTracer::Trace trace;
 
 public:
   bufferlist bl;
   C_ReadFinish(ObjectCacher *c, Object *ob, ceph_tid_t t, loff_t s,
-	       uint64_t l) :
+	       uint64_t l, const ZTracer::Trace &trace) :
     oc(c), poolid(ob->oloc.pool), oid(ob->get_soid()), start(s), length(l),
     set_item(this), trust_enoent(true),
-    tid(t) {
+    tid(t), trace(trace) {
     ob->reads.push_back(&set_item);
   }
 
   void finish(int r) override {
     oc->bh_read_finish(poolid, oid, tid, start, length, bl, r, trust_enoent);
+    trace.event("finish");
 
     // object destructor clears the list
     if (set_item.is_on_list())
@@ -65,18 +67,25 @@ class ObjectCacher::C_RetryRead : public Context {
   OSDRead *rd;
   ObjectSet *oset;
   Context *onfinish;
+  ZTracer::Trace trace;
 public:
-  C_RetryRead(ObjectCacher *_oc, OSDRead *r, ObjectSet *os, Context *c)
-    : oc(_oc), rd(r), oset(os), onfinish(c) {}
+  C_RetryRead(ObjectCacher *_oc, OSDRead *r, ObjectSet *os, Context *c,
+	      const ZTracer::Trace &trace)
+    : oc(_oc), rd(r), oset(os), onfinish(c), trace(trace) {
+  }
   void finish(int r) override {
-    if (r < 0) {
-      if (onfinish)
-        onfinish->complete(r);
+    if (r >= 0) {
+      r = oc->_readx(rd, oset, onfinish, false, &trace);
+    }
+
+    if (r == 0) {
+      // read is still in-progress
       return;
     }
-    int ret = oc->_readx(rd, oset, onfinish, false);
-    if (ret != 0 && onfinish) {
-      onfinish->complete(ret);
+
+    trace.event("finish");
+    if (onfinish) {
+      onfinish->complete(r);
     }
   }
 };
@@ -274,10 +283,9 @@ int ObjectCacher::Object::map_read(ObjectExtent &ex,
 				   map<loff_t, BufferHead*>& errors)
 {
   assert(oc->lock.is_locked());
-  ldout(oc->cct, 10) << "map_read " << ex.oid 
-      	       << " " << ex.offset << "~" << ex.length
-      	       << dendl;
-  
+  ldout(oc->cct, 10) << "map_read " << ex.oid << " "
+                     << ex.offset << "~" << ex.length << dendl;
+
   loff_t cur = ex.offset;
   loff_t left = ex.length;
 
@@ -302,7 +310,7 @@ int ObjectCacher::Object::map_read(ObjectExtent &ex,
       assert(cur == (loff_t)ex.offset + (loff_t)ex.length);
       break;  // no more.
     }
-    
+
     if (p->first <= cur) {
       // have it (or part of it)
       BufferHead *e = p->second;
@@ -322,13 +330,13 @@ int ObjectCacher::Object::map_read(ObjectExtent &ex,
       } else {
         ceph_abort();
       }
-      
+
       loff_t lenfromcur = MIN(e->end() - cur, left);
       cur += lenfromcur;
       left -= lenfromcur;
       ++p;
       continue;  // more?
-      
+
     } else if (p->first > cur) {
       // gap.. miss
       loff_t next = p->first;
@@ -396,7 +404,7 @@ void ObjectCacher::Object::audit_buffers()
  * other dirty data to left and/or right.
  */
 ObjectCacher::BufferHead *ObjectCacher::Object::map_write(ObjectExtent &ex,
-    ceph_tid_t tid)
+							  ceph_tid_t tid)
 {
   assert(oc->lock.is_locked());
   BufferHead *final = 0;
@@ -612,6 +620,7 @@ ObjectCacher::ObjectCacher(CephContext *cct_, string name,
     max_size(max_bytes), max_objects(max_objects),
     max_dirty_age(ceph::make_timespan(max_dirty_age)),
     block_writes_upfront(block_writes_upfront),
+    trace_endpoint("ObjectCacher"),
     flush_set_callback(flush_callback),
     flush_set_callback_arg(flush_callback_arg),
     last_read_tid(0), flusher_stop(false), flusher_thread(this),finisher(cct),
@@ -724,24 +733,32 @@ void ObjectCacher::close_object(Object *ob)
   delete ob;
 }
 
-void ObjectCacher::bh_read(BufferHead *bh, int op_flags)
+void ObjectCacher::bh_read(BufferHead *bh, int op_flags,
+                           const ZTracer::Trace &parent_trace)
 {
   assert(lock.is_locked());
   ldout(cct, 7) << "bh_read on " << *bh << " outstanding reads "
 		<< reads_outstanding << dendl;
 
+  ZTracer::Trace trace;
+  if (parent_trace.valid()) {
+    trace.init("", &trace_endpoint, &parent_trace);
+    trace.copy_name("bh_read " + bh->ob->get_oid().name);
+    trace.event("start");
+  }
+
   mark_rx(bh);
   bh->last_read_tid = ++last_read_tid;
 
   // finisher
   C_ReadFinish *onfinish = new C_ReadFinish(this, bh->ob, bh->last_read_tid,
-					    bh->start(), bh->length());
+					    bh->start(), bh->length(), trace);
   // go
   writeback_handler.read(bh->ob->get_oid(), bh->ob->get_object_number(),
 			 bh->ob->get_oloc(), bh->start(), bh->length(),
 			 bh->ob->get_snap(), &onfinish->bl,
 			 bh->ob->truncate_size, bh->ob->truncate_seq,
-			 op_flags, onfinish);
+			 op_flags, trace, onfinish);
 
   ++reads_outstanding;
 }
@@ -979,11 +996,12 @@ class ObjectCacher::C_WriteCommit : public Context {
   int64_t poolid;
   sobject_t oid;
   vector<pair<loff_t, uint64_t> > ranges;
+  ZTracer::Trace trace;
 public:
-  ceph_tid_t tid;
+  ceph_tid_t tid = 0;
   C_WriteCommit(ObjectCacher *c, int64_t _poolid, sobject_t o, loff_t s,
-		uint64_t l) :
-    oc(c), poolid(_poolid), oid(o), tid(0) {
+		uint64_t l, const ZTracer::Trace &trace) :
+    oc(c), poolid(_poolid), oid(o), trace(trace) {
       ranges.push_back(make_pair(s, l));
     }
   C_WriteCommit(ObjectCacher *c, int64_t _poolid, sobject_t o,
@@ -993,6 +1011,7 @@ public:
     }
   void finish(int r) override {
     oc->bh_write_commit(poolid, oid, ranges, tid, r);
+    trace.event("finish");
   }
 };
 void ObjectCacher::bh_write_scattered(list<BufferHead*>& blist)
@@ -1048,17 +1067,24 @@ void ObjectCacher::bh_write_scattered(list<BufferHead*>& blist)
     perfcounter->inc(l_objectcacher_data_flushed, total_len);
 }
 
-void ObjectCacher::bh_write(BufferHead *bh)
+void ObjectCacher::bh_write(BufferHead *bh, const ZTracer::Trace &parent_trace)
 {
   assert(lock.is_locked());
   ldout(cct, 7) << "bh_write " << *bh << dendl;
 
   bh->ob->get();
 
+  ZTracer::Trace trace;
+  if (parent_trace.valid()) {
+    trace.init("", &trace_endpoint, &parent_trace);
+    trace.copy_name("bh_write " + bh->ob->get_oid().name);
+    trace.event("start");
+  }
+
   // finishers
   C_WriteCommit *oncommit = new C_WriteCommit(this, bh->ob->oloc.pool,
 					      bh->ob->get_soid(), bh->start(),
-					      bh->length());
+					      bh->length(), trace);
   // go
   ceph_tid_t tid = writeback_handler.write(bh->ob->get_oid(),
 					   bh->ob->get_oloc(),
@@ -1066,7 +1092,7 @@ void ObjectCacher::bh_write(BufferHead *bh)
 					   bh->snapc, bh->bl, bh->last_write,
 					   bh->ob->truncate_size,
 					   bh->ob->truncate_seq,
-					   bh->journal_tid, oncommit);
+					   bh->journal_tid, trace, oncommit);
   ldout(cct, 20) << " tid " << tid << " on " << bh->ob->get_oid() << dendl;
 
   // set bh last_write_tid
@@ -1191,8 +1217,9 @@ void ObjectCacher::bh_write_commit(int64_t poolid, sobject_t oid,
     finish_contexts(cct, ls, r);
 }
 
-void ObjectCacher::flush(loff_t amount)
+void ObjectCacher::flush(ZTracer::Trace *trace, loff_t amount)
 {
+  assert(trace != nullptr);
   assert(lock.is_locked());
   ceph::real_time cutoff = ceph::real_clock::now();
 
@@ -1215,9 +1242,9 @@ void ObjectCacher::flush(loff_t amount)
       bh_write_adjacencies(bh, cutoff, amount > 0 ? &left : NULL, NULL);
     } else {
       left -= bh->length();
-      bh_write(bh);
+      bh_write(bh, *trace);
     }
-  }    
+  }
 }
 
 
@@ -1290,14 +1317,26 @@ bool ObjectCacher::is_cached(ObjectSet *oset, vector<ObjectExtent>& extents,
  *           must delete it)
  * returns 0 if doing async read
  */
-int ObjectCacher::readx(OSDRead *rd, ObjectSet *oset, Context *onfinish)
+int ObjectCacher::readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
+			ZTracer::Trace *parent_trace)
 {
-  return _readx(rd, oset, onfinish, true);
+  ZTracer::Trace trace;
+  if (parent_trace != nullptr) {
+    trace.init("read", &trace_endpoint, parent_trace);
+    trace.event("start");
+  }
+
+  int r =_readx(rd, oset, onfinish, true, &trace);
+  if (r < 0) {
+    trace.event("finish");
+  }
+  return r;
 }
 
 int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
-			 bool external_call)
+			 bool external_call, ZTracer::Trace *trace)
 {
+  assert(trace != nullptr);
   assert(lock.is_locked());
   bool success = true;
   int error = 0;
@@ -1350,7 +1389,7 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
 	      if (scattered_write)
 		blist.push_back(bh);
 	      else
-		bh_write(bh);
+		bh_write(bh, *trace);
 	    }
 	  }
 	}
@@ -1360,7 +1399,7 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
 	  ldout(cct, 10) << "readx  waiting on tid " << o->last_write_tid
 			 << " on " << *o << dendl;
 	  o->waitfor_commit[o->last_write_tid].push_back(
-	    new C_RetryRead(this,rd, oset, onfinish));
+	    new C_RetryRead(this,rd, oset, onfinish, *trace));
 	  // FIXME: perfcounter!
 	  return 0;
 	}
@@ -1417,14 +1456,15 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
 			   << waitfor_read.size() << " blocked reads, "
 			   << (MAX(rx_bytes, max_size) - max_size)
 			   << " read bytes" << dendl;
-	    waitfor_read.push_back(new C_RetryRead(this, rd, oset, onfinish));
+	    waitfor_read.push_back(new C_RetryRead(this, rd, oset, onfinish,
+						   *trace));
 	  }
 
 	  bh_remove(o, bh_it->second);
 	  delete bh_it->second;
 	} else {
 	  bh_it->second->set_nocache(nocache);
-	  bh_read(bh_it->second, rd->fadvise_flags);
+	  bh_read(bh_it->second, rd->fadvise_flags, *trace);
 	  if ((success && onfinish) || last != missing.end())
 	    last = bh_it;
 	}
@@ -1436,7 +1476,7 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
 	ldout(cct, 10) << "readx missed, waiting on " << *last->second
 	  << " off " << last->first << dendl;
 	last->second->waitfor_read[last->first].push_back(
-	  new C_RetryRead(this, rd, oset, onfinish) );
+	  new C_RetryRead(this, rd, oset, onfinish, *trace) );
 
       }
 
@@ -1449,7 +1489,7 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
 	  ldout(cct, 10) << "readx missed, waiting on " << *bh_it->second
 			 << " off " << bh_it->first << dendl;
 	  bh_it->second->waitfor_read[bh_it->first].push_back(
-	    new C_RetryRead(this, rd, oset, onfinish) );
+	    new C_RetryRead(this, rd, oset, onfinish, *trace) );
 	}
 	bytes_not_in_cache += bh_it->second->length();
 	success = false;
@@ -1616,7 +1656,8 @@ void ObjectCacher::retry_waiting_reads()
   waitfor_read.splice(waitfor_read.end(), ls);
 }
 
-int ObjectCacher::writex(OSDWrite *wr, ObjectSet *oset, Context *onfreespace)
+int ObjectCacher::writex(OSDWrite *wr, ObjectSet *oset, Context *onfreespace,
+			 ZTracer::Trace *parent_trace)
 {
   assert(lock.is_locked());
   ceph::real_time now = ceph::real_clock::now();
@@ -1625,6 +1666,12 @@ int ObjectCacher::writex(OSDWrite *wr, ObjectSet *oset, Context *onfreespace)
   bool dontneed = wr->fadvise_flags & LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
   bool nocache = wr->fadvise_flags & LIBRADOS_OP_FLAG_FADVISE_NOCACHE;
 
+  ZTracer::Trace trace;
+  if (parent_trace != nullptr) {
+    trace.init("write", &trace_endpoint, parent_trace);
+    trace.event("start");
+  }
+
   for (vector<ObjectExtent>::iterator ex_it = wr->extents.begin();
        ex_it != wr->extents.end();
        ++ex_it) {
@@ -1637,7 +1684,7 @@ int ObjectCacher::writex(OSDWrite *wr, ObjectSet *oset, Context *onfreespace)
     BufferHead *bh = o->map_write(*ex_it, wr->journal_tid);
     bool missing = bh->is_missing();
     bh->snapc = wr->snapc;
-    
+
     bytes_written += ex_it->length;
     if (bh->is_tx()) {
       bytes_written_in_flush += ex_it->length;
@@ -1696,7 +1743,7 @@ int ObjectCacher::writex(OSDWrite *wr, ObjectSet *oset, Context *onfreespace)
     }
   }
 
-  int r = _wait_for_write(wr, bytes_written, oset, onfreespace);
+  int r = _wait_for_write(wr, bytes_written, oset, &trace, onfreespace);
   delete wr;
 
   //verify_stats();
@@ -1706,23 +1753,26 @@ int ObjectCacher::writex(OSDWrite *wr, ObjectSet *oset, Context *onfreespace)
 
 class ObjectCacher::C_WaitForWrite : public Context {
 public:
-  C_WaitForWrite(ObjectCacher *oc, uint64_t len, Context *onfinish) :
-    m_oc(oc), m_len(len), m_onfinish(onfinish) {}
+  C_WaitForWrite(ObjectCacher *oc, uint64_t len,
+                 const ZTracer::Trace &trace, Context *onfinish) :
+    m_oc(oc), m_len(len), m_trace(trace), m_onfinish(onfinish) {}
   void finish(int r) override;
 private:
   ObjectCacher *m_oc;
   uint64_t m_len;
+  ZTracer::Trace m_trace;
   Context *m_onfinish;
 };
 
 void ObjectCacher::C_WaitForWrite::finish(int r)
 {
   Mutex::Locker l(m_oc->lock);
-  m_oc->maybe_wait_for_writeback(m_len);
+  m_oc->maybe_wait_for_writeback(m_len, &m_trace);
   m_onfinish->complete(r);
 }
 
-void ObjectCacher::maybe_wait_for_writeback(uint64_t len)
+void ObjectCacher::maybe_wait_for_writeback(uint64_t len,
+                                            ZTracer::Trace *trace)
 {
   assert(lock.is_locked());
   ceph::mono_time start = ceph::mono_clock::now();
@@ -1735,6 +1785,9 @@ void ObjectCacher::maybe_wait_for_writeback(uint64_t len)
   while (get_stat_dirty() + get_stat_tx() > 0 &&
 	 (uint64_t) (get_stat_dirty() + get_stat_tx()) >=
 	 max_dirty + get_stat_dirty_waiting()) {
+    if (blocked == 0) {
+      trace->event("start wait for writeback");
+    }
     ldout(cct, 10) << __func__ << " waiting for dirty|tx "
 		   << (get_stat_dirty() + get_stat_tx()) << " >= max "
 		   << max_dirty << " + dirty_waiting "
@@ -1746,6 +1799,9 @@ void ObjectCacher::maybe_wait_for_writeback(uint64_t len)
     ++blocked;
     ldout(cct, 10) << __func__ << " woke up" << dendl;
   }
+  if (blocked > 0) {
+    trace->event("finish wait for writeback");
+  }
   if (blocked && perfcounter) {
     perfcounter->inc(l_objectcacher_write_ops_blocked);
     perfcounter->inc(l_objectcacher_write_bytes_blocked, len);
@@ -1756,19 +1812,20 @@ void ObjectCacher::maybe_wait_for_writeback(uint64_t len)
 
 // blocking wait for write.
 int ObjectCacher::_wait_for_write(OSDWrite *wr, uint64_t len, ObjectSet *oset,
-				  Context *onfreespace)
+				  ZTracer::Trace *trace, Context *onfreespace)
 {
   assert(lock.is_locked());
+  assert(trace != nullptr);
   int ret = 0;
 
   if (max_dirty > 0) {
     if (block_writes_upfront) {
-      maybe_wait_for_writeback(len);
+      maybe_wait_for_writeback(len, trace);
       if (onfreespace)
 	onfreespace->complete(0);
     } else {
       assert(onfreespace);
-      finisher.queue(new C_WaitForWrite(this, len, onfreespace));
+      finisher.queue(new C_WaitForWrite(this, len, *trace, onfreespace));
     }
   } else {
     // write-thru!  flush what we just wrote.
@@ -1777,7 +1834,7 @@ int ObjectCacher::_wait_for_write(OSDWrite *wr, uint64_t len, ObjectSet *oset,
     Context *fin = block_writes_upfront ?
       new C_Cond(&cond, &done, &ret) : onfreespace;
     assert(fin);
-    bool flushed = flush_set(oset, wr->extents, fin);
+    bool flushed = flush_set(oset, wr->extents, trace, fin);
     assert(!flushed);   // we just dirtied it, and didn't drop our lock!
     ldout(cct, 10) << "wait_for_write waiting on write-thru of " << len
 		   << " bytes" << dendl;
@@ -1816,12 +1873,19 @@ void ObjectCacher::flusher_entry()
 		   << max_dirty << " max)"
 		   << dendl;
     loff_t actual = get_stat_dirty() + get_stat_dirty_waiting();
+
+    ZTracer::Trace trace;
+    if (cct->_conf->osdc_blkin_trace_all) {
+      trace.init("flusher", &trace_endpoint);
+      trace.event("start");
+    }
+
     if (actual > 0 && (uint64_t) actual > target_dirty) {
       // flush some dirty pages
       ldout(cct, 10) << "flusher " << get_stat_dirty() << " dirty + "
 		     << get_stat_dirty_waiting() << " dirty_waiting > target "
 		     << target_dirty << ", flushing some dirty bhs" << dendl;
-      flush(actual - target_dirty);
+      flush(&trace, actual - target_dirty);
     } else {
       // check tail of lru for old dirty items
       ceph::real_time cutoff = ceph::real_clock::now();
@@ -1836,17 +1900,20 @@ void ObjectCacher::flusher_entry()
 	if (scattered_write) {
 	  bh_write_adjacencies(bh, cutoff, NULL, &max);
         } else {
-	  bh_write(bh);
+	  bh_write(bh, trace);
 	  --max;
 	}
       }
       if (!max) {
 	// back off the lock to avoid starving other threads
+        trace.event("backoff");
 	lock.Unlock();
 	lock.Lock();
 	continue;
       }
     }
+
+    trace.event("finish");
     if (flusher_stop)
       break;
 
@@ -1943,8 +2010,10 @@ void ObjectCacher::purge(Object *ob)
 // true if clean, already flushed.
 // false if we wrote something.
 // be sloppy about the ranges and flush any buffer it touches
-bool ObjectCacher::flush(Object *ob, loff_t offset, loff_t length)
+bool ObjectCacher::flush(Object *ob, loff_t offset, loff_t length,
+                         ZTracer::Trace *trace)
 {
+  assert(trace != nullptr);
   assert(lock.is_locked());
   list<BufferHead*> blist;
   bool clean = true;
@@ -1968,7 +2037,7 @@ bool ObjectCacher::flush(Object *ob, loff_t offset, loff_t length)
     if (scattered_write)
       blist.push_back(bh);
     else
-      bh_write(bh);
+      bh_write(bh, *trace);
     clean = false;
   }
   if (scattered_write && !blist.empty())
@@ -2044,7 +2113,7 @@ bool ObjectCacher::flush_set(ObjectSet *oset, Context *onfinish)
 	}
 	blist.push_back(bh);
       } else {
-	bh_write(bh);
+	bh_write(bh, {});
       }
     }
   }
@@ -2070,7 +2139,7 @@ bool ObjectCacher::flush_set(ObjectSet *oset, Context *onfinish)
 	  }
 	  blist.push_front(bh);
 	} else {
-	  bh_write(bh);
+	  bh_write(bh, {});
 	}
       }
       if (!backwards)
@@ -2097,9 +2166,10 @@ bool ObjectCacher::flush_set(ObjectSet *oset, Context *onfinish)
 // flush.  non-blocking, takes callback.
 // returns true if already flushed
 bool ObjectCacher::flush_set(ObjectSet *oset, vector<ObjectExtent>& exv,
-			     Context *onfinish)
+			     ZTracer::Trace *trace, Context *onfinish)
 {
   assert(lock.is_locked());
+  assert(trace != nullptr);
   assert(onfinish != NULL);
   if (oset->objects.empty()) {
     ldout(cct, 10) << "flush_set on " << oset << " dne" << dendl;
@@ -2125,7 +2195,7 @@ bool ObjectCacher::flush_set(ObjectSet *oset, vector<ObjectExtent>& exv,
     ldout(cct, 20) << "flush_set " << oset << " ex " << ex << " ob " << soid
 		   << " " << ob << dendl;
 
-    if (!flush(ob, ex.offset, ex.length)) {
+    if (!flush(ob, ex.offset, ex.length, trace)) {
       // we'll need to gather...
       ldout(cct, 10) << "flush_set " << oset << " will wait for ack tid "
 		     << ob->last_write_tid << " on " << *ob << dendl;
@@ -2169,7 +2239,7 @@ bool ObjectCacher::flush_all(Context *onfinish)
 	}
 	blist.push_back(bh);
       } else {
-	bh_write(bh);
+	bh_write(bh, {});
       }
     }
 
diff --git a/src/osdc/ObjectCacher.h b/src/osdc/ObjectCacher.h
index a0305b65213..31201a72354 100644
--- a/src/osdc/ObjectCacher.h
+++ b/src/osdc/ObjectCacher.h
@@ -11,6 +11,7 @@
 #include "common/Cond.h"
 #include "common/Finisher.h"
 #include "common/Thread.h"
+#include "common/zipkin_trace.h"
 
 #include "Objecter.h"
 #include "Striper.h"
@@ -350,7 +351,7 @@ class ObjectCacher {
                  map<loff_t, BufferHead*>& rx,
 		 map<loff_t, BufferHead*>& errors);
     BufferHead *map_write(ObjectExtent &ex, ceph_tid_t tid);
-    
+
     void replace_journal_tid(BufferHead *bh, ceph_tid_t tid);
     void truncate(loff_t s);
     void discard(loff_t off, loff_t len);
@@ -403,6 +404,8 @@ class ObjectCacher {
   ceph::timespan max_dirty_age;
   bool block_writes_upfront;
 
+  ZTracer::Endpoint trace_endpoint;
+
   flush_set_callback_t flush_set_callback;
   void *flush_set_callback_arg;
 
@@ -519,14 +522,15 @@ class ObjectCacher {
   void bh_remove(Object *ob, BufferHead *bh);
 
   // io
-  void bh_read(BufferHead *bh, int op_flags);
-  void bh_write(BufferHead *bh);
+  void bh_read(BufferHead *bh, int op_flags,
+               const ZTracer::Trace &parent_trace);
+  void bh_write(BufferHead *bh, const ZTracer::Trace &parent_trace);
   void bh_write_scattered(list<BufferHead*>& blist);
   void bh_write_adjacencies(BufferHead *bh, ceph::real_time cutoff,
 			    int64_t *amount, int *max_count);
 
   void trim();
-  void flush(loff_t amount=0);
+  void flush(ZTracer::Trace *trace, loff_t amount=0);
 
   /**
    * flush a range of buffers
@@ -539,7 +543,8 @@ class ObjectCacher {
    * @param len extent length, or 0 for entire object
    * @return true if object was already clean/flushed.
    */
-  bool flush(Object *o, loff_t off, loff_t len);
+  bool flush(Object *o, loff_t off, loff_t len,
+             ZTracer::Trace *trace);
   loff_t release(Object *o);
   void purge(Object *o);
 
@@ -547,7 +552,7 @@ class ObjectCacher {
   Cond read_cond;
 
   int _readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
-	     bool external_call);
+	     bool external_call, ZTracer::Trace *trace);
   void retry_waiting_reads();
 
  public:
@@ -597,16 +602,18 @@ class ObjectCacher {
    * @note total read size must be <= INT_MAX, since
    * the return value is total bytes read
    */
-  int readx(OSDRead *rd, ObjectSet *oset, Context *onfinish);
-  int writex(OSDWrite *wr, ObjectSet *oset, Context *onfreespace);
+  int readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
+	    ZTracer::Trace *parent_trace = nullptr);
+  int writex(OSDWrite *wr, ObjectSet *oset, Context *onfreespace,
+	     ZTracer::Trace *parent_trace = nullptr);
   bool is_cached(ObjectSet *oset, vector<ObjectExtent>& extents,
 		 snapid_t snapid);
 
 private:
   // write blocking
   int _wait_for_write(OSDWrite *wr, uint64_t len, ObjectSet *oset,
-		      Context *onfreespace);
-  void maybe_wait_for_writeback(uint64_t len);
+                      ZTracer::Trace *trace, Context *onfreespace);
+  void maybe_wait_for_writeback(uint64_t len, ZTracer::Trace *trace);
   bool _flush_set_finish(C_GatherBuilder *gather, Context *onfinish);
 
 public:
@@ -616,7 +623,7 @@ public:
 
   bool flush_set(ObjectSet *oset, Context *onfinish=0);
   bool flush_set(ObjectSet *oset, vector<ObjectExtent>& ex,
-		 Context *onfinish = 0);
+                 ZTracer::Trace *trace, Context *onfinish = 0);
   bool flush_all(Context *onfinish = 0);
 
   void purge_set(ObjectSet *oset);
@@ -689,7 +696,8 @@ public:
     vector<ObjectExtent> extents;
     Striper::file_to_extents(cct, oset->ino, layout, offset, len,
 			     oset->truncate_size, extents);
-    return flush_set(oset, extents, onfinish);
+    ZTracer::Trace trace;
+    return flush_set(oset, extents, &trace, onfinish);
   }
 };
 
diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc
index 0d8138d8909..0364be92de7 100644
--- a/src/osdc/Objecter.cc
+++ b/src/osdc/Objecter.cc
@@ -242,8 +242,8 @@ void Objecter::init()
   if (!logger) {
     PerfCountersBuilder pcb(cct, "objecter", l_osdc_first, l_osdc_last);
 
-    pcb.add_u64(l_osdc_op_active, "op_active",
-		"Operations active", "actv");
+    pcb.add_u64(l_osdc_op_active, "op_active", "Operations active", "actv",
+		PerfCountersBuilder::PRIO_CRITICAL);
     pcb.add_u64(l_osdc_op_laggy, "op_laggy", "Laggy operations");
     pcb.add_u64_counter(l_osdc_op_send, "op_send", "Sent operations");
     pcb.add_u64_counter(l_osdc_op_send_bytes, "op_send_bytes", "Sent data");
@@ -251,12 +251,12 @@ void Objecter::init()
     pcb.add_u64_counter(l_osdc_op_reply, "op_reply", "Operation reply");
 
     pcb.add_u64_counter(l_osdc_op, "op", "Operations");
-    pcb.add_u64_counter(l_osdc_op_r, "op_r",
-			"Read operations", "read");
-    pcb.add_u64_counter(l_osdc_op_w, "op_w",
-			"Write operations", "writ");
-    pcb.add_u64_counter(l_osdc_op_rmw, "op_rmw",
-			"Read-modify-write operations");
+    pcb.add_u64_counter(l_osdc_op_r, "op_r", "Read operations", "rd",
+			PerfCountersBuilder::PRIO_CRITICAL);
+    pcb.add_u64_counter(l_osdc_op_w, "op_w", "Write operations", "wr",
+			PerfCountersBuilder::PRIO_CRITICAL);
+    pcb.add_u64_counter(l_osdc_op_rmw, "op_rmw", "Read-modify-write operations",
+			"rdwr", PerfCountersBuilder::PRIO_INTERESTING);
     pcb.add_u64_counter(l_osdc_op_pg, "op_pg", "PG operation");
 
     pcb.add_u64_counter(l_osdc_osdop_stat, "osdop_stat", "Stat operations");
@@ -1331,9 +1331,11 @@ void Objecter::handle_osd_map(MOSDMap *m)
   for (map<ceph_tid_t,CommandOp*>::iterator p = need_resend_command.begin();
        p != need_resend_command.end(); ++p) {
     CommandOp *c = p->second;
-    _assign_command_session(c, sul);
-    if (c->session && !c->session->is_homeless()) {
-      _send_command(c);
+    if (c->target.osd >= 0) {
+      _assign_command_session(c, sul);
+      if (c->session && !c->session->is_homeless()) {
+	_send_command(c);
+      }
     }
   }
 
@@ -1396,18 +1398,17 @@ void Objecter::C_Op_Map_Latest::finish(int r)
 }
 
 int Objecter::pool_snap_by_name(int64_t poolid, const char *snap_name,
-				snapid_t *snap)
+				snapid_t *snap) const
 {
   shared_lock rl(rwlock);
 
-  const map<int64_t, pg_pool_t>& pools = osdmap->get_pools();
-  map<int64_t, pg_pool_t>::const_iterator iter = pools.find(poolid);
+  auto& pools = osdmap->get_pools();
+  auto iter = pools.find(poolid);
   if (iter == pools.end()) {
     return -ENOENT;
   }
   const pg_pool_t& pg_pool = iter->second;
-  map<snapid_t, pool_snap_info_t>::const_iterator p;
-  for (p = pg_pool.snaps.begin();
+  for (auto p = pg_pool.snaps.begin();
        p != pg_pool.snaps.end();
        ++p) {
     if (p->second.name == snap_name) {
@@ -1419,17 +1420,17 @@ int Objecter::pool_snap_by_name(int64_t poolid, const char *snap_name,
 }
 
 int Objecter::pool_snap_get_info(int64_t poolid, snapid_t snap,
-				 pool_snap_info_t *info)
+				 pool_snap_info_t *info) const
 {
   shared_lock rl(rwlock);
 
-  const map<int64_t, pg_pool_t>& pools = osdmap->get_pools();
-  map<int64_t, pg_pool_t>::const_iterator iter = pools.find(poolid);
+  auto& pools = osdmap->get_pools();
+  auto iter = pools.find(poolid);
   if (iter == pools.end()) {
     return -ENOENT;
   }
   const pg_pool_t& pg_pool = iter->second;
-  map<snapid_t,pool_snap_info_t>::const_iterator p = pg_pool.snaps.find(snap);
+  auto p = pg_pool.snaps.find(snap);
   if (p == pg_pool.snaps.end())
     return -ENOENT;
   *info = p->second;
@@ -1457,9 +1458,9 @@ void Objecter::_check_op_pool_dne(Op *op, unique_lock *sl)
 {
   // rwlock is locked unique
 
-  if (op->attempts) {
-    // we send a reply earlier, which means that previously the pool
-    // existed, and now it does not (i.e., it was deleted).
+  if (op->target.pool_ever_existed) {
+    // the pool previously existed and now it does not, which means it
+    // was deleted.
     op->map_dne_bound = osdmap->get_epoch();
     ldout(cct, 10) << "check_op_pool_dne tid " << op->tid
 		   << " pool previously exists but now does not"
@@ -2194,6 +2195,7 @@ void Objecter::op_submit(Op *op, ceph_tid_t *ptid, int *ctx_budget)
   ceph_tid_t tid = 0;
   if (!ptid)
     ptid = &tid;
+  op->trace.event("op submit");
   _op_submit_with_budget(op, rl, ptid, ctx_budget);
 }
 
@@ -2748,6 +2750,7 @@ int Objecter::_calc_target(op_target_t *t, Connection *con, bool any_change)
 		<< t->target_oloc << " -> pgid " << pgid << dendl;
   ldout(cct,30) << __func__ << "  target pi " << pi
 		<< " pg_num " << pi->get_pg_num() << dendl;
+  t->pool_ever_existed = true;
 
   int size = pi->size;
   int min_size = pi->min_size;
@@ -2759,7 +2762,7 @@ int Objecter::_calc_target(op_target_t *t, Connection *con, bool any_change)
   bool sort_bitwise = osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE);
   unsigned prev_seed = ceph_stable_mod(pgid.ps(), t->pg_num, t->pg_num_mask);
   pg_t prev_pgid(prev_seed, pgid.pool());
-  if (any_change && pg_interval_t::is_new_interval(
+  if (any_change && PastIntervals::is_new_interval(
 	t->acting_primary,
 	acting_primary,
 	t->acting,
@@ -3089,6 +3092,10 @@ MOSDOp *Objecter::_prepare_osd_op(Op *op)
   m->set_mtime(op->mtime);
   m->set_retry_attempt(op->attempts++);
 
+  if (!op->trace.valid() && cct->_conf->osdc_blkin_trace_all) {
+    op->trace.init("op", &trace_endpoint);
+  }
+
   if (op->priority)
     m->set_priority(op->priority);
   else
@@ -3172,6 +3179,9 @@ void Objecter::_send_op(Op *op, MOSDOp *m)
 
   m->set_tid(op->tid);
 
+  if (op->trace.valid()) {
+    m->trace.init("op msg", nullptr, &op->trace);
+  }
   op->session->con->send_message(m);
 }
 
@@ -3280,6 +3290,7 @@ void Objecter::handle_osd_op_reply(MOSDOpReply *m)
 		<< " attempt " << m->get_retry_attempt()
 		<< dendl;
   Op *op = iter->second;
+  op->trace.event("osd op reply");
 
   if (retry_writes_after_first_reply && op->attempts == 1 &&
       (op->target.flags & CEPH_OSD_FLAG_WRITE)) {
@@ -4727,11 +4738,13 @@ int Objecter::_calc_command_target(CommandOp *c, shunique_lock& sul)
     if (!osdmap->exists(c->target_osd)) {
       c->map_check_error = -ENOENT;
       c->map_check_error_str = "osd dne";
+      c->target.osd = -1;
       return RECALC_OP_TARGET_OSD_DNE;
     }
     if (osdmap->is_down(c->target_osd)) {
       c->map_check_error = -ENXIO;
       c->map_check_error_str = "osd down";
+      c->target.osd = -1;
       return RECALC_OP_TARGET_OSD_DOWN;
     }
     c->target.osd = c->target_osd;
@@ -4740,10 +4753,12 @@ int Objecter::_calc_command_target(CommandOp *c, shunique_lock& sul)
     if (ret == RECALC_OP_TARGET_POOL_DNE) {
       c->map_check_error = -ENOENT;
       c->map_check_error_str = "pool dne";
+      c->target.osd = -1;
       return ret;
     } else if (ret == RECALC_OP_TARGET_OSD_DOWN) {
       c->map_check_error = -ENXIO;
       c->map_check_error_str = "osd down";
+      c->target.osd = -1;
       return ret;
     }
   }
diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h
index 3ce5e256c24..883ef5bbb1d 100644
--- a/src/osdc/Objecter.h
+++ b/src/osdc/Objecter.h
@@ -35,11 +35,11 @@
 #include "common/ceph_timer.h"
 #include "common/Finisher.h"
 #include "common/shunique_lock.h"
+#include "common/zipkin_trace.h"
 
 #include "messages/MOSDOp.h"
 #include "osd/OSDMap.h"
 
-
 using namespace std;
 
 class Context;
@@ -282,7 +282,37 @@ struct ObjectOperation {
     out_handler[p] = h;
     out_rval[p] = prval;
   }
-  // object data
+  // object cmpext
+  struct C_ObjectOperation_cmpext : public Context {
+    int *prval;
+    C_ObjectOperation_cmpext(int *prval)
+      : prval(prval) {}
+
+    void finish(int r) {
+      if (prval)
+        *prval = r;
+    }
+  };
+
+  void cmpext(uint64_t off, bufferlist& cmp_bl, int *prval) {
+    add_data(CEPH_OSD_OP_CMPEXT, off, cmp_bl.length(), cmp_bl);
+    unsigned p = ops.size() - 1;
+    C_ObjectOperation_cmpext *h = new C_ObjectOperation_cmpext(prval);
+    out_handler[p] = h;
+    out_rval[p] = prval;
+  }
+
+  // Used by C API
+  void cmpext(uint64_t off, uint64_t cmp_len, const char *cmp_buf, int *prval) {
+    bufferlist cmp_bl;
+    cmp_bl.append(cmp_buf, cmp_len);
+    add_data(CEPH_OSD_OP_CMPEXT, off, cmp_len, cmp_bl);
+    unsigned p = ops.size() - 1;
+    C_ObjectOperation_cmpext *h = new C_ObjectOperation_cmpext(prval);
+    out_handler[p] = h;
+    out_rval[p] = prval;
+  }
+
   void read(uint64_t off, uint64_t len, bufferlist *pbl, int *prval,
 	    Context* ctx) {
     bufferlist bl;
@@ -1143,6 +1173,7 @@ public:
   Messenger *messenger;
   MonClient *monc;
   Finisher *finisher;
+  ZTracer::Endpoint trace_endpoint;
 private:
   OSDMap    *osdmap;
 public:
@@ -1210,6 +1241,9 @@ public:
     ///< true if we are directed at base_pgid, not base_oid
     bool precalc_pgid = false;
 
+    ///< true if we have ever mapped to a valid pool
+    bool pool_ever_existed = false;
+
     ///< explcit pg target, if any
     pg_t base_pgid;
 
@@ -1312,9 +1346,11 @@ public:
     int *data_offset;
 
     osd_reqid_t reqid; // explicitly setting reqid
+    ZTracer::Trace trace;
 
     Op(const object_t& o, const object_locator_t& ol, vector<OSDOp>& op,
-       int f, Context *fin, version_t *ov, int *offset = NULL) :
+       int f, Context *fin, version_t *ov, int *offset = NULL,
+       ZTracer::Trace *parent_trace = nullptr) :
       session(NULL), incarnation(0),
       target(o, ol, f),
       con(NULL),
@@ -1347,6 +1383,11 @@ public:
 
       if (target.base_oloc.key == o)
 	target.base_oloc.key.clear();
+
+      if (parent_trace && parent_trace->valid()) {
+        trace.init("op", nullptr, parent_trace);
+        trace.event("start");
+      }
     }
 
     bool operator<(const Op& other) const {
@@ -1365,6 +1406,7 @@ public:
 	delete out_handler.back();
 	out_handler.pop_back();
       }
+      trace.event("finish");
     }
   };
 
@@ -1934,6 +1976,7 @@ private:
 	   double mon_timeout,
 	   double osd_timeout) :
     Dispatcher(cct_), messenger(m), monc(mc), finisher(fin),
+    trace_endpoint("0.0.0.0", 0, "Objecter"),
     osdmap(new OSDMap), initialized(0), last_tid(0), client_inc(-1),
     max_linger_id(0), num_in_flight(0), global_op_flags(0),
     keep_balanced_budget(false), honor_osdmap_full(true), osdmap_full_try(false),
@@ -2030,9 +2073,11 @@ private:
   void handle_osd_map(class MOSDMap *m);
   void wait_for_osd_map();
 
-  int pool_snap_by_name(int64_t poolid, const char *snap_name, snapid_t *snap);
+  int pool_snap_by_name(int64_t poolid,
+			const char *snap_name,
+			snapid_t *snap) const;
   int pool_snap_get_info(int64_t poolid, snapid_t snap,
-			 pool_snap_info_t *info);
+			 pool_snap_info_t *info) const;
   int pool_snap_list(int64_t poolid, vector<uint64_t> *snaps);
 private:
 
@@ -2142,9 +2187,10 @@ public:
     ObjectOperation& op, const SnapContext& snapc,
     ceph::real_time mtime, int flags,
     Context *oncommit, version_t *objver = NULL,
-    osd_reqid_t reqid = osd_reqid_t()) {
+    osd_reqid_t reqid = osd_reqid_t(),
+    ZTracer::Trace *parent_trace = nullptr) {
     Op *o = new Op(oid, oloc, op.ops, flags | global_op_flags.read() |
-		   CEPH_OSD_FLAG_WRITE, oncommit, objver);
+		   CEPH_OSD_FLAG_WRITE, oncommit, objver, nullptr, parent_trace);
     o->priority = op.priority;
     o->mtime = mtime;
     o->snapc = snapc;
@@ -2170,9 +2216,10 @@ public:
     snapid_t snapid, bufferlist *pbl, int flags,
     Context *onack, version_t *objver = NULL,
     int *data_offset = NULL,
-    uint64_t features = 0) {
+    uint64_t features = 0,
+    ZTracer::Trace *parent_trace = nullptr) {
     Op *o = new Op(oid, oloc, op.ops, flags | global_op_flags.read() |
-		   CEPH_OSD_FLAG_READ, onack, objver, data_offset);
+		   CEPH_OSD_FLAG_READ, onack, objver, data_offset, parent_trace);
     o->priority = op.priority;
     o->snapid = snapid;
     o->outbl = pbl;
@@ -2315,7 +2362,8 @@ public:
     const object_t& oid, const object_locator_t& oloc,
     uint64_t off, uint64_t len, snapid_t snap, bufferlist *pbl,
     int flags, Context *onfinish, version_t *objver = NULL,
-    ObjectOperation *extra_ops = NULL, int op_flags = 0) {
+    ObjectOperation *extra_ops = NULL, int op_flags = 0,
+    ZTracer::Trace *parent_trace = nullptr) {
     vector<OSDOp> ops;
     int i = init_ops(ops, 1, extra_ops);
     ops[i].op.op = CEPH_OSD_OP_READ;
@@ -2325,7 +2373,7 @@ public:
     ops[i].op.extent.truncate_seq = 0;
     ops[i].op.flags = op_flags;
     Op *o = new Op(oid, oloc, ops, flags | global_op_flags.read() |
-		   CEPH_OSD_FLAG_READ, onfinish, objver);
+		   CEPH_OSD_FLAG_READ, onfinish, objver, nullptr, parent_trace);
     o->snapid = snap;
     o->outbl = pbl;
     return o;
@@ -2342,6 +2390,38 @@ public:
     return tid;
   }
 
+  Op *prepare_cmpext_op(
+    const object_t& oid, const object_locator_t& oloc,
+    uint64_t off, bufferlist &cmp_bl,
+    snapid_t snap, int flags, Context *onfinish, version_t *objver = NULL,
+    ObjectOperation *extra_ops = NULL, int op_flags = 0) {
+    vector<OSDOp> ops;
+    int i = init_ops(ops, 1, extra_ops);
+    ops[i].op.op = CEPH_OSD_OP_CMPEXT;
+    ops[i].op.extent.offset = off;
+    ops[i].op.extent.length = cmp_bl.length();
+    ops[i].op.extent.truncate_size = 0;
+    ops[i].op.extent.truncate_seq = 0;
+    ops[i].indata = cmp_bl;
+    ops[i].op.flags = op_flags;
+    Op *o = new Op(oid, oloc, ops, flags | global_op_flags.read() |
+		   CEPH_OSD_FLAG_READ, onfinish, objver);
+    o->snapid = snap;
+    return o;
+  }
+
+  ceph_tid_t cmpext(
+    const object_t& oid, const object_locator_t& oloc,
+    uint64_t off, bufferlist &cmp_bl,
+    snapid_t snap, int flags, Context *onfinish, version_t *objver = NULL,
+    ObjectOperation *extra_ops = NULL, int op_flags = 0) {
+    Op *o = prepare_cmpext_op(oid, oloc, off, cmp_bl, snap,
+			      flags, onfinish, objver, extra_ops, op_flags);
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+
   ceph_tid_t read_trunc(const object_t& oid, const object_locator_t& oloc,
 			uint64_t off, uint64_t len, snapid_t snap,
 			bufferlist *pbl, int flags, uint64_t trunc_size,
@@ -2448,7 +2528,8 @@ public:
     uint64_t off, uint64_t len, const SnapContext& snapc,
     const bufferlist &bl, ceph::real_time mtime, int flags,
     Context *oncommit, version_t *objver = NULL,
-    ObjectOperation *extra_ops = NULL, int op_flags = 0) {
+    ObjectOperation *extra_ops = NULL, int op_flags = 0,
+    ZTracer::Trace *parent_trace = nullptr) {
     vector<OSDOp> ops;
     int i = init_ops(ops, 1, extra_ops);
     ops[i].op.op = CEPH_OSD_OP_WRITE;
@@ -2459,7 +2540,8 @@ public:
     ops[i].indata = bl;
     ops[i].op.flags = op_flags;
     Op *o = new Op(oid, oloc, ops, flags | global_op_flags.read() |
-		   CEPH_OSD_FLAG_WRITE, oncommit, objver);
+		   CEPH_OSD_FLAG_WRITE, oncommit, objver,
+                   nullptr, parent_trace);
     o->mtime = mtime;
     o->snapc = snapc;
     return o;
diff --git a/src/osdc/WritebackHandler.h b/src/osdc/WritebackHandler.h
index c2835322341..bf90fdd0ac0 100644
--- a/src/osdc/WritebackHandler.h
+++ b/src/osdc/WritebackHandler.h
@@ -15,7 +15,8 @@ class WritebackHandler {
   virtual void read(const object_t& oid, uint64_t object_no,
 		    const object_locator_t& oloc, uint64_t off, uint64_t len,
 		    snapid_t snapid, bufferlist *pbl, uint64_t trunc_size,
-		    __u32 trunc_seq, int op_flags, Context *onfinish) = 0;
+		    __u32 trunc_seq, int op_flags,
+                    const ZTracer::Trace &parent_trace, Context *onfinish) = 0;
   /**
    * check if a given extent read result may change due to a write
    *
@@ -34,7 +35,9 @@ class WritebackHandler {
 			   const SnapContext& snapc,
 			   const bufferlist &bl, ceph::real_time mtime,
 			   uint64_t trunc_size, __u32 trunc_seq,
-                           ceph_tid_t journal_tid, Context *oncommit) = 0;
+                           ceph_tid_t journal_tid,
+                           const ZTracer::Trace &parent_trace,
+                           Context *oncommit) = 0;
 
   virtual void overwrite_extent(const object_t& oid, uint64_t off, uint64_t len,
                                 ceph_tid_t original_journal_tid,
diff --git a/src/pybind/ceph_daemon.py b/src/pybind/ceph_daemon.py
index c3293d66f57..48bd4e0d7ce 100755
--- a/src/pybind/ceph_daemon.py
+++ b/src/pybind/ceph_daemon.py
@@ -15,7 +15,12 @@ import json
 import socket
 import struct
 import time
-from collections import defaultdict, OrderedDict
+from collections import OrderedDict
+from fcntl import ioctl
+from fnmatch import fnmatch
+from prettytable import PrettyTable, HEADER
+from signal import signal, SIGWINCH
+from termios import TIOCGWINSZ
 
 from ceph_argparse import parse_json_funcsigs, validate_command
 
@@ -82,6 +87,38 @@ def admin_socket(asok_path, cmd, format=''):
     return ret
 
 
+def _gettermsize():
+    try:
+        rows, cols = struct.unpack('hhhh', ioctl(0, TIOCGWINSZ, 8*'\x00'))[0:2]
+    except IOError:
+        return 25, 80
+
+    return rows,cols
+
+
+class Termsize(object):
+
+    def __init__(self):
+        self.rows, self.cols = _gettermsize()
+        self.changed = False
+
+    def update(self):
+        rows, cols = _gettermsize()
+        self.changed = self.changed or (
+            (self.rows != rows) or (self.cols != cols)
+        )
+        self.rows, self.cols = rows, cols
+
+    def reset_changed(self):
+        self.changed = False
+
+    def __str__(self):
+        return '%s(%dx%d, changed %s)' % (self.__class__, self.rows, self.cols, self.changed)
+
+    def __repr__(self):
+        return 'Termsize(%d,%d,%s)' % (self.__class__, self.rows, self.cols, self.changed)
+
+
 class DaemonWatcher(object):
     """
     Given a Ceph daemon's admin socket path, poll its performance counters
@@ -105,12 +142,16 @@ class DaemonWatcher(object):
     BOLD_SEQ = "\033[1m"
     UNDERLINE_SEQ = "\033[4m"
 
-    def __init__(self, asok):
+    def __init__(self, asok, statpats=None, min_prio=0):
         self.asok_path = asok
         self._colored = False
 
         self._stats = None
         self._schema = None
+        self._statpats = statpats
+        self._stats_that_fit = dict()
+        self._min_prio = min_prio
+        self.termsize = Termsize()
 
     def supports_color(self, ostr):
         """
@@ -173,13 +214,40 @@ class DaemonWatcher(object):
         """
         return max(len(nick), 4)
 
+    def get_stats_that_fit(self):
+        '''
+        Get a possibly-truncated list of stats to display based on
+        current terminal width.  Allow breaking mid-section.
+        '''
+        current_fit = OrderedDict()
+        if self.termsize.changed or not self._stats_that_fit:
+            width = 0
+            for section_name, names in self._stats.items():
+                for name, stat_data in names.items():
+                    width += self.col_width(stat_data) + 1
+                    if width > self.termsize.cols:
+                        break
+                    if section_name not in current_fit:
+                        current_fit[section_name] = OrderedDict()
+                    current_fit[section_name][name] = stat_data
+                if width > self.termsize.cols:
+                    break
+
+        self.termsize.reset_changed()
+        changed = current_fit and (current_fit != self._stats_that_fit)
+        if changed:
+            self._stats_that_fit = current_fit
+        return self._stats_that_fit, changed
+
     def _print_headers(self, ostr):
         """
         Print a header row to `ostr`
         """
         header = ""
-        for section_name, names in self._stats.items():
-            section_width = sum([self.col_width(x)+1 for x in names.values()]) - 1
+        stats, _ = self.get_stats_that_fit()
+        for section_name, names in stats.items():
+            section_width = \
+                sum([self.col_width(x) + 1 for x in names.values()]) - 1
             pad = max(section_width - len(section_name), 0)
             pad_prefix = pad // 2
             header += (pad_prefix * '-')
@@ -190,7 +258,7 @@ class DaemonWatcher(object):
         ostr.write(self.colorize(header, self.BLUE, True))
 
         sub_header = ""
-        for section_name, names in self._stats.items():
+        for section_name, names in stats.items():
             for stat_name, stat_nick in names.items():
                 sub_header += self.UNDERLINE_SEQ \
                               + self.colorize(
@@ -207,7 +275,10 @@ class DaemonWatcher(object):
         `last_dump`.
         """
         val_row = ""
-        for section_name, names in self._stats.items():
+        fit, changed = self.get_stats_that_fit()
+        if changed:
+            self._print_headers(ostr)
+        for section_name, names in fit.items():
             for stat_name, stat_nick in names.items():
                 stat_type = self._schema[section_name][stat_name]['type']
                 if bool(stat_type & COUNTER):
@@ -233,6 +304,29 @@ class DaemonWatcher(object):
         val_row = val_row[0:-len(self.colorize("|", self.BLUE))]
         ostr.write("{0}\n".format(val_row))
 
+    def _should_include(self, sect, name, prio):
+        '''
+        boolean: should we output this stat?
+
+        1) If self._statpats exists and the name filename-glob-matches
+           anything in the list, and prio is high enough, or
+        2) If self._statpats doesn't exist and prio is high enough
+
+        then yes.
+        '''
+        if self._statpats:
+            sectname = '.'.join((sect, name))
+            if not any([
+                p for p in self._statpats
+                if fnmatch(name, p) or fnmatch(sectname, p)
+            ]):
+                return False
+
+        if self._min_prio is not None and prio is not None:
+            return (prio >= self._min_prio)
+
+        return True
+
     def _load_schema(self):
         """
         Populate our instance-local copy of the daemon's performance counter
@@ -242,15 +336,20 @@ class DaemonWatcher(object):
             admin_socket(self.asok_path, ["perf", "schema"]).decode('utf-8'),
             object_pairs_hook=OrderedDict)
 
-        # Build list of which stats we will display, based on which
-        # stats have a nickname
+        # Build list of which stats we will display
         self._stats = OrderedDict()
         for section_name, section_stats in self._schema.items():
             for name, schema_data in section_stats.items():
-                if schema_data.get('nick'):
+                prio = schema_data.get('priority', 0)
+                if self._should_include(section_name, name, prio):
                     if section_name not in self._stats:
                         self._stats[section_name] = OrderedDict()
                     self._stats[section_name][name] = schema_data['nick']
+        if not len(self._stats):
+            raise RuntimeError("no stats selected by filters")
+
+    def _handle_sigwinch(self, signo, frame):
+        self.termsize.update()
 
     def run(self, interval, count=None, ostr=sys.stdout):
         """
@@ -266,12 +365,12 @@ class DaemonWatcher(object):
 
         last_dump = json.loads(admin_socket(self.asok_path, ["perf", "dump"]).decode('utf-8'))
         rows_since_header = 0
-        term_height = 25
 
         try:
+            signal(SIGWINCH, self._handle_sigwinch)
             while True:
                 dump = json.loads(admin_socket(self.asok_path, ["perf", "dump"]).decode('utf-8'))
-                if rows_since_header > term_height - 2:
+                if rows_since_header >= self.termsize.rows - 2:
                     self._print_headers(ostr)
                     rows_since_header = 0
                 self._print_vals(ostr, dump, last_dump)
@@ -281,6 +380,27 @@ class DaemonWatcher(object):
                         break
                 rows_since_header += 1
                 last_dump = dump
-                time.sleep(interval)
+
+                # time.sleep() is interrupted by SIGWINCH; avoid that
+                end = time.time() + interval
+                while time.time() < end:
+                    time.sleep(end - time.time())
+
         except KeyboardInterrupt:
             return
+
+    def list(self, ostr=sys.stdout):
+        """
+        Show all selected stats with section, full name, nick, and prio
+        """
+        table = PrettyTable(('section', 'name', 'nick', 'prio'))
+        table.align['section'] = 'l'
+        table.align['name'] = 'l'
+        table.align['nick'] = 'l'
+        table.align['prio'] = 'r'
+        self._load_schema()
+        for section_name, section_stats in self._stats.items():
+            for name, nick in section_stats.items():
+                prio = self._schema[section_name][name].get('priority') or 0
+                table.add_row((section_name, name, nick, prio))
+        ostr.write(table.get_string(hrules=HEADER) + '\n')
diff --git a/src/pybind/cephfs/cephfs.pyx b/src/pybind/cephfs/cephfs.pyx
index b7b406cd95e..806e0ab4c37 100644
--- a/src/pybind/cephfs/cephfs.pyx
+++ b/src/pybind/cephfs/cephfs.pyx
@@ -16,9 +16,7 @@ import os
 import sys
 
 # Are we running Python 2.x
-_python2 = sys.hexversion < 0x03000000
-
-if _python2:
+if sys.version_info[0] < 3:
     str_type = basestring
 else:
     str_type = str
@@ -153,51 +151,59 @@ class Error(Exception):
     pass
 
 
-class PermissionError(Error):
-    pass
+class OSError(Error):
+    def __init__(self, errno, strerror):
+        self.errno = errno
+        self.strerror = strerror
+
+    def __str__(self):
+        return '[Errno {0}] {1}'.format(self.errno, self.strerror)
 
 
-class ObjectNotFound(Error):
+class PermissionError(OSError):
     pass
 
 
-class NoData(Error):
+class ObjectNotFound(OSError):
     pass
 
 
-class ObjectExists(Error):
+class NoData(OSError):
     pass
 
 
-class IOError(Error):
+class ObjectExists(OSError):
     pass
 
 
-class NoSpace(Error):
+class IOError(OSError):
     pass
 
 
-class InvalidValue(Error):
+class NoSpace(OSError):
     pass
 
 
-class OperationNotSupported(Error):
+class InvalidValue(OSError):
     pass
 
 
-class IncompleteWriteError(Error):
+class OperationNotSupported(OSError):
     pass
 
 
 class LibCephFSStateError(Error):
     pass
 
-class WouldBlock(Error):
+
+class WouldBlock(OSError):
     pass
 
-class OutOfRange(Error):
+
+class OutOfRange(OSError):
     pass
 
+
 IF UNAME_SYSNAME == "FreeBSD":
     cdef errno_to_exception =  {
         errno.EPERM      : PermissionError,
@@ -238,9 +244,9 @@ cdef make_ex(ret, msg):
     """
     ret = abs(ret)
     if ret in errno_to_exception:
-        return errno_to_exception[ret](msg)
+        return errno_to_exception[ret](ret, msg)
     else:
-        return Error(msg + (": error code %d" % ret))
+        return Error(ret, msg + (": error code %d" % ret))
 
 
 class DirEntry(namedtuple('DirEntry',
@@ -350,7 +356,8 @@ cdef class LibCephFS(object):
         self.state = "uninitialized"
         if rados_inst is not None:
             if auth_id is not None or conffile is not None or conf is not None:
-                raise InvalidValue("May not pass RADOS instance as well as other configuration")
+                raise make_ex(errno.EINVAL,
+                              "May not pass RADOS instance as well as other configuration")
 
             self.create_with_rados(rados_inst)
         else:
@@ -653,16 +660,26 @@ cdef class LibCephFS(object):
             if flags == '':
                 cephfs_flags = os.O_RDONLY
             else:
+                access_flags = 0;
                 for c in flags:
                     if c == 'r':
-                        cephfs_flags |= os.O_RDONLY
+                        access_flags = 1;
                     elif c == 'w':
-                        cephfs_flags |= os.O_WRONLY | os.O_TRUNC | os.O_CREAT
-                    elif c == '+':
-                        cephfs_flags |= os.O_RDWR
+                        access_flags = 2;
+                        cephfs_flags |= os.O_TRUNC | os.O_CREAT
+                    elif access_flags > 0 and c == '+':
+                        access_flags = 3;
                     else:
-                        raise OperationNotSupported(
-                            "open flags doesn't support %s" % c)
+                        raise make_ex(errno.EOPNOTSUPP,
+                                      "open flags doesn't support %s" % c)
+
+                if access_flags == 1:
+                    cephfs_flags |= os.O_RDONLY;
+                elif access_flags == 2:
+                    cephfs_flags |= os.O_WRONLY;
+                else:
+                    cephfs_flags |= os.O_RDWR;
+
         elif isinstance(flags, int):
             cephfs_flags = flags
         else:
diff --git a/src/pybind/mgr/rest/module.py b/src/pybind/mgr/rest/module.py
index f72b7ce30e4..77a38f54d6e 100644
--- a/src/pybind/mgr/rest/module.py
+++ b/src/pybind/mgr/rest/module.py
@@ -156,7 +156,7 @@ class Module(MgrModule):
         return [self._auth_cls()]
 
     def shutdown(self):
-        cherrypy.engine.stop()
+        cherrypy.engine.exit()
 
     def serve(self):
         self.keys = self._load_keys()
diff --git a/src/pybind/rados/rados.pyx b/src/pybind/rados/rados.pyx
index 717be1b50be..73d4d5a1db2 100644
--- a/src/pybind/rados/rados.pyx
+++ b/src/pybind/rados/rados.pyx
@@ -14,6 +14,7 @@ method.
 # Copyright 2016 Mehdi Abaakouk <sileht@redhat.com>
 
 from cpython cimport PyObject, ref
+from cpython.pycapsule cimport *
 from libc cimport errno
 from libc.stdint cimport *
 from libc.stdlib cimport malloc, realloc, free
@@ -28,9 +29,7 @@ from functools import partial, wraps
 from itertools import chain
 
 # Are we running Python 2.x
-_python2 = sys.hexversion < 0x03000000
-
-if _python2:
+if sys.version_info[0] < 3:
     str_type = basestring
 else:
     str_type = str
@@ -126,6 +125,7 @@ cdef extern from "rados/librados.h" nogil:
     void rados_version(int *major, int *minor, int *extra)
     int rados_create2(rados_t *pcluster, const char *const clustername,
                       const char * const name, uint64_t flags)
+    int rados_create_with_context(rados_t *cluster, rados_config_t cct)
     int rados_connect(rados_t cluster)
     void rados_shutdown(rados_t cluster)
     int rados_conf_read_file(rados_t cluster, const char *path)
@@ -297,59 +297,68 @@ LIBRADOS_CREATE_IDEMPOTENT = _LIBRADOS_CREATE_IDEMPOTENT
 ANONYMOUS_AUID = 0xffffffffffffffff
 ADMIN_AUID = 0
 
+
 class Error(Exception):
     """ `Error` class, derived from `Exception` """
+    pass
 
 
 class InvalidArgumentError(Error):
     pass
 
 
-class InterruptedOrTimeoutError(Error):
-    """ `InterruptedOrTimeoutError` class, derived from `Error` """
-    pass
+class OSError(Error):
+    """ `OSError` class, derived from `Error` """
+    def __init__(self, errno, strerror):
+        self.errno = errno
+        self.strerror = strerror
+
+    def __str__(self):
+        return '[Errno {0}] {1}'.format(self.errno, self.strerror)
 
 
-class PermissionError(Error):
-    """ `PermissionError` class, derived from `Error` """
+class InterruptedOrTimeoutError(OSError):
+    """ `InterruptedOrTimeoutError` class, derived from `OSError` """
     pass
 
-class PermissionDeniedError(Error):
-    """ deal with EACCES related. """
+
+class PermissionError(OSError):
+    """ `PermissionError` class, derived from `OSError` """
     pass
 
-class ObjectNotFound(Error):
-    """ `ObjectNotFound` class, derived from `Error` """
+
+class PermissionDeniedError(OSError):
+    """ deal with EACCES related. """
     pass
 
 
-class NoData(Error):
-    """ `NoData` class, derived from `Error` """
+class ObjectNotFound(OSError):
+    """ `ObjectNotFound` class, derived from `OSError` """
     pass
 
 
-class ObjectExists(Error):
-    """ `ObjectExists` class, derived from `Error` """
+class NoData(OSError):
+    """ `NoData` class, derived from `OSError` """
     pass
 
 
-class ObjectBusy(Error):
-    """ `ObjectBusy` class, derived from `Error` """
+class ObjectExists(OSError):
+    """ `ObjectExists` class, derived from `OSError` """
     pass
 
 
-class IOError(Error):
-    """ `IOError` class, derived from `Error` """
+class ObjectBusy(OSError):
+    """ `ObjectBusy` class, derived from `IOError` """
     pass
 
 
-class NoSpace(Error):
-    """ `NoSpace` class, derived from `Error` """
+class IOError(OSError):
+    """ `ObjectBusy` class, derived from `OSError` """
     pass
 
 
-class IncompleteWriteError(Error):
-    """ `IncompleteWriteError` class, derived from `Error` """
+class NoSpace(OSError):
+    """ `NoSpace` class, derived from `OSError` """
     pass
 
 
@@ -362,6 +371,7 @@ class IoctxStateError(Error):
     """ `IoctxStateError` class, derived from `Error` """
     pass
 
+
 class ObjectStateError(Error):
     """ `ObjectStateError` class, derived from `Error` """
     pass
@@ -372,8 +382,8 @@ class LogicError(Error):
     pass
 
 
-class TimedOut(Error):
-    """ `TimedOut` class, derived from `Error` """
+class TimedOut(OSError):
+    """ `TimedOut` class, derived from `OSError` """
     pass
 
 
@@ -419,9 +429,9 @@ cdef make_ex(ret, msg):
     """
     ret = abs(ret)
     if ret in errno_to_exception:
-        return errno_to_exception[ret](msg)
+        return errno_to_exception[ret](ret, msg)
     else:
-        return Error(msg + (": error code %d" % ret))
+        return Error(ret, msg + (": error code %d" % ret))
 
 
 # helper to specify an optional argument, where in addition to `cls`, `None`
@@ -568,7 +578,8 @@ cdef class Rados(object):
     @requires(('rados_id', opt(str_type)), ('name', opt(str_type)), ('clustername', opt(str_type)),
               ('conffile', opt(str_type)))
     def __setup(self, rados_id=None, name=None, clustername=None,
-                conf_defaults=None, conffile=None, conf=None, flags=0):
+                conf_defaults=None, conffile=None, conf=None, flags=0,
+                context=None):
         self.monitor_callback = None
         self.parsed_args = []
         self.conf_defaults = conf_defaults
@@ -592,8 +603,14 @@ cdef class Rados(object):
             int _flags = flags
             int ret
 
-        with nogil:
-            ret = rados_create2(&self.cluster, _clustername, _name, _flags)
+        if context:
+            # Unpack void* (aka rados_config_t) from capsule
+            rados_config = <rados_config_t> PyCapsule_GetPointer(context, NULL)
+            with nogil:
+                ret = rados_create_with_context(&self.cluster, rados_config)
+        else:
+            with nogil:
+                ret = rados_create2(&self.cluster, _clustername, _name, _flags)
         if ret != 0:
             raise Error("rados_initialize failed with error code: %d" % ret)
 
diff --git a/src/pybind/rbd/rbd.pyx b/src/pybind/rbd/rbd.pyx
index 08d067bf534..cb03878141d 100644
--- a/src/pybind/rbd/rbd.pyx
+++ b/src/pybind/rbd/rbd.pyx
@@ -180,6 +180,9 @@ cdef extern from "rbd/librbd.h" nogil:
                    const char *destname)
 
     int rbd_trash_move(rados_ioctx_t io, const char *name, uint64_t delay)
+    int rbd_trash_get(rados_ioctx_t io, const char *id,
+                      rbd_trash_image_info_t *info)
+    void rbd_trash_get_cleanup(rbd_trash_image_info_t *info)
     int rbd_trash_list(rados_ioctx_t io, rbd_trash_image_info_t *trash_entries,
                        size_t *num_entries)
     void rbd_trash_list_cleanup(rbd_trash_image_info_t *trash_entries,
@@ -229,10 +232,11 @@ cdef extern from "rbd/librbd.h" nogil:
     int rbd_get_id(rbd_image_t image, char *id, size_t id_len)
     int rbd_get_block_name_prefix(rbd_image_t image, char *prefix,
                                   size_t prefix_len)
-    int rbd_get_parent_info(rbd_image_t image,
-                            char *parent_poolname, size_t ppoolnamelen,
-                            char *parent_name, size_t pnamelen,
-                            char *parent_snapname, size_t psnapnamelen)
+    int rbd_get_parent_info2(rbd_image_t image,
+                             char *parent_poolname, size_t ppoolnamelen,
+                             char *parent_name, size_t pnamelen,
+                             char *parent_id, size_t pidlen,
+                             char *parent_snapname, size_t psnapnamelen)
     int rbd_get_flags(rbd_image_t image, uint64_t *flags)
     ssize_t rbd_read2(rbd_image_t image, uint64_t ofs, size_t len,
                       char *buf, int op_flags)
@@ -381,31 +385,41 @@ class Error(Exception):
     pass
 
 
-class PermissionError(Error):
+class OSError(Error):
+    """ `OSError` class, derived from `Error` """
+    def __init__(self, errno, strerror):
+        self.errno = errno
+        self.strerror = strerror
+
+    def __str__(self):
+        return '[Errno {0}] {1}'.format(self.errno, self.strerror)
+
+
+class PermissionError(OSError):
     pass
 
 
-class ImageNotFound(Error):
+class ImageNotFound(OSError):
     pass
 
 
-class ImageExists(Error):
+class ImageExists(OSError):
     pass
 
 
-class IOError(Error):
+class IOError(OSError):
     pass
 
 
-class NoSpace(Error):
+class NoSpace(OSError):
     pass
 
 
-class IncompleteWriteError(Error):
+class IncompleteWriteError(OSError):
     pass
 
 
-class InvalidArgument(Error):
+class InvalidArgument(OSError):
     pass
 
 
@@ -413,34 +427,34 @@ class LogicError(Error):
     pass
 
 
-class ReadOnlyImage(Error):
+class ReadOnlyImage(OSError):
     pass
 
 
-class ImageBusy(Error):
+class ImageBusy(OSError):
     pass
 
 
-class ImageHasSnapshots(Error):
+class ImageHasSnapshots(OSError):
     pass
 
 
-class FunctionNotSupported(Error):
+class FunctionNotSupported(OSError):
     pass
 
 
-class ArgumentOutOfRange(Error):
+class ArgumentOutOfRange(OSError):
     pass
 
 
-class ConnectionShutdown(Error):
+class ConnectionShutdown(OSError):
     pass
 
 
-class Timeout(Error):
+class Timeout(OSError):
     pass
 
-class DiskQuotaExceeded(Error):
+class DiskQuotaExceeded(OSError):
     pass
 
 
@@ -473,9 +487,9 @@ cdef make_ex(ret, msg):
     """
     ret = abs(ret)
     if ret in errno_to_exception:
-        return errno_to_exception[ret](msg)
+        return errno_to_exception[ret](ret, msg)
     else:
-        return Error(msg + (": error code %d" % ret))
+        return Error(ret, msg + (": error code %d" % ret))
 
 
 cdef rados_ioctx_t convert_ioctx(rados.Ioctx ioctx) except? NULL:
@@ -919,6 +933,49 @@ class RBD(object):
         if ret != 0:
             raise make_ex(ret, 'error deleting image from trash')
 
+    def trash_get(self, ioctx, image_id):
+        """
+        Retrieve RBD image info from trash
+        :param ioctx: determines which RADOS pool the image is in
+        :type ioctx: :class:`rados.Ioctx`
+        :param image_id: the id of the image to restore
+        :type image_id: str
+        :returns: dict - contains the following keys:
+
+            * ``id`` (str) - image id
+
+            * ``name`` (str) - image name
+
+            * ``source`` (str) - source of deletion
+
+            * ``deletion_time`` (datetime) - time of deletion
+
+            * ``deferment_end_time`` (datetime) - time that an image is allowed
+              to be removed from trash
+
+        :raises: :class:`ImageNotFound`
+        """
+        image_id = cstr(image_id, 'image_id')
+        cdef:
+            rados_ioctx_t _ioctx = convert_ioctx(ioctx)
+            char *_image_id = image_id
+            rbd_trash_image_info_t c_info
+        with nogil:
+            ret = rbd_trash_get(_ioctx, _image_id, &c_info)
+        if ret != 0:
+            raise make_ex(ret, 'error restoring image from trash')
+
+        __source_string = ['USER', 'MIRRORING']
+        info = {
+            'id'          : decode_cstr(c_info.id),
+            'name'        : decode_cstr(c_info.name),
+            'source'      : __source_string[c_info.source],
+            'deletion_time' : datetime.fromtimestamp(c_info.deletion_time),
+            'deferment_end_time' : datetime.fromtimestamp(c_info.deferment_end_time)
+            }
+        rbd_trash_get_cleanup(&c_info)
+        return info
+
     def trash_list(self, ioctx):
         """
         Lists all entries from trash.
@@ -1506,8 +1563,8 @@ cdef class Image(object):
                 name = <char *>realloc_chk(name, size)
                 snapname = <char *>realloc_chk(snapname, size)
                 with nogil:
-                    ret = rbd_get_parent_info(self.image, pool, size, name,
-                                              size, snapname, size)
+                    ret = rbd_get_parent_info2(self.image, pool, size, name,
+                                               size, NULL, 0, snapname, size)
                 if ret == -errno.ERANGE:
                     size *= 2
 
@@ -1519,6 +1576,32 @@ cdef class Image(object):
             free(name)
             free(snapname)
 
+    def parent_id(self):
+        """
+        Get image id of a cloned image's parent (if any)
+
+        :returns: str - the parent id
+        :raises: :class:`ImageNotFound` if the image doesn't have a parent
+        """
+        cdef:
+            int ret = -errno.ERANGE
+            size_t size = 32
+            char *parent_id = NULL
+        try:
+            while ret == -errno.ERANGE and size <= 4096:
+                parent_id = <char *>realloc_chk(parent_id, size)
+                with nogil:
+                    ret = rbd_get_parent_info2(self.image, NULL, 0, NULL, 0,
+                                               parent_id, size, NULL, 0)
+                if ret == -errno.ERANGE:
+                    size *= 2
+
+            if ret != 0:
+                raise make_ex(ret, 'error getting parent id for image %s' % (self.name,))
+            return decode_cstr(parent_id)
+        finally:
+            free(parent_id)
+
     def old_format(self):
         """
         Find out whether the image uses the old RBD format.
@@ -2771,8 +2854,8 @@ cdef class SnapIterator(object):
         self.num_snaps = 10
         while True:
             self.snaps = <rbd_snap_info_t*>realloc_chk(self.snaps,
-                                                   self.num_snaps *
-                                                   sizeof(rbd_snap_info_t))
+                                                       self.num_snaps *
+                                                       sizeof(rbd_snap_info_t))
             with nogil:
                 ret = rbd_snap_list(image.image, self.snaps, &self.num_snaps)
             if ret >= 0:
@@ -2822,12 +2905,18 @@ cdef class TrashIterator(object):
     def __init__(self, ioctx):
         self.ioctx = convert_ioctx(ioctx)
         self.num_entries = 1024
-        self.entries = <rbd_trash_image_info_t *>realloc_chk(NULL,
-            sizeof(rbd_trash_image_info_t) * self.num_entries)
-        with nogil:
-            ret = rbd_trash_list(self.ioctx, self.entries, &self.num_entries)
-        if ret < 0:
-            raise make_ex(ret, 'error listing trash entries')
+        self.entries = NULL
+        while True:
+            self.entries = <rbd_trash_image_info_t*>realloc_chk(self.entries,
+                                                                self.num_entries *
+                                                                sizeof(rbd_trash_image_info_t))
+            with nogil:
+                ret = rbd_trash_list(self.ioctx, self.entries, &self.num_entries)
+            if ret >= 0:
+                self.num_entries = ret
+                break
+            elif ret != -errno.ERANGE:
+                raise make_ex(ret, 'error listing trash entries')
 
     __source_string = ['USER', 'MIRRORING']
 
diff --git a/src/pybind/rgw/rgw.pyx b/src/pybind/rgw/rgw.pyx
index b492d70123a..f512d33f84a 100644
--- a/src/pybind/rgw/rgw.pyx
+++ b/src/pybind/rgw/rgw.pyx
@@ -183,11 +183,21 @@ class Error(Exception):
     pass
 
 
-class PermissionError(Error):
+class OSError(Error):
+    """ `OSError` class, derived from `Error` """
+    def __init__(self, errno, strerror):
+        self.errno = errno
+        self.strerror = strerror
+
+    def __str__(self):
+        return '[Errno {0}] {1}'.format(self.errno, self.strerror)
+
+
+class PermissionError(OSError):
     pass
 
 
-class ObjectNotFound(Error):
+class ObjectNotFound(OSError):
     pass
 
 
@@ -199,7 +209,7 @@ class ObjectExists(Error):
     pass
 
 
-class IOError(Error):
+class IOError(OSError):
     pass
 
 
@@ -299,7 +309,7 @@ cdef make_ex(ret, msg):
     """
     ret = abs(ret)
     if ret in errno_to_exception:
-        return errno_to_exception[ret](msg)
+        return errno_to_exception[ret](ret, msg)
     else:
         return Error(msg + (": error code %d" % ret))
 
diff --git a/src/rapidjson b/src/rapidjson
new file mode 160000
+Subproject f54b0e47a08782a6131cc3d60f94d038fa6e0a5
diff --git a/src/rbd_replay/ActionTypes.cc b/src/rbd_replay/ActionTypes.cc
index 4e7a297a6c7..eed19fcec1a 100644
--- a/src/rbd_replay/ActionTypes.cc
+++ b/src/rbd_replay/ActionTypes.cc
@@ -26,7 +26,7 @@ void decode_big_endian_string(std::string &str, bufferlist::iterator &it) {
 #if defined(CEPH_LITTLE_ENDIAN)
   uint32_t length;
   ::decode(length, it);
-  length = swab32(length);
+  length = swab(length);
   str.clear();
   it.copy(length, str);
 #else
@@ -92,8 +92,8 @@ void Dependency::decode(__u8 version, bufferlist::iterator &it) {
   ::decode(id, it);
   ::decode(time_delta, it);
   if (byte_swap_required(version)) {
-    id = swab32(id);
-    time_delta = swab64(time_delta);
+    id = swab(id);
+    time_delta = swab(time_delta);
   }
 }
 
@@ -125,12 +125,12 @@ void ActionBase::decode(__u8 version, bufferlist::iterator &it) {
   }
 
   if (byte_swap_required(version)) {
-    id = swab32(id);
-    thread_id = swab64(thread_id);
+    id = swab(id);
+    thread_id = swab(thread_id);
 
     uint32_t dep_count;
     ::decode(dep_count, it);
-    dep_count = swab32(dep_count);
+    dep_count = swab(dep_count);
     dependencies.resize(dep_count);
     for (uint32_t i = 0; i < dep_count; ++i) {
       dependencies[i].decode(0, it);
@@ -161,7 +161,7 @@ void ImageActionBase::decode(__u8 version, bufferlist::iterator &it) {
   ActionBase::decode(version, it);
   ::decode(imagectx_id, it);
   if (byte_swap_required(version)) {
-    imagectx_id = swab64(imagectx_id);
+    imagectx_id = swab(imagectx_id);
   }
 }
 
@@ -181,8 +181,8 @@ void IoActionBase::decode(__u8 version, bufferlist::iterator &it) {
   ::decode(offset, it);
   ::decode(length, it);
   if (byte_swap_required(version)) {
-    offset = swab64(offset);
-    length = swab64(length);
+    offset = swab(offset);
+    length = swab(length);
   }
 }
 
diff --git a/src/rgw/CMakeLists.txt b/src/rgw/CMakeLists.txt
index 583b3531cdb..591d136ccae 100644
--- a/src/rgw/CMakeLists.txt
+++ b/src/rgw/CMakeLists.txt
@@ -25,6 +25,19 @@ add_custom_target(civetweb_h
   "${CMAKE_BINARY_DIR}/src/include/civetweb"
   COMMENT "keep civetweb.h up-to-date")
 
+find_program(GPERF gperf)
+if(NOT GPERF)
+  message(FATAL_ERROR "Can't find gperf")
+endif()
+function(gperf_generate input output)
+  add_custom_command(
+    OUTPUT ${output}
+    COMMAND ${GPERF} ${input} > ${output}
+    DEPENDS ${input}
+    COMMENT "Generate ${output}"
+    )
+endfunction()
+
 set(rgw_a_srcs
   rgw_acl.cc
   rgw_acl_s3.cc
@@ -103,7 +116,15 @@ set(rgw_a_srcs
   rgw_xml_enc.cc
   rgw_torrent.cc
   rgw_crypt.cc
-  rgw_crypt_sanitize.cc)
+  rgw_crypt_sanitize.cc
+  rgw_iam_policy.cc)
+
+gperf_generate(${CMAKE_SOURCE_DIR}/src/rgw/rgw_iam_policy_keywords.gperf
+  rgw_iam_policy_keywords.frag.cc)
+set_source_files_properties(rgw_iam_policy.cc PROPERTIES
+  OBJECT_DEPENDS ${CMAKE_BINARY_DIR}/src/rgw/rgw_iam_policy_keywords.frag.cc
+  COMPILE_FLAGS -I${CMAKE_BINARY_DIR}/src/rgw)
+
 
 if (WITH_RADOSGW_FCGI_FRONTEND)
   list(APPEND rgw_a_srcs rgw_fcgi.cc)
@@ -115,12 +136,15 @@ add_dependencies(rgw_a civetweb_h)
 
 target_include_directories(rgw_a PUBLIC
   "../Beast/include"
-  ${FCGI_INCLUDE_DIR})
+  ${FCGI_INCLUDE_DIR}
+  "../rapidjson/include"
+  )
+target_compile_definitions(rgw_a PUBLIC BOOST_COROUTINES_NO_DEPRECATION_WARNING)
 
 target_link_libraries(rgw_a librados cls_lock_client cls_rgw_client cls_refcount_client
   cls_log_client cls_statelog_client cls_timeindex_client cls_version_client
   cls_replica_log_client cls_user_client ceph-common common_utf8 global
-  ${CURL_LIBRARIES}
+  ${CURL_LIBRARIES} ${Boost_LIBRARIES}
   ${EXPAT_LIBRARIES}
   ${OPENLDAP_LIBRARIES} ${CRYPTO_LIBS})
 
@@ -134,15 +158,15 @@ if (WITH_RADOSGW_FCGI_FRONTEND)
   list(APPEND radosgw_srcs rgw_fcgi_process.cc)
 endif()
 
-if (WITH_RADOSGW_ASIO_FRONTEND)
+if (WITH_RADOSGW_BEAST_FRONTEND)
   list(APPEND radosgw_srcs
     rgw_asio_client.cc
     rgw_asio_frontend.cc)
-endif (WITH_RADOSGW_ASIO_FRONTEND)
+endif (WITH_RADOSGW_BEAST_FRONTEND)
 
 add_library(radosgw_a STATIC ${radosgw_srcs}
   $<TARGET_OBJECTS:civetweb_common_objs>)
-target_link_libraries(radosgw_a rgw_a)
+target_link_libraries(radosgw_a rgw_a ${SSL_LIBRARIES})
 
 add_executable(radosgw rgw_main.cc)
 target_link_libraries(radosgw radosgw_a librados
@@ -150,7 +174,7 @@ target_link_libraries(radosgw radosgw_a librados
   cls_log_client cls_statelog_client cls_timeindex_client
   cls_version_client cls_replica_log_client cls_user_client
   global ${FCGI_LIBRARY} ${LIB_RESOLV}
-  ${CURL_LIBRARIES} ${EXPAT_LIBRARIES} ${SSL_LIBRARIES} ${BLKID_LIBRARIES}
+  ${CURL_LIBRARIES} ${EXPAT_LIBRARIES} ${BLKID_LIBRARIES}
   ${ALLOC_LIBS})
 # radosgw depends on cls libraries at runtime, but not as link dependencies
 add_dependencies(radosgw cls_rgw cls_lock cls_refcount
diff --git a/src/rgw/rgw_acl.h b/src/rgw/rgw_acl.h
index 24567677e8e..26c84d121a2 100644
--- a/src/rgw/rgw_acl.h
+++ b/src/rgw/rgw_acl.h
@@ -15,8 +15,6 @@
 
 #include "rgw_basic_types.h"
 
-using namespace std;
-
 #define RGW_PERM_NONE            0x00
 #define RGW_PERM_READ            0x01
 #define RGW_PERM_WRITE           0x02
diff --git a/src/rgw/rgw_acl_s3.h b/src/rgw/rgw_acl_s3.h
index 3a52735e091..6991a0b8363 100644
--- a/src/rgw/rgw_acl_s3.h
+++ b/src/rgw/rgw_acl_s3.h
@@ -13,9 +13,6 @@
 #include "rgw_xml.h"
 #include "rgw_acl.h"
 
-
-using namespace std;
-
 class RGWRados;
 
 class ACLPermission_S3 : public ACLPermission, public XMLObj
diff --git a/src/rgw/rgw_acl_swift.cc b/src/rgw/rgw_acl_swift.cc
index 78b60124ba6..339b93bbe06 100644
--- a/src/rgw/rgw_acl_swift.cc
+++ b/src/rgw/rgw_acl_swift.cc
@@ -179,11 +179,13 @@ int RGWAccessControlPolicy_SWIFT::create(RGWRados* const store,
                                          const rgw_user& id,
                                          const std::string& name,
                                          const std::string& read_list,
-                                         const std::string& write_list)
+                                         const std::string& write_list,
+                                         uint32_t& rw_mask)
 {
   acl.create_default(id, name);
   owner.set_id(id);
   owner.set_name(name);
+  rw_mask = 0;
 
   if (read_list.size()) {
     std::vector<std::string> uids;
@@ -200,6 +202,7 @@ int RGWAccessControlPolicy_SWIFT::create(RGWRados* const store,
                     << r << dendl;
       return r;
     }
+    rw_mask |= SWIFT_PERM_READ;
   }
   if (write_list.size()) {
     std::vector<std::string> uids;
@@ -216,10 +219,45 @@ int RGWAccessControlPolicy_SWIFT::create(RGWRados* const store,
                     << r << dendl;
       return r;
     }
+    rw_mask |= SWIFT_PERM_WRITE;
   }
   return 0;
 }
 
+void RGWAccessControlPolicy_SWIFT::filter_merge(uint32_t rw_mask,
+                                                RGWAccessControlPolicy_SWIFT *old)
+{
+  /* rw_mask&SWIFT_PERM_READ => setting read acl,
+   * rw_mask&SWIFT_PERM_WRITE => setting write acl
+   * when bit is cleared, copy matching elements from old.
+   */
+  if (rw_mask == (SWIFT_PERM_READ|SWIFT_PERM_WRITE)) {
+    return;
+  }
+  rw_mask ^= (SWIFT_PERM_READ|SWIFT_PERM_WRITE);
+  for (auto &iter: old->acl.get_grant_map()) {
+    ACLGrant& grant = iter.second;
+    uint32_t perm = grant.get_permission().get_permissions();
+    rgw_user id;
+    string url_spec;
+    if (!grant.get_id(id)) {
+      if (grant.get_group() != ACL_GROUP_ALL_USERS) {
+        url_spec = grant.get_referer();
+        if (url_spec.empty()) {
+          continue;
+        }
+        if (perm == 0) {
+          /* We need to carry also negative, HTTP referrer-based ACLs. */
+          perm = SWIFT_PERM_READ;
+        }
+      }
+    }
+    if (perm & rw_mask) {
+      acl.add_grant(&grant);
+    }
+  }
+}
+
 void RGWAccessControlPolicy_SWIFT::to_str(string& read, string& write)
 {
   multimap<string, ACLGrant>& m = acl.get_grant_map();
diff --git a/src/rgw/rgw_acl_swift.h b/src/rgw/rgw_acl_swift.h
index b74a85ecfe1..883b623af18 100644
--- a/src/rgw/rgw_acl_swift.h
+++ b/src/rgw/rgw_acl_swift.h
@@ -27,7 +27,9 @@ public:
              const rgw_user& id,
              const std::string& name,
              const std::string& read_list,
-             const std::string& write_list);
+             const std::string& write_list,
+             uint32_t& rw_mask);
+  void filter_merge(uint32_t mask, RGWAccessControlPolicy_SWIFT *policy);
   void to_str(std::string& read, std::string& write);
 };
 
diff --git a/src/rgw/rgw_admin.cc b/src/rgw/rgw_admin.cc
index 46a9d3f933c..69ff573fcde 100644
--- a/src/rgw/rgw_admin.cc
+++ b/src/rgw/rgw_admin.cc
@@ -421,6 +421,7 @@ enum {
   OPT_METADATA_SYNC_INIT,
   OPT_METADATA_SYNC_RUN,
   OPT_MDLOG_LIST,
+  OPT_MDLOG_AUTOTRIM,
   OPT_MDLOG_TRIM,
   OPT_MDLOG_FETCH,
   OPT_MDLOG_STATUS,
@@ -819,6 +820,8 @@ static int get_cmd(const char *cmd, const char *prev_cmd, const char *prev_prev_
   } else if (strcmp(prev_cmd, "mdlog") == 0) {
     if (strcmp(cmd, "list") == 0)
       return OPT_MDLOG_LIST;
+    if (strcmp(cmd, "autotrim") == 0)
+      return OPT_MDLOG_AUTOTRIM;
     if (strcmp(cmd, "trim") == 0)
       return OPT_MDLOG_TRIM;
     if (strcmp(cmd, "fetch") == 0)
@@ -1511,32 +1514,46 @@ int do_check_object_locator(const string& tenant_name, const string& bucket_name
   return 0;
 }
 
-#define MAX_REST_RESPONSE (128 * 1024) // we expect a very small response
-static int send_to_remote_gateway(const string& remote, req_info& info,
-                                  bufferlist& in_data, JSONParser& parser)
+/// search for a matching zone/zonegroup id and return a connection if found
+static boost::optional<RGWRESTConn> get_remote_conn(RGWRados *store,
+                                                    const RGWZoneGroup& zonegroup,
+                                                    const std::string& remote)
 {
-  bufferlist response;
-  RGWRESTConn *conn;
-  if (remote.empty()) {
-    if (!store->rest_master_conn) {
-      cerr << "Invalid rest master connection" << std::endl;
-      return -EINVAL;
-    }
-    conn = store->rest_master_conn;
+  boost::optional<RGWRESTConn> conn;
+  if (remote == zonegroup.get_id()) {
+    conn.emplace(store->ctx(), store, remote, zonegroup.endpoints);
   } else {
-    // check zonegroups
-    auto iter = store->zonegroup_conn_map.find(remote);
-    if (iter == store->zonegroup_conn_map.end()) {
-      // check zones
-      iter = store->zone_conn_map.find(remote);
-      if (iter == store->zone_conn_map.end()) {
-        cerr << "could not find connection for zone or zonegroup id: "
-            << remote << std::endl;
-        return -ENOENT;
+    for (const auto& z : zonegroup.zones) {
+      const auto& zone = z.second;
+      if (remote == zone.id) {
+        conn.emplace(store->ctx(), store, remote, zone.endpoints);
+        break;
       }
     }
-    conn = iter->second;
   }
+  return conn;
+}
+
+/// search each zonegroup for a connection
+static boost::optional<RGWRESTConn> get_remote_conn(RGWRados *store,
+                                                    const RGWPeriodMap& period_map,
+                                                    const std::string& remote)
+{
+  boost::optional<RGWRESTConn> conn;
+  for (const auto& zg : period_map.zonegroups) {
+    conn = get_remote_conn(store, zg.second, remote);
+    if (conn) {
+      break;
+    }
+  }
+  return conn;
+}
+
+#define MAX_REST_RESPONSE (128 * 1024) // we expect a very small response
+static int send_to_remote_gateway(RGWRESTConn* conn, req_info& info,
+                                  bufferlist& in_data, JSONParser& parser)
+{
+  bufferlist response;
   rgw_user user;
   int ret = conn->forward(user, info, NULL, MAX_REST_RESPONSE, &in_data, &response);
 
@@ -1574,20 +1591,21 @@ static int send_to_url(const string& url, const string& access,
   return ret;
 }
 
-static int send_to_remote_or_url(const string& remote, const string& url,
+static int send_to_remote_or_url(RGWRESTConn *conn, const string& url,
                                  const string& access, const string& secret,
                                  req_info& info, bufferlist& in_data,
                                  JSONParser& parser)
 {
   if (url.empty()) {
-    return send_to_remote_gateway(remote, info, in_data, parser);
+    return send_to_remote_gateway(conn, info, in_data, parser);
   }
   return send_to_url(url, access, secret, info, in_data, parser);
 }
 
 static int commit_period(RGWRealm& realm, RGWPeriod& period,
                          string remote, const string& url,
-                         const string& access, const string& secret)
+                         const string& access, const string& secret,
+                         bool force)
 {
   const string& master_zone = period.get_master_zone();
   if (master_zone.empty()) {
@@ -1605,7 +1623,7 @@ static int commit_period(RGWRealm& realm, RGWPeriod& period,
       return ret;
     }
     // the master zone can commit locally
-    ret = period.commit(realm, current_period, cerr);
+    ret = period.commit(realm, current_period, cerr, force);
     if (ret < 0) {
       cerr << "failed to commit period: " << cpp_strerror(-ret) << std::endl;
     }
@@ -1617,6 +1635,17 @@ static int commit_period(RGWRealm& realm, RGWPeriod& period,
     remote = master_zone;
     cout << "Sending period to new master zone " << remote << std::endl;
   }
+  boost::optional<RGWRESTConn> conn;
+  RGWRESTConn *remote_conn = nullptr;
+  if (!remote.empty()) {
+    conn = get_remote_conn(store, period.get_map(), remote);
+    if (!conn) {
+      cerr << "failed to find a zone or zonegroup for remote "
+          << remote << std::endl;
+      return -ENOENT;
+    }
+    remote_conn = &*conn;
+  }
 
   // push period to the master with an empty period id
   period.set_id("");
@@ -1633,7 +1662,7 @@ static int commit_period(RGWRealm& realm, RGWPeriod& period,
   jf.flush(bl);
 
   JSONParser p;
-  int ret = send_to_remote_or_url(remote, url, access, secret, info, bl, p);
+  int ret = send_to_remote_or_url(remote_conn, url, access, secret, info, bl, p);
   if (ret < 0) {
     cerr << "request failed: " << cpp_strerror(-ret) << std::endl;
 
@@ -1682,7 +1711,7 @@ static int update_period(const string& realm_id, const string& realm_name,
                          const string& period_id, const string& period_epoch,
                          bool commit, const string& remote, const string& url,
                          const string& access, const string& secret,
-                         Formatter *formatter)
+                         Formatter *formatter, bool force)
 {
   RGWRealm realm(realm_id, realm_name);
   int ret = realm.init(g_ceph_context, store);
@@ -1713,7 +1742,7 @@ static int update_period(const string& realm_id, const string& realm_name,
     return ret;
   }
   if (commit) {
-    ret = commit_period(realm, period, remote, url, access, secret);
+    ret = commit_period(realm, period, remote, url, access, secret, force);
     if (ret < 0) {
       cerr << "failed to commit period: " << cpp_strerror(-ret) << std::endl;
       return ret;
@@ -1739,8 +1768,10 @@ static int init_bucket_for_sync(const string& tenant, const string& bucket_name,
   return 0;
 }
 
-static int do_period_pull(const string& remote, const string& url, const string& access_key, const string& secret_key,
-                          const string& realm_id, const string& realm_name, const string& period_id, const string& period_epoch,
+static int do_period_pull(RGWRESTConn *remote_conn, const string& url,
+                          const string& access_key, const string& secret_key,
+                          const string& realm_id, const string& realm_name,
+                          const string& period_id, const string& period_epoch,
                           RGWPeriod *period)
 {
   RGWEnv env;
@@ -1760,7 +1791,7 @@ static int do_period_pull(const string& remote, const string& url, const string&
 
   bufferlist bl;
   JSONParser p;
-  int ret = send_to_remote_or_url(remote, url, access_key, secret_key,
+  int ret = send_to_remote_or_url(remote_conn, url, access_key, secret_key,
                                   info, bl, p);
   if (ret < 0) {
     cerr << "request failed: " << cpp_strerror(-ret) << std::endl;
@@ -1826,14 +1857,13 @@ static void get_md_sync_status(list<string>& status)
     return;
   }
 
-  ret = sync.read_sync_status();
+  rgw_meta_sync_status sync_status;
+  ret = sync.read_sync_status(&sync_status);
   if (ret < 0) {
     status.push_back(string("failed to read sync status: ") + cpp_strerror(-ret));
     return;
   }
 
-  const rgw_meta_sync_status& sync_status = sync.get_sync_status();
-
   string status_str;
   switch (sync_status.sync_info.state) {
     case rgw_meta_sync_info::StateInit:
@@ -1912,7 +1942,8 @@ static void get_md_sync_status(list<string>& status)
         continue;
       }
       auto master_marker = iter->second.marker;
-      if (master_marker > local_iter.second.marker) {
+      if (local_iter.second.state == rgw_meta_sync_marker::SyncState::IncrementalSync &&
+          master_marker > local_iter.second.marker) {
         shards_behind[shard_id] = local_iter.second.marker;
       }
     }
@@ -2059,7 +2090,8 @@ static void get_data_sync_status(const string& source_zone, list<string>& status
       continue;
     }
     auto master_marker = iter->second.marker;
-    if (master_marker > local_iter.second.marker) {
+    if (local_iter.second.state == rgw_data_sync_marker::SyncState::IncrementalSync &&
+        master_marker > local_iter.second.marker) {
       shards_behind[shard_id] = local_iter.second.marker;
     }
   }
@@ -2823,15 +2855,6 @@ int main(int argc, const char **argv)
       ++i;
     }
   }
-  if (tenant.empty()) {
-    tenant = user_id.tenant;
-  } else {
-    if (user_id.empty()) {
-      cerr << "ERROR: --tenant is set, but there's no user ID" << std::endl;
-      return EINVAL;
-    }
-    user_id.tenant = tenant;
-  }
 
   if (args.empty()) {
     return usage();
@@ -2871,6 +2894,23 @@ int main(int argc, const char **argv)
       }
     }
 
+    if (tenant.empty()) {
+      tenant = user_id.tenant;
+    } else {
+      if (user_id.empty() && opt_cmd != OPT_ROLE_CREATE
+                          && opt_cmd != OPT_ROLE_DELETE
+                          && opt_cmd != OPT_ROLE_GET
+                          && opt_cmd != OPT_ROLE_MODIFY
+                          && opt_cmd != OPT_ROLE_LIST
+                          && opt_cmd != OPT_ROLE_POLICY_PUT
+                          && opt_cmd != OPT_ROLE_POLICY_LIST
+                          && opt_cmd != OPT_ROLE_POLICY_GET
+                          && opt_cmd != OPT_ROLE_POLICY_DELETE) {
+        cerr << "ERROR: --tenant is set, but there's no user ID" << std::endl;
+        return EINVAL;
+      }
+      user_id.tenant = tenant;
+    }
     /* check key parameter conflict */
     if ((!access_key.empty()) && gen_access_key) {
         cerr << "ERROR: key parameter conflict, --access-key & --gen-access-key" << std::endl;
@@ -2905,9 +2945,6 @@ int main(int argc, const char **argv)
 
   // not a raw op if 'period update' needs to commit to master
   bool raw_period_update = opt_cmd == OPT_PERIOD_UPDATE && !commit;
-  // not a raw op if 'period pull' needs to look up remotes
-  bool raw_period_pull = opt_cmd == OPT_PERIOD_PULL && remote.empty() && !url.empty();
-
   std::set<int> raw_storage_ops_list = {OPT_ZONEGROUP_ADD, OPT_ZONEGROUP_CREATE, OPT_ZONEGROUP_DELETE,
 			 OPT_ZONEGROUP_GET, OPT_ZONEGROUP_LIST,
                          OPT_ZONEGROUP_SET, OPT_ZONEGROUP_DEFAULT,
@@ -2923,6 +2960,7 @@ int main(int argc, const char **argv)
 			 OPT_ZONE_PLACEMENT_MODIFY, OPT_ZONE_PLACEMENT_LIST,
 			 OPT_REALM_CREATE,
 			 OPT_PERIOD_DELETE, OPT_PERIOD_GET,
+			 OPT_PERIOD_PULL,
 			 OPT_PERIOD_GET_CURRENT, OPT_PERIOD_LIST,
 			 OPT_GLOBAL_QUOTA_GET, OPT_GLOBAL_QUOTA_SET,
 			 OPT_GLOBAL_QUOTA_ENABLE, OPT_GLOBAL_QUOTA_DISABLE,
@@ -2934,7 +2972,7 @@ int main(int argc, const char **argv)
 
 
   bool raw_storage_op = (raw_storage_ops_list.find(opt_cmd) != raw_storage_ops_list.end() ||
-                         raw_period_update || raw_period_pull);
+                         raw_period_update);
 
   if (raw_storage_op) {
     store = RGWStoreManager::get_raw_storage(g_ceph_context);
@@ -3040,20 +3078,45 @@ int main(int argc, const char **argv)
       {
         int ret = update_period(realm_id, realm_name, period_id, period_epoch,
                                 commit, remote, url, access_key, secret_key,
-                                formatter);
+                                formatter, yes_i_really_mean_it);
 	if (ret < 0) {
 	  return -ret;
 	}
       }
       break;
-    case OPT_PERIOD_PULL: // period pull --url
+    case OPT_PERIOD_PULL:
       {
+        boost::optional<RGWRESTConn> conn;
+        RGWRESTConn *remote_conn = nullptr;
         if (url.empty()) {
-          cerr << "A --url or --remote must be provided." << std::endl;
-          return EINVAL;
+          // load current period for endpoints
+          RGWRealm realm(realm_id, realm_name);
+          int ret = realm.init(g_ceph_context, store);
+          if (ret < 0) {
+            cerr << "failed to init realm: " << cpp_strerror(-ret) << std::endl;
+            return -ret;
+          }
+          RGWPeriod current_period(realm.get_current_period());
+          ret = current_period.init(g_ceph_context, store);
+          if (ret < 0) {
+            cerr << "failed to init current period: " << cpp_strerror(-ret) << std::endl;
+            return -ret;
+          }
+          if (remote.empty()) {
+            // use realm master zone as remote
+            remote = current_period.get_master_zone();
+          }
+          conn = get_remote_conn(store, current_period.get_map(), remote);
+          if (!conn) {
+            cerr << "failed to find a zone or zonegroup for remote "
+                << remote << std::endl;
+            return -ENOENT;
+          }
+          remote_conn = &*conn;
         }
+
         RGWPeriod period;
-        int ret = do_period_pull(remote, url, access_key, secret_key,
+        int ret = do_period_pull(remote_conn, url, access_key, secret_key,
                                  realm_id, realm_name, period_id, period_epoch,
                                  &period);
         if (ret < 0) {
@@ -3399,7 +3462,7 @@ int main(int argc, const char **argv)
         auto& current_period = realm.get_current_period();
         if (!current_period.empty()) {
           // pull the latest epoch of the realm's current period
-          ret = do_period_pull(remote, url, access_key, secret_key,
+          ret = do_period_pull(nullptr, url, access_key, secret_key,
                                realm_id, realm_name, current_period, "",
                                &period);
           if (ret < 0) {
@@ -3642,7 +3705,6 @@ int main(int argc, const char **argv)
         }
 
         if (need_update) {
-          zonegroup.post_process_params();
 	  ret = zonegroup.update();
 	  if (ret < 0) {
 	    cerr << "failed to update zonegroup: " << cpp_strerror(-ret) << std::endl;
@@ -4533,7 +4595,7 @@ int main(int argc, const char **argv)
       jf.flush(bl);
 
       JSONParser p;
-      ret = send_to_remote_or_url(remote, url, access_key, secret_key,
+      ret = send_to_remote_or_url(nullptr, url, access_key, secret_key,
                                   info, bl, p);
       if (ret < 0) {
         cerr << "request failed: " << cpp_strerror(-ret) << std::endl;
@@ -4541,43 +4603,11 @@ int main(int argc, const char **argv)
       }
     }
     return 0;
-  case OPT_PERIOD_PULL: // period pull --remote
-    {
-      if (remote.empty()) {
-	/* use realm master zonegroup as remote */
-	RGWRealm realm(realm_id, realm_name);
-	int ret = realm.init(g_ceph_context, store);
-	if (ret < 0) {
-	  cerr << "failed to init realm: " << cpp_strerror(-ret) << std::endl;
-	  return -ret;
-	}
-	RGWPeriod current_period(realm.get_current_period());
-	ret = current_period.init(g_ceph_context, store);
-	if (ret < 0) {
-	  cerr << "failed to init current period: " << cpp_strerror(-ret) << std::endl;
-	  return -ret;
-	}
-	remote = current_period.get_master_zonegroup();
-      }
-      RGWPeriod period;
-      int ret = do_period_pull(remote, url, access_key, secret_key,
-                               realm_id, realm_name, period_id, period_epoch,
-                               &period);
-      if (ret < 0) {
-        cerr << "period pull failed: " << cpp_strerror(-ret) << std::endl;
-        return -ret;
-      }
-
-      encode_json("period", period, formatter);
-      formatter->flush(cout);
-      cout << std::endl;
-    }
-    return 0;
   case OPT_PERIOD_UPDATE:
     {
       int ret = update_period(realm_id, realm_name, period_id, period_epoch,
                               commit, remote, url, access_key, secret_key,
-                              formatter);
+                              formatter, yes_i_really_mean_it);
       if (ret < 0) {
 	return -ret;
       }
@@ -4598,7 +4628,8 @@ int main(int argc, const char **argv)
         cerr << "period init failed: " << cpp_strerror(-ret) << std::endl;
         return -ret;
       }
-      ret = commit_period(realm, period, remote, url, access_key, secret_key);
+      ret = commit_period(realm, period, remote, url, access_key, secret_key,
+                          yes_i_really_mean_it);
       if (ret < 0) {
         cerr << "failed to commit period: " << cpp_strerror(-ret) << std::endl;
         return -ret;
@@ -4611,10 +4642,13 @@ int main(int argc, const char **argv)
     return 0;
   case OPT_ROLE_CREATE:
     {
-      string uid;
-      user_id.to_str(uid);
-      if (role_name.empty() || assume_role_doc.empty() || uid.empty()) {
-        cerr << "ERROR: one of role name or assume role policy document or uid is empty" << std::endl;
+      if (role_name.empty()) {
+        cerr << "ERROR: role name is empty" << std::endl;
+        return -EINVAL;
+      }
+
+      if (assume_role_doc.empty()) {
+        cerr << "ERROR: assume role policy document is empty" << std::endl;
         return -EINVAL;
       }
       /* The following two calls will be replaced by read_decode_json or something
@@ -4631,7 +4665,7 @@ int main(int argc, const char **argv)
         return -EINVAL;
       }
       string trust_policy = bl.to_str();
-      RGWRole role(g_ceph_context, store, role_name, path, trust_policy, uid);
+      RGWRole role(g_ceph_context, store, role_name, path, trust_policy, tenant);
       ret = role.create(true);
       if (ret < 0) {
         return -ret;
@@ -4645,7 +4679,7 @@ int main(int argc, const char **argv)
         cerr << "ERROR: empty role name" << std::endl;
         return -EINVAL;
       }
-      RGWRole role(g_ceph_context, store, role_name);
+      RGWRole role(g_ceph_context, store, role_name, tenant);
       ret = role.delete_obj();
       if (ret < 0) {
         return -ret;
@@ -4659,7 +4693,7 @@ int main(int argc, const char **argv)
         cerr << "ERROR: empty role name" << std::endl;
         return -EINVAL;
       }
-      RGWRole role(g_ceph_context, store, role_name);
+      RGWRole role(g_ceph_context, store, role_name, tenant);
       ret = role.get();
       if (ret < 0) {
         return -ret;
@@ -4669,10 +4703,16 @@ int main(int argc, const char **argv)
     }
   case OPT_ROLE_MODIFY:
     {
-      if (role_name.empty() || assume_role_doc.empty()) {
-        cerr << "ERROR: one of role name or assume role policy document is empty" << std::endl;
+      if (role_name.empty()) {
+        cerr << "ERROR: role name is empty" << std::endl;
         return -EINVAL;
       }
+
+      if (assume_role_doc.empty()) {
+        cerr << "ERROR: assume role policy document is empty" << std::endl;
+        return -EINVAL;
+      }
+
       /* The following two calls will be replaced by read_decode_json or something
          similar when the code for AWS Policies is in place */
       bufferlist bl;
@@ -4687,7 +4727,7 @@ int main(int argc, const char **argv)
         return -EINVAL;
       }
       string trust_policy = bl.to_str();
-      RGWRole role(g_ceph_context, store, role_name);
+      RGWRole role(g_ceph_context, store, role_name, tenant);
       ret = role.get();
       if (ret < 0) {
         return -ret;
@@ -4703,7 +4743,7 @@ int main(int argc, const char **argv)
   case OPT_ROLE_LIST:
     {
       vector<RGWRole> result;
-      ret = RGWRole::get_roles_by_path_prefix(store, g_ceph_context, path_prefix, result);
+      ret = RGWRole::get_roles_by_path_prefix(store, g_ceph_context, path_prefix, tenant, result);
       if (ret < 0) {
         return -ret;
       }
@@ -4712,10 +4752,21 @@ int main(int argc, const char **argv)
     }
   case OPT_ROLE_POLICY_PUT:
     {
-      if (role_name.empty() || policy_name.empty() || perm_policy_doc.empty()) {
-        cerr << "One of role name, policy name or permission policy document is empty" << std::endl;
+      if (role_name.empty()) {
+        cerr << "role name is empty" << std::endl;
+        return -EINVAL;
+      }
+
+      if (policy_name.empty()) {
+        cerr << "policy name is empty" << std::endl;
+        return -EINVAL;
+      }
+
+      if (perm_policy_doc.empty()) {
+        cerr << "permission policy document is empty" << std::endl;
         return -EINVAL;
       }
+
       /* The following two calls will be replaced by read_decode_json or something
          similar, when code for AWS Policies is in place.*/
       bufferlist bl;
@@ -4732,7 +4783,7 @@ int main(int argc, const char **argv)
       string perm_policy;
       perm_policy = bl.c_str();
 
-      RGWRole role(g_ceph_context, store, role_name);
+      RGWRole role(g_ceph_context, store, role_name, tenant);
       ret = role.get();
       if (ret < 0) {
         return -ret;
@@ -4751,7 +4802,7 @@ int main(int argc, const char **argv)
         cerr << "ERROR: Role name is empty" << std::endl;
         return -EINVAL;
       }
-      RGWRole role(g_ceph_context, store, role_name);
+      RGWRole role(g_ceph_context, store, role_name, tenant);
       ret = role.get();
       if (ret < 0) {
         return -ret;
@@ -4762,11 +4813,16 @@ int main(int argc, const char **argv)
     }
   case OPT_ROLE_POLICY_GET:
     {
-      if (role_name.empty() || policy_name.empty()) {
-        cerr << "ERROR: One of role name or policy name is empty" << std::endl;
+      if (role_name.empty()) {
+        cerr << "ERROR: role name is empty" << std::endl;
         return -EINVAL;
       }
-      RGWRole role(g_ceph_context, store, role_name);
+
+      if (policy_name.empty()) {
+        cerr << "ERROR: policy name is empty" << std::endl;
+        return -EINVAL;
+      }
+      RGWRole role(g_ceph_context, store, role_name, tenant);
       int ret = role.get();
       if (ret < 0) {
         return -ret;
@@ -4781,11 +4837,16 @@ int main(int argc, const char **argv)
     }
   case OPT_ROLE_POLICY_DELETE:
     {
-      if (role_name.empty() || policy_name.empty()) {
-        cerr << "ERROR: One of role name or policy name is empty" << std::endl;
+      if (role_name.empty()) {
+        cerr << "ERROR: role name is empty" << std::endl;
+        return -EINVAL;
+      }
+
+      if (policy_name.empty()) {
+        cerr << "ERROR: policy name is empty" << std::endl;
         return -EINVAL;
       }
-      RGWRole role(g_ceph_context, store, role_name);
+      RGWRole role(g_ceph_context, store, role_name, tenant);
       ret = role.get();
       if (ret < 0) {
         return -ret;
@@ -6170,6 +6231,26 @@ next:
     formatter->flush(cout);
   }
 
+  if (opt_cmd == OPT_MDLOG_AUTOTRIM) {
+    // need a full history for purging old mdlog periods
+    store->meta_mgr->init_oldest_log_period();
+
+    RGWCoroutinesManager crs(store->ctx(), store->get_cr_registry());
+    RGWHTTPManager http(store->ctx(), crs.get_completion_mgr());
+    int ret = http.set_threaded();
+    if (ret < 0) {
+      cerr << "failed to initialize http client with " << cpp_strerror(ret) << std::endl;
+      return -ret;
+    }
+
+    auto num_shards = g_conf->rgw_md_log_max_shards;
+    ret = crs.run(create_admin_meta_log_trim_cr(store, &http, num_shards));
+    if (ret < 0) {
+      cerr << "automated mdlog trim failed with " << cpp_strerror(ret) << std::endl;
+      return -ret;
+    }
+  }
+
   if (opt_cmd == OPT_MDLOG_TRIM) {
     utime_t start_time, end_time;
 
@@ -6212,14 +6293,13 @@ next:
       return -ret;
     }
 
-    ret = sync.read_sync_status();
+    rgw_meta_sync_status sync_status;
+    ret = sync.read_sync_status(&sync_status);
     if (ret < 0) {
       cerr << "ERROR: sync.read_sync_status() returned ret=" << ret << std::endl;
       return -ret;
     }
 
-    const rgw_meta_sync_status& sync_status = sync.get_sync_status();
-
     formatter->open_object_section("summary");
     encode_json("sync_status", sync_status, formatter);
 
@@ -6255,7 +6335,7 @@ next:
     }
     ret = sync.init_sync_status();
     if (ret < 0) {
-      cerr << "ERROR: sync.get_sync_status() returned ret=" << ret << std::endl;
+      cerr << "ERROR: sync.init_sync_status() returned ret=" << ret << std::endl;
       return -ret;
     }
   }
@@ -6336,7 +6416,7 @@ next:
 
     ret = sync.init_sync_status();
     if (ret < 0) {
-      cerr << "ERROR: sync.get_sync_status() returned ret=" << ret << std::endl;
+      cerr << "ERROR: sync.init_sync_status() returned ret=" << ret << std::endl;
       return -ret;
     }
   }
@@ -6384,7 +6464,7 @@ next:
     }
     ret = sync.init_sync_status();
     if (ret < 0) {
-      cerr << "ERROR: sync.get_sync_status() returned ret=" << ret << std::endl;
+      cerr << "ERROR: sync.init_sync_status() returned ret=" << ret << std::endl;
       return -ret;
     }
   }
diff --git a/src/rgw/rgw_asio_client.cc b/src/rgw/rgw_asio_client.cc
index 219a36b6837..63de2d27e75 100644
--- a/src/rgw/rgw_asio_client.cc
+++ b/src/rgw/rgw_asio_client.cc
@@ -3,6 +3,7 @@
 
 #include <boost/algorithm/string/predicate.hpp>
 #include <boost/asio/write.hpp>
+#include <beast/http/read.hpp>
 
 #include "rgw_asio_client.h"
 
@@ -12,22 +13,23 @@
 #undef dout_prefix
 #define dout_prefix (*_dout << "asio: ")
 
+using namespace rgw::asio;
 
-RGWAsioClientIO::RGWAsioClientIO(tcp::socket&& socket,
-                                 request_type&& request)
-  : socket(std::move(socket)),
-    request(std::move(request)),
-    txbuf(*this) {
+ClientIO::ClientIO(tcp::socket& socket,
+                   parser_type& parser,
+                   beast::flat_streambuf& buffer)
+  : socket(socket), parser(parser), buffer(buffer), txbuf(*this)
+{
 }
 
-RGWAsioClientIO::~RGWAsioClientIO() = default;
+ClientIO::~ClientIO() = default;
 
-void RGWAsioClientIO::init_env(CephContext *cct)
+void ClientIO::init_env(CephContext *cct)
 {
   env.init(cct);
-  body_iter = request.body.begin();
 
-  const auto& headers = request.headers;
+  const auto& request = parser.get();
+  const auto& headers = request.fields;
   for (auto header = headers.begin(); header != headers.end(); ++header) {
     const auto& name = header->name();
     const auto& value = header->value();
@@ -80,42 +82,58 @@ void RGWAsioClientIO::init_env(CephContext *cct)
   // TODO: set REMOTE_USER if authenticated
 }
 
-size_t RGWAsioClientIO::write_data(const char* const buf,
-                                   const size_t len)
+size_t ClientIO::write_data(const char* buf, size_t len)
 {
   boost::system::error_code ec;
   auto bytes = boost::asio::write(socket, boost::asio::buffer(buf, len), ec);
   if (ec) {
     derr << "write_data failed: " << ec.message() << dendl;
     throw rgw::io::Exception(ec.value(), std::system_category());
-  } else {
-    /* According to the documentation of boost::asio::write if there is
-     * no error (signalised by ec), then bytes == len. We don't need to
-     * take care of partial writes in such situation. */
-    return bytes;
   }
+  /* According to the documentation of boost::asio::write if there is
+   * no error (signalised by ec), then bytes == len. We don't need to
+   * take care of partial writes in such situation. */
+  return bytes;
 }
 
-size_t RGWAsioClientIO::read_data(char* const buf, const size_t max)
+size_t ClientIO::read_data(char* buf, size_t max)
 {
-  // read data from the body's bufferlist
-  auto bytes = std::min<unsigned>(max, body_iter.get_remaining());
-  body_iter.copy(bytes, buf);
-  return bytes;
+  auto& message = parser.get();
+  auto& body_remaining = message.body;
+  body_remaining = boost::asio::mutable_buffer{buf, max};
+
+  boost::system::error_code ec;
+
+  dout(30) << this << " read_data for " << max << " with "
+      << buffer.size() << " bytes buffered" << dendl;
+
+  while (boost::asio::buffer_size(body_remaining) && !parser.is_complete()) {
+    auto bytes = beast::http::read_some(socket, buffer, parser, ec);
+    buffer.consume(bytes);
+    if (ec == boost::asio::error::connection_reset ||
+        ec == boost::asio::error::eof ||
+        ec == beast::http::error::partial_message) {
+      break;
+    }
+    if (ec) {
+      derr << "failed to read body: " << ec.message() << dendl;
+      throw rgw::io::Exception(ec.value(), std::system_category());
+    }
+  }
+  return max - boost::asio::buffer_size(body_remaining);
 }
 
-size_t RGWAsioClientIO::complete_request()
+size_t ClientIO::complete_request()
 {
   return 0;
 }
 
-void RGWAsioClientIO::flush()
+void ClientIO::flush()
 {
   txbuf.pubsync();
 }
 
-size_t RGWAsioClientIO::send_status(const int status,
-                                    const char* const status_name)
+size_t ClientIO::send_status(int status, const char* status_name)
 {
   static constexpr size_t STATUS_BUF_SIZE = 128;
 
@@ -126,7 +144,7 @@ size_t RGWAsioClientIO::send_status(const int status,
   return txbuf.sputn(statusbuf, statuslen);
 }
 
-size_t RGWAsioClientIO::send_100_continue()
+size_t ClientIO::send_100_continue()
 {
   const char HTTTP_100_CONTINUE[] = "HTTP/1.1 100 CONTINUE\r\n\r\n";
   const size_t sent = txbuf.sputn(HTTTP_100_CONTINUE,
@@ -148,7 +166,7 @@ static size_t dump_date_header(char (&timestr)[TIME_BUF_SIZE])
                   "Date: %a, %d %b %Y %H:%M:%S %Z\r\n", tmp);
 }
 
-size_t RGWAsioClientIO::complete_header()
+size_t ClientIO::complete_header()
 {
   size_t sent = 0;
 
@@ -172,8 +190,8 @@ size_t RGWAsioClientIO::complete_header()
   return sent;
 }
 
-size_t RGWAsioClientIO::send_header(const boost::string_ref& name,
-                                    const boost::string_ref& value)
+size_t ClientIO::send_header(const boost::string_ref& name,
+                             const boost::string_ref& value)
 {
   static constexpr char HEADER_SEP[] = ": ";
   static constexpr char HEADER_END[] = "\r\n";
@@ -188,7 +206,7 @@ size_t RGWAsioClientIO::send_header(const boost::string_ref& name,
   return sent;
 }
 
-size_t RGWAsioClientIO::send_content_length(const uint64_t len)
+size_t ClientIO::send_content_length(uint64_t len)
 {
   static constexpr size_t CONLEN_BUF_SIZE = 128;
 
diff --git a/src/rgw/rgw_asio_client.h b/src/rgw/rgw_asio_client.h
index c87fd5f720b..513a3ef0ca2 100644
--- a/src/rgw/rgw_asio_client.h
+++ b/src/rgw/rgw_asio_client.h
@@ -4,36 +4,58 @@
 #define RGW_ASIO_CLIENT_H
 
 #include <boost/asio/ip/tcp.hpp>
-#include <beast/http/body_type.hpp>
-#include <beast/http/concepts.hpp>
-#include <beast/http/message_v1.hpp>
+#include <beast/http/message.hpp>
+#include <beast/http/message_parser.hpp>
+#include <beast/core/flat_streambuf.hpp>
 #include "include/assert.h"
 
 #include "rgw_client_io.h"
 
-// bufferlist to represent the message body
-class RGWBufferlistBody {
- public:
-  using value_type = ceph::bufferlist;
+namespace rgw {
+namespace asio {
 
-  class reader;
-  class writer;
+/// streaming message body interface
+struct streaming_body {
+  using value_type = boost::asio::mutable_buffer;
 
-  template <bool isRequest, typename Headers>
-  using message_type = beast::http::message<isRequest, RGWBufferlistBody,
-                                            Headers>;
-};
+  class reader {
+    value_type& buffer;
+   public:
+    using mutable_buffers_type = boost::asio::mutable_buffers_1;
 
-class RGWAsioClientIO : public rgw::io::RestfulClient,
-                        public rgw::io::BuffererSink {
-  using tcp = boost::asio::ip::tcp;
-  tcp::socket socket;
+    static const bool is_direct{true}; // reads directly into user buffer
 
-  using body_type = RGWBufferlistBody;
-  using request_type = beast::http::request_v1<body_type>;
-  request_type request;
+    template<bool isRequest, class Fields>
+    explicit reader(beast::http::message<isRequest, streaming_body, Fields>& m)
+      : buffer(m.body)
+    {}
+
+    void init() {}
+    void init(uint64_t content_length) {}
+    void finish() {}
+
+    mutable_buffers_type prepare(size_t n) {
+      n = std::min(n, boost::asio::buffer_size(buffer));
+      auto position = boost::asio::buffer_cast<char*>(buffer);
+      return {position, n};
+    }
 
-  bufferlist::const_iterator body_iter;
+    void commit(size_t n) {
+      buffer = buffer + n;
+    }
+  };
+};
+
+using header_type = beast::http::fields;
+using parser_type = beast::http::message_parser<true, streaming_body, header_type>;
+
+class ClientIO : public io::RestfulClient,
+                 public io::BuffererSink {
+ private:
+  using tcp = boost::asio::ip::tcp;
+  tcp::socket& socket;
+  parser_type& parser;
+  beast::flat_streambuf& buffer; //< parse buffer
 
   bool conn_keepalive{false};
   bool conn_close{false};
@@ -45,8 +67,11 @@ class RGWAsioClientIO : public rgw::io::RestfulClient,
   size_t read_data(char *buf, size_t max);
 
  public:
-  RGWAsioClientIO(tcp::socket&& socket, request_type&& request);
-  ~RGWAsioClientIO() override;
+  ClientIO(tcp::socket& socket, parser_type& parser,
+           beast::flat_streambuf& buffer);
+  ~ClientIO() override;
+
+  bool get_conn_close() const { return conn_close; }
 
   void init_env(CephContext *cct) override;
   size_t complete_request() override;
@@ -71,45 +96,7 @@ class RGWAsioClientIO : public rgw::io::RestfulClient,
   }
 };
 
-// used by beast::http::read() to read the body into a bufferlist
-class RGWBufferlistBody::reader {
-  value_type& bl;
- public:
-  template<bool isRequest, typename Headers>
-  explicit reader(message_type<isRequest, Headers>& m) : bl(m.body) {}
-
-  void write(const char* data, size_t size, boost::system::error_code&) {
-    bl.append(data, size);
-  }
-};
-
-// used by beast::http::write() to write the buffered body
-class RGWBufferlistBody::writer {
-  const value_type& bl;
- public:
-  template<bool isRequest, typename Headers>
-  explicit writer(const message_type<isRequest, Headers>& msg)
-    : bl(msg.body) {}
-
-  void init(boost::system::error_code& ec) {}
-  uint64_t content_length() const { return bl.length(); }
-
-  template<typename Write>
-  boost::tribool operator()(beast::http::resume_context&&,
-                            boost::system::error_code&, Write&& write) {
-    // translate from bufferlist to a ConstBufferSequence for beast
-    std::vector<boost::asio::const_buffer> buffers;
-    buffers.reserve(bl.get_num_buffers());
-    for (auto& ptr : bl.buffers()) {
-      buffers.emplace_back(ptr.c_str(), ptr.length());
-    }
-    write(buffers);
-    return true;
-  }
-};
-static_assert(beast::http::is_ReadableBody<RGWBufferlistBody>{},
-              "RGWBufferlistBody does not satisfy ReadableBody");
-static_assert(beast::http::is_WritableBody<RGWBufferlistBody>{},
-              "RGWBufferlistBody does not satisfy WritableBody");
+} // namespace asio
+} // namespace rgw
 
 #endif // RGW_ASIO_CLIENT_H
diff --git a/src/rgw/rgw_asio_frontend.cc b/src/rgw/rgw_asio_frontend.cc
index ff2d7806759..5fc1deec42e 100644
--- a/src/rgw/rgw_asio_frontend.cc
+++ b/src/rgw/rgw_asio_frontend.cc
@@ -7,13 +7,11 @@
 #include <vector>
 
 #include <boost/asio.hpp>
-#include <boost/optional.hpp>
+#include <boost/asio/spawn.hpp>
 
 #include <beast/core/placeholders.hpp>
-#include <beast/core/streambuf.hpp>
-#include <beast/http/empty_body.hpp>
-#include <beast/http/parse_error.hpp>
 #include <beast/http/read.hpp>
+#include <beast/http/string_body.hpp>
 #include <beast/http/write.hpp>
 
 #include "rgw_asio_frontend.h"
@@ -71,28 +69,47 @@ void Pauser::wait()
 
 using tcp = boost::asio::ip::tcp;
 
-class AsioConnection : public std::enable_shared_from_this<AsioConnection> {
-  RGWProcessEnv& env;
-  boost::asio::io_service::strand strand;
-  tcp::socket socket;
-  tcp::endpoint endpoint;
-  beast::streambuf buf;
-  beast::http::request_v1<RGWBufferlistBody> request;
+// coroutine to handle a client connection to completion
+static void handle_connection(RGWProcessEnv& env, tcp::socket socket,
+                              boost::asio::yield_context yield)
+{
+  auto cct = env.store->ctx();
+  boost::system::error_code ec;
 
- public:
-  void on_read(boost::system::error_code ec) {
-    auto cct = env.store->ctx();
+  beast::flat_streambuf buffer{1024};
+
+  // read messages from the socket until eof
+  for (;;) {
+    // parse the header
+    rgw::asio::parser_type parser;
+    do {
+      auto bytes = beast::http::async_read_some(socket, buffer, parser, yield[ec]);
+      buffer.consume(bytes);
+    } while (!ec && !parser.got_header());
+
+    if (ec == boost::asio::error::connection_reset ||
+        ec == boost::asio::error::eof) {
+      return;
+    }
     if (ec) {
-      if (ec.category() == beast::http::get_parse_error_category()) {
-        ldout(cct, 1) << "parse failed: " << ec.message() << dendl;
-      } else {
-        ldout(cct, 1) << "read failed: " << ec.message() << dendl;
-      }
-      write_bad_request();
+      auto& message = parser.get();
+      ldout(cct, 1) << "read failed: " << ec.message() << dendl;
+      ldout(cct, 1) << "====== req done http_status=400 ======" << dendl;
+      beast::http::response<beast::http::string_body> response;
+      response.status = 400;
+      response.reason = "Bad Request";
+      response.version = message.version == 10 ? 10 : 11;
+      beast::http::prepare(response);
+      beast::http::async_write(socket, std::move(response), yield[ec]);
+      // ignore ec
       return;
     }
+
+    // process the request
     RGWRequest req{env.store->get_new_req_id()};
-    RGWAsioClientIO real_client{std::move(socket), std::move(request)};
+
+    rgw::asio::ClientIO real_client{socket, parser, buffer};
+
     auto real_client_io = rgw::io::add_reordering(
                             rgw::io::add_buffering(
                               rgw::io::add_chunking(
@@ -101,40 +118,12 @@ class AsioConnection : public std::enable_shared_from_this<AsioConnection> {
     RGWRestfulIO client(&real_client_io);
     process_request(env.store, env.rest, &req, env.uri_prefix,
                     *env.auth_registry, &client, env.olog);
-  }
-
-  void write_bad_request() {
-    beast::http::response_v1<beast::http::empty_body> response;
-    response.status = 400;
-    response.reason = "Bad Request";
-    /* If the request is so terribly malformed that we can't extract even
-     * the protocol version, we will use HTTP/1.1 as a fallback. */
-    response.version = request.version ? request.version : 11;
-    beast::http::prepare(response);
-    beast::http::async_write(socket, std::move(response),
-                             std::bind(&AsioConnection::on_write,
-                                       shared_from_this(),
-                                       beast::asio::placeholders::error));
-  }
 
-  void on_write(boost::system::error_code ec) {
-    auto cct = env.store->ctx();
-    if (ec) {
-      ldout(cct, 1) << "write failed: " << ec.message() << dendl;
+    if (real_client.get_conn_close()) {
+      return;
     }
   }
-
- public:
-  AsioConnection(RGWProcessEnv& env, tcp::socket&& socket)
-    : env(env), strand(socket.get_io_service()), socket(std::move(socket))
-  {}
-
-  void read() {
-    beast::http::async_read(socket, buf, request, strand.wrap(
-            std::bind(&AsioConnection::on_read, shared_from_this(),
-                      beast::asio::placeholders::error)));
-  }
-};
+}
 
 class AsioFrontend {
   RGWProcessEnv env;
@@ -168,9 +157,19 @@ int AsioFrontend::init()
   auto ep = tcp::endpoint{tcp::v4(), static_cast<unsigned short>(env.port)};
   ldout(ctx(), 4) << "frontend listening on " << ep << dendl;
 
-  acceptor.open(ep.protocol());
+  boost::system::error_code ec;
+  acceptor.open(ep.protocol(), ec);
+  if (ec) {
+    lderr(ctx()) << "failed to open socket: " << ec.message() << dendl;
+    return -ec.value();
+  }
   acceptor.set_option(tcp::acceptor::reuse_address(true));
-  acceptor.bind(ep);
+  acceptor.bind(ep, ec);
+  if (ec) {
+    lderr(ctx()) << "failed to bind address " << ep <<
+        ": " << ec.message() << dendl;
+    return -ec.value();
+  }
   acceptor.listen(boost::asio::socket_base::max_connections);
   acceptor.async_accept(peer_socket,
                         [this] (boost::system::error_code ec) {
@@ -189,13 +188,15 @@ void AsioFrontend::accept(boost::system::error_code ec)
     throw ec;
   }
   auto socket = std::move(peer_socket);
-
+  // spawn a coroutine to handle the connection
+  boost::asio::spawn(service,
+                     [&] (boost::asio::yield_context yield) {
+                       handle_connection(env, std::move(socket), yield);
+                     });
   acceptor.async_accept(peer_socket,
                         [this] (boost::system::error_code ec) {
                           return accept(ec);
                         });
-
-  std::make_shared<AsioConnection>(env, std::move(socket))->read();
 }
 
 int AsioFrontend::run()
diff --git a/src/rgw/rgw_auth.cc b/src/rgw/rgw_auth.cc
index 6311d4b16ab..c63db0ccec5 100644
--- a/src/rgw/rgw_auth.cc
+++ b/src/rgw/rgw_auth.cc
@@ -56,6 +56,21 @@ transform_old_authinfo(const req_state* const s)
       return id == acct_id;
     }
 
+    bool is_identity(const idset_t& ids) const override {
+      for (auto& p : ids) {
+	if (p.is_wildcard()) {
+	  return true;
+	} else if (p.is_tenant() && p.get_tenant() == id.tenant) {
+	  return true;
+	} else if (p.is_user() &&
+		   (p.get_tenant() == id.tenant) &&
+		   (p.get_id() == id.id)) {
+	  return true;
+	}
+      }
+      return false;
+    }
+
     uint32_t get_perm_mask() const override {
       return perm_mask;
     }
@@ -291,6 +306,29 @@ bool rgw::auth::RemoteApplier::is_owner_of(const rgw_user& uid) const
   return info.acct_user == uid;
 }
 
+bool rgw::auth::RemoteApplier::is_identity(const idset_t& ids) const {
+  for (auto& id : ids) {
+    if (id.is_wildcard()) {
+      return true;
+
+      // We also need to cover cases where rgw_keystone_implicit_tenants
+      // was enabled. */
+    } else if (id.is_tenant() &&
+	       (info.acct_user.tenant.empty() ?
+		info.acct_user.id :
+		info.acct_user.tenant) == id.get_tenant()) {
+      return true;
+    } else if (id.is_user() &&
+	       info.acct_user.id == id.get_id() &&
+	       (info.acct_user.tenant.empty() ?
+		info.acct_user.id :
+		info.acct_user.tenant) == id.get_tenant()) {
+      return true;
+    }
+  }
+  return false;
+}
+
 void rgw::auth::RemoteApplier::to_str(std::ostream& out) const
 {
   out << "rgw::auth::RemoteApplier(acct_user=" << info.acct_user
@@ -383,8 +421,23 @@ bool rgw::auth::LocalApplier::is_owner_of(const rgw_user& uid) const
   return uid == user_info.user_id;
 }
 
-void rgw::auth::LocalApplier::to_str(std::ostream& out) const
-{
+bool rgw::auth::LocalApplier::is_identity(const idset_t& ids) const {
+  for (auto& id : ids) {
+    if (id.is_wildcard()) {
+      return true;
+    } else if (id.is_tenant() &&
+	       id.get_tenant() == user_info.user_id.tenant) {
+      return true;
+    } else if (id.is_user() &&
+	       (id.get_tenant() == user_info.user_id.tenant) &&
+	       (id.get_id() == user_info.user_id.id)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void rgw::auth::LocalApplier::to_str(std::ostream& out) const {
   out << "rgw::auth::LocalApplier(acct_user=" << user_info.user_id
       << ", acct_name=" << user_info.display_name
       << ", subuser=" << subuser
diff --git a/src/rgw/rgw_auth.h b/src/rgw/rgw_auth.h
index 8118ac150d3..819a3380385 100644
--- a/src/rgw/rgw_auth.h
+++ b/src/rgw/rgw_auth.h
@@ -16,7 +16,6 @@
 
 #define RGW_USER_ANON_ID "anonymous"
 
-
 namespace rgw {
 namespace auth {
 
@@ -28,6 +27,7 @@ using Exception = std::system_error;
 class Identity {
 public:
   typedef std::map<std::string, int> aclspec_t;
+  using idset_t = boost::container::flat_set<Principal>;
 
   virtual ~Identity() = default;
 
@@ -64,6 +64,10 @@ public:
   }
 
   virtual void to_str(std::ostream& out) const = 0;
+
+  /* Verify whether a given identity corresponds to an identity in the
+     provided set */
+  virtual bool is_identity(const idset_t& ids) const = 0;
 };
 
 inline std::ostream& operator<<(std::ostream& out,
@@ -404,6 +408,8 @@ public:
   uint32_t get_perms_from_aclspec(const aclspec_t& aclspec) const override;
   bool is_admin_of(const rgw_user& uid) const override;
   bool is_owner_of(const rgw_user& uid) const override;
+  bool is_identity(const idset_t& ids) const override;
+
   uint32_t get_perm_mask() const override { return info.perm_mask; }
   void to_str(std::ostream& out) const override;
   void load_acct_info(RGWUserInfo& user_info) const override; /* out */
@@ -449,6 +455,7 @@ public:
   uint32_t get_perms_from_aclspec(const aclspec_t& aclspec) const override;
   bool is_admin_of(const rgw_user& uid) const override;
   bool is_owner_of(const rgw_user& uid) const override;
+  bool is_identity(const idset_t& ids) const override;
   uint32_t get_perm_mask() const override {
     return get_perm_mask(subuser, user_info);
   }
diff --git a/src/rgw/rgw_auth_filters.h b/src/rgw/rgw_auth_filters.h
index 204a43e73f8..78846fc28d9 100644
--- a/src/rgw/rgw_auth_filters.h
+++ b/src/rgw/rgw_auth_filters.h
@@ -80,6 +80,11 @@ public:
     return get_decoratee().get_perm_mask();
   }
 
+  bool is_identity(
+    const boost::container::flat_set<Principal>& ids) const override {
+    return get_decoratee().is_identity(ids);
+  }
+
   void to_str(std::ostream& out) const override {
     get_decoratee().to_str(out);
   }
diff --git a/src/rgw/rgw_basic_types.cc b/src/rgw/rgw_basic_types.cc
index 5ebf1cfe446..c16d920f21b 100644
--- a/src/rgw/rgw_basic_types.cc
+++ b/src/rgw/rgw_basic_types.cc
@@ -1,6 +1,13 @@
+#include <iostream>
+#include <sstream>
+#include <string>
+
 #include "rgw_basic_types.h"
 #include "common/ceph_json.h"
 
+using std::string;
+using std::stringstream;
+
 void decode_json_obj(rgw_user& val, JSONObj *obj)
 {
   string s = obj->get_data();
@@ -12,3 +19,24 @@ void encode_json(const char *name, const rgw_user& val, Formatter *f)
   string s = val.to_str();
   f->dump_string(name, s);
 }
+
+namespace rgw {
+namespace auth {
+ostream& operator <<(ostream& m, const Principal& p) {
+  if (p.is_wildcard()) {
+    return m << "*";
+  }
+
+  m << "arn:aws:iam:" << p.get_tenant() << ":";
+  if (p.is_tenant()) {
+    return m << "root";
+  }
+  return m << (p.is_user() ? "user/" : "role/") << p.get_id();
+}
+string to_string(const Principal& p) {
+  stringstream s;
+  s << p;
+  return s.str();
+}
+}
+}
diff --git a/src/rgw/rgw_basic_types.h b/src/rgw/rgw_basic_types.h
index 261b201369a..31e9d3a32ac 100644
--- a/src/rgw/rgw_basic_types.h
+++ b/src/rgw/rgw_basic_types.h
@@ -1,3 +1,5 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
 #ifndef CEPH_RGW_BASIC_TYPES_H
 #define CEPH_RGW_BASIC_TYPES_H
 
@@ -18,6 +20,10 @@ struct rgw_user {
     : tenant(tenant),
       id(id) {
   }
+  rgw_user(std::string&& tenant, std::string&& id)
+    : tenant(std::move(tenant)),
+      id(std::move(id)) {
+  }
 
   void encode(bufferlist& bl) const {
     ENCODE_START(1, 1, bl);
@@ -100,6 +106,82 @@ struct rgw_user {
 };
 WRITE_CLASS_ENCODER(rgw_user)
 
+// Represents an identity. This is more wide-ranging than a
+// 'User'. Its purposes is to be matched against by an
+// IdentityApplier. The internal representation will doubtless change as
+// more types are added. We may want to expose the type enum and make
+// the member public so people can switch/case on it.
+
+namespace rgw {
+namespace auth {
+class Principal {
+  enum types { User, Role, Tenant, Wildcard };
+  types t;
+  rgw_user u;
+
+  Principal(types t)
+    : t(t) {}
+
+  Principal(types t, std::string&& n, std::string i)
+    : t(t), u(std::move(n), std::move(i)) {}
+
+public:
+
+  static Principal wildcard() {
+    return Principal(Wildcard);
+  }
+
+  static Principal user(std::string&& t, std::string&& u) {
+    return Principal(User, std::move(t), std::move(u));
+  }
+
+  static Principal role(std::string&& t, std::string&& u) {
+    return Principal(Role, std::move(t), std::move(u));
+  }
+
+  static Principal tenant(std::string&& t) {
+    return Principal(Tenant, std::move(t), {});
+  }
+
+  bool is_wildcard() const {
+    return t == Wildcard;
+  }
+
+  bool is_user() const {
+    return t == User;
+  }
+
+  bool is_role() const {
+    return t == Role;
+  }
+
+  bool is_tenant() const {
+    return t == Tenant;
+  }
+
+  const std::string& get_tenant() const {
+    ceph_assert(t != Wildcard);
+    return u.tenant;
+  }
+
+  const std::string& get_id() const {
+    ceph_assert(t != Wildcard && t != Tenant);
+    return u.id;
+  }
+
+  bool operator ==(const Principal& o) const {
+    return (t == o.t) && (u == o.u);
+  }
+
+  bool operator <(const Principal& o) const {
+    return (t < o.t) || ((t == o.t) && (u < o.u));
+  }
+};
+
+std::ostream& operator <<(std::ostream& m, const Principal& p);
+std::string to_string(const Principal& p);
+}
+}
 
 class JSONObj;
 
diff --git a/src/rgw/rgw_bucket.cc b/src/rgw/rgw_bucket.cc
index 049ab4a294b..7aeee21ad25 100644
--- a/src/rgw/rgw_bucket.cc
+++ b/src/rgw/rgw_bucket.cc
@@ -51,18 +51,9 @@ void rgw_get_buckets_obj(const rgw_user& user_id, string& buckets_obj_id)
  * acceptable in bucket names and thus qualified buckets cannot conflict
  * with the legacy or S3 buckets.
  */
-void rgw_make_bucket_entry_name(const string& tenant_name, const string& bucket_name, string& bucket_entry) {
-  if (bucket_name.empty()) {
-    bucket_entry.clear();
-  } else if (tenant_name.empty()) {
-    bucket_entry = bucket_name;
-  } else {
-    bucket_entry = tenant_name + "/" + bucket_name;
-  }
-}
-
-string rgw_make_bucket_entry_name(const string& tenant_name, const string& bucket_name) {
-  string bucket_entry;
+std::string rgw_make_bucket_entry_name(const std::string& tenant_name,
+                                       const std::string& bucket_name) {
+  std::string bucket_entry;
 
   if (bucket_name.empty()) {
     bucket_entry.clear();
@@ -1959,11 +1950,11 @@ int RGWDataChangesLog::trim_entries(const real_time& start_time, const real_time
 
 bool RGWDataChangesLog::going_down()
 {
-  return (down_flag.read() != 0);
+  return down_flag;
 }
 
 RGWDataChangesLog::~RGWDataChangesLog() {
-  down_flag.set(1);
+  down_flag = true;
   renew_thread->stop();
   renew_thread->join();
   delete renew_thread;
@@ -2243,7 +2234,7 @@ public:
       bci.info.bucket.name = bucket_name;
       bci.info.bucket.bucket_id = bucket_instance;
       bci.info.bucket.tenant = tenant_name;
-      ret = store->select_bucket_location_by_rule(bci.info.placement_rule, bci.info.bucket, &rule_info);
+      ret = store->select_bucket_location_by_rule(bci.info.placement_rule, &rule_info);
       if (ret < 0) {
         ldout(store->ctx(), 0) << "ERROR: select_bucket_placement() returned " << ret << dendl;
         return ret;
diff --git a/src/rgw/rgw_bucket.h b/src/rgw/rgw_bucket.h
index e691a1178a6..14c472faa8c 100644
--- a/src/rgw/rgw_bucket.h
+++ b/src/rgw/rgw_bucket.h
@@ -20,9 +20,6 @@
 #include "common/ceph_time.h"
 #include "rgw_formats.h"
 
-
-using namespace std;
-
 // define as static when RGWBucket implementation compete
 extern void rgw_get_buckets_obj(const rgw_user& user_id, string& buckets_obj_id);
 
@@ -49,11 +46,14 @@ extern int rgw_bucket_delete_bucket_obj(RGWRados *store,
 extern int rgw_bucket_sync_user_stats(RGWRados *store, const rgw_user& user_id, const RGWBucketInfo& bucket_info);
 extern int rgw_bucket_sync_user_stats(RGWRados *store, const string& tenant_name, const string& bucket_name);
 
-extern void rgw_make_bucket_entry_name(const string& tenant_name,
-                                       const string& bucket_name,
-                                       string& bucket_entry);
-extern string rgw_make_bucket_entry_name(const string& tenant_name,
-                                       const string& bucket_name);
+extern std::string rgw_make_bucket_entry_name(const std::string& tenant_name,
+                                              const std::string& bucket_name);
+static inline void rgw_make_bucket_entry_name(const string& tenant_name,
+                                              const string& bucket_name,
+                                              std::string& bucket_entry) {
+  bucket_entry = rgw_make_bucket_entry_name(tenant_name, bucket_name);
+}
+
 extern void rgw_parse_url_bucket(const string& bucket,
                                  const string& auth_tenant,
                                  string &tenant_name, string &bucket_name);
@@ -401,7 +401,7 @@ class RGWDataChangesLog {
   RWLock modified_lock;
   map<int, set<string> > modified_shards;
 
-  atomic_t down_flag;
+  std::atomic<bool> down_flag = { false };
 
   struct ChangeStatus {
     real_time cur_expiration;
diff --git a/src/rgw/rgw_cache.h b/src/rgw/rgw_cache.h
index 32068d39073..6bc9ef14903 100644
--- a/src/rgw/rgw_cache.h
+++ b/src/rgw/rgw_cache.h
@@ -231,7 +231,8 @@ public:
               bufferlist& data,
               RGWObjVersionTracker *objv_tracker,
               real_time set_mtime) override;
-  int put_system_obj_data(void *ctx, rgw_raw_obj& obj, bufferlist& bl, off_t ofs, bool exclusive) override;
+  int put_system_obj_data(void *ctx, rgw_raw_obj& obj, bufferlist& bl, off_t ofs, bool exclusive,
+                          RGWObjVersionTracker *objv_tracker = nullptr) override;
 
   int get_system_obj(RGWObjectCtx& obj_ctx, RGWRados::SystemObject::Read::GetObjState& read_state,
                      RGWObjVersionTracker *objv_tracker, rgw_raw_obj& obj,
@@ -422,7 +423,8 @@ int RGWCache<T>::put_system_obj_impl(rgw_raw_obj& obj, uint64_t size, real_time
 }
 
 template <class T>
-int RGWCache<T>::put_system_obj_data(void *ctx, rgw_raw_obj& obj, bufferlist& data, off_t ofs, bool exclusive)
+int RGWCache<T>::put_system_obj_data(void *ctx, rgw_raw_obj& obj, bufferlist& data, off_t ofs, bool exclusive,
+                                     RGWObjVersionTracker *objv_tracker)
 {
   rgw_pool pool;
   string oid;
@@ -436,7 +438,11 @@ int RGWCache<T>::put_system_obj_data(void *ctx, rgw_raw_obj& obj, bufferlist& da
     info.status = 0;
     info.flags = CACHE_FLAG_DATA;
   }
-  int ret = T::put_system_obj_data(ctx, obj, data, ofs, exclusive);
+  if (objv_tracker) {
+    info.version = objv_tracker->write_version;
+    info.flags |= CACHE_FLAG_OBJV;
+  }
+  int ret = T::put_system_obj_data(ctx, obj, data, ofs, exclusive, objv_tracker);
   if (cacheable) {
     string name = normal_name(pool, oid);
     if (ret >= 0) {
diff --git a/src/rgw/rgw_civetweb_frontend.cc b/src/rgw/rgw_civetweb_frontend.cc
index c564d0e4920..a18a6c5da41 100644
--- a/src/rgw/rgw_civetweb_frontend.cc
+++ b/src/rgw/rgw_civetweb_frontend.cc
@@ -66,32 +66,30 @@ int RGWCivetWebFrontend::run()
 
   /* Prepare options for CivetWeb. */
   const std::set<boost::string_ref> rgw_opts = { "port", "prefix" };
-  const size_t CW_NUM_OPTS = 2 * (conf_map.size() - rgw_opts.size()) + 1;
-  const char *options[CW_NUM_OPTS];
-  size_t i = 0;
+
+  std::vector<const char*> options;
 
   for (const auto& pair : conf_map) {
     if (! rgw_opts.count(pair.first)) {
       /* CivetWeb doesn't understand configurables of the glue layer between
        * it and RadosGW. We need to strip them out. Otherwise CivetWeb would
        * signalise an error. */
-      options[i + 0] = pair.first.c_str();
-      options[i + 1] = pair.second.c_str();
+      options.push_back(pair.first.c_str());
+      options.push_back(pair.second.c_str());
 
-      dout(20) << "civetweb config: " << options[i] << ": "
-               << (options[i + 1] ? options[i + 1] : "<null>") << dendl;
-      i += 2;
+      dout(20) << "civetweb config: " << pair.first
+               << ": " << pair.second << dendl;
     }
   }
-  options[i] = nullptr;
 
+  options.push_back(nullptr);
   /* Initialize the CivetWeb right now. */
   struct mg_callbacks cb;
   memset((void *)&cb, 0, sizeof(cb));
   cb.begin_request = civetweb_callback;
   cb.log_message = rgw_civetweb_log_callback;
   cb.log_access = rgw_civetweb_log_access_callback;
-  ctx = mg_start(&cb, this, (const char **)&options);
+  ctx = mg_start(&cb, this, options.data());
 
   return ! ctx ? -EIO : 0;
 } /* RGWCivetWebFrontend::run */
diff --git a/src/rgw/rgw_common.cc b/src/rgw/rgw_common.cc
index fbff872ee21..009918b76ff 100644
--- a/src/rgw/rgw_common.cc
+++ b/src/rgw/rgw_common.cc
@@ -11,10 +11,12 @@
 #include "json_spirit/json_spirit.h"
 #include "common/ceph_json.h"
 
+#include "rgw_op.h"
 #include "rgw_common.h"
 #include "rgw_acl.h"
 #include "rgw_string.h"
 #include "rgw_rados.h"
+#include "rgw_http_errors.h"
 
 #include "common/ceph_crypto.h"
 #include "common/armor.h"
@@ -32,15 +34,87 @@
 #define dout_context g_ceph_context
 #define dout_subsys ceph_subsys_rgw
 
-#define POLICY_ACTION           0x01
-#define POLICY_RESOURCE         0x02
-#define POLICY_ARN              0x04
-#define POLICY_STRING           0x08
+using boost::none;
+using boost::optional;
+
+using rgw::IAM::ARN;
+using rgw::IAM::Effect;
+using rgw::IAM::op_to_perm;
+using rgw::IAM::Policy;
 
 PerfCounters *perfcounter = NULL;
 
 const uint32_t RGWBucketInfo::NUM_SHARDS_BLIND_BUCKET(UINT32_MAX);
 
+rgw_http_errors rgw_http_s3_errors({
+    { 0, {200, "" }},
+    { STATUS_CREATED, {201, "Created" }},
+    { STATUS_ACCEPTED, {202, "Accepted" }},
+    { STATUS_NO_CONTENT, {204, "NoContent" }},
+    { STATUS_PARTIAL_CONTENT, {206, "" }},
+    { ERR_PERMANENT_REDIRECT, {301, "PermanentRedirect" }},
+    { ERR_WEBSITE_REDIRECT, {301, "WebsiteRedirect" }},
+    { STATUS_REDIRECT, {303, "" }},
+    { ERR_NOT_MODIFIED, {304, "NotModified" }},
+    { EINVAL, {400, "InvalidArgument" }},
+    { ERR_INVALID_REQUEST, {400, "InvalidRequest" }},
+    { ERR_INVALID_DIGEST, {400, "InvalidDigest" }},
+    { ERR_BAD_DIGEST, {400, "BadDigest" }},
+    { ERR_INVALID_BUCKET_NAME, {400, "InvalidBucketName" }},
+    { ERR_INVALID_OBJECT_NAME, {400, "InvalidObjectName" }},
+    { ERR_UNRESOLVABLE_EMAIL, {400, "UnresolvableGrantByEmailAddress" }},
+    { ERR_INVALID_PART, {400, "InvalidPart" }},
+    { ERR_INVALID_PART_ORDER, {400, "InvalidPartOrder" }},
+    { ERR_REQUEST_TIMEOUT, {400, "RequestTimeout" }},
+    { ERR_TOO_LARGE, {400, "EntityTooLarge" }},
+    { ERR_TOO_SMALL, {400, "EntityTooSmall" }},
+    { ERR_TOO_MANY_BUCKETS, {400, "TooManyBuckets" }},
+    { ERR_MALFORMED_XML, {400, "MalformedXML" }},
+    { ERR_AMZ_CONTENT_SHA256_MISMATCH, {400, "XAmzContentSHA256Mismatch" }},
+    { ERR_LENGTH_REQUIRED, {411, "MissingContentLength" }},
+    { EACCES, {403, "AccessDenied" }},
+    { EPERM, {403, "AccessDenied" }},
+    { ERR_SIGNATURE_NO_MATCH, {403, "SignatureDoesNotMatch" }},
+    { ERR_INVALID_ACCESS_KEY, {403, "InvalidAccessKeyId" }},
+    { ERR_USER_SUSPENDED, {403, "UserSuspended" }},
+    { ERR_REQUEST_TIME_SKEWED, {403, "RequestTimeTooSkewed" }},
+    { ERR_QUOTA_EXCEEDED, {403, "QuotaExceeded" }},
+    { ENOENT, {404, "NoSuchKey" }},
+    { ERR_NO_SUCH_BUCKET, {404, "NoSuchBucket" }},
+    { ERR_NO_SUCH_WEBSITE_CONFIGURATION, {404, "NoSuchWebsiteConfiguration" }},
+    { ERR_NO_SUCH_UPLOAD, {404, "NoSuchUpload" }},
+    { ERR_NOT_FOUND, {404, "Not Found"}},
+    { ERR_NO_SUCH_LC, {404, "NoSuchLifecycleConfiguration"}},
+    { ERR_METHOD_NOT_ALLOWED, {405, "MethodNotAllowed" }},
+    { ETIMEDOUT, {408, "RequestTimeout" }},
+    { EEXIST, {409, "BucketAlreadyExists" }},
+    { ERR_USER_EXIST, {409, "UserAlreadyExists" }},
+    { ERR_EMAIL_EXIST, {409, "EmailExists" }},
+    { ERR_KEY_EXIST, {409, "KeyExists"}},
+    { ERR_INVALID_SECRET_KEY, {400, "InvalidSecretKey"}},
+    { ERR_INVALID_KEY_TYPE, {400, "InvalidKeyType"}},
+    { ERR_INVALID_CAP, {400, "InvalidCapability"}},
+    { ERR_INVALID_TENANT_NAME, {400, "InvalidTenantName" }},
+    { ENOTEMPTY, {409, "BucketNotEmpty" }},
+    { ERR_PRECONDITION_FAILED, {412, "PreconditionFailed" }},
+    { ERANGE, {416, "InvalidRange" }},
+    { ERR_UNPROCESSABLE_ENTITY, {422, "UnprocessableEntity" }},
+    { ERR_LOCKED, {423, "Locked" }},
+    { ERR_INTERNAL_ERROR, {500, "InternalError" }},
+    { ERR_NOT_IMPLEMENTED, {501, "NotImplemented" }},
+    { ERR_SERVICE_UNAVAILABLE, {503, "ServiceUnavailable"}},
+});
+
+rgw_http_errors rgw_http_swift_errors({
+    { EACCES, {403, "AccessDenied" }},
+    { EPERM, {401, "AccessDenied" }},
+    { ERR_USER_SUSPENDED, {401, "UserSuspended" }},
+    { ERR_INVALID_UTF8, {412, "Invalid UTF8" }},
+    { ERR_BAD_URL, {412, "Bad URL" }},
+    { ERR_NOT_SLO_MANIFEST, {400, "Not an SLO manifest" }},
+    { ERR_QUOTA_EXCEEDED, {413, "QuotaExceeded" }},
+});
+
 int rgw_perf_start(CephContext *cct)
 {
   PerfCountersBuilder plb(cct, cct->_conf->name.to_str(), l_rgw_first, l_rgw_last);
@@ -84,12 +158,6 @@ rgw_err()
   clear();
 }
 
-rgw_err::
-rgw_err(int http, const std::string& s3)
-    : http_ret(http), ret(0), s3_code(s3)
-{
-}
-
 void rgw_err::
 clear()
 {
@@ -224,6 +292,77 @@ req_state::~req_state() {
   delete object_acl;
 }
 
+bool search_err(rgw_http_errors& errs, int err_no, bool is_website_redirect, int& http_ret, string& code)
+{
+  auto r = errs.find(err_no);
+  if (r != errs.end()) {
+    if (! is_website_redirect)
+      http_ret = r->second.first;
+     code = r->second.second;
+     return true;
+  }
+  return false;
+}
+
+void set_req_state_err(struct rgw_err& err,	/* out */
+			int err_no,		/* in  */
+			const int prot_flags)	/* in  */
+{
+  if (err_no < 0)
+    err_no = -err_no;
+
+  err.ret = -err_no;
+  bool is_website_redirect = false;
+
+  if (prot_flags & RGW_REST_SWIFT) {
+    if (search_err(rgw_http_swift_errors, err_no, is_website_redirect, err.http_ret, err.s3_code))
+      return;
+  }
+
+  //Default to searching in s3 errors
+  is_website_redirect |= (prot_flags & RGW_REST_WEBSITE)
+		&& err_no == ERR_WEBSITE_REDIRECT && err.is_clear();
+  if (search_err(rgw_http_s3_errors, err_no, is_website_redirect, err.http_ret, err.s3_code))
+      return;
+  dout(0) << "WARNING: set_req_state_err err_no=" << err_no
+	<< " resorting to 500" << dendl;
+
+  err.http_ret = 500;
+  err.s3_code = "UnknownError";
+}
+
+void set_req_state_err(struct req_state* s, int err_no, const string& err_msg)
+{
+  if (s) {
+    set_req_state_err(s, err_no);
+    s->err.message = err_msg;
+  }
+}
+
+void set_req_state_err(struct req_state* s, int err_no)
+{
+  if (s) {
+    set_req_state_err(s->err, err_no, s->prot_flags);
+  }
+}
+
+void dump(struct req_state* s)
+{
+  if (s->format != RGW_FORMAT_HTML)
+    s->formatter->open_object_section("Error");
+  if (!s->err.s3_code.empty())
+    s->formatter->dump_string("Code", s->err.s3_code);
+  if (!s->err.message.empty())
+    s->formatter->dump_string("Message", s->err.message);
+  if (!s->bucket_name.empty())	// TODO: connect to expose_bucket
+    s->formatter->dump_string("BucketName", s->bucket_name);
+  if (!s->trans_id.empty())	// TODO: connect to expose_bucket or another toggle
+    s->formatter->dump_string("RequestId", s->trans_id);
+  s->formatter->dump_string("HostId", s->host_id);
+  if (s->format != RGW_FORMAT_HTML)
+    s->formatter->close_section();
+}
+
 struct str_len {
   const char *str;
   int len;
@@ -938,17 +1077,39 @@ bool verify_requester_payer_permission(struct req_state *s)
 }
 
 bool verify_bucket_permission(struct req_state * const s,
+			      const rgw_bucket& bucket,
                               RGWAccessControlPolicy * const user_acl,
                               RGWAccessControlPolicy * const bucket_acl,
-                              const int perm)
+			      const optional<Policy>& bucket_policy,
+                              const uint64_t op)
 {
-  if (!bucket_acl)
+  if (!verify_requester_payer_permission(s))
     return false;
 
-  if ((perm & (int)s->perm_mask) != perm)
+  if (bucket_policy) {
+    auto r = bucket_policy->eval(s->env, *s->auth.identity, op, ARN(bucket));
+    if (r == Effect::Allow)
+      // It looks like S3 ACLs only GRANT permissions rather than
+      // denying them, so this should be safe.
+      return true;
+    else if (r == Effect::Deny)
+      return false;
+  }
+
+  const auto perm = op_to_perm(op);
+
+  return verify_bucket_permission_no_policy(s, user_acl, bucket_acl, perm);
+}
+
+bool verify_bucket_permission_no_policy(struct req_state * const s,
+					RGWAccessControlPolicy * const user_acl,
+					RGWAccessControlPolicy * const bucket_acl,
+					const int perm)
+{
+  if (!bucket_acl)
     return false;
 
-  if (!verify_requester_payer_permission(s))
+  if ((perm & (int)s->perm_mask) != perm)
     return false;
 
   if (bucket_acl->verify_permission(*s->auth.identity, perm, perm,
@@ -961,35 +1122,76 @@ bool verify_bucket_permission(struct req_state * const s,
   return user_acl->verify_permission(*s->auth.identity, perm, perm);
 }
 
-bool verify_bucket_permission(struct req_state * const s, const int perm)
+bool verify_bucket_permission_no_policy(struct req_state * const s, const int perm)
+{
+  if (!verify_requester_payer_permission(s))
+    return false;
+
+  return verify_bucket_permission_no_policy(s,
+					    s->user_acl.get(),
+					    s->bucket_acl,
+					    perm);
+}
+
+bool verify_bucket_permission(struct req_state * const s, const uint64_t op)
 {
   return verify_bucket_permission(s,
+				  s->bucket,
                                   s->user_acl.get(),
                                   s->bucket_acl,
-                                  perm);
+				  s->iam_policy,
+                                  op);
+}
+
+static inline bool check_deferred_bucket_perms(struct req_state * const s,
+					       const rgw_bucket& bucket,
+					       RGWAccessControlPolicy * const user_acl,
+					       RGWAccessControlPolicy * const bucket_acl,
+					       const optional<Policy>& bucket_policy,
+					       const uint8_t deferred_check,
+					       const uint64_t op)
+{
+  return (s->defer_to_bucket_acls == deferred_check \
+	  && verify_bucket_permission(s, bucket, user_acl, bucket_acl, bucket_policy, op));
 }
 
-static inline bool check_deferred_bucket_acl(struct req_state * const s,
-                                             RGWAccessControlPolicy * const user_acl,
-                                             RGWAccessControlPolicy * const bucket_acl,
-                                             const uint8_t deferred_check,
-                                             const int perm)
+static inline bool check_deferred_bucket_only_acl(struct req_state * const s,
+						  RGWAccessControlPolicy * const user_acl,
+						  RGWAccessControlPolicy * const bucket_acl,
+						  const uint8_t deferred_check,
+						  const int perm)
 {
   return (s->defer_to_bucket_acls == deferred_check \
-              && verify_bucket_permission(s, user_acl, bucket_acl, perm));
+	  && verify_bucket_permission_no_policy(s, user_acl, bucket_acl, perm));
 }
 
 bool verify_object_permission(struct req_state * const s,
+			      const rgw_obj& obj,
                               RGWAccessControlPolicy * const user_acl,
                               RGWAccessControlPolicy * const bucket_acl,
                               RGWAccessControlPolicy * const object_acl,
-                              const int perm)
+                              const optional<Policy>& bucket_policy,
+                              const uint64_t op)
 {
   if (!verify_requester_payer_permission(s))
     return false;
 
-  if (check_deferred_bucket_acl(s, user_acl, bucket_acl, RGW_DEFER_TO_BUCKET_ACLS_RECURSE, perm) ||
-      check_deferred_bucket_acl(s, user_acl, bucket_acl, RGW_DEFER_TO_BUCKET_ACLS_FULL_CONTROL, RGW_PERM_FULL_CONTROL)) {
+  if (bucket_policy) {
+    auto r = bucket_policy->eval(s->env, *s->auth.identity, op, ARN(obj));
+    if (r == Effect::Allow)
+      // It looks like S3 ACLs only GRANT permissions rather than
+      // denying them, so this should be safe.
+      return true;
+    else if (r == Effect::Deny)
+      return false;
+  }
+
+  const auto perm = op_to_perm(op);
+
+  if (check_deferred_bucket_perms(s, obj.bucket, user_acl, bucket_acl, bucket_policy,
+				  RGW_DEFER_TO_BUCKET_ACLS_RECURSE, op) ||
+      check_deferred_bucket_perms(s, obj.bucket, user_acl, bucket_acl, bucket_policy,
+				  RGW_DEFER_TO_BUCKET_ACLS_FULL_CONTROL, rgw::IAM::s3All)) {
     return true;
   }
 
@@ -1029,13 +1231,72 @@ bool verify_object_permission(struct req_state * const s,
   return user_acl->verify_permission(*s->auth.identity, swift_perm, swift_perm);
 }
 
-bool verify_object_permission(struct req_state *s, int perm)
+bool verify_object_permission_no_policy(struct req_state * const s,
+					RGWAccessControlPolicy * const user_acl,
+					RGWAccessControlPolicy * const bucket_acl,
+					RGWAccessControlPolicy * const object_acl,
+					const int perm)
+{
+  if (check_deferred_bucket_only_acl(s, user_acl, bucket_acl, RGW_DEFER_TO_BUCKET_ACLS_RECURSE, perm) ||
+      check_deferred_bucket_only_acl(s, user_acl, bucket_acl, RGW_DEFER_TO_BUCKET_ACLS_FULL_CONTROL, RGW_PERM_FULL_CONTROL)) {
+    return true;
+  }
+
+  if (!object_acl) {
+    return false;
+  }
+
+  bool ret = object_acl->verify_permission(*s->auth.identity, s->perm_mask, perm);
+  if (ret) {
+    return true;
+  }
+
+  if (!s->cct->_conf->rgw_enforce_swift_acls)
+    return ret;
+
+  if ((perm & (int)s->perm_mask) != perm)
+    return false;
+
+  int swift_perm = 0;
+  if (perm & (RGW_PERM_READ | RGW_PERM_READ_ACP))
+    swift_perm |= RGW_PERM_READ_OBJS;
+  if (perm & RGW_PERM_WRITE)
+    swift_perm |= RGW_PERM_WRITE_OBJS;
+
+  if (!swift_perm)
+    return false;
+
+  /* we already verified the user mask above, so we pass swift_perm as the mask here,
+     otherwise the mask might not cover the swift permissions bits */
+  if (bucket_acl->verify_permission(*s->auth.identity, swift_perm, swift_perm,
+                                    s->info.env->get("HTTP_REFERER")))
+    return true;
+
+  if (!user_acl)
+    return false;
+
+  return user_acl->verify_permission(*s->auth.identity, swift_perm, swift_perm);
+}
+
+bool verify_object_permission_no_policy(struct req_state *s, int perm)
+{
+  if (!verify_requester_payer_permission(s))
+    return false;
+
+  return verify_object_permission_no_policy(s, s->user_acl.get(),
+					    s->bucket_acl, s->object_acl,
+					    perm);
+}
+
+bool verify_object_permission(struct req_state *s, uint64_t op)
 {
   return verify_object_permission(s,
-                                  s->user_acl.get(),
+				  rgw_obj(s->bucket, s->object),
+				  s->user_acl.get(),
                                   s->bucket_acl,
                                   s->object_acl,
-                                  perm);
+				  s->iam_policy,
+                                  op);
 }
 
 class HexTable
@@ -1613,7 +1874,7 @@ static int matchignorecase(const char& c1, const char& c2)
   return 0;
 }
 
-int match(const string& pattern, const string& input, int flag)
+int match(const string& pattern, const string& input, uint32_t flag)
 {
   auto last_pos_input = 0, last_pos_pattern = 0;
 
@@ -1625,7 +1886,9 @@ int match(const string& pattern, const string& input, int flag)
     string substr_pattern = pattern.substr(last_pos_pattern, cur_pos_pattern);
 
     int res;
-    if (flag & POLICY_ACTION || flag & POLICY_ARN) {
+    if (substr_pattern == "*") {
+      res = 1;
+    } else if (flag & MATCH_POLICY_ACTION || flag & MATCH_POLICY_ARN) {
       res = match_internal(substr_pattern, substr_input, &matchignorecase);
     } else {
       res = match_internal(substr_pattern, substr_input, &matchcase);
@@ -1636,7 +1899,7 @@ int match(const string& pattern, const string& input, int flag)
     if (cur_pos_pattern == string::npos && cur_pos_input == string::npos)
       return 1;
     else if ((cur_pos_pattern == string::npos && cur_pos_input != string::npos) ||
-             (cur_pos_pattern != string::npos && cur_pos_input == string::npos))
+	     (cur_pos_pattern != string::npos && cur_pos_input == string::npos))
       return 0;
 
     last_pos_pattern = cur_pos_pattern + 1;
diff --git a/src/rgw/rgw_common.h b/src/rgw/rgw_common.h
index 5ad7637b44c..6e24bb1d419 100644
--- a/src/rgw/rgw_common.h
+++ b/src/rgw/rgw_common.h
@@ -21,6 +21,7 @@
 #include "acconfig.h"
 #include "rgw_acl.h"
 #include "rgw_cors.h"
+#include "rgw_iam_policy.h"
 #include "rgw_quota.h"
 #include "rgw_string.h"
 #include "rgw_website.h"
@@ -29,8 +30,6 @@
 #include "cls/rgw/cls_rgw_types.h"
 #include "include/rados/librados.hpp"
 
-using namespace std;
-
 namespace ceph {
   class Formatter;
 }
@@ -99,6 +98,10 @@ using ceph::crypto::MD5;
 
 #define RGW_ATTR_COMPRESSION    RGW_ATTR_PREFIX "compression"
 
+/* IAM Policy */
+#define RGW_ATTR_IAM_POLICY	RGW_ATTR_PREFIX "iam-policy"
+
+
 /* RGW File Attributes */
 #define RGW_ATTR_UNIX_KEY1      RGW_ATTR_PREFIX "unix-key1"
 #define RGW_ATTR_UNIX1          RGW_ATTR_PREFIX "unix1"
@@ -202,6 +205,8 @@ using ceph::crypto::MD5;
 #define UINT32_MAX (0xffffffffu)
 #endif
 
+struct req_state;
+
 typedef void *RGWAccessHandle;
 
 
@@ -263,7 +268,6 @@ enum RGWObjCategory {
 /** Store error returns for output at a different point in the program */
 struct rgw_err {
   rgw_err();
-  rgw_err(int http, const std::string &s3);
   void clear();
   bool is_clear() const;
   bool is_err() const;
@@ -275,6 +279,8 @@ struct rgw_err {
   std::string message;
 };
 
+
+
 /* Helper class used for RGWHTTPArgs parsing */
 class NameVal
 {
@@ -451,6 +457,9 @@ enum RGWOpType {
   RGW_OP_GET_ROLE_POLICY,
   RGW_OP_LIST_ROLE_POLICIES,
   RGW_OP_DELETE_ROLE_POLICY,
+  RGW_OP_PUT_BUCKET_POLICY,
+  RGW_OP_GET_BUCKET_POLICY,
+  RGW_OP_DELETE_BUCKET_POLICY,
 
   /* rgw specific */
   RGW_OP_ADMIN_SET_METADATA,
@@ -1338,8 +1347,6 @@ struct RGWStorageStats
   void dump(Formatter *f) const;
 };
 
-struct req_state;
-
 class RGWEnv;
 
 /* Namespaced forward declarations. */
@@ -1348,12 +1355,14 @@ namespace rgw {
     namespace s3 {
       class RGWGetPolicyV2Extractor;
     }
+    class Completer;
   }
   namespace io {
     class BasicClient;
   }
 }
 
+
 struct req_info {
   RGWEnv *env;
   RGWHTTPArgs args;
@@ -1701,7 +1710,7 @@ struct req_state {
   const char *length;
   int64_t content_length;
   map<string, string> generic_attrs;
-  struct rgw_err err;
+  rgw_err err;
   bool expect_cont;
   bool header_ended;
   uint64_t obj_size;
@@ -1770,6 +1779,9 @@ struct req_state {
   RGWAccessControlPolicy *bucket_acl;
   RGWAccessControlPolicy *object_acl;
 
+  rgw::IAM::Environment env;
+  boost::optional<rgw::IAM::Policy> iam_policy;
+
   /* Is the request made by an user marked as a system one?
    * Being system user means we also have the admin status. */
   bool system_request;
@@ -1805,8 +1817,15 @@ struct req_state {
 
   req_state(CephContext* _cct, RGWEnv* e, RGWUserInfo* u);
   ~req_state();
+
+  bool is_err() const { return err.is_err(); }
 };
 
+void set_req_state_err(struct req_state*, int);
+void set_req_state_err(struct req_state*, int, const string&);
+void set_req_state_err(struct rgw_err&, int, const int);
+void dump(struct req_state*);
+
 /** Store basic data on bucket */
 struct RGWBucketEnt {
   rgw_bucket bucket;
@@ -2132,17 +2151,38 @@ bool verify_user_permission(struct req_state * const s,
                             const int perm);
 bool verify_user_permission(struct req_state * const s,
                             const int perm);
-extern bool verify_bucket_permission(struct req_state * s,
-                                     RGWAccessControlPolicy * user_acl,
-                                     RGWAccessControlPolicy * bucket_acl,
-                                     int perm);
-extern bool verify_bucket_permission(struct req_state *s, int perm);
-extern bool verify_object_permission(struct req_state *s,
-                                     RGWAccessControlPolicy * user_acl,
-                                     RGWAccessControlPolicy * bucket_acl,
-                                     RGWAccessControlPolicy * object_acl,
-                                     int perm);
-extern bool verify_object_permission(struct req_state *s, int perm);
+bool verify_bucket_permission(
+  struct req_state * const s,
+  const rgw_bucket& bucket,
+  RGWAccessControlPolicy * const user_acl,
+  RGWAccessControlPolicy * const bucket_acl,
+  const boost::optional<rgw::IAM::Policy>& bucket_policy,
+  const uint64_t op);
+bool verify_bucket_permission(struct req_state * const s, const uint64_t op);
+bool verify_bucket_permission_no_policy(
+  struct req_state * const s,
+  RGWAccessControlPolicy * const user_acl,
+  RGWAccessControlPolicy * const bucket_acl,
+  const int perm);
+bool verify_bucket_permission_no_policy(struct req_state * const s,
+					const int perm);
+extern bool verify_object_permission(
+  struct req_state * const s,
+  const rgw_obj& obj,
+  RGWAccessControlPolicy * const user_acl,
+  RGWAccessControlPolicy * const bucket_acl,
+  RGWAccessControlPolicy * const object_acl,
+  const boost::optional<rgw::IAM::Policy>& bucket_policy,
+  const uint64_t op);
+extern bool verify_object_permission(struct req_state *s, uint64_t op);
+extern bool verify_object_permission_no_policy(
+  struct req_state * const s,
+  RGWAccessControlPolicy * const user_acl,
+  RGWAccessControlPolicy * const bucket_acl,
+  RGWAccessControlPolicy * const object_acl,
+  int perm);
+extern bool verify_object_permission_no_policy(struct req_state *s,
+					       int perm);
 /** Convert an input URL into a sane object name
  * by converting %-escaped strings into characters, etc*/
 extern void rgw_uri_escape_char(char c, string& dst);
@@ -2168,5 +2208,12 @@ extern string  calc_hash_sha256_close_stream(SHA256 **hash);
 
 extern int rgw_parse_op_type_list(const string& str, uint32_t *perm);
 
-int match(const string& pattern, const string& input, int flag);
+namespace {
+  constexpr uint32_t MATCH_POLICY_ACTION = 0x01;
+  constexpr uint32_t MATCH_POLICY_RESOURCE = 0x02;
+  constexpr uint32_t MATCH_POLICY_ARN = 0x04;
+  constexpr uint32_t MATCH_POLICY_STRING = 0x08;
+}
+
+int match(const std::string& pattern, const std::string& input, uint32_t flag);
 #endif
diff --git a/src/rgw/rgw_coroutine.cc b/src/rgw/rgw_coroutine.cc
index 85ad290d020..4bb48da49dc 100644
--- a/src/rgw/rgw_coroutine.cc
+++ b/src/rgw/rgw_coroutine.cc
@@ -1,10 +1,11 @@
 
-
 #include "common/ceph_json.h"
 
 #include "rgw_coroutine.h"
 #include "rgw_boost_asio_yield.h"
 
+// re-include our assert to clobber the system one; fix dout:
+#include "include/assert.h"
 
 #define dout_subsys ceph_subsys_rgw
 
@@ -68,7 +69,7 @@ int RGWCompletionManager::get_next(void **user_info)
   Mutex::Locker l(lock);
   while (complete_reqs.empty()) {
     cond.Wait(lock);
-    if (going_down.read() != 0) {
+    if (going_down) {
       return -ECANCELED;
     }
   }
@@ -94,7 +95,7 @@ void RGWCompletionManager::go_down()
   for (auto cn : cns) {
     cn->unregister();
   }
-  going_down.set(1);
+  going_down = true;
   cond.Signal();
 }
 
@@ -460,7 +461,7 @@ int RGWCoroutinesManager::run(list<RGWCoroutinesStack *>& stacks)
   bool canceled = false; // set on going_down
   RGWCoroutinesEnv env;
 
-  uint64_t run_context = run_context_count.inc();
+  uint64_t run_context = ++run_context_count;
 
   lock.get_write();
   set<RGWCoroutinesStack *>& context_stacks = run_contexts[run_context];
@@ -475,7 +476,7 @@ int RGWCoroutinesManager::run(list<RGWCoroutinesStack *>& stacks)
   env.manager = this;
   env.scheduled_stacks = &scheduled_stacks;
 
-  for (list<RGWCoroutinesStack *>::iterator iter = scheduled_stacks.begin(); iter != scheduled_stacks.end() && !going_down.read();) {
+  for (list<RGWCoroutinesStack *>::iterator iter = scheduled_stacks.begin(); iter != scheduled_stacks.end() && !going_down;) {
     lock.get_write();
 
     RGWCoroutinesStack *stack = *iter;
@@ -566,7 +567,7 @@ int RGWCoroutinesManager::run(list<RGWCoroutinesStack *>& stacks)
       if (ret < 0) {
 	ldout(cct, 0) << "ERROR: failed to clone shard, completion_mgr.get_next() returned ret=" << ret << dendl;
       }
-      if (going_down.read() > 0) {
+      if (going_down) {
 	ldout(cct, 5) << __func__ << "(): was stopped, exiting" << dendl;
 	ret = -ECANCELED;
         canceled = true;
@@ -585,7 +586,7 @@ int RGWCoroutinesManager::run(list<RGWCoroutinesStack *>& stacks)
   }
 
   lock.get_write();
-  if (!context_stacks.empty() && !going_down.read()) {
+  if (!context_stacks.empty() && !going_down) {
     JSONFormatter formatter(true);
     formatter.open_array_section("context_stacks");
     for (auto& s : context_stacks) {
@@ -595,7 +596,7 @@ int RGWCoroutinesManager::run(list<RGWCoroutinesStack *>& stacks)
     lderr(cct) << __func__ << "(): ERROR: deadlock detected, dumping remaining coroutines:\n";
     formatter.flush(*_dout);
     *_dout << dendl;
-    assert(context_stacks.empty() || going_down.read()); // assert on deadlock
+    assert(context_stacks.empty() || going_down); // assert on deadlock
   }
 
   for (auto stack : context_stacks) {
diff --git a/src/rgw/rgw_coroutine.h b/src/rgw/rgw_coroutine.h
index 821ff55889a..66afa74b375 100644
--- a/src/rgw/rgw_coroutine.h
+++ b/src/rgw/rgw_coroutine.h
@@ -22,6 +22,8 @@
 #include "rgw_common.h"
 #include "rgw_boost_asio_coroutine.h"
 
+#include <atomic>
+
 #define RGW_ASYNC_OPS_MGR_WINDOW 100
 
 class RGWCoroutinesStack;
@@ -39,7 +41,7 @@ class RGWCompletionManager : public RefCountedObject {
 
   SafeTimer timer;
 
-  atomic_t going_down;
+  std::atomic<bool> going_down = { false };
 
   map<void *, void *> waiters;
 
@@ -506,9 +508,9 @@ public:
 
 class RGWCoroutinesManager {
   CephContext *cct;
-  atomic_t going_down;
+  std::atomic<bool> going_down = { false };
 
-  atomic64_t run_context_count;
+  std::atomic<int64_t> run_context_count = { 0 };
   map<uint64_t, set<RGWCoroutinesStack *> > run_contexts;
 
   RWLock lock;
@@ -542,7 +544,8 @@ public:
   int run(list<RGWCoroutinesStack *>& ops);
   int run(RGWCoroutine *op);
   void stop() {
-    if (going_down.inc() == 1) {
+    bool expected = false;
+    if (going_down.compare_exchange_strong(expected, true)) {
       completion_mgr->go_down();
     }
   }
diff --git a/src/rgw/rgw_cors_s3.h b/src/rgw/rgw_cors_s3.h
index cad423c10a2..1e60e651512 100644
--- a/src/rgw/rgw_cors_s3.h
+++ b/src/rgw/rgw_cors_s3.h
@@ -18,15 +18,12 @@
 #include <map>
 #include <string>
 #include <iosfwd>
-#include <expat.h>
 
 #include <include/types.h>
 #include <common/Formatter.h>
 #include "rgw_xml.h"
 #include "rgw_cors.h"
 
-using namespace std;
-
 class RGWCORSRule_S3 : public RGWCORSRule, public XMLObj
 {
   public:
diff --git a/src/rgw/rgw_cors_swift.h b/src/rgw/rgw_cors_swift.h
index 6aef5e13561..d2516dd0357 100644
--- a/src/rgw/rgw_cors_swift.h
+++ b/src/rgw/rgw_cors_swift.h
@@ -23,8 +23,6 @@
 
 #include "rgw_cors.h"
 
-using namespace std;
-
 class RGWCORSConfiguration_SWIFT : public RGWCORSConfiguration
 {
   public:
diff --git a/src/rgw/rgw_cr_rados.cc b/src/rgw/rgw_cr_rados.cc
index 489ef589172..0e0be84c82c 100644
--- a/src/rgw/rgw_cr_rados.cc
+++ b/src/rgw/rgw_cr_rados.cc
@@ -65,7 +65,7 @@ void RGWAsyncRadosProcessor::start() {
 }
 
 void RGWAsyncRadosProcessor::stop() {
-  going_down.set(1);
+  going_down = true;
   m_tp.drain(&req_wq);
   m_tp.stop();
   for (auto iter = m_req_queue.begin(); iter != m_req_queue.end(); ++iter) {
@@ -116,14 +116,14 @@ int RGWSimpleRadosReadAttrsCR::request_complete()
 
 int RGWAsyncPutSystemObj::_send_request()
 {
-  return store->put_system_obj_data(NULL, obj, bl, -1, exclusive);
+  return store->put_system_obj_data(NULL, obj, bl, -1, exclusive, objv_tracker);
 }
 
 RGWAsyncPutSystemObj::RGWAsyncPutSystemObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, RGWRados *_store,
-                     const rgw_raw_obj& _obj, bool _exclusive,
-                     bufferlist& _bl) : RGWAsyncRadosRequest(caller, cn), store(_store),
-                                                       obj(_obj), exclusive(_exclusive),
-                                                       bl(_bl)
+                     RGWObjVersionTracker *_objv_tracker, rgw_raw_obj& _obj,
+                     bool _exclusive, bufferlist& _bl)
+  : RGWAsyncRadosRequest(caller, cn), store(_store), objv_tracker(_objv_tracker),
+    obj(_obj), exclusive(_exclusive), bl(_bl)
 {
 }
 
@@ -315,6 +315,40 @@ int RGWRadosRemoveOmapKeysCR::send_request() {
   return ref.ioctx.aio_operate(ref.oid, cn->completion(), &op);
 }
 
+RGWRadosRemoveCR::RGWRadosRemoveCR(RGWRados *store, const rgw_raw_obj& obj)
+  : RGWSimpleCoroutine(store->ctx()), store(store), obj(obj)
+{
+  set_description() << "remove dest=" << obj;
+}
+
+int RGWRadosRemoveCR::send_request()
+{
+  auto rados = store->get_rados_handle();
+  int r = rados->ioctx_create(obj.pool.name.c_str(), ioctx);
+  if (r < 0) {
+    lderr(cct) << "ERROR: failed to open pool (" << obj.pool.name << ") ret=" << r << dendl;
+    return r;
+  }
+  ioctx.locator_set_key(obj.loc);
+
+  set_status() << "send request";
+
+  librados::ObjectWriteOperation op;
+  op.remove();
+
+  cn = stack->create_completion_notifier();
+  return ioctx.aio_operate(obj.oid, cn->completion(), &op);
+}
+
+int RGWRadosRemoveCR::request_complete()
+{
+  int r = cn->completion()->get_return_value();
+
+  set_status() << "request complete; ret=" << r;
+
+  return r;
+}
+
 RGWSimpleRadosLockCR::RGWSimpleRadosLockCR(RGWAsyncRadosProcessor *_async_rados, RGWRados *_store,
                       const rgw_raw_obj& _obj,
                       const string& _lock_name,
@@ -507,7 +541,6 @@ int RGWAsyncFetchRemoteObj::_send_request()
                        &key.instance, /* string *version_id, */
                        NULL, /* string *ptag, */
                        NULL, /* string *petag, */
-                       NULL, /* struct rgw_err *err, */
                        NULL, /* void (*progress_cb)(off_t, void *), */
                        NULL); /* void *progress_data*); */
 
@@ -627,7 +660,7 @@ int RGWContinuousLeaseCR::operate()
     return set_cr_done();
   }
   reenter(this) {
-    while (!going_down.read()) {
+    while (!going_down) {
       yield call(new RGWSimpleRadosLockCR(async_rados, store, obj, lock_name, cookie, interval));
 
       caller->set_sleeping(false); /* will only be relevant when we return, that's why we can do it early */
@@ -722,6 +755,29 @@ int RGWRadosTimelogTrimCR::request_complete()
   return r;
 }
 
+
+RGWSyncLogTrimCR::RGWSyncLogTrimCR(RGWRados *store, const std::string& oid,
+                                   const std::string& to_marker,
+                                   std::string *last_trim_marker)
+  : RGWRadosTimelogTrimCR(store, oid, real_time{}, real_time{},
+                          std::string{}, to_marker),
+    cct(store->ctx()), last_trim_marker(last_trim_marker)
+{
+}
+
+int RGWSyncLogTrimCR::request_complete()
+{
+  int r = RGWRadosTimelogTrimCR::request_complete();
+  if (r < 0 && r != -ENODATA) {
+    return r;
+  }
+  if (*last_trim_marker < to_marker) {
+    *last_trim_marker = to_marker;
+  }
+  return 0;
+}
+
+
 int RGWAsyncStatObj::_send_request()
 {
   rgw_raw_obj raw_obj;
diff --git a/src/rgw/rgw_cr_rados.h b/src/rgw/rgw_cr_rados.h
index 8496201e0cd..a892b2a6db0 100644
--- a/src/rgw/rgw_cr_rados.h
+++ b/src/rgw/rgw_cr_rados.h
@@ -1,11 +1,15 @@
 #ifndef CEPH_RGW_CR_RADOS_H
 #define CEPH_RGW_CR_RADOS_H
 
+#include <boost/intrusive_ptr.hpp>
+#include "include/assert.h"
 #include "rgw_coroutine.h"
 #include "rgw_rados.h"
 #include "common/WorkQueue.h"
 #include "common/Throttle.h"
 
+#include <atomic>
+
 class RGWAsyncRadosRequest : public RefCountedObject {
   RGWCoroutine *caller;
   RGWAioCompletionNotifier *notifier;
@@ -57,7 +61,7 @@ public:
 
 class RGWAsyncRadosProcessor {
   deque<RGWAsyncRadosRequest *> m_req_queue;
-  atomic_t going_down;
+  std::atomic<bool> going_down = { false };
 protected:
   RGWRados *store;
   ThreadPool m_tp;
@@ -91,7 +95,7 @@ public:
   void queue(RGWAsyncRadosRequest *req);
 
   bool is_going_down() {
-    return (going_down.read() != 0);
+    return going_down;
   }
 };
 
@@ -117,6 +121,7 @@ public:
 
 class RGWAsyncPutSystemObj : public RGWAsyncRadosRequest {
   RGWRados *store;
+  RGWObjVersionTracker *objv_tracker;
   rgw_raw_obj obj;
   bool exclusive;
   bufferlist bl;
@@ -125,8 +130,8 @@ protected:
   int _send_request() override;
 public:
   RGWAsyncPutSystemObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, RGWRados *_store,
-                       const rgw_raw_obj& _obj, bool _exclusive,
-                       bufferlist& _bl);
+                       RGWObjVersionTracker *_objv_tracker, rgw_raw_obj& _obj,
+                       bool _exclusive, bufferlist& _bl);
 };
 
 class RGWAsyncPutSystemObjAttrs : public RGWAsyncRadosRequest {
@@ -187,16 +192,18 @@ class RGWSimpleRadosReadCR : public RGWSimpleCoroutine {
   T *result;
   /// on ENOENT, call handle_data() with an empty object instead of failing
   const bool empty_on_enoent;
+  RGWObjVersionTracker *objv_tracker;
 
   RGWAsyncGetSystemObj *req{nullptr};
 
 public:
   RGWSimpleRadosReadCR(RGWAsyncRadosProcessor *_async_rados, RGWRados *_store,
 		      const rgw_raw_obj& _obj,
-		      T *_result, bool empty_on_enoent = true)
+		      T *_result, bool empty_on_enoent = true,
+		      RGWObjVersionTracker *objv_tracker = nullptr)
     : RGWSimpleCoroutine(_store->ctx()), async_rados(_async_rados), store(_store),
       obj_ctx(store), obj(_obj), result(_result),
-      empty_on_enoent(empty_on_enoent) {}
+      empty_on_enoent(empty_on_enoent), objv_tracker(objv_tracker) {}
   ~RGWSimpleRadosReadCR() override {
     request_cleanup();
   }
@@ -220,7 +227,7 @@ template <class T>
 int RGWSimpleRadosReadCR<T>::send_request()
 {
   req = new RGWAsyncGetSystemObj(this, stack->create_completion_notifier(),
-			         store, &obj_ctx, NULL,
+			         store, &obj_ctx, objv_tracker,
 				 obj,
 				 &bl, 0, -1);
   if (pattrs) {
@@ -303,17 +310,16 @@ class RGWSimpleRadosWriteCR : public RGWSimpleCoroutine {
   bufferlist bl;
 
   rgw_raw_obj obj;
+  RGWObjVersionTracker *objv_tracker;
 
-  RGWAsyncPutSystemObj *req;
+  RGWAsyncPutSystemObj *req{nullptr};
 
 public:
   RGWSimpleRadosWriteCR(RGWAsyncRadosProcessor *_async_rados, RGWRados *_store,
 		      const rgw_raw_obj& _obj,
-		      const T& _data) : RGWSimpleCoroutine(_store->ctx()),
-                                                async_rados(_async_rados),
-						store(_store),
-						obj(_obj),
-                                                req(NULL) {
+		      const T& _data, RGWObjVersionTracker *objv_tracker = nullptr)
+    : RGWSimpleCoroutine(_store->ctx()), async_rados(_async_rados),
+      store(_store), obj(_obj), objv_tracker(objv_tracker) {
     ::encode(_data, bl);
   }
 
@@ -330,7 +336,7 @@ public:
 
   int send_request() override {
     req = new RGWAsyncPutSystemObj(this, stack->create_completion_notifier(),
-			           store, obj, false, bl);
+			           store, objv_tracker, obj, false, bl);
     async_rados->queue(req);
     return 0;
   }
@@ -462,6 +468,19 @@ public:
   }
 };
 
+class RGWRadosRemoveCR : public RGWSimpleCoroutine {
+  RGWRados *store;
+  librados::IoCtx ioctx;
+  const rgw_raw_obj obj;
+  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+
+public:
+  RGWRadosRemoveCR(RGWRados *store, const rgw_raw_obj& obj);
+
+  int send_request();
+  int request_complete();
+};
+
 class RGWSimpleRadosLockCR : public RGWSimpleCoroutine {
   RGWAsyncRadosProcessor *async_rados;
   RGWRados *store;
@@ -1014,7 +1033,7 @@ class RGWContinuousLeaseCR : public RGWCoroutine {
   int interval;
 
   Mutex lock;
-  atomic_t going_down;
+  std::atomic<bool> going_down = { false };
   bool locked{false};
 
   RGWCoroutine *caller;
@@ -1044,7 +1063,7 @@ public:
   }
 
   void go_down() {
-    going_down.set(1);
+    going_down = true;
     wakeup();
   }
 
@@ -1091,6 +1110,16 @@ class RGWRadosTimelogTrimCR : public RGWSimpleCoroutine {
   int request_complete() override;
 };
 
+// wrapper to update last_trim_marker on success
+class RGWSyncLogTrimCR : public RGWRadosTimelogTrimCR {
+  CephContext *cct;
+  std::string *last_trim_marker;
+ public:
+  RGWSyncLogTrimCR(RGWRados *store, const std::string& oid,
+                   const std::string& to_marker, std::string *last_trim_marker);
+  int request_complete() override;
+};
+
 class RGWAsyncStatObj : public RGWAsyncRadosRequest {
   RGWRados *store;
   RGWBucketInfo bucket_info;
diff --git a/src/rgw/rgw_crypt.cc b/src/rgw/rgw_crypt.cc
index cf48e27751f..0d57f4da270 100644
--- a/src/rgw/rgw_crypt.cc
+++ b/src/rgw/rgw_crypt.cc
@@ -1053,7 +1053,9 @@ static const crypt_option_names crypt_options[] = {
 
 static boost::string_ref get_crypt_attribute(
     RGWEnv* env,
-    map<string, post_form_part, const ltstr_nocase>* parts,
+    std::map<std::string,
+             RGWPostObj_ObjStore::post_form_part,
+             const ltstr_nocase>* parts,
     crypt_option_e option)
 {
   static_assert(
@@ -1079,10 +1081,12 @@ static boost::string_ref get_crypt_attribute(
 
 
 int rgw_s3_prepare_encrypt(struct req_state* s,
-                       map<string, bufferlist>& attrs,
-                       map<string, post_form_part, const ltstr_nocase>* parts,
-                       std::unique_ptr<BlockCrypt>* block_crypt,
-                       std::map<std::string, std::string>& crypt_http_responses)
+                           std::map<std::string, ceph::bufferlist>& attrs,
+                           std::map<std::string,
+                                    RGWPostObj_ObjStore::post_form_part,
+                                    const ltstr_nocase>* parts,
+                           std::unique_ptr<BlockCrypt>* block_crypt,
+                           std::map<std::string, std::string>& crypt_http_responses)
 {
   int res = 0;
   crypt_http_responses.clear();
diff --git a/src/rgw/rgw_crypt.h b/src/rgw/rgw_crypt.h
index a6b7df0a42a..1774983758d 100644
--- a/src/rgw/rgw_crypt.h
+++ b/src/rgw/rgw_crypt.h
@@ -7,6 +7,7 @@
 #define CEPH_RGW_CRYPT_H
 
 #include <rgw/rgw_op.h>
+#include <rgw/rgw_rest.h>
 #include <rgw/rgw_rest_s3.h>
 #include <boost/utility/string_ref.hpp>
 
@@ -136,14 +137,18 @@ public:
 
 
 int rgw_s3_prepare_encrypt(struct req_state* s,
-                       map<string, bufferlist>& attrs,
-                       map<string, post_form_part, const ltstr_nocase>* parts,
-                       std::unique_ptr<BlockCrypt>* block_crypt,
-                       std::map<std::string, std::string>& crypt_http_responses);
+                           std::map<std::string, ceph::bufferlist>& attrs,
+                           std::map<std::string,
+                                    RGWPostObj_ObjStore::post_form_part,
+                                    const ltstr_nocase>* parts,
+                           std::unique_ptr<BlockCrypt>* block_crypt,
+                           std::map<std::string,
+                                    std::string>& crypt_http_responses);
 
 int rgw_s3_prepare_decrypt(struct req_state* s,
-                       map<string, bufferlist>& attrs,
-                       std::unique_ptr<BlockCrypt>* block_crypt,
-                       std::map<std::string, std::string>& crypt_http_responses);
+                           std::map<std::string, ceph::bufferlist>& attrs,
+                           std::unique_ptr<BlockCrypt>* block_crypt,
+                           std::map<std::string,
+                                    std::string>& crypt_http_responses);
 
 #endif
diff --git a/src/rgw/rgw_data_sync.cc b/src/rgw/rgw_data_sync.cc
index cbe3a91d8e1..e3f3b079365 100644
--- a/src/rgw/rgw_data_sync.cc
+++ b/src/rgw/rgw_data_sync.cc
@@ -456,22 +456,25 @@ bool RGWListRemoteDataLogCR::spawn_next() {
 }
 
 class RGWInitDataSyncStatusCoroutine : public RGWCoroutine {
+  static constexpr uint32_t lock_duration = 30;
   RGWDataSyncEnv *sync_env;
-
   RGWRados *store;
+  const rgw_pool& pool;
+  const uint32_t num_shards;
 
   string sync_status_oid;
 
   string lock_name;
   string cookie;
-  rgw_data_sync_info status;
+  rgw_data_sync_status *status;
   map<int, RGWDataChangesLogInfo> shards_info;
 public:
-  RGWInitDataSyncStatusCoroutine(RGWDataSyncEnv *_sync_env,
-		      uint32_t _num_shards) : RGWCoroutine(_sync_env->cct),
-                                                sync_env(_sync_env), store(sync_env->store) {
+  RGWInitDataSyncStatusCoroutine(RGWDataSyncEnv *_sync_env, uint32_t num_shards,
+                                 rgw_data_sync_status *status)
+    : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), store(sync_env->store),
+      pool(store->get_zone_params().log_pool),
+      num_shards(num_shards), status(status) {
     lock_name = "sync_lock";
-    status.num_shards = _num_shards;
 
 #define COOKIE_LEN 16
     char buf[COOKIE_LEN + 1];
@@ -485,33 +488,32 @@ public:
   int operate() override {
     int ret;
     reenter(this) {
-      yield {
-	uint32_t lock_duration = 30;
-	call(new RGWSimpleRadosLockCR(sync_env->async_rados, store,
-                                      rgw_raw_obj(store->get_zone_params().log_pool, sync_status_oid),
-			             lock_name, cookie, lock_duration));
-	if (retcode < 0) {
-	  ldout(cct, 0) << "ERROR: failed to take a lock on " << sync_status_oid << dendl;
-	  return set_cr_error(retcode);
-	}
+      using LockCR = RGWSimpleRadosLockCR;
+      yield call(new LockCR(sync_env->async_rados, store,
+                            rgw_raw_obj{pool, sync_status_oid},
+                            lock_name, cookie, lock_duration));
+      if (retcode < 0) {
+        ldout(cct, 0) << "ERROR: failed to take a lock on " << sync_status_oid << dendl;
+        return set_cr_error(retcode);
       }
-      yield {
-        call(new RGWSimpleRadosWriteCR<rgw_data_sync_info>(sync_env->async_rados,
-                                                           store,
-                                                           rgw_raw_obj(store->get_zone_params().log_pool, sync_status_oid),
-                                                           status));
-      }
-      yield { /* take lock again, we just recreated the object */
-	uint32_t lock_duration = 30;
-	call(new RGWSimpleRadosLockCR(sync_env->async_rados,
-                                      store,
-                                      rgw_raw_obj(store->get_zone_params().log_pool, sync_status_oid),
-                                      lock_name, cookie, lock_duration));
-	if (retcode < 0) {
-	  ldout(cct, 0) << "ERROR: failed to take a lock on " << sync_status_oid << dendl;
-	  return set_cr_error(retcode);
-	}
+      using WriteInfoCR = RGWSimpleRadosWriteCR<rgw_data_sync_info>;
+      yield call(new WriteInfoCR(sync_env->async_rados, store,
+                                 rgw_raw_obj{pool, sync_status_oid},
+                                 status->sync_info));
+      if (retcode < 0) {
+        ldout(cct, 0) << "ERROR: failed to write sync status info with " << retcode << dendl;
+        return set_cr_error(retcode);
+      }
+
+      /* take lock again, we just recreated the object */
+      yield call(new LockCR(sync_env->async_rados, store,
+                            rgw_raw_obj{pool, sync_status_oid},
+                            lock_name, cookie, lock_duration));
+      if (retcode < 0) {
+        ldout(cct, 0) << "ERROR: failed to take a lock on " << sync_status_oid << dendl;
+        return set_cr_error(retcode);
       }
+
       /* fetch current position in logs */
       yield {
         RGWRESTConn *conn = store->get_zone_conn_by_id(sync_env->source_zone);
@@ -519,46 +521,48 @@ public:
           ldout(cct, 0) << "ERROR: connection to zone " << sync_env->source_zone << " does not exist!" << dendl;
           return set_cr_error(-EIO);
         }
-        for (int i = 0; i < (int)status.num_shards; i++) {
+        for (uint32_t i = 0; i < num_shards; i++) {
           spawn(new RGWReadRemoteDataLogShardInfoCR(sync_env, i, &shards_info[i]), true);
-	}
+        }
       }
       while (collect(&ret, NULL)) {
-	if (ret < 0) {
-	  return set_state(RGWCoroutine_Error);
-	}
+        if (ret < 0) {
+          ldout(cct, 0) << "ERROR: failed to read remote data log shards" << dendl;
+          return set_state(RGWCoroutine_Error);
+        }
         yield;
       }
       yield {
-        for (int i = 0; i < (int)status.num_shards; i++) {
-	  rgw_data_sync_marker marker;
+        for (uint32_t i = 0; i < num_shards; i++) {
           RGWDataChangesLogInfo& info = shards_info[i];
-	  marker.next_step_marker = info.marker;
-	  marker.timestamp = info.last_update;
-          spawn(new RGWSimpleRadosWriteCR<rgw_data_sync_marker>(sync_env->async_rados, store,
-                                                                rgw_raw_obj(store->get_zone_params().log_pool, RGWDataSyncStatusManager::shard_obj_name(sync_env->source_zone, i)),
-                                                                marker), true);
+          auto& marker = status->sync_markers[i];
+          marker.next_step_marker = info.marker;
+          marker.timestamp = info.last_update;
+          const auto& oid = RGWDataSyncStatusManager::shard_obj_name(sync_env->source_zone, i);
+          using WriteMarkerCR = RGWSimpleRadosWriteCR<rgw_data_sync_marker>;
+          spawn(new WriteMarkerCR(sync_env->async_rados, store,
+                                  rgw_raw_obj{pool, oid}, marker), true);
         }
       }
-      yield {
-	status.state = rgw_data_sync_info::StateBuildingFullSyncMaps;
-        call(new RGWSimpleRadosWriteCR<rgw_data_sync_info>(sync_env->async_rados, store,
-                                                           rgw_raw_obj(store->get_zone_params().log_pool, sync_status_oid),
-                                                           status));
-      }
-      yield { /* unlock */
-	call(new RGWSimpleRadosUnlockCR(sync_env->async_rados,
-                                        store,
-                                        rgw_raw_obj(store->get_zone_params().log_pool, sync_status_oid),
-                                        lock_name, cookie));
-      }
       while (collect(&ret, NULL)) {
-	if (ret < 0) {
-	  return set_state(RGWCoroutine_Error);
-	}
+        if (ret < 0) {
+          ldout(cct, 0) << "ERROR: failed to write data sync status markers" << dendl;
+          return set_state(RGWCoroutine_Error);
+        }
         yield;
       }
-      drain_all();
+
+      status->sync_info.state = rgw_data_sync_info::StateBuildingFullSyncMaps;
+      yield call(new WriteInfoCR(sync_env->async_rados, store,
+                                 rgw_raw_obj{pool, sync_status_oid},
+                                 status->sync_info));
+      if (retcode < 0) {
+        ldout(cct, 0) << "ERROR: failed to write sync status info with " << retcode << dendl;
+        return set_cr_error(retcode);
+      }
+      yield call(new RGWSimpleRadosUnlockCR(sync_env->async_rados, store,
+                                            rgw_raw_obj{pool, sync_status_oid},
+                                            lock_name, cookie));
       return set_cr_done();
     }
     return 0;
@@ -666,6 +670,7 @@ int RGWRemoteDataLog::read_sync_status(rgw_data_sync_status *sync_status)
 
 int RGWRemoteDataLog::init_sync_status(int num_shards)
 {
+  rgw_data_sync_status sync_status;
   RGWCoroutinesManager crs(store->ctx(), store->get_cr_registry());
   RGWHTTPManager http_manager(store->ctx(), crs.get_completion_mgr());
   int ret = http_manager.set_threaded();
@@ -675,7 +680,7 @@ int RGWRemoteDataLog::init_sync_status(int num_shards)
   }
   RGWDataSyncEnv sync_env_local = sync_env;
   sync_env_local.http_manager = &http_manager;
-  ret = crs.run(new RGWInitDataSyncStatusCoroutine(&sync_env_local, num_shards));
+  ret = crs.run(new RGWInitDataSyncStatusCoroutine(&sync_env_local, num_shards, &sync_status));
   http_manager.stop();
   return ret;
 }
@@ -1240,11 +1245,6 @@ public:
 
   int incremental_sync() {
     reenter(&incremental_cr) {
-      error_repo = new RGWOmapAppend(sync_env->async_rados, sync_env->store,
-                                     rgw_raw_obj(pool, error_oid),
-                                     1 /* no buffer */);
-      error_repo->get();
-      spawn(error_repo, false);
       yield init_lease_cr();
       while (!lease_cr->is_locked()) {
         if (lease_cr->is_done()) {
@@ -1256,6 +1256,11 @@ public:
         yield;
       }
       set_status("lease acquired");
+      error_repo = new RGWOmapAppend(sync_env->async_rados, sync_env->store,
+                                     rgw_raw_obj(pool, error_oid),
+                                     1 /* no buffer */);
+      error_repo->get();
+      spawn(error_repo, false);
       logger.log("inc sync");
       set_marker_tracker(new RGWDataSyncShardMarkerTrack(sync_env, status_oid, sync_marker));
       do {
@@ -1458,20 +1463,12 @@ public:
       /* state: init status */
       if ((rgw_data_sync_info::SyncState)sync_status.sync_info.state == rgw_data_sync_info::StateInit) {
         ldout(sync_env->cct, 20) << __func__ << "(): init" << dendl;
-        yield call(new RGWInitDataSyncStatusCoroutine(sync_env, sync_status.sync_info.num_shards));
+        yield call(new RGWInitDataSyncStatusCoroutine(sync_env, num_shards, &sync_status));
         if (retcode < 0) {
           ldout(sync_env->cct, 0) << "ERROR: failed to init sync, retcode=" << retcode << dendl;
           return set_cr_error(retcode);
         }
-        sync_status.sync_info.num_shards = num_shards;
-        sync_status.sync_info.state = rgw_data_sync_info::StateBuildingFullSyncMaps;
-        /* update new state */
-        yield call(set_sync_info_cr());
-
-        if (retcode < 0) {
-          ldout(sync_env->cct, 0) << "ERROR: failed to write sync status, retcode=" << retcode << dendl;
-          return set_cr_error(retcode);
-        }
+        // sets state = StateBuildingFullSyncMaps
 
         *reset_backoff = true;
       }
@@ -2894,6 +2891,7 @@ string RGWBucketSyncStatusManager::status_oid(const string& source_zone,
 }
 
 
+// TODO: move into rgw_data_sync_trim.cc
 #undef dout_prefix
 #define dout_prefix (*_dout << "data trim: ")
 
@@ -2939,28 +2937,7 @@ void take_min_markers(IterIn first, IterIn last, IterOut dest)
   }
 }
 
-// wrapper to update last_trim_marker on success
-class LastTimelogTrimCR : public RGWRadosTimelogTrimCR {
-  CephContext *cct;
-  std::string *last_trim_marker;
- public:
-  LastTimelogTrimCR(RGWRados *store, const std::string& oid,
-                    const std::string& to_marker, std::string *last_trim_marker)
-    : RGWRadosTimelogTrimCR(store, oid, real_time{}, real_time{},
-                            std::string{}, to_marker),
-      cct(store->ctx()), last_trim_marker(last_trim_marker)
-  {}
-  int request_complete() override {
-    int r = RGWRadosTimelogTrimCR::request_complete();
-    if (r < 0 && r != -ENODATA) {
-      ldout(cct, 1) << "failed to trim datalog: " << cpp_strerror(r) << dendl;
-      return r;
-    }
-    ldout(cct, 10) << "datalog trimmed to marker " << to_marker << dendl;
-    *last_trim_marker = to_marker;
-    return 0;
-  }
-};
+} // anonymous namespace
 
 class DataLogTrimCR : public RGWCoroutine {
   RGWRados *store;
@@ -3039,7 +3016,7 @@ int DataLogTrimCR::operate()
         ldout(cct, 10) << "trimming log shard " << i
             << " at marker=" << stable
             << " last_trim=" << last_trim[i] << dendl;
-        using TrimCR = LastTimelogTrimCR;
+        using TrimCR = RGWSyncLogTrimCR;
         spawn(new TrimCR(store, store->data_log->get_oid(i),
                          stable, &last_trim[i]),
               true);
@@ -3103,8 +3080,6 @@ int DataLogTrimPollCR::operate()
   return 0;
 }
 
-} // anonymous namespace
-
 RGWCoroutine* create_data_log_trim_cr(RGWRados *store,
                                       RGWHTTPManager *http,
                                       int num_shards, utime_t interval)
diff --git a/src/rgw/rgw_file.cc b/src/rgw/rgw_file.cc
index be3d356ec41..f79dd0a78b7 100644
--- a/src/rgw/rgw_file.cc
+++ b/src/rgw/rgw_file.cc
@@ -27,6 +27,8 @@
 #include "rgw_file.h"
 #include "rgw_lib_frontend.h"
 
+#include <atomic>
+
 #define dout_subsys ceph_subsys_rgw
 
 using namespace rgw;
@@ -37,7 +39,7 @@ namespace rgw {
 
   const string RGWFileHandle::root_name = "/";
 
-  atomic<uint32_t> RGWLibFS::fs_inst_counter;
+  std::atomic<uint32_t> RGWLibFS::fs_inst_counter;
 
   uint32_t RGWLibFS::write_completion_interval_s = 10;
 
@@ -1148,6 +1150,8 @@ namespace rgw {
     int rc = write_finish(FLAG_LOCKED);
 
     flags &= ~FLAG_OPEN;
+    flags &= ~FLAG_STATELESS_OPEN;
+
     return rc;
   } /* RGWFileHandle::close */
 
@@ -1417,7 +1421,6 @@ int rgw_umount(struct rgw_fs *rgw_fs, uint32_t flags)
 {
   RGWLibFS *fs = static_cast<RGWLibFS*>(rgw_fs->fs_private);
   fs->close();
-  fs->rele();
   return 0;
 }
 
diff --git a/src/rgw/rgw_file.h b/src/rgw/rgw_file.h
index 5d964f9c2aa..1edc37a9f60 100644
--- a/src/rgw/rgw_file.h
+++ b/src/rgw/rgw_file.h
@@ -282,7 +282,7 @@ namespace rgw {
 	variant_type = directory();
 	flags |= FLAG_BUCKET;
       } else {
-	bucket = (parent->flags & FLAG_BUCKET) ? parent
+	bucket = parent->is_bucket() ? parent
 	  : parent->bucket;
 	if (flags & FLAG_DIRECTORY) {
 	  fh.fh_type = RGW_FS_TYPE_DIRECTORY;
@@ -410,7 +410,7 @@ namespace rgw {
     const std::string& bucket_name() const {
       if (is_root())
 	return root_name;
-      if (flags & FLAG_BUCKET)
+      if (is_bucket())
 	return name;
       return bucket->object_name();
     }
@@ -506,7 +506,7 @@ namespace rgw {
 
     int open(uint32_t gsh_flags) {
       lock_guard guard(mtx);
-      if (! (flags & FLAG_OPEN)) {
+      if (! is_open()) {
 	if (gsh_flags & RGW_OPEN_FLAG_V3) {
 	  flags |= FLAG_STATELESS_OPEN;
 	}
@@ -745,7 +745,7 @@ namespace rgw {
     RGWUserInfo user;
     RGWAccessKey key; // XXXX acc_key
 
-    static atomic<uint32_t> fs_inst_counter;
+    static std::atomic<uint32_t> fs_inst_counter;
 
     static uint32_t write_completion_interval_s;
     std::string fsid;
@@ -778,7 +778,7 @@ namespace rgw {
       }
 
       void operator()() {
-	rgw_fh.write_finish();
+	rgw_fh.close(); /* will finish in-progress write */
 	rgw_fh.get_fs()->unref(&rgw_fh);
       }
     };
@@ -1449,9 +1449,9 @@ public:
     op = this;
   }
 
-  virtual bool only_bucket() override { return false; }
+  bool only_bucket() override { return false; }
 
-  virtual int op_init() override {
+  int op_init() override {
     // assign store, s, and dialect_handler
     RGWObjectCtx* rados_ctx
       = static_cast<RGWObjectCtx*>(get_state()->obj_ctx);
@@ -1462,7 +1462,7 @@ public:
     return 0;
   }
 
-  virtual int header_init() override {
+  int header_init() override {
     struct req_state* s = get_state();
     s->info.method = "GET";
     s->op = OP_GET;
@@ -1484,12 +1484,12 @@ public:
     return 0;
   }
 
-  virtual int get_params() override {
+  int get_params() override {
     max = default_max;
     return 0;
   }
 
-  virtual void send_response() override {
+  void send_response() override {
     valid = true;
     if ((objs.size() > 1) ||
 	(! objs.empty() &&
diff --git a/src/rgw/rgw_gc.cc b/src/rgw/rgw_gc.cc
index c46f693daea..8fb461292dc 100644
--- a/src/rgw/rgw_gc.cc
+++ b/src/rgw/rgw_gc.cc
@@ -253,7 +253,7 @@ int RGWGC::process()
 
 bool RGWGC::going_down()
 {
-  return (down_flag.read() != 0);
+  return down_flag;
 }
 
 void RGWGC::start_processor()
@@ -264,7 +264,7 @@ void RGWGC::start_processor()
 
 void RGWGC::stop_processor()
 {
-  down_flag.set(1);
+  down_flag = true;
   if (worker) {
     worker->stop();
     worker->join();
diff --git a/src/rgw/rgw_gc.h b/src/rgw/rgw_gc.h
index ca48a6e75e5..491796b50e8 100644
--- a/src/rgw/rgw_gc.h
+++ b/src/rgw/rgw_gc.h
@@ -6,7 +6,6 @@
 
 
 #include "include/types.h"
-#include "include/atomic.h"
 #include "include/rados/librados.hpp"
 #include "common/Mutex.h"
 #include "common/Cond.h"
@@ -15,12 +14,14 @@
 #include "rgw_rados.h"
 #include "cls/rgw/cls_rgw_types.h"
 
+#include <atomic>
+
 class RGWGC {
   CephContext *cct;
   RGWRados *store;
   int max_objs;
   string *obj_names;
-  atomic_t down_flag;
+  std::atomic<bool> down_flag = { false };
 
   int tag_index(const string& tag);
 
diff --git a/src/rgw/rgw_http_client.cc b/src/rgw/rgw_http_client.cc
index 070110e5099..81c4c6ef6bc 100644
--- a/src/rgw/rgw_http_client.cc
+++ b/src/rgw/rgw_http_client.cc
@@ -16,6 +16,8 @@
 
 #include "rgw_coroutine.h"
 
+#include <atomic>
+
 #define dout_context g_ceph_context
 #define dout_subsys ceph_subsys_rgw
 
@@ -24,7 +26,7 @@ struct rgw_http_req_data : public RefCountedObject {
   curl_slist *h;
   uint64_t id;
   int ret;
-  atomic_t done;
+  std::atomic<bool> done = { false };
   RGWHTTPClient *client;
   void *user_info;
   bool registered;
@@ -58,12 +60,12 @@ struct rgw_http_req_data : public RefCountedObject {
 
     easy_handle = NULL;
     h = NULL;
-    done.set(1);
+    done = true;
     cond.Signal();
   }
 
   bool is_done() {
-    return done.read() != 0;
+    return done;
   }
 
   int get_retcode() {
@@ -900,14 +902,14 @@ int RGWHTTPManager::set_threaded()
 
 void RGWHTTPManager::stop()
 {
-  if (is_stopped.read()) {
+  if (is_stopped) {
     return;
   }
 
-  is_stopped.set(1);
+  is_stopped = true;
 
   if (is_threaded) {
-    going_down.set(1);
+    going_down = true;
     signal_thread();
     reqs_thread->join();
     delete reqs_thread;
@@ -935,7 +937,7 @@ void *RGWHTTPManager::reqs_thread_entry()
 
   ldout(cct, 20) << __func__ << ": start" << dendl;
 
-  while (!going_down.read()) {
+  while (!going_down) {
     int ret = do_curl_wait(cct, (CURLM *)multi_handle, thread_pipe[0]);
     if (ret < 0) {
       dout(0) << "ERROR: do_curl_wait() returned: " << ret << dendl;
diff --git a/src/rgw/rgw_http_client.h b/src/rgw/rgw_http_client.h
index 4266b7b50ba..cbe4f3d0312 100644
--- a/src/rgw/rgw_http_client.h
+++ b/src/rgw/rgw_http_client.h
@@ -6,10 +6,11 @@
 
 #include "common/RWLock.h"
 #include "common/Cond.h"
-#include "include/atomic.h"
 #include "rgw_common.h"
 #include "rgw_string.h"
 
+#include <atomic>
+
 using param_pair_t = pair<string, string>;
 using param_vec_t = vector<param_pair_t>;
 
@@ -33,7 +34,7 @@ class RGWHTTPClient
   string last_url;
   bool verify_ssl; // Do not validate self signed certificates, default to false
 
-  atomic_t stopped;
+  std::atomic<unsigned> stopped { 0 };
 
 protected:
   CephContext *cct;
@@ -219,8 +220,8 @@ class RGWHTTPManager {
   RGWCompletionManager *completion_mgr;
   void *multi_handle;
   bool is_threaded;
-  atomic_t going_down;
-  atomic_t is_stopped;
+  std::atomic<unsigned> going_down { 0 };
+  std::atomic<unsigned> is_stopped { 0 };
 
   RWLock reqs_lock;
   map<uint64_t, rgw_http_req_data *> reqs;
diff --git a/src/rgw/rgw_http_errors.h b/src/rgw/rgw_http_errors.h
index a0423bb48fd..e2aec310c88 100644
--- a/src/rgw/rgw_http_errors.h
+++ b/src/rgw/rgw_http_errors.h
@@ -6,144 +6,11 @@
 
 #include "rgw_common.h"
 
-struct rgw_http_errors {
-  int err_no;
-  int http_ret;
-  const char *s3_code;
-};
+typedef const std::map<int,const std::pair<int, const char*>> rgw_http_errors;
 
-const static struct rgw_http_errors RGW_HTTP_ERRORS[] = {
-    { 0, 200, "" },
-    { STATUS_CREATED, 201, "Created" },
-    { STATUS_ACCEPTED, 202, "Accepted" },
-    { STATUS_NO_CONTENT, 204, "NoContent" },
-    { STATUS_PARTIAL_CONTENT, 206, "" },
-    { ERR_PERMANENT_REDIRECT, 301, "PermanentRedirect" },
-    { ERR_WEBSITE_REDIRECT, 301, "WebsiteRedirect" },
-    { STATUS_REDIRECT, 303, "" },
-    { ERR_NOT_MODIFIED, 304, "NotModified" },
-    { EINVAL, 400, "InvalidArgument" },
-    { ERR_INVALID_REQUEST, 400, "InvalidRequest" },
-    { ERR_INVALID_DIGEST, 400, "InvalidDigest" },
-    { ERR_BAD_DIGEST, 400, "BadDigest" },
-    { ERR_INVALID_BUCKET_NAME, 400, "InvalidBucketName" },
-    { ERR_INVALID_OBJECT_NAME, 400, "InvalidObjectName" },
-    { ERR_UNRESOLVABLE_EMAIL, 400, "UnresolvableGrantByEmailAddress" },
-    { ERR_INVALID_PART, 400, "InvalidPart" },
-    { ERR_INVALID_PART_ORDER, 400, "InvalidPartOrder" },
-    { ERR_REQUEST_TIMEOUT, 400, "RequestTimeout" },
-    { ERR_TOO_LARGE, 400, "EntityTooLarge" },
-    { ERR_TOO_SMALL, 400, "EntityTooSmall" },
-    { ERR_TOO_MANY_BUCKETS, 400, "TooManyBuckets" },
-    { ERR_MALFORMED_XML, 400, "MalformedXML" },
-    { ERR_AMZ_CONTENT_SHA256_MISMATCH, 400, "XAmzContentSHA256Mismatch" },
-    { ERR_MALFORMED_DOC, 400, "MalformedPolicyDocument" },
-    { ERR_LENGTH_REQUIRED, 411, "MissingContentLength" },
-    { EACCES, 403, "AccessDenied" },
-    { EPERM, 403, "AccessDenied" },
-    { ERR_SIGNATURE_NO_MATCH, 403, "SignatureDoesNotMatch" },
-    { ERR_INVALID_ACCESS_KEY, 403, "InvalidAccessKeyId" },
-    { ERR_USER_SUSPENDED, 403, "UserSuspended" },
-    { ERR_REQUEST_TIME_SKEWED, 403, "RequestTimeTooSkewed" },
-    { ERR_QUOTA_EXCEEDED, 403, "QuotaExceeded" },
-    { ENOENT, 404, "NoSuchKey" },
-    { ERR_NO_SUCH_BUCKET, 404, "NoSuchBucket" },
-    { ERR_NO_SUCH_WEBSITE_CONFIGURATION, 404, "NoSuchWebsiteConfiguration" },
-    { ERR_NO_SUCH_UPLOAD, 404, "NoSuchUpload" },
-    { ERR_NOT_FOUND, 404, "Not Found"},
-    { ERR_NO_SUCH_LC, 404, "NoSuchLifecycleConfiguration"},
-    { ERR_NO_ROLE_FOUND, 404, "NoSuchEntity"},
-    { ERR_METHOD_NOT_ALLOWED, 405, "MethodNotAllowed" },
-    { ETIMEDOUT, 408, "RequestTimeout" },
-    { EEXIST, 409, "BucketAlreadyExists" },
-    { ERR_USER_EXIST, 409, "UserAlreadyExists" },
-    { ERR_EMAIL_EXIST, 409, "EmailExists" },
-    { ERR_KEY_EXIST, 409, "KeyExists"},
-    { ERR_ROLE_EXISTS, 409, "EntityAlreadyExists"},
-    { ERR_DELETE_CONFLICT, 409, "DeleteConflict"},
-    { ERR_INVALID_SECRET_KEY, 400, "InvalidSecretKey"},
-    { ERR_INVALID_KEY_TYPE, 400, "InvalidKeyType"},
-    { ERR_INVALID_CAP, 400, "InvalidCapability"},
-    { ERR_INVALID_TENANT_NAME, 400, "InvalidTenantName" },
-    { ENOTEMPTY, 409, "BucketNotEmpty" },
-    { ERR_PRECONDITION_FAILED, 412, "PreconditionFailed" },
-    { ERANGE, 416, "InvalidRange" },
-    { ERR_UNPROCESSABLE_ENTITY, 422, "UnprocessableEntity" },
-    { ERR_LOCKED, 423, "Locked" },
-    { ERR_INTERNAL_ERROR, 500, "InternalError" },
-    { ERR_NOT_IMPLEMENTED, 501, "NotImplemented" },
-    { ERR_SERVICE_UNAVAILABLE, 503, "ServiceUnavailable"}
-};
-
-const static struct rgw_http_errors RGW_HTTP_SWIFT_ERRORS[] = {
-    { EACCES, 403, "AccessDenied" },
-    { EPERM, 401, "AccessDenied" },
-    { ERR_USER_SUSPENDED, 401, "UserSuspended" },
-    { ERR_INVALID_UTF8, 412, "Invalid UTF8" },
-    { ERR_BAD_URL, 412, "Bad URL" },
-    { ERR_NOT_SLO_MANIFEST, 400, "Not an SLO manifest" },
-    { ERR_QUOTA_EXCEEDED, 413, "QuotaExceeded" }
-};
-
-struct rgw_http_status_code {
-  int code;
-  const char *name;
-};
-
-const static struct rgw_http_status_code http_codes[] = {
-  { 100, "Continue" },
-  { 200, "OK" },
-  { 201, "Created" },
-  { 202, "Accepted" },
-  { 204, "No Content" },
-  { 205, "Reset Content" },
-  { 206, "Partial Content" },
-  { 207, "Multi Status" },
-  { 208, "Already Reported" },
-  { 300, "Multiple Choices" },
-  { 301, "Moved Permanently" },
-  { 302, "Found" },
-  { 303, "See Other" },
-  { 304, "Not Modified" },
-  { 305, "User Proxy" },
-  { 306, "Switch Proxy" },
-  { 307, "Temporary Redirect" },
-  { 308, "Permanent Redirect" },
-  { 400, "Bad Request" },
-  { 401, "Unauthorized" },
-  { 402, "Payment Required" },
-  { 403, "Forbidden" },
-  { 404, "Not Found" },
-  { 405, "Method Not Allowed" },
-  { 406, "Not Acceptable" },
-  { 407, "Proxy Authentication Required" },
-  { 408, "Request Timeout" },
-  { 409, "Conflict" },
-  { 410, "Gone" },
-  { 411, "Length Required" },
-  { 412, "Precondition Failed" },
-  { 413, "Request Entity Too Large" },
-  { 414, "Request-URI Too Long" },
-  { 415, "Unsupported Media Type" },
-  { 416, "Requested Range Not Satisfiable" },
-  { 417, "Expectation Failed" },
-  { 422, "Unprocessable Entity" },
-  { 500, "Internal Server Error" },
-  { 501, "Not Implemented" },
-  { 0, NULL },
-};
-
-#define ARRAY_LEN(arr) (sizeof(arr) / sizeof(arr[0]))
-
-static inline const struct rgw_http_errors *search_err(int err_no, const struct rgw_http_errors *errs, int len)
-{
-  for (int i = 0; i < len; ++i, ++errs) {
-    if (err_no == errs->err_no)
-      return errs;
-  }
-  return NULL;
-}
+extern rgw_http_errors rgw_http_s3_errors;
 
+extern rgw_http_errors rgw_http_swift_errors;
 
 static inline int rgw_http_error_to_errno(int http_err)
 {
diff --git a/src/rgw/rgw_iam_policy.cc b/src/rgw/rgw_iam_policy.cc
new file mode 100644
index 00000000000..c12e243f1e0
--- /dev/null
+++ b/src/rgw/rgw_iam_policy.cc
@@ -0,0 +1,1514 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+
+#include <cstring>
+#include <regex>
+#include <sstream>
+#include <stack>
+#include <utility>
+
+#include "rapidjson/reader.h"
+
+#include "rgw_auth.h"
+#include "rgw_iam_policy.h"
+
+namespace {
+constexpr int dout_subsys = ceph_subsys_rgw;
+}
+
+using std::bitset;
+using std::find;
+using std::int64_t;
+using std::move;
+using std::pair;
+using std::regex;
+using std::regex_match;
+using std::size_t;
+using std::smatch;
+using std::string;
+using std::stringstream;
+using std::ostream;
+using std::uint16_t;
+using std::uint64_t;
+using std::unordered_map;
+
+using boost::container::flat_set;
+using boost::none;
+using boost::optional;
+
+using rapidjson::BaseReaderHandler;
+using rapidjson::UTF8;
+using rapidjson::SizeType;
+using rapidjson::Reader;
+using rapidjson::kParseCommentsFlag;
+using rapidjson::kParseNumbersAsStringsFlag;
+using rapidjson::StringStream;
+using rapidjson::ParseResult;
+
+using rgw::auth::Principal;
+
+namespace rgw {
+namespace IAM {
+#include "rgw_iam_policy_keywords.frag.cc"
+
+struct actpair {
+  const char* name;
+  const uint64_t bit;
+};
+
+namespace {
+optional<Partition> to_partition(const smatch::value_type& p,
+				 bool wildcards) {
+  if (p == "aws") {
+    return Partition::aws;
+  } else if (p == "aws-cn") {
+    return Partition::aws_cn;
+  } else if (p == "aws-us-gov") {
+    return Partition::aws_us_gov;
+  } else if (p == "*" && wildcards) {
+    return Partition::wildcard;
+  } else {
+    return none;
+  }
+
+  ceph_abort();
+}
+
+optional<Service> to_service(const smatch::value_type& s,
+			     bool wildcards) {
+  static const unordered_map<string, Service> services = {
+    { "acm", Service::acm },
+    { "apigateway", Service::apigateway },
+    { "appstream", Service::appstream },
+    { "artifact", Service::artifact },
+    { "autoscaling", Service::autoscaling },
+    { "aws-marketplace", Service::aws_marketplace },
+    { "aws-marketplace-management",
+      Service::aws_marketplace_management },
+    { "aws-portal", Service::aws_portal },
+    { "cloudformation", Service::cloudformation },
+    { "cloudfront", Service::cloudfront },
+    { "cloudhsm", Service::cloudhsm },
+    { "cloudsearch", Service::cloudsearch },
+    { "cloudtrail", Service::cloudtrail },
+    { "cloudwatch", Service::cloudwatch },
+    { "codebuild", Service::codebuild },
+    { "codecommit", Service::codecommit },
+    { "codedeploy", Service::codedeploy },
+    { "codepipeline", Service::codepipeline },
+    { "cognito-identity", Service::cognito_identity },
+    { "cognito-idp", Service::cognito_idp },
+    { "cognito-sync", Service::cognito_sync },
+    { "config", Service::config },
+    { "datapipeline", Service::datapipeline },
+    { "devicefarm", Service::devicefarm },
+    { "directconnect", Service::directconnect },
+    { "dms", Service::dms },
+    { "ds", Service::ds },
+    { "dynamodb", Service::dynamodb },
+    { "ec2", Service::ec2 },
+    { "ecr", Service::ecr },
+    { "ecs", Service::ecs },
+    { "elasticache", Service::elasticache },
+    { "elasticbeanstalk", Service::elasticbeanstalk },
+    { "elasticfilesystem", Service::elasticfilesystem },
+    { "elasticloadbalancing", Service::elasticloadbalancing },
+    { "elasticmapreduce", Service::elasticmapreduce },
+    { "elastictranscoder", Service::elastictranscoder },
+    { "es", Service::es },
+    { "events", Service::events },
+    { "firehose", Service::firehose },
+    { "gamelift", Service::gamelift },
+    { "glacier", Service::glacier },
+    { "health", Service::health },
+    { "iam", Service::iam },
+    { "importexport", Service::importexport },
+    { "inspector", Service::inspector },
+    { "iot", Service::iot },
+    { "kinesis", Service::kinesis },
+    { "kinesisanalytics", Service::kinesisanalytics },
+    { "kms", Service::kms },
+    { "lambda", Service::lambda },
+    { "lightsail", Service::lightsail },
+    { "logs", Service::logs },
+    { "machinelearning", Service::machinelearning },
+    { "mobileanalytics", Service::mobileanalytics },
+    { "mobilehub", Service::mobilehub },
+    { "opsworks", Service::opsworks },
+    { "opsworks-cm", Service::opsworks_cm },
+    { "polly", Service::polly },
+    { "rds", Service::rds },
+    { "redshift", Service::redshift },
+    { "route53", Service::route53 },
+    { "route53domains", Service::route53domains },
+    { "s3", Service::s3 },
+    { "sdb", Service::sdb },
+    { "servicecatalog", Service::servicecatalog },
+    { "ses", Service::ses },
+    { "sns", Service::sns },
+    { "sqs", Service::sqs },
+    { "ssm", Service::ssm },
+    { "states", Service::states },
+    { "storagegateway", Service::storagegateway },
+    { "sts", Service::sts },
+    { "support", Service::support },
+    { "swf", Service::swf },
+    { "trustedadvisor", Service::trustedadvisor },
+    { "waf", Service::waf },
+    { "workmail", Service::workmail },
+    { "workspaces", Service::workspaces }};
+
+  if (wildcards && s == "*") {
+    return Service::wildcard;
+  }
+
+  auto i = services.find(s);
+  if (i == services.end()) {
+    return none;
+  } else {
+    return i->second;
+  }
+}
+}
+
+ARN::ARN(const rgw_obj& o)
+  : partition(Partition::aws),
+    service(Service::s3),
+    region(),
+    account(o.bucket.tenant),
+    resource(o.bucket.name)
+{
+  resource.push_back('/');
+  resource.append(o.key.name);
+}
+
+ARN::ARN(const rgw_bucket& b)
+  : partition(Partition::aws),
+    service(Service::s3),
+    region(),
+    account(b.tenant),
+    resource(b.name) { }
+
+ARN::ARN(const rgw_bucket& b, const string& o)
+  : partition(Partition::aws),
+    service(Service::s3),
+    region(),
+    account(b.tenant),
+    resource(b.name) {
+  resource.push_back('/');
+  resource.append(o);
+}
+
+optional<ARN> ARN::parse(const string& s, bool wildcards) {
+  static const regex rx_wild("arn:([^:]*):([^:]*):([^:]*):([^:]*):([^:]*)",
+			     std::regex_constants::ECMAScript |
+			     std::regex_constants::optimize);
+  static const regex rx_no_wild(
+    "arn:([^:*]*):([^:*]*):([^:*]*):([^:*]*):([^:*]*)",
+    std::regex_constants::ECMAScript |
+    std::regex_constants::optimize);
+
+  smatch match;
+
+  if ((s == "*") && wildcards) {
+    return ARN(Partition::wildcard, Service::wildcard, "*", "*", "*");
+  } else if (regex_match(s, match, wildcards ? rx_wild : rx_no_wild)) {
+    ceph_assert(match.size() == 6);
+
+    ARN a;
+    {
+      auto p = to_partition(match[1], wildcards);
+      if (!p)
+	return none;
+
+      a.partition = *p;
+    }
+    {
+      auto s = to_service(match[2], wildcards);
+      if (!s) {
+	return none;
+      }
+      a.service = *s;
+    }
+
+    a.region = match[3];
+    a.account = match[4];
+    a.resource = match[5];
+
+    return a;
+  }
+  return none;
+}
+
+string ARN::to_string() const {
+  string s;
+
+  if (partition == Partition::aws) {
+    s.append("aws:");
+  } else if (partition == Partition::aws_cn) {
+    s.append("aws-cn:");
+  } else if (partition == Partition::aws_us_gov) {
+    s.append("aws-us-gov:");
+  } else {
+    s.append("*:");
+  }
+
+  static const unordered_map<Service, string> services = {
+    { Service::acm, "acm" },
+    { Service::apigateway, "apigateway" },
+    { Service::appstream, "appstream" },
+    { Service::artifact, "artifact" },
+    { Service::autoscaling, "autoscaling" },
+    { Service::aws_marketplace, "aws-marketplace" },
+    { Service::aws_marketplace_management, "aws-marketplace-management" },
+    { Service::aws_portal, "aws-portal" },
+    { Service::cloudformation, "cloudformation" },
+    { Service::cloudfront, "cloudfront" },
+    { Service::cloudhsm, "cloudhsm" },
+    { Service::cloudsearch, "cloudsearch" },
+    { Service::cloudtrail, "cloudtrail" },
+    { Service::cloudwatch, "cloudwatch" },
+    { Service::codebuild, "codebuild" },
+    { Service::codecommit, "codecommit" },
+    { Service::codedeploy, "codedeploy" },
+    { Service::codepipeline, "codepipeline" },
+    { Service::cognito_identity, "cognito-identity" },
+    { Service::cognito_idp, "cognito-idp" },
+    { Service::cognito_sync, "cognito-sync" },
+    { Service::config, "config" },
+    { Service::datapipeline, "datapipeline" },
+    { Service::devicefarm, "devicefarm" },
+    { Service::directconnect, "directconnect" },
+    { Service::dms, "dms" },
+    { Service::ds, "ds" },
+    { Service::dynamodb, "dynamodb" },
+    { Service::ec2, "ec2" },
+    { Service::ecr, "ecr" },
+    { Service::ecs, "ecs" },
+    { Service::elasticache, "elasticache" },
+    { Service::elasticbeanstalk, "elasticbeanstalk" },
+    { Service::elasticfilesystem, "elasticfilesystem" },
+    { Service::elasticloadbalancing, "elasticloadbalancing" },
+    { Service::elasticmapreduce, "elasticmapreduce" },
+    { Service::elastictranscoder, "elastictranscoder" },
+    { Service::es, "es" },
+    { Service::events, "events" },
+    { Service::firehose, "firehose" },
+    { Service::gamelift, "gamelift" },
+    { Service::glacier, "glacier" },
+    { Service::health, "health" },
+    { Service::iam, "iam" },
+    { Service::importexport, "importexport" },
+    { Service::inspector, "inspector" },
+    { Service::iot, "iot" },
+    { Service::kinesis, "kinesis" },
+    { Service::kinesisanalytics, "kinesisanalytics" },
+    { Service::kms, "kms" },
+    { Service::lambda, "lambda" },
+    { Service::lightsail, "lightsail" },
+    { Service::logs, "logs" },
+    { Service::machinelearning, "machinelearning" },
+    { Service::mobileanalytics, "mobileanalytics" },
+    { Service::mobilehub, "mobilehub" },
+    { Service::opsworks, "opsworks" },
+    { Service::opsworks_cm, "opsworks-cm" },
+    { Service::polly, "polly" },
+    { Service::rds, "rds" },
+    { Service::redshift, "redshift" },
+    { Service::route53, "route53" },
+    { Service::route53domains, "route53domains" },
+    { Service::s3, "s3" },
+    { Service::sdb, "sdb" },
+    { Service::servicecatalog, "servicecatalog" },
+    { Service::ses, "ses" },
+    { Service::sns, "sns" },
+    { Service::sqs, "sqs" },
+    { Service::ssm, "ssm" },
+    { Service::states, "states" },
+    { Service::storagegateway, "storagegateway" },
+    { Service::sts, "sts" },
+    { Service::support, "support" },
+    { Service::swf, "swf" },
+    { Service::trustedadvisor, "trustedadvisor" },
+    { Service::waf, "waf" },
+    { Service::workmail, "workmail" },
+    { Service::workspaces, "workspaces" }};
+
+  auto i = services.find(service);
+  if (i != services.end()) {
+    s.append(i->second);
+  } else {
+    s.push_back('*');
+  }
+  s.push_back(':');
+
+  s.append(region);
+  s.push_back(':');
+
+  s.append(account);
+  s.push_back(':');
+
+  s.append(resource);
+
+  return s;
+}
+
+bool operator ==(const ARN& l, const ARN& r) {
+  return ((l.partition == r.partition) &&
+	  (l.service == r.service) &&
+	  (l.region == r.region) &&
+	  (l.account == r.account) &&
+	  (l.resource == r.resource));
+}
+bool operator <(const ARN& l, const ARN& r) {
+  return ((l.partition < r.partition) ||
+	  (l.service < r.service) ||
+	  (l.region < r.region) ||
+	  (l.account < r.account) ||
+	  (l.resource < r.resource));
+}
+
+// The candidate is not allowed to have wildcards. The only way to
+// do that sanely would be to use unification rather than matching.
+bool ARN::match(const ARN& candidate) const {
+  if ((candidate.partition == Partition::wildcard) ||
+      (partition != candidate.partition && partition
+       != Partition::wildcard)) {
+    return false;
+  }
+
+  if ((candidate.service == Service::wildcard) ||
+      (service != candidate.service && service != Service::wildcard)) {
+    return false;
+  }
+
+  if (!::match(region, candidate.region, MATCH_POLICY_ARN)) {
+    return false;
+  }
+
+  if (!::match(account, candidate.account, MATCH_POLICY_ARN)) {
+    return false;
+  }
+
+  if (!::match(resource, candidate.resource, MATCH_POLICY_ARN)) {
+    return false;
+  }
+
+  return true;
+}
+
+static const actpair actpairs[] =
+{{ "s3:AbortMultipartUpload", s3AbortMultipartUpload },
+ { "s3:CreateBucket", s3CreateBucket },
+ { "s3:DeleteBucketPolicy", s3DeleteBucketPolicy },
+ { "s3:DeleteBucket", s3DeleteBucket },
+ { "s3:DeleteBucketWebsite", s3DeleteBucketWebsite },
+ { "s3:DeleteObject", s3DeleteObject },
+ { "s3:DeleteObjectVersion", s3DeleteObjectVersion },
+ { "s3:DeleteReplicationConfiguration", s3DeleteReplicationConfiguration },
+ { "s3:GetAccelerateConfiguration", s3GetAccelerateConfiguration },
+ { "s3:GetBucketAcl", s3GetBucketAcl },
+ { "s3:GetBucketCORS", s3GetBucketCORS },
+ { "s3:GetBucketLocation", s3GetBucketLocation },
+ { "s3:GetBucketLogging", s3GetBucketLogging },
+ { "s3:GetBucketNotification", s3GetBucketNotification },
+ { "s3:GetBucketPolicy", s3GetBucketPolicy },
+ { "s3:GetBucketRequestPayment", s3GetBucketRequestPayment },
+ { "s3:GetBucketTagging", s3GetBucketTagging },
+ { "s3:GetBucketVersioning", s3GetBucketVersioning },
+ { "s3:GetBucketWebsite", s3GetBucketWebsite },
+ { "s3:GetLifecycleConfiguration", s3GetLifecycleConfiguration },
+ { "s3:GetObjectAcl", s3GetObjectAcl },
+ { "s3:GetObject", s3GetObject },
+ { "s3:GetObjectTorrent", s3GetObjectTorrent },
+ { "s3:GetObjectVersionAcl", s3GetObjectVersionAcl },
+ { "s3:GetObjectVersion", s3GetObjectVersion },
+ { "s3:GetObjectVersionTorrent", s3GetObjectVersionTorrent },
+ { "s3:GetReplicationConfiguration", s3GetReplicationConfiguration },
+ { "s3:ListAllMyBuckets", s3ListAllMyBuckets },
+ { "s3:ListBucketMultiPartUploads", s3ListBucketMultiPartUploads },
+ { "s3:ListBucket", s3ListBucket },
+ { "s3:ListBucketVersions", s3ListBucketVersions },
+ { "s3:ListMultipartUploadParts", s3ListMultipartUploadParts },
+ { "s3:PutAccelerateConfiguration", s3PutAccelerateConfiguration },
+ { "s3:PutBucketAcl", s3PutBucketAcl },
+ { "s3:PutBucketCORS", s3PutBucketCORS },
+ { "s3:PutBucketLogging", s3PutBucketLogging },
+ { "s3:PutBucketNotification", s3PutBucketNotification },
+ { "s3:PutBucketPolicy", s3PutBucketPolicy },
+ { "s3:PutBucketRequestPayment", s3PutBucketRequestPayment },
+ { "s3:PutBucketTagging", s3PutBucketTagging },
+ { "s3:PutBucketVersioning", s3PutBucketVersioning },
+ { "s3:PutBucketWebsite", s3PutBucketWebsite },
+ { "s3:PutLifecycleConfiguration", s3PutLifecycleConfiguration },
+ { "s3:PutObjectAcl",  s3PutObjectAcl },
+ { "s3:PutObject", s3PutObject },
+ { "s3:PutObjectVersionAcl", s3PutObjectVersionAcl },
+ { "s3:PutReplicationConfiguration", s3PutReplicationConfiguration },
+ { "s3:RestoreObject", s3RestoreObject }};
+
+struct PolicyParser;
+
+const Keyword top[1]{"<Top>", TokenKind::pseudo, TokenID::Top, 0, false,
+    false};
+const Keyword cond_key[1]{"<Condition Key>", TokenKind::cond_key,
+    TokenID::CondKey, 0, true, false};
+
+struct ParseState {
+  PolicyParser* pp;
+  const Keyword* w;
+
+  bool arraying = false;
+  bool objecting = false;
+
+  void reset();
+
+  ParseState(PolicyParser* pp, const Keyword* w)
+    : pp(pp), w(w) {}
+
+  bool obj_start();
+
+  bool obj_end();
+
+  bool array_start() {
+    if (w->arrayable && !arraying) {
+      arraying = true;
+      return true;
+    }
+    return false;
+  }
+
+  bool array_end();
+
+  bool key(const char* s, size_t l);
+  bool do_string(CephContext* cct, const char* s, size_t l);
+  bool number(const char* str, size_t l);
+};
+
+// If this confuses you, look up the Curiously Recurring Template Pattern
+struct PolicyParser : public BaseReaderHandler<UTF8<>, PolicyParser> {
+  keyword_hash tokens;
+  std::vector<ParseState> s;
+  CephContext* cct;
+  const string& tenant;
+  Policy& policy;
+
+  uint32_t seen = 0;
+
+  uint32_t dex(TokenID in) const {
+    switch (in) {
+    case TokenID::Version:
+      return 0x1;
+    case TokenID::Id:
+      return 0x2;
+    case TokenID::Statement:
+      return 0x4;
+    case TokenID::Sid:
+      return 0x8;
+    case TokenID::Effect:
+      return 0x10;
+    case TokenID::Principal:
+      return 0x20;
+    case TokenID::NotPrincipal:
+      return 0x40;
+    case TokenID::Action:
+      return 0x80;
+    case TokenID::NotAction:
+      return 0x100;
+    case TokenID::Resource:
+      return 0x200;
+    case TokenID::NotResource:
+      return 0x400;
+    case TokenID::Condition:
+      return 0x800;
+    case TokenID::AWS:
+      return 0x1000;
+    case TokenID::Federated:
+      return 0x2000;
+    case TokenID::Service:
+      return 0x4000;
+    case TokenID::CanonicalUser:
+      return 0x8000;
+    default:
+      ceph_abort();
+    }
+  }
+  bool test(TokenID in) {
+    return seen & dex(in);
+  }
+  void set(TokenID in) {
+    seen |= dex(in);
+  }
+  void set(std::initializer_list<TokenID> l) {
+    for (auto in : l) {
+      seen |= dex(in);
+    }
+  }
+  void reset(TokenID in) {
+    seen &= ~dex(in);
+  }
+  void reset(std::initializer_list<TokenID> l) {
+    for (auto in : l) {
+      seen &= ~dex(in);
+    }
+  }
+
+  PolicyParser(CephContext* cct, const string& tenant, Policy& policy)
+    : cct(cct), tenant(tenant), policy(policy) {}
+  PolicyParser(const PolicyParser& policy) = delete;
+
+  bool StartObject() {
+    if (s.empty()) {
+      s.push_back({this, top});
+      s.back().objecting = true;
+      return true;
+    }
+
+    return s.back().obj_start();
+  }
+  bool EndObject(SizeType memberCount) {
+    if (s.empty()) {
+      return false;
+    }
+
+    return s.back().obj_end();
+  }
+  bool Key(const char* str, SizeType length, bool copy) {
+    if (s.empty()) {
+      return false;
+    }
+
+    return s.back().key(str, length);
+  }
+
+  bool String(const char* str, SizeType length, bool copy) {
+    if (s.empty()) {
+      return false;
+    }
+
+    return s.back().do_string(cct, str, length);
+  }
+  bool RawNumber(const char* str, SizeType length, bool copy) {
+    if (s.empty()) {
+      return false;
+    }
+
+    return s.back().number(str, length);
+  }
+  bool StartArray() {
+    if (s.empty()) {
+      return false;
+    }
+
+    return s.back().array_start();
+  }
+  bool EndArray(SizeType) {
+    if (s.empty()) {
+      return false;
+    }
+
+    return s.back().array_end();
+  }
+
+  bool Default() {
+    return false;
+  }
+};
+
+
+// I really despise this misfeature of C++.
+//
+bool ParseState::obj_end() {
+  if (objecting) {
+    objecting = false;
+    if (!arraying) {
+      pp->s.pop_back();
+    } else {
+      reset();
+    }
+    return true;
+  }
+  return false;
+}
+
+bool ParseState::key(const char* s, size_t l) {
+  auto k = pp->tokens.lookup(s, l);
+
+  if (!k) {
+    if (w->kind == TokenKind::cond_op) {
+      auto& t = pp->policy.statements.back();
+      pp->s.emplace_back(pp, cond_key);
+      t.conditions.emplace_back(w->id, s, l);
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  // If the token we're going with belongs within the condition at the
+  // top of the stack and we haven't already encountered it, push it
+  // on the stack
+
+  // Top
+  if ((((w->id == TokenID::Top) && (k->kind == TokenKind::top)) ||
+       // Statement
+       ((w->id == TokenID::Statement) && (k->kind == TokenKind::statement)) ||
+
+       /// Principal
+       ((w->id == TokenID::Principal || w->id == TokenID::NotPrincipal) &&
+	(k->kind == TokenKind::princ_type))) &&
+
+      // Check that it hasn't been encountered. Note that this
+      // conjoins with the run of disjunctions above.
+      !pp->test(k->id)) {
+    pp->set(k->id);
+    pp->s.emplace_back(pp, k);
+    return true;
+  } else if ((w->id == TokenID::Condition) &&
+	     (k->kind == TokenKind::cond_op)) {
+    pp->s.emplace_back(pp, k);
+    return true;
+  }
+  return false;
+}
+
+// I should just rewrite a few helper functions to use iterators,
+// which will make all of this ever so much nicer.
+static optional<Principal> parse_principal(CephContext* cct, TokenID t,
+				    string&& s) {
+  // Wildcard!
+  if ((t == TokenID::AWS) && (s == "*")) {
+    return Principal::wildcard();
+
+    // Do nothing for now.
+  } else if (t == TokenID::CanonicalUser) {
+
+    // AWS ARNs
+  } else if (t == TokenID::AWS) {
+    auto a = ARN::parse(s);
+    if (!a) {
+      if (std::none_of(s.begin(), s.end(),
+		       [](const char& c) {
+			 return (c == ':') || (c == '/');
+		       })) {
+	// Since tenants are simply prefixes, there's no really good
+	// way to see if one exists or not. So we return the thing and
+	// let them try to match against it.
+	return Principal::tenant(std::move(s));
+      }
+    }
+
+    if (a->resource == "root") {
+      return Principal::tenant(std::move(a->account));
+    }
+
+    static const regex rx("([^/]*)/(.*)",
+			  std::regex_constants::ECMAScript |
+			  std::regex_constants::optimize);
+    smatch match;
+    if (regex_match(a->resource, match, rx)) {
+      ceph_assert(match.size() == 2);
+
+      if (match[1] == "user") {
+	return Principal::user(std::move(a->account),
+			       match[2]);
+      }
+
+      if (match[1] == "role") {
+	return Principal::role(std::move(a->account),
+			       match[2]);
+      }
+    }
+  }
+
+  ldout(cct, 0) << "Supplied principal is discarded: " << s << dendl;
+  return boost::none;
+}
+
+bool ParseState::do_string(CephContext* cct, const char* s, size_t l) {
+  auto k = pp->tokens.lookup(s, l);
+  Policy& p = pp->policy;
+  Statement* t = p.statements.empty() ? nullptr : &(p.statements.back());
+
+  // Top level!
+  if ((w->id == TokenID::Version) && k &&
+      k->kind == TokenKind::version_key) {
+    p.version = static_cast<Version>(k->specific);
+  } else if (w->id == TokenID::Id) {
+    p.id = string(s, l);
+
+    // Statement
+
+  } else if (w->id == TokenID::Sid) {
+    t->sid.emplace(s, l);
+  } else if ((w->id == TokenID::Effect) &&
+	     k->kind == TokenKind::effect_key) {
+    t->effect = static_cast<Effect>(k->specific);
+  } else if (w->id == TokenID::Principal && s && *s == '*') {
+    t->princ.emplace(Principal::wildcard());
+  } else if (w->id == TokenID::NotPrincipal && s && *s == '*') {
+    t->noprinc.emplace(Principal::wildcard());
+  } else if ((w->id == TokenID::Action) ||
+	     (w->id == TokenID::NotAction)) {
+    for (auto& p : actpairs) {
+      if (match({s, l}, p.name, MATCH_POLICY_ACTION)) {
+	(w->id == TokenID::Action ? t->action : t->notaction) |= p.bit;
+      }
+    }
+  } else if (w->id == TokenID::Resource || w->id == TokenID::NotResource) {
+    auto a = ARN::parse({s, l}, true);
+    // You can't specify resources for someone ELSE'S account.
+    if (a && (a->account.empty() || a->account == pp->tenant ||
+	      a->account == "*")) {
+      if (a->account.empty() || a->account == "*")
+	a->account = pp->tenant;
+      (w->id == TokenID::Resource ? t->resource : t->notresource)
+	.emplace(std::move(*a));
+    }
+    else
+      ldout(cct, 0) << "Supplied resource is discarded: " << string(s, l)
+		    << dendl;
+  } else if (w->kind == TokenKind::cond_key) {
+    auto& t = pp->policy.statements.back();
+    t.conditions.back().vals.emplace_back(s, l);
+
+    // Principals
+
+  } else if (w->kind == TokenKind::princ_type) {
+    ceph_assert(pp->s.size() > 1);
+    auto& pri = pp->s[pp->s.size() - 2].w->id == TokenID::Principal ?
+      t->princ : t->noprinc;
+
+    auto o = parse_principal(pp->cct, w->id, string(s, l));
+    if (o)
+      pri.emplace(std::move(*o));
+
+    // Failure
+
+  } else {
+    return false;
+  }
+
+  if (!arraying) {
+    pp->s.pop_back();
+  }
+
+  return true;
+}
+
+bool ParseState::number(const char* s, size_t l) {
+  // Top level!
+  if (w->kind == TokenKind::cond_key) {
+    auto& t = pp->policy.statements.back();
+    t.conditions.back().vals.emplace_back(s, l);
+
+    // Failure
+
+  } else {
+    return false;
+  }
+
+  if (!arraying) {
+    pp->s.pop_back();
+  }
+
+  return true;
+}
+
+void ParseState::reset() {
+  pp->reset({TokenID::Sid, TokenID::Effect, TokenID::Principal,
+	TokenID::NotPrincipal, TokenID::Action, TokenID::NotAction,
+	TokenID::Resource, TokenID::NotResource, TokenID::Condition});
+}
+
+bool ParseState::obj_start() {
+  if (w->objectable && !objecting) {
+    objecting = true;
+    if (w->id == TokenID::Statement) {
+      pp->policy.statements.push_back({});
+    }
+
+    return true;
+  }
+
+  return false;
+}
+
+
+bool ParseState::array_end() {
+  if (arraying && !objecting) {
+    pp->s.pop_back();
+    return true;
+  }
+
+  return false;
+}
+
+ostream& operator <<(ostream& m, const MaskedIP& ip) {
+  // I have a theory about why std::bitset is the way it is.
+  if (ip.v6) {
+    for (int i = 15; i >= 0; --i) {
+      uint8_t b = 0;
+      for (int j = 7; j >= 0; --j) {
+	b |= (ip.addr[(i * 8) + j] << j);
+      }
+      m << hex << b;
+      if (i != 0) {
+	m << "::";
+      }
+    }
+  } else {
+    // It involves Satan.
+    for (int i = 3; i >= 0; --i) {
+      uint8_t b = 0;
+      for (int j = 7; j >= 0; --j) {
+	b |= (ip.addr[(i * 8) + j] << j);
+      }
+      m << b;
+      if (i != 0) {
+	m << ".";
+      }
+    }
+  }
+  m << "/" << ip.prefix;
+  // It would explain a lot
+  return m;
+}
+
+string to_string(const MaskedIP& m) {
+  stringstream ss;
+  ss << m;
+  return ss.str();
+}
+
+bool Condition::eval(const Environment& env) const {
+  auto i = env.find(key);
+  if (op == TokenID::Null) {
+    return i == env.end() ? true : false;
+  }
+
+  if (i == env.end()) {
+    return false;
+  }
+  const auto& s = i->second;
+
+  switch (op) {
+    // String!
+  case TokenID::StringEquals:
+    return orrible(std::equal_to<std::string>(), s, vals);
+
+  case TokenID::StringNotEquals:
+    return orrible(std::not2(std::equal_to<std::string>()),
+		   s, vals);
+
+  case TokenID::StringEqualsIgnoreCase:
+    return orrible(ci_equal_to(), s, vals);
+
+  case TokenID::StringNotEqualsIgnoreCase:
+    return orrible(std::not2(ci_equal_to()), s, vals);
+
+    // Implement actual StringLike with wildcarding later
+  case TokenID::StringLike:
+    return orrible(std::equal_to<std::string>(), s, vals);
+  case TokenID::StringNotLike:
+    return orrible(std::not2(std::equal_to<std::string>()),
+		   s, vals);
+
+    // Numeric
+  case TokenID::NumericEquals:
+    return shortible(std::equal_to<double>(), as_number, s, vals);
+
+  case TokenID::NumericNotEquals:
+    return shortible(std::not2(std::equal_to<double>()),
+		     as_number, s, vals);
+
+
+  case TokenID::NumericLessThan:
+    return shortible(std::less<double>(), as_number, s, vals);
+
+
+  case TokenID::NumericLessThanEquals:
+    return shortible(std::less_equal<double>(), as_number, s, vals);
+
+  case TokenID::NumericGreaterThan:
+    return shortible(std::greater<double>(), as_number, s, vals);
+
+  case TokenID::NumericGreaterThanEquals:
+    return shortible(std::greater_equal<double>(), as_number, s, vals);
+
+    // Date!
+  case TokenID::DateEquals:
+    return shortible(std::equal_to<ceph::real_time>(), as_date, s, vals);
+
+  case TokenID::DateNotEquals:
+    return shortible(std::not2(std::equal_to<ceph::real_time>()),
+		     as_date, s, vals);
+
+  case TokenID::DateLessThan:
+    return shortible(std::less<ceph::real_time>(), as_date, s, vals);
+
+
+  case TokenID::DateLessThanEquals:
+    return shortible(std::less_equal<ceph::real_time>(), as_date, s, vals);
+
+  case TokenID::DateGreaterThan:
+    return shortible(std::greater<ceph::real_time>(), as_date, s, vals);
+
+  case TokenID::DateGreaterThanEquals:
+    return shortible(std::greater_equal<ceph::real_time>(), as_date, s,
+		     vals);
+
+    // Bool!
+  case TokenID::Bool:
+    return shortible(std::equal_to<bool>(), as_bool, s, vals);
+
+    // Binary!
+  case TokenID::BinaryEquals:
+    return shortible(std::equal_to<ceph::bufferlist>(), as_binary, s,
+		     vals);
+
+    // IP Address!
+  case TokenID::IpAddress:
+    return shortible(std::equal_to<MaskedIP>(), as_network, s, vals);
+
+  case TokenID::NotIpAddress:
+    return shortible(std::not2(std::equal_to<MaskedIP>()), as_network, s,
+		     vals);
+
+#if 0
+    // Amazon Resource Names! (Does S3 need this?)
+    TokenID::ArnEquals, TokenID::ArnNotEquals, TokenID::ArnLike,
+      TokenID::ArnNotLike,
+#endif
+
+  default:
+    return false;
+  }
+}
+
+optional<MaskedIP> Condition::as_network(const string& s) {
+  MaskedIP m;
+  if (s.empty()) {
+    return none;
+  }
+
+  m.v6 = s.find(':');
+  auto slash = s.find('/');
+  if (slash == string::npos) {
+    m.prefix = m.v6 ? 128 : 32;
+  } else {
+    char* end = 0;
+    m.prefix = strtoul(s.data() + slash + 1, &end, 10);
+    if (*end != 0 || (m.v6 && m.prefix > 128) ||
+	(!m.v6 && m.prefix > 32)) {
+      return none;
+    }
+  }
+
+  string t;
+  auto p = &s;
+
+  if (slash != string::npos) {
+    t.assign(s, 0, slash);
+    p = &t;
+  }
+
+  if (m.v6) {
+    struct sockaddr_in6 a;
+    if (inet_pton(AF_INET6, p->c_str(), static_cast<void*>(&a)) != 1) {
+      return none;
+    }
+
+    m.addr |= Address(a.sin6_addr.s6_addr[0]) << 0;
+    m.addr |= Address(a.sin6_addr.s6_addr[1]) << 8;
+    m.addr |= Address(a.sin6_addr.s6_addr[2]) << 16;
+    m.addr |= Address(a.sin6_addr.s6_addr[3]) << 24;
+    m.addr |= Address(a.sin6_addr.s6_addr[4]) << 32;
+    m.addr |= Address(a.sin6_addr.s6_addr[5]) << 40;
+    m.addr |= Address(a.sin6_addr.s6_addr[6]) << 48;
+    m.addr |= Address(a.sin6_addr.s6_addr[7]) << 56;
+    m.addr |= Address(a.sin6_addr.s6_addr[8]) << 64;
+    m.addr |= Address(a.sin6_addr.s6_addr[9]) << 72;
+    m.addr |= Address(a.sin6_addr.s6_addr[10]) << 80;
+    m.addr |= Address(a.sin6_addr.s6_addr[11]) << 88;
+    m.addr |= Address(a.sin6_addr.s6_addr[12]) << 96;
+    m.addr |= Address(a.sin6_addr.s6_addr[13]) << 104;
+    m.addr |= Address(a.sin6_addr.s6_addr[14]) << 112;
+    m.addr |= Address(a.sin6_addr.s6_addr[15]) << 120;
+  } else {
+    struct sockaddr_in a;
+    if (inet_pton(AF_INET, p->c_str(), static_cast<void*>(&a)) != 1) {
+      return none;
+    }
+    m.addr = ntohl(a.sin_addr.s_addr);
+  }
+
+  return none;
+}
+
+namespace {
+const char* condop_string(const TokenID t) {
+  switch (t) {
+  case TokenID::StringEquals:
+    return "StringEquals";
+
+  case TokenID::StringNotEquals:
+    return "StringNotEquals";
+
+  case TokenID::StringEqualsIgnoreCase:
+    return "StringEqualsIgnoreCase";
+
+  case TokenID::StringNotEqualsIgnoreCase:
+    return "StringNotEqualsIgnoreCase";
+
+  case TokenID::StringLike:
+    return "StringLike";
+
+  case TokenID::StringNotLike:
+    return "StringNotLike";
+
+  // Numeric!
+  case TokenID::NumericEquals:
+    return "NumericEquals";
+
+  case TokenID::NumericNotEquals:
+    return "NumericNotEquals";
+
+  case TokenID::NumericLessThan:
+    return "NumericLessThan";
+
+  case TokenID::NumericLessThanEquals:
+    return "NumericLessThanEquals";
+
+  case TokenID::NumericGreaterThan:
+    return "NumericGreaterThan";
+
+  case TokenID::NumericGreaterThanEquals:
+    return "NumericGreaterThanEquals";
+
+  case TokenID::DateEquals:
+    return "DateEquals";
+
+  case TokenID::DateNotEquals:
+    return "DateNotEquals";
+
+  case TokenID::DateLessThan:
+    return "DateLessThan";
+
+  case TokenID::DateLessThanEquals:
+    return "DateLessThanEquals";
+
+  case TokenID::DateGreaterThan:
+    return "DateGreaterThan";
+
+  case TokenID::DateGreaterThanEquals:
+    return "DateGreaterThanEquals";
+
+  case TokenID::Bool:
+    return "Bool";
+
+  case TokenID::BinaryEquals:
+    return "BinaryEquals";
+
+  case TokenID::IpAddress:
+    return "case TokenID::IpAddress";
+
+  case TokenID::NotIpAddress:
+    return "NotIpAddress";
+
+  case TokenID::ArnEquals:
+    return "ArnEquals";
+
+  case TokenID::ArnNotEquals:
+    return "ArnNotEquals";
+
+  case TokenID::ArnLike:
+    return "ArnLike";
+
+  case TokenID::ArnNotLike:
+    return "ArnNotLike";
+
+  case TokenID::Null:
+    return "Null";
+
+  default:
+    return "InvalidConditionOperator";
+  }
+}
+
+template<typename Iterator>
+ostream& print_array(ostream& m, Iterator begin, Iterator end) {
+  if (begin == end) {
+    m << "[";
+  } else {
+    auto beforelast = end - 1;
+    m << "[ ";
+    for (auto i = begin; i != end; ++i) {
+      m << *i;
+      if (i != beforelast) {
+	m << ", ";
+      } else {
+	m << " ";
+      }
+    }
+  }
+  m << "]";
+  return m;
+}
+}
+
+ostream& operator <<(ostream& m, const Condition& c) {
+  m << "{ " << condop_string(c.op) << ": { " << c.key;
+  if (c.ifexists) {
+    m << "IfExists";
+  }
+  print_array(m, c.vals.cbegin(), c.vals.cend());
+  return m << "}";
+}
+
+string to_string(const Condition& c) {
+  stringstream ss;
+  ss << c;
+  return ss.str();
+}
+
+Effect Statement::eval(const Environment& e,
+		       optional<const rgw::auth::Identity&> ida,
+		       uint64_t act, const ARN& res) const {
+  if (ida && (!ida->is_identity(princ) || ida->is_identity(noprinc))) {
+    return Effect::Pass;
+  }
+
+
+  if (!std::any_of(resource.begin(), resource.end(),
+		   [&res](const ARN& pattern) {
+		     return pattern.match(res);
+		   }) ||
+      (std::any_of(notresource.begin(), notresource.end(),
+		   [&res](const ARN& pattern) {
+		     return pattern.match(res);
+		   }))) {
+    return Effect::Pass;
+  }
+
+  if (!(action & act) || (notaction & act)) {
+    return Effect::Pass;
+  }
+
+  if (std::all_of(conditions.begin(),
+		  conditions.end(),
+		  [&e](const Condition& c) { return c.eval(e);})) {
+    return effect;
+  }
+
+  return Effect::Pass;
+}
+
+namespace {
+const char* action_bit_string(uint64_t action) {
+  switch (action) {
+  case s3GetObject:
+    return "s3:GetObject";
+
+  case s3GetObjectVersion:
+    return "s3:GetObjectVersion";
+
+  case s3PutObject:
+    return "s3:PutObject";
+
+  case s3GetObjectAcl:
+    return "s3:GetObjectAcl";
+
+  case s3GetObjectVersionAcl:
+    return "s3:GetObjectVersionAcl";
+
+  case s3PutObjectAcl:
+    return "s3:PutObjectAcl";
+
+  case s3PutObjectVersionAcl:
+    return "s3:PutObjectVersionAcl";
+
+  case s3DeleteObject:
+    return "s3:DeleteObject";
+
+  case s3DeleteObjectVersion:
+    return "s3:DeleteObjectVersion";
+
+  case s3ListMultipartUploadParts:
+    return "s3:ListMultipartUploadParts";
+
+  case s3AbortMultipartUpload:
+    return "s3:AbortMultipartUpload";
+
+  case s3GetObjectTorrent:
+    return "s3:GetObjectTorrent";
+
+  case s3GetObjectVersionTorrent:
+    return "s3:GetObjectVersionTorrent";
+
+  case s3RestoreObject:
+    return "s3:RestoreObject";
+
+  case s3CreateBucket:
+    return "s3:CreateBucket";
+
+  case s3DeleteBucket:
+    return "s3:DeleteBucket";
+
+  case s3ListBucket:
+    return "s3:ListBucket";
+
+  case s3ListBucketVersions:
+    return "s3:ListBucketVersions";
+  case s3ListAllMyBuckets:
+    return "s3:ListAllMyBuckets";
+
+  case s3ListBucketMultiPartUploads:
+    return "s3:ListBucketMultiPartUploads";
+
+  case s3GetAccelerateConfiguration:
+    return "s3:GetAccelerateConfiguration";
+
+  case s3PutAccelerateConfiguration:
+    return "s3:PutAccelerateConfiguration";
+
+  case s3GetBucketAcl:
+    return "s3:GetBucketAcl";
+
+  case s3PutBucketAcl:
+    return "s3:PutBucketAcl";
+
+  case s3GetBucketCORS:
+    return "s3:GetBucketCORS";
+
+  case s3PutBucketCORS:
+    return "s3:PutBucketCORS";
+
+  case s3GetBucketVersioning:
+    return "s3:GetBucketVersioning";
+
+  case s3PutBucketVersioning:
+    return "s3:PutBucketVersioning";
+
+  case s3GetBucketRequestPayment:
+    return "s3:GetBucketRequestPayment";
+
+  case s3PutBucketRequestPayment:
+    return "s3:PutBucketRequestPayment";
+
+  case s3GetBucketLocation:
+    return "s3:GetBucketLocation";
+
+  case s3GetBucketPolicy:
+    return "s3:GetBucketPolicy";
+
+  case s3DeleteBucketPolicy:
+    return "s3:DeleteBucketPolicy";
+
+  case s3PutBucketPolicy:
+    return "s3:PutBucketPolicy";
+
+  case s3GetBucketNotification:
+    return "s3:GetBucketNotification";
+
+  case s3PutBucketNotification:
+    return "s3:PutBucketNotification";
+
+  case s3GetBucketLogging:
+    return "s3:GetBucketLogging";
+
+  case s3PutBucketLogging:
+    return "s3:PutBucketLogging";
+
+  case s3GetBucketTagging:
+    return "s3:GetBucketTagging";
+
+  case s3PutBucketTagging:
+    return "s3:PutBucketTagging";
+
+  case s3GetBucketWebsite:
+    return "s3:GetBucketWebsite";
+
+  case s3PutBucketWebsite:
+    return "s3:PutBucketWebsite";
+
+  case s3DeleteBucketWebsite:
+    return "s3:DeleteBucketWebsite";
+
+  case s3GetLifecycleConfiguration:
+    return "s3:GetLifecycleConfiguration";
+
+  case s3PutLifecycleConfiguration:
+    return "s3:PutLifecycleConfiguration";
+
+  case s3PutReplicationConfiguration:
+    return "s3:PutReplicationConfiguration";
+
+  case s3GetReplicationConfiguration:
+    return "s3:GetReplicationConfiguration";
+
+  case s3DeleteReplicationConfiguration:
+    return "s3:DeleteReplicationConfiguration";
+  }
+  return "s3Invalid";
+}
+
+ostream& print_actions(ostream& m, const uint64_t a) {
+  bool begun = false;
+  m << "[ ";
+  for (auto i = 0U; i < s3Count; ++i) {
+    if (a & (1 << i)) {
+      if (begun) {
+	m << ", ";
+      } else {
+	begun = true;
+      }
+      m << action_bit_string(1 << i);
+    }
+  }
+  if (begun) {
+    m << " ]";
+  } else {
+    m << "]";
+  }
+  return m;
+}
+}
+
+ostream& operator <<(ostream& m, const Statement& s) {
+  m << "{ ";
+  if (s.sid) {
+    m << "Sid: " << *s.sid << ", ";
+  }
+  if (!s.princ.empty()) {
+    m << "Principal: ";
+    print_array(m, s.princ.cbegin(), s.princ.cend());
+    m << ", ";
+  }
+  if (!s.noprinc.empty()) {
+    m << "NotPrincipal: ";
+    print_array(m, s.noprinc.cbegin(), s.noprinc.cend());
+    m << ", ";
+  }
+
+  m << "Effect: " <<
+    (s.effect == Effect::Allow ?
+     (const char*) "Allow" :
+     (const char*) "Deny");
+
+  if (s.action || s.notaction || !s.resource.empty() ||
+      !s.notresource.empty() || !s.conditions.empty()) {
+    m << ", ";
+  }
+
+  if (s.action) {
+    m << "Action: ";
+    print_actions(m, s.action);
+
+    if (s.notaction || !s.resource.empty() ||
+	!s.notresource.empty() || !s.conditions.empty()) {
+      m << ", ";
+    }
+  }
+
+  if (s.notaction) {
+    m << "NotAction: ";
+    print_actions(m, s.notaction);
+
+    if (!s.resource.empty() || !s.notresource.empty() ||
+	!s.conditions.empty()) {
+      m << ", ";
+    }
+  }
+
+  if (!s.resource.empty()) {
+    m << "Resource: ";
+    print_array(m, s.resource.cbegin(), s.resource.cend());
+
+    if (!s.notresource.empty() || !s.conditions.empty()) {
+      m << ", ";
+    }
+  }
+
+  if (!s.notresource.empty()) {
+    m << "NotResource: ";
+    print_array(m, s.notresource.cbegin(), s.notresource.cend());
+
+    if (!s.conditions.empty()) {
+      m << ", ";
+    }
+  }
+
+  if (!s.conditions.empty()) {
+    m << "Condition: ";
+    print_array(m, s.conditions.cbegin(), s.conditions.cend());
+  }
+
+  return m << " }";
+}
+
+string to_string(const Statement& s) {
+  stringstream m;
+  m << s;
+  return m.str();
+}
+
+Policy::Policy(CephContext* cct, const string& tenant,
+	       const bufferlist& _text)
+  : text(_text.to_str()) {
+  StringStream ss(text.data());
+  PolicyParser pp(cct, tenant, *this);
+  auto pr = Reader{}.Parse<kParseNumbersAsStringsFlag |
+			   kParseCommentsFlag>(ss, pp);
+  if (!pr) {
+    throw PolicyParseException(std::move(pr));
+  }
+}
+
+Effect Policy::eval(const Environment& e,
+		    optional<const rgw::auth::Identity&> ida,
+		    std::uint64_t action, const ARN& resource) const {
+  auto allowed = false;
+  for (auto& s : statements) {
+    auto g = s.eval(e, ida, action, resource);
+    if (g == Effect::Deny) {
+      return g;
+    } else if (g == Effect::Allow) {
+      allowed = true;
+    }
+  }
+  return allowed ? Effect::Allow : Effect::Pass;
+}
+
+ostream& operator <<(ostream& m, const Policy& p) {
+  m << "{ Version: "
+    << (p.version == Version::v2008_10_17 ? "2008-10-17" : "2012-10-17");
+
+  if (p.id || !p.statements.empty()) {
+    m << ", ";
+  }
+
+  if (p.id) {
+    m << "Id: " << *p.id;
+    if (!p.statements.empty()) {
+      m << ", ";
+    }
+  }
+
+  if (!p.statements.empty()) {
+    m << "Statements: ";
+    print_array(m, p.statements.cbegin(), p.statements.cend());
+    m << ", ";
+  }
+  return m << " }";
+}
+
+string to_string(const Policy& p) {
+  stringstream s;
+  s << p;
+  return s.str();
+}
+
+}
+}
diff --git a/src/rgw/rgw_iam_policy.h b/src/rgw/rgw_iam_policy.h
new file mode 100644
index 00000000000..4429a574309
--- /dev/null
+++ b/src/rgw/rgw_iam_policy.h
@@ -0,0 +1,466 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RGW_IAM_POLICY_H
+#define CEPH_RGW_IAM_POLICY_H
+
+#include <bitset>
+#include <chrono>
+#include <cstdint>
+#include <iostream>
+#include <string>
+
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/container/flat_map.hpp>
+#include <boost/container/flat_set.hpp>
+#include <boost/optional.hpp>
+#include <boost/thread/shared_mutex.hpp>
+#include <boost/utility/string_ref.hpp>
+#include <boost/variant.hpp>
+
+#include "common/ceph_time.h"
+#include "common/iso_8601.h"
+
+#include "rapidjson/error/error.h"
+#include "rapidjson/error/en.h"
+
+#include "fnmatch.h"
+
+#include "rgw_basic_types.h"
+#include "rgw_iam_policy_keywords.h"
+
+#include "include/assert.h" // razzin' frazzin' ...grrr.
+
+class RGWRados;
+namespace rgw {
+namespace auth {
+class Identity;
+}
+}
+struct rgw_obj;
+struct rgw_bucket;
+
+namespace rgw {
+namespace IAM {
+static constexpr std::uint64_t s3None = 0;
+static constexpr std::uint64_t s3GetObject = 1ULL << 0;
+static constexpr std::uint64_t s3GetObjectVersion = 1ULL << 1;
+static constexpr std::uint64_t s3PutObject = 1ULL << 2;
+static constexpr std::uint64_t s3GetObjectAcl = 1ULL << 3;
+static constexpr std::uint64_t s3GetObjectVersionAcl = 1ULL << 4;
+static constexpr std::uint64_t s3PutObjectAcl = 1ULL << 5;
+static constexpr std::uint64_t s3PutObjectVersionAcl = 1ULL << 6;
+static constexpr std::uint64_t s3DeleteObject = 1ULL << 7;
+static constexpr std::uint64_t s3DeleteObjectVersion = 1ULL << 8;
+static constexpr std::uint64_t s3ListMultipartUploadParts = 1ULL << 9;
+static constexpr std::uint64_t s3AbortMultipartUpload = 1ULL << 10;
+static constexpr std::uint64_t s3GetObjectTorrent = 1ULL << 11;
+static constexpr std::uint64_t s3GetObjectVersionTorrent = 1ULL << 12;
+static constexpr std::uint64_t s3RestoreObject = 1ULL << 13;
+static constexpr std::uint64_t s3CreateBucket = 1ULL << 14;
+static constexpr std::uint64_t s3DeleteBucket = 1ULL << 15;
+static constexpr std::uint64_t s3ListBucket = 1ULL << 16;
+static constexpr std::uint64_t s3ListBucketVersions = 1ULL << 17;
+static constexpr std::uint64_t s3ListAllMyBuckets = 1ULL << 18;
+static constexpr std::uint64_t s3ListBucketMultiPartUploads = 1ULL << 19;
+static constexpr std::uint64_t s3GetAccelerateConfiguration = 1ULL << 20;
+static constexpr std::uint64_t s3PutAccelerateConfiguration = 1ULL << 21;
+static constexpr std::uint64_t s3GetBucketAcl = 1ULL << 22;
+static constexpr std::uint64_t s3PutBucketAcl = 1ULL << 23;
+static constexpr std::uint64_t s3GetBucketCORS = 1ULL << 24;
+static constexpr std::uint64_t s3PutBucketCORS = 1ULL << 25;
+static constexpr std::uint64_t s3GetBucketVersioning = 1ULL << 26;
+static constexpr std::uint64_t s3PutBucketVersioning = 1ULL << 27;
+static constexpr std::uint64_t s3GetBucketRequestPayment = 1ULL << 28;
+static constexpr std::uint64_t s3PutBucketRequestPayment = 1ULL << 29;
+static constexpr std::uint64_t s3GetBucketLocation = 1ULL << 30;
+static constexpr std::uint64_t s3GetBucketPolicy = 1ULL << 31;
+static constexpr std::uint64_t s3DeleteBucketPolicy = 1ULL << 32;
+static constexpr std::uint64_t s3PutBucketPolicy = 1ULL << 33;
+static constexpr std::uint64_t s3GetBucketNotification = 1ULL << 34;
+static constexpr std::uint64_t s3PutBucketNotification = 1ULL << 35;
+static constexpr std::uint64_t s3GetBucketLogging = 1ULL << 36;
+static constexpr std::uint64_t s3PutBucketLogging = 1ULL << 37;
+static constexpr std::uint64_t s3GetBucketTagging = 1ULL << 38;
+static constexpr std::uint64_t s3PutBucketTagging = 1ULL << 39;
+static constexpr std::uint64_t s3GetBucketWebsite = 1ULL << 40;
+static constexpr std::uint64_t s3PutBucketWebsite = 1ULL << 41;
+static constexpr std::uint64_t s3DeleteBucketWebsite = 1ULL << 42;
+static constexpr std::uint64_t s3GetLifecycleConfiguration = 1ULL << 43;
+static constexpr std::uint64_t s3PutLifecycleConfiguration = 1ULL << 44;
+static constexpr std::uint64_t s3PutReplicationConfiguration = 1ULL << 45;
+static constexpr std::uint64_t s3GetReplicationConfiguration = 1ULL << 46;
+static constexpr std::uint64_t s3DeleteReplicationConfiguration = 1ULL << 47;
+static constexpr std::uint64_t s3Count = 48;
+static constexpr std::uint64_t s3All = (1ULL << s3Count) - 1;
+
+namespace {
+inline int op_to_perm(std::uint64_t op) {
+  switch (op) {
+  case s3GetObject:
+  case s3GetObjectTorrent:
+  case s3GetObjectVersion:
+  case s3GetObjectVersionTorrent:
+  case s3ListAllMyBuckets:
+  case s3ListBucket:
+  case s3ListBucketMultiPartUploads:
+  case s3ListBucketVersions:
+  case s3ListMultipartUploadParts:
+    return RGW_PERM_READ;
+
+  case s3AbortMultipartUpload:
+  case s3CreateBucket:
+  case s3DeleteBucket:
+  case s3DeleteObject:
+  case s3DeleteObjectVersion:
+  case s3PutObject:
+  case s3RestoreObject:
+    return RGW_PERM_WRITE;
+
+  case s3GetAccelerateConfiguration:
+  case s3GetBucketAcl:
+  case s3GetBucketCORS:
+  case s3GetBucketLocation:
+  case s3GetBucketLogging:
+  case s3GetBucketNotification:
+  case s3GetBucketPolicy:
+  case s3GetBucketRequestPayment:
+  case s3GetBucketTagging:
+  case s3GetBucketVersioning:
+  case s3GetBucketWebsite:
+  case s3GetLifecycleConfiguration:
+  case s3GetObjectAcl:
+  case s3GetObjectVersionAcl:
+  case s3GetReplicationConfiguration:
+    return RGW_PERM_READ_ACP;
+
+  case s3DeleteBucketPolicy:
+  case s3DeleteBucketWebsite:
+  case s3DeleteReplicationConfiguration:
+  case s3PutAccelerateConfiguration:
+  case s3PutBucketAcl:
+  case s3PutBucketCORS:
+  case s3PutBucketLogging:
+  case s3PutBucketNotification:
+  case s3PutBucketPolicy:
+  case s3PutBucketRequestPayment:
+  case s3PutBucketTagging:
+  case s3PutBucketVersioning:
+  case s3PutBucketWebsite:
+  case s3PutLifecycleConfiguration:
+  case s3PutObjectAcl:
+  case s3PutObjectVersionAcl:
+  case s3PutReplicationConfiguration:
+    return RGW_PERM_WRITE_ACP;
+
+  case s3All:
+    return RGW_PERM_FULL_CONTROL;
+  }
+  return RGW_PERM_INVALID;
+}
+}
+
+using Environment = boost::container::flat_map<std::string, std::string>;
+
+enum struct Partition {
+  aws, aws_cn, aws_us_gov, wildcard
+  // If we wanted our own ARNs for principal type unique to us
+  // (maybe to integrate better with Swift) or for anything else we
+  // provide that doesn't map onto S3, we could add an 'rgw'
+  // partition type.
+};
+
+enum struct Service {
+  apigateway, appstream, artifact, autoscaling, aws_portal, acm,
+  cloudformation, cloudfront, cloudhsm, cloudsearch, cloudtrail,
+  cloudwatch, events, logs, codebuild, codecommit, codedeploy,
+  codepipeline, cognito_idp, cognito_identity, cognito_sync,
+  config, datapipeline, dms, devicefarm, directconnect,
+  ds, dynamodb, ec2, ecr, ecs, ssm, elasticbeanstalk, elasticfilesystem,
+  elasticloadbalancing, elasticmapreduce, elastictranscoder, elasticache,
+  es, gamelift, glacier, health, iam, importexport, inspector, iot,
+  kms, kinesisanalytics, firehose, kinesis, lambda, lightsail,
+  machinelearning, aws_marketplace, aws_marketplace_management,
+  mobileanalytics, mobilehub, opsworks, opsworks_cm, polly,
+  redshift, rds, route53, route53domains, sts, servicecatalog,
+  ses, sns, sqs, s3, swf, sdb, states, storagegateway, support,
+  trustedadvisor, waf, workmail, workspaces, wildcard
+};
+
+struct ARN {
+  Partition partition;
+  Service service;
+  std::string region;
+  // Once we refity tenant, we should probably use that instead of a
+  // string.
+  std::string account;
+  std::string resource;
+
+  ARN()
+    : partition(Partition::wildcard), service(Service::wildcard) {}
+  ARN(Partition partition, Service service, std::string region,
+      std::string account, std::string resource)
+    : partition(partition), service(service), region(std::move(region)),
+      account(std::move(account)), resource(std::move(resource)) {}
+  ARN(const rgw_obj& o);
+  ARN(const rgw_bucket& b);
+  ARN(const rgw_bucket& b, const std::string& o);
+
+  static boost::optional<ARN> parse(const std::string& s,
+				    bool wildcard = false);
+  std::string to_string() const;
+
+  // `this` is the pattern
+  bool match(const ARN& candidate) const;
+};
+
+inline std::string to_string(const ARN& a) {
+  return a.to_string();
+}
+
+inline std::ostream& operator <<(std::ostream& m, const ARN& a) {
+  return m << to_string(a);
+}
+
+bool operator ==(const ARN& l, const ARN& r);
+bool operator <(const ARN& l, const ARN& r);
+
+using Address = std::bitset<128>;
+struct MaskedIP {
+  bool v6;
+  Address addr;
+  // Since we're mapping IPv6 to IPv4 addresses, we may want to
+  // consider making the prefix always be in terms of a v6 address
+  // and just use the v6 bit to rewrite it as a v4 prefix for
+  // output.
+  unsigned int prefix;
+};
+
+std::ostream& operator <<(std::ostream& m, const MaskedIP& ip);
+string to_string(const MaskedIP& m);
+
+inline bool operator ==(const MaskedIP& l, const MaskedIP& r) {
+  auto shift = std::max((l.v6 ? 128 : 32) - l.prefix,
+			(r.v6 ? 128 : 32) - r.prefix);
+  ceph_assert(shift > 0);
+  return (l.addr >> shift) == (r.addr >> shift);
+}
+
+struct Condition {
+  TokenID op;
+  // Originally I was going to use a perfect hash table, but Marcus
+  // says keys are to be added at run-time not compile time.
+
+  // In future development, use symbol internment.
+  std::string key;
+  bool ifexists = false;
+  // Much to my annoyance there is no actual way to do this in a
+  // typed way that is compatible with AWS. I know this because I've
+  // seen examples where the same value is used as a string in one
+  // context and a date in another.
+  std::vector<std::string> vals;
+
+  Condition() = default;
+  Condition(TokenID op, const char* s, std::size_t len) : op(op) {
+    static constexpr char ifexistr[] = "IfExists";
+    auto l = static_cast<const char*>(memmem(static_cast<const void*>(s), len,
+					     static_cast<const void*>(ifexistr),
+					     sizeof(ifexistr) -1));
+    if (l && ((l + sizeof(ifexistr) - 1 == (s + len)))) {
+      ifexists = true;
+      key.assign(s, static_cast<const char*>(l) - s);
+    } else {
+      key.assign(s, len);
+    }
+  }
+
+  bool eval(const Environment& e) const;
+
+  static boost::optional<double> as_number(const std::string& s) {
+    std::size_t p = 0;
+
+    try {
+      double d = std::stod(s, &p);
+      if (p < s.length()) {
+	return boost::none;
+      }
+
+      return d;
+    } catch (const std::logic_error& e) {
+      return boost::none;
+    }
+  }
+
+  static boost::optional<ceph::real_time> as_date(const std::string& s) {
+    std::size_t p = 0;
+
+    try {
+      double d = std::stod(s, &p);
+      if (p == s.length()) {
+	return ceph::real_time(
+	  std::chrono::seconds(static_cast<uint64_t>(d)) +
+	  std::chrono::nanoseconds(
+	    static_cast<uint64_t>((d - static_cast<uint64_t>(d))
+				  * 1000000000)));
+      }
+
+      return from_iso_8601(boost::string_ref(s), false);
+    } catch (const std::logic_error& e) {
+      return boost::none;
+    }
+  }
+
+  static boost::optional<bool> as_bool(const std::string& s) {
+    std::size_t p = 0;
+
+    if (s.empty() || boost::iequals(s, "false")) {
+      return false;
+    }
+
+    try {
+      double d = std::stod(s, &p);
+      if (p == s.length()) {
+	return !((d == +0.0) || (d = -0.0) || std::isnan(d));
+      }
+    } catch (const std::logic_error& e) {
+      // Fallthrough
+    }
+
+    return true;
+  }
+
+  static boost::optional<ceph::bufferlist> as_binary(const std::string& s) {
+    // In a just world
+    ceph::bufferlist base64;
+    // I could populate a bufferlist
+    base64.push_back(buffer::create_static(
+		       s.length(),
+		       const_cast<char*>(s.data()))); // Yuck
+    // From a base64 encoded std::string.
+    ceph::bufferlist bin;
+
+    try {
+      base64.decode_base64(bin);
+    } catch (const ceph::buffer::malformed_input& e) {
+      return boost::none;
+    }
+    return bin;
+  }
+
+  static boost::optional<MaskedIP> as_network(const std::string& s);
+
+
+  struct ci_equal_to : public std::binary_function<const std::string,
+						   const std::string,
+						   bool> {
+    bool operator ()(const std::string& s1,
+		     const std::string& s2) const {
+      return boost::iequals(s1, s2);
+    }
+  };
+
+
+  template<typename F>
+  static bool orrible(F&& f, const std::string& c,
+		      const std::vector<std::string>& v) {
+    for (const auto& d : v) {
+      if (std::forward<F>(f)(c, d)) {
+	return true;
+      }
+    }
+    return false;
+  }
+
+  template<typename F, typename X>
+  static bool shortible(F&& f, X& x, const std::string& c,
+			const std::vector<std::string>& v) {
+    auto xc = std::forward<X>(x)(c);
+    if (!xc) {
+      return false;
+    }
+
+    for (const auto& d : v) {
+      auto xd = std::forward<X>(x)(d);
+      if (!xd) {
+	continue;
+      }
+
+      if (std::forward<F>(f)(*xc, *xd)) {
+	return true;
+      }
+    }
+    return false;
+  }
+};
+
+std::ostream& operator <<(std::ostream& m, const Condition& c);
+
+std::string to_string(const Condition& c);
+
+struct Statement {
+  boost::optional<std::string> sid = boost::none;
+
+  boost::container::flat_set<rgw::auth::Principal> princ;
+  boost::container::flat_set<rgw::auth::Principal> noprinc;
+
+  // Every statement MUST provide an effect. I just initialize it to
+  // deny as defensive programming.
+  Effect effect = Effect::Deny;
+
+  std::uint64_t action = 0;
+  std::uint64_t notaction = 0;
+
+  boost::container::flat_set<ARN> resource;
+  boost::container::flat_set<ARN> notresource;
+
+  std::vector<Condition> conditions;
+
+  Effect eval(const Environment& e,
+	      boost::optional<const rgw::auth::Identity&> ida,
+	      std::uint64_t action, const ARN& resource) const;
+};
+
+std::ostream& operator <<(ostream& m, const Statement& s);
+std::string to_string(const Statement& s);
+
+struct PolicyParseException : public std::exception {
+  rapidjson::ParseResult pr;
+
+  PolicyParseException(rapidjson::ParseResult&& pr)
+    : pr(pr) { }
+  const char* what() const noexcept override {
+    return rapidjson::GetParseError_En(pr.Code());
+  }
+};
+
+struct Policy {
+  std::string text;
+  Version version = Version::v2008_10_17;
+  boost::optional<std::string> id = boost::none;
+
+  std::vector<Statement> statements;
+
+  Policy(CephContext* cct, const std::string& tenant,
+	 const bufferlist& text);
+
+  Effect eval(const Environment& e,
+	      boost::optional<const rgw::auth::Identity&> ida,
+	      std::uint64_t action, const ARN& resource) const;
+};
+
+std::ostream& operator <<(ostream& m, const Policy& p);
+std::string to_string(const Policy& p);
+}
+}
+
+namespace std {
+template<>
+struct hash<::rgw::IAM::Service> {
+  size_t operator()(const ::rgw::IAM::Service& s) const noexcept {
+    // Invoke a default-constructed hash object for int.
+    return hash<int>()(static_cast<int>(s));
+  }
+};
+}
+
+#endif
diff --git a/src/rgw/rgw_iam_policy_keywords.gperf b/src/rgw/rgw_iam_policy_keywords.gperf
new file mode 100644
index 00000000000..d37fa6aff65
--- /dev/null
+++ b/src/rgw/rgw_iam_policy_keywords.gperf
@@ -0,0 +1,127 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+%language=C++
+%define class-name keyword_hash
+%define lookup-function-name lookup
+%struct-type
+struct Keyword {
+  const char* name;
+  TokenKind kind;
+  TokenID id;
+  uint64_t specific;
+  bool arrayable;
+  bool objectable;
+};
+%%
+# Top-level
+#
+Version, TokenKind::top, TokenID::Version, 0, false, false
+Id, TokenKind::top, TokenID::Id, 0, false, false
+Statement, TokenKind::top, TokenID::Statement, 0, true, true
+#
+# Statement level
+#
+Sid, TokenKind::statement, TokenID::Sid, 0, false, false
+Effect, TokenKind::statement, TokenID::Effect, 0, false, false
+Principal, TokenKind::statement, TokenID::Principal, 0, false, true
+NotPrincipal, TokenKind::statement, TokenID::NotPrincipal, 0, true, true
+Action, TokenKind::statement, TokenID::Action, 0, true, false
+NotAction, TokenKind::statement, TokenID::NotAction, 0, true, false
+Resource, TokenKind::statement, TokenID::Resource, 0, true, false
+NotResource, TokenKind::statement, TokenID::NotResource, 0, true, false
+Condition, TokenKind::statement, TokenID::Condition, 0, true, true
+#
+# Condition operators
+#
+# String
+StringEquals, TokenKind::cond_op, TokenID::StringEquals, (uint64_t) Type::string, true, true
+StringNotEquals, TokenKind::cond_op, TokenID::StringNotEquals, (uint64_t) Type::string, true, true
+StringEqualsIgnoreCase, TokenKind::cond_op, TokenID::StringEqualsIgnoreCase, (uint64_t) Type::string, true, true
+StringNotEqualsIgnoreCase, TokenKind::cond_op, TokenID::StringNotEqualsIgnoreCase, (uint64_t) Type::string, true, true
+StringLike, TokenKind::cond_op, TokenID::StringLike, (uint64_t) Type::string, true, true,
+StringNotLike, TokenKind::cond_op, TokenID::StringNotLike, (uint64_t) Type::string, true, true
+# Numeric
+NumericEquals, TokenKind::cond_op, TokenID::NumericEquals, (uint64_t) Type::number, true, true
+NumericNotEquals, TokenKind::cond_op, TokenID::NumericNotEquals, (uint64_t) Type::number, true, true
+NumericLessThan, TokenKind::cond_op, TokenID::NumericLessThan, (uint64_t) Type::number, true, true
+NumericLessThanEquals, TokenKind::cond_op, TokenID::NumericLessThanEquals, (uint64_t) Type::number, true, true
+NumericGreaterThan, TokenKind::cond_op, TokenID::NumericGreaterThan, (uint64_t) Type::number, true, true
+NumericGreaterThanEquals, TokenKind::cond_op, TokenID::NumericGreaterThanEquals, (uint64_t) Type::number, true, true
+# Date
+DateEquals, TokenKind::cond_op, TokenID::DateEquals, (uint64_t) Type::date, true, true
+DateNotEquals, TokenKind::cond_op, TokenID::DateNotEquals, (uint64_t) Type::date, true, true
+DateLessThan, TokenKind::cond_op, TokenID::DateLessThan, (uint64_t) Type::date, true, true
+DateLessThanEquals, TokenKind::cond_op, TokenID::DateLessThanEquals, (uint64_t) Type::date, true, true
+DateGreaterThan, TokenKind::cond_op, TokenID::DateGreaterThan, (uint64_t) Type::date, true, true
+DateGreaterThanEquals, TokenKind::cond_op, TokenID::DateGreaterThanEquals, (uint64_t) Type::date, true, true
+# Bool
+Bool, TokenKind::cond_op, TokenID::Bool, (uint64_t) Type::boolean, true, true
+# Binary
+BinaryEquals, TokenKind::cond_op, TokenID::BinaryEquals, (uint64_t) Type::binary, true, true
+# IP Address
+IpAddress, TokenKind::cond_op, TokenID::IpAddress, (uint64_t) Type::ipaddr, true, true
+NotIpAddress, TokenKind::cond_op, TokenID::NotIpAddress, (uint64_t) Type::ipaddr, true, true
+# Amazon Resource Names
+ArnEquals, TokenKind::cond_op, TokenID::ArnEquals, (uint64_t) Type::arn, true, true
+ArnNotEquals, TokenKind::cond_op, TokenID::ArnNotEquals, (uint64_t) Type::arn, true, true
+ArnLike, TokenKind::cond_op, TokenID::ArnLike, (uint64_t) Type::arn, true, true
+ArnNotLike, TokenKind::cond_op, TokenID::ArnNotLike, (uint64_t) Type::arn, true, true
+# Null
+Null, TokenKind::cond_op, TokenID::Null, (uint64_t) Type::null, true, true
+#
+# Condition keys
+#
+# AWS
+#aws:CurrentTime, TokenKind::cond_key, TokenID::awsCurrentTime, (uint64_t) Type::date, true, false
+#aws:EpochTime, TokenKind::cond_key, TokenID::awsEpochTime, (uint64_t) Type::date, true, false
+#aws:TokenIssueTime, TokenKind::cond_key, TokenID::awsTokenIssueTime, (uint64_t) Type::date, true, false
+#aws:MultiFactorAuthPresent, TokenKind::cond_key, TokenID::awsMultiFactorAuthPresent, (uint64_t) Type::boolean, true, false
+#aws:MultiFactorAuthAge, TokenKind::cond_key, TokenID::awsMultiFactorAuthAge, (uint64_t) Type::number, true, false
+#aws:PrincipalType, TokenKind::cond_key, TokenID::awsPrincipalType, (uint64_t) Type::string, true, false
+#aws:Referer, TokenKind::cond_key, TokenID::awsReferer, (uint64_t) Type::string, true, false
+#aws:SecureTransport, TokenKind::cond_key, TokenID::awsSecureTransport, (uint64_t) Type::boolean, true, false
+#aws:SourceArn, TokenKind::cond_key, TokenID::awsSourceArn, (uint64_t) Type::arn, true, false
+#aws:SourceIp, TokenKind::cond_key, TokenID::awsSourceIp, (uint64_t) Type::ipaddr, true, false
+#aws:SourceVpc, TokenKind::cond_key, TokenID::awsSourceVpc, (uint64_t) Type::string, true, false
+#aws:SourceVpce, TokenKind::cond_key, TokenID::awsSourceVpce, (uint64_t) Type::string, true, false
+#aws:UserAgent, TokenKind::cond_key, TokenID::awsUserAgent, (uint64_t) Type::string, true, false
+#aws:userid, TokenKind::cond_key, TokenID::awsuserid, (uint64_t) Type::string, true, false
+#aws:username, TokenKind::cond_key, TokenID::awsusername, (uint64_t) Type::string, true, false
+# S3
+#s3:x-amz-acl, TokenKind::cond_key, TokenID::s3x_amz_acl, (uint64_t) Type::string, true, false
+#s3:x-amz-grant-read, TokenKind::cond_key, TokenID::s3x_amz_grant_permission, (uint64_t) Type::boolean, true, false
+#s3:x-amz-grant-write, TokenKind::cond_key, TokenID::s3x_amz_grant_permission, (uint64_t) Type::boolean, true, false
+#s3:x-amz-grant-read-acp, TokenKind::cond_key, TokenID::s3x_amz_grant_permission, (uint64_t) Type::boolean, true, false
+#s3:x-amz-grant-write-acp, TokenKind::cond_key, TokenID::s3x_amz_grant_permission, (uint64_t) Type::boolean, true, false
+#s3:x-amz-grant-full-control, TokenKind::cond_key, TokenID::s3x_amz_grant_permission, (uint64_t) Type::boolean, true, false
+#s3:x-amz-copy-source, TokenKind::cond_key, TokenID::s3x_amz_copy_source, (uint64_t) Type::string, true, false
+#s3:x-amz-server-side-encryption, TokenKind::cond_key, TokenID::s3x_amz_server_side_encryption, (uint64_t) Type::boolean, true, false
+#s3:x-amz-server-side-encryption-aws-kms-key-id, TokenKind::cond_key, TokenID::s3x_amz_server_side_encryption_aws_kms_key_id, (uint64_t) Type::arn, true, false
+#s3:x-amz-metadata-directive, TokenKind::cond_key, TokenID::s3x_amz_metadata_directive, (uint64_t) Type::string, true, false
+#s3:x-amz-storage-class, TokenKind::cond_key, TokenID::s3x_amz_storage_class, (uint64_t) Type::string, true, false
+#s3:VersionId, TokenKind::cond_key, TokenID::s3VersionId, (uint64_t) Type::string, true, false
+#s3:LocationConstraint, TokenKind::cond_key, TokenID::s3LocationConstraint, (uint64_t) Type::string, true, false
+#s3:prefix, TokenKind::cond_key, TokenID::s3prefix, (uint64_t) Type::string, true, false
+#s3:delimiter, TokenKind::cond_key, TokenID::s3delimiter, (uint64_t) Type::string, true, false
+#s3:max-keys, TokenKind::cond_key, TokenID::s3max_keys, (uint64_t) Type::number, true, false
+#s3:signatureversion, TokenKind::cond_key, TokenID::s3signatureversion, (uint64_t) Type::string, true, false
+#s3:authType, TokenKind::cond_key, TokenID::s3authType, (uint64_t) Type::string, true, false
+#s3:signatureAge, TokenKind::cond_key, TokenID::s3signatureAge, (uint64_t) Type::number, true, false
+#s3:x-amz-content-sha256, TokenKind::cond_key, TokenID::s3x_amz_content_sha256, (uint64_t) Type::string, true, false
+#
+# Version Keywords
+#
+2008-10-17, TokenKind::version_key, TokenID::v2008_10_17, (uint64_t) Version::v2008_10_17, false, false
+2012-10-17, TokenKind::version_key, TokenID::v2012_10_17, (uint64_t) Version::v2012_10_17, false, false
+#
+# Effect Keywords
+#
+Allow, TokenKind::effect_key, TokenID::Allow, (uint64_t) Effect::Allow, false, false
+Deny, TokenKind::effect_key, TokenID::Deny, (uint64_t) Effect::Deny, false, false
+#
+# Principal types
+#
+AWS, TokenKind::princ_type, TokenID::AWS, 0, true, false
+Federated, TokenKind::princ_type, TokenID::Federated, 0, true, false
+Service, TokenKind::princ_type, TokenID::Service, 0, true, false
+CanonicalUser, TokenKind::princ_type, TokenID::CanonicalUser, 0, true, false
diff --git a/src/rgw/rgw_iam_policy_keywords.h b/src/rgw/rgw_iam_policy_keywords.h
new file mode 100644
index 00000000000..a0cd34b6286
--- /dev/null
+++ b/src/rgw/rgw_iam_policy_keywords.h
@@ -0,0 +1,139 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RGW_POLICY_S3V2_KEYWORDS_H
+#define CEPH_RGW_POLICY_S3V2_KEYWORDS_H
+
+namespace rgw {
+namespace IAM {
+
+enum class TokenKind {
+  pseudo, top, statement, cond_op, cond_key, version_key, effect_key,
+  princ_type
+};
+
+enum class TokenID {
+  /// Pseudo-token
+  Top,
+
+  /// Top-level tokens
+  Version, Id, Statement,
+
+  /// Statement level tokens
+  Sid, Effect, Principal, NotPrincipal, Action, NotAction,
+  Resource, NotResource, Condition,
+
+  /// Condition Operators!
+  /// Any of these, except Null, can have an IfExists variant.
+
+  // String!
+  StringEquals, StringNotEquals, StringEqualsIgnoreCase,
+  StringNotEqualsIgnoreCase, StringLike, StringNotLike,
+
+  // Numeric!
+  NumericEquals, NumericNotEquals, NumericLessThan, NumericLessThanEquals,
+  NumericGreaterThan, NumericGreaterThanEquals,
+
+  // Date!
+  DateEquals, DateNotEquals, DateLessThan, DateLessThanEquals,
+  DateGreaterThan, DateGreaterThanEquals,
+
+  // Bool!
+  Bool,
+
+  // Binary!
+  BinaryEquals,
+
+  // IP Address!
+  IpAddress, NotIpAddress,
+
+  // Amazon Resource Names! (Does S3 need this?)
+  ArnEquals, ArnNotEquals, ArnLike, ArnNotLike,
+
+  // Null!
+  Null,
+
+#if 0 // Keys are done at runtime now
+
+      /// Condition Keys!
+  awsCurrentTime,
+  awsEpochTime,
+  awsTokenIssueTime,
+  awsMultiFactorAuthPresent,
+  awsMultiFactorAuthAge,
+  awsPrincipalType,
+  awsReferer,
+  awsSecureTransport,
+  awsSourceArn,
+  awsSourceIp,
+  awsSourceVpc,
+  awsSourceVpce,
+  awsUserAgent,
+  awsuserid,
+  awsusername,
+  s3x_amz_acl,
+  s3x_amz_grant_permission,
+  s3x_amz_copy_source,
+  s3x_amz_server_side_encryption,
+  s3x_amz_server_side_encryption_aws_kms_key_id,
+  s3x_amz_metadata_directive,
+  s3x_amz_storage_class,
+  s3VersionId,
+  s3LocationConstraint,
+  s3prefix,
+  s3delimiter,
+  s3max_keys,
+  s3signatureversion,
+  s3authType,
+  s3signatureAge,
+  s3x_amz_content_sha256,
+#else
+  CondKey,
+#endif
+
+  ///
+  /// Versions!
+  ///
+  v2008_10_17,
+  v2012_10_17,
+
+  ///
+  /// Effects!
+  ///
+  Allow,
+  Deny,
+
+  /// Principal Types!
+  AWS,
+  Federated,
+  Service,
+  CanonicalUser
+};
+
+
+enum class Version {
+  v2008_10_17,
+  v2012_10_17
+};
+
+
+enum class Effect {
+  Allow,
+  Deny,
+  Pass
+};
+
+enum class Type {
+  string,
+  number,
+  date,
+  boolean,
+  binary,
+  ipaddr,
+  arn,
+  null
+};
+}
+}
+
+#endif // CEPH_RGW_POLICY_S3V2_KEYWORDS_H
diff --git a/src/rgw/rgw_json_enc.cc b/src/rgw/rgw_json_enc.cc
index 067ad6fadbb..2a183b59195 100644
--- a/src/rgw/rgw_json_enc.cc
+++ b/src/rgw/rgw_json_enc.cc
@@ -1290,6 +1290,7 @@ void rgw_meta_sync_marker::decode_json(JSONObj *obj)
   utime_t ut;
   JSONDecoder::decode_json("timestamp", ut, obj);
   timestamp = ut.to_real_time();
+  JSONDecoder::decode_json("realm_epoch", realm_epoch, obj);
 }
 
 void rgw_meta_sync_marker::dump(Formatter *f) const
@@ -1300,6 +1301,7 @@ void rgw_meta_sync_marker::dump(Formatter *f) const
   encode_json("total_entries", total_entries, f);
   encode_json("pos", pos, f);
   encode_json("timestamp", utime_t(timestamp), f);
+  encode_json("realm_epoch", realm_epoch, f);
 }
 
 void rgw_meta_sync_status::decode_json(JSONObj *obj)
diff --git a/src/rgw/rgw_keystone.cc b/src/rgw/rgw_keystone.cc
index 933308240e2..3294380ba10 100644
--- a/src/rgw/rgw_keystone.cc
+++ b/src/rgw/rgw_keystone.cc
@@ -613,7 +613,7 @@ int TokenCache::RevokeThread::check_revoked()
 
 bool TokenCache::going_down() const
 {
-  return (down_flag.read() != 0);
+  return down_flag;
 }
 
 void* TokenCache::RevokeThread::entry()
diff --git a/src/rgw/rgw_keystone.h b/src/rgw/rgw_keystone.h
index 5bacfb0da5e..df5650c5927 100644
--- a/src/rgw/rgw_keystone.h
+++ b/src/rgw/rgw_keystone.h
@@ -13,6 +13,8 @@
 #include "rgw_http_client.h"
 #include "common/Cond.h"
 
+#include <atomic>
+
 int rgw_open_cms_envelope(CephContext *cct,
                           const std::string& src,
                           std::string& dst);            /* out */
@@ -216,7 +218,7 @@ class TokenCache {
     list<string>::iterator lru_iter;
   };
 
-  atomic_t down_flag;
+  std::atomic<bool> down_flag = { false };
 
   class RevokeThread : public Thread {
     friend class TokenCache;
@@ -271,7 +273,7 @@ class TokenCache {
   }
 
   ~TokenCache() {
-    down_flag.set(1);
+    down_flag = true;
 
     revocator.stop();
     revocator.join();
diff --git a/src/rgw/rgw_lc.cc b/src/rgw/rgw_lc.cc
index 61d4cc94427..b70d2596f26 100644
--- a/src/rgw/rgw_lc.cc
+++ b/src/rgw/rgw_lc.cc
@@ -2,11 +2,13 @@
 #include <iostream>
 #include <map>
 
+#include <boost/algorithm/string/split.hpp>
+#include <boost/algorithm/string.hpp>
+
 #include "common/Formatter.h"
 #include <common/errno.h>
 #include "auth/Crypto.h"
 #include "cls/rgw/cls_rgw_client.h"
-#include "cls/refcount/cls_refcount_client.h"
 #include "cls/lock/cls_lock_client.h"
 #include "rgw_common.h"
 #include "rgw_bucket.h"
@@ -198,21 +200,6 @@ bool RGWLC::if_already_run_today(time_t& start_date)
     return false;
 }
 
-static std::vector<std::string> &split(const std::string &s, char delim, std::vector<std::string> &elems) {
-  std::stringstream ss(s);
-  std::string item;
-  while (std::getline(ss, item, delim)) {
-      elems.push_back(item);
-  }
-  return elems;
-}
-
-static std::vector<std::string> split(const std::string &s, char delim) {
-  std::vector<std::string> elems;
-  split(s, delim, elems);
-  return elems;
-}
-
 int RGWLC::bucket_lc_prepare(int index)
 {
   map<string, int > entries;
@@ -324,7 +311,7 @@ int RGWLC::bucket_lc_process(string& shard_id)
   vector<rgw_bucket_dir_entry> objs;
   RGWObjectCtx obj_ctx(store);
   vector<std::string> result;
-  result = split(shard_id, ':');
+  boost::split(result, shard_id, boost::is_any_of(":"));
   string bucket_tenant = result[0];
   string bucket_name = result[1];
   string bucket_id = result[2];
@@ -496,8 +483,7 @@ int RGWLC::bucket_lc_process(string& shard_id)
   return ret;
 }
 
-int RGWLC::bucket_lc_post(int index, int max_lock_sec, cls_rgw_lc_obj_head& head,
-                                                              pair<string, int >& entry, int& result)
+int RGWLC::bucket_lc_post(int index, int max_lock_sec, pair<string, int >& entry, int& result)
 {
   utime_t lock_duration(cct->_conf->rgw_lc_lock_max_time, 0);
 
@@ -637,7 +623,7 @@ int RGWLC::process(int index, int max_lock_secs)
     }
     l.unlock(&store->lc_pool_ctx, obj_names[index]);
     ret = bucket_lc_process(entry.first);
-    ret = bucket_lc_post(index, max_lock_secs, head, entry, ret);
+    bucket_lc_post(index, max_lock_secs, entry, ret);
     return 0;
 exit:
     l.unlock(&store->lc_pool_ctx, obj_names[index]);
@@ -655,7 +641,7 @@ void RGWLC::start_processor()
 
 void RGWLC::stop_processor()
 {
-  down_flag.set(1);
+  down_flag = true;
   if (worker) {
     worker->stop();
     worker->join();
@@ -672,7 +658,7 @@ void RGWLC::LCWorker::stop()
 
 bool RGWLC::going_down()
 {
-  return (down_flag.read() != 0);
+  return down_flag;
 }
 
 bool RGWLC::LCWorker::should_work(utime_t& now)
diff --git a/src/rgw/rgw_lc.h b/src/rgw/rgw_lc.h
index 61edd78aac7..480ba3fe513 100644
--- a/src/rgw/rgw_lc.h
+++ b/src/rgw/rgw_lc.h
@@ -9,7 +9,6 @@
 #include "common/debug.h"
 
 #include "include/types.h"
-#include "include/atomic.h"
 #include "include/rados/librados.hpp"
 #include "common/Mutex.h"
 #include "common/Cond.h"
@@ -19,7 +18,8 @@
 #include "rgw_multi.h"
 #include "cls/rgw/cls_rgw_types.h"
 
-using namespace std;
+#include <atomic>
+
 #define HASH_PRIME 7877
 #define MAX_ID_LEN 255
 static string lc_oid_prefix = "lc";
@@ -227,7 +227,7 @@ class RGWLC {
   RGWRados *store;
   int max_objs;
   string *obj_names;
-  atomic_t down_flag;
+  std::atomic<bool> down_flag = { false };
   string cookie;
 
   class LCWorker : public Thread {
@@ -261,8 +261,7 @@ class RGWLC {
   int list_lc_progress(const string& marker, uint32_t max_entries, map<string, int> *progress_map);
   int bucket_lc_prepare(int index);
   int bucket_lc_process(string& shard_id);
-  int bucket_lc_post(int index, int max_lock_sec, cls_rgw_lc_obj_head& head, 
-                                                              pair<string, int >& entry, int& result);
+  int bucket_lc_post(int index, int max_lock_sec, pair<string, int >& entry, int& result);
   bool going_down();
   void start_processor();
   void stop_processor();
diff --git a/src/rgw/rgw_lc_s3.h b/src/rgw/rgw_lc_s3.h
index c969ac979c9..52f6987e487 100644
--- a/src/rgw/rgw_lc_s3.h
+++ b/src/rgw/rgw_lc_s3.h
@@ -6,16 +6,10 @@
 #include <iostream>
 #include <include/types.h>
 
-#include <expat.h>
-
 #include "include/str_list.h"
 #include "rgw_lc.h"
 #include "rgw_xml.h"
 
-
-
-using namespace std;
-
 class LCID_S3 : public XMLObj
 {
 public:
diff --git a/src/rgw/rgw_ldap.cc b/src/rgw/rgw_ldap.cc
index d07b295153e..a39afa6c081 100644
--- a/src/rgw/rgw_ldap.cc
+++ b/src/rgw/rgw_ldap.cc
@@ -94,6 +94,7 @@ namespace rgw {
 	if (ret != LDAP_SUCCESS) {
 	  ldout(g_ceph_context, 10)
 	    << __func__ << " simple_bind failed uid=" << uid
+	    << "ldap err=" << ret
 	    << dendl;
 	}
 	ldap_memfree(dn);
diff --git a/src/rgw/rgw_loadgen_process.cc b/src/rgw/rgw_loadgen_process.cc
index 7f003facbb7..23e6fefca73 100644
--- a/src/rgw/rgw_loadgen_process.cc
+++ b/src/rgw/rgw_loadgen_process.cc
@@ -13,6 +13,8 @@
 #include "rgw_loadgen.h"
 #include "rgw_client_io.h"
 
+#include <atomic>
+
 #define dout_subsys ceph_subsys_rgw
 
 extern void signal_shutdown();
@@ -37,7 +39,7 @@ void RGWLoadGenProcess::run()
 
   vector<string> buckets(num_buckets);
 
-  atomic_t failed;
+  std::atomic<long int> failed = { 0 };
 
   for (i = 0; i < num_buckets; i++) {
     buckets[i] = "/loadgen";
@@ -51,7 +53,7 @@ void RGWLoadGenProcess::run()
 
   string *objs = new string[num_objs];
 
-  if (failed.read()) {
+  if (failed) {
     derr << "ERROR: bucket creation failed" << dendl;
     goto done;
   }
@@ -69,7 +71,7 @@ void RGWLoadGenProcess::run()
 
   checkpoint();
 
-  if (failed.read()) {
+  if (failed) {
     derr << "ERROR: bucket creation failed" << dendl;
     goto done;
   }
@@ -102,7 +104,7 @@ done:
 
 void RGWLoadGenProcess::gen_request(const string& method,
 				    const string& resource,
-				    int content_length, atomic_t* fail_flag)
+				    int content_length, std::atomic<long int>* fail_flag)
 {
   RGWLoadGenRequest* req =
     new RGWLoadGenRequest(store->get_new_req_id(), method, resource,
@@ -138,7 +140,7 @@ void RGWLoadGenProcess::handle_request(RGWRequest* r)
     dout(20) << "process_request() returned " << ret << dendl;
 
     if (req->fail_flag) {
-      req->fail_flag->inc();
+      req->fail_flag++;
     }
   }
 
diff --git a/src/rgw/rgw_log.cc b/src/rgw/rgw_log.cc
index 153ddb060d7..51dbd1bbe93 100644
--- a/src/rgw/rgw_log.cc
+++ b/src/rgw/rgw_log.cc
@@ -218,7 +218,7 @@ static void log_usage(struct req_state *s, const string& op_name)
   rgw_usage_data data(bytes_sent, bytes_received);
 
   data.ops = 1;
-  if (!error)
+  if (!s->is_err())
     data.successful_ops = 1;
 
   entry.add(op_name, data);
diff --git a/src/rgw/rgw_log.h b/src/rgw/rgw_log.h
index 5e612b7ca22..25e99e096de 100644
--- a/src/rgw/rgw_log.h
+++ b/src/rgw/rgw_log.h
@@ -3,7 +3,6 @@
 
 #ifndef CEPH_RGW_LOG_H
 #define CEPH_RGW_LOG_H
-
 #include <boost/container/flat_map.hpp>
 #include "rgw_common.h"
 #include "include/utime.h"
diff --git a/src/rgw/rgw_main.cc b/src/rgw/rgw_main.cc
index 1bec56553a6..01ef5556c22 100644
--- a/src/rgw/rgw_main.cc
+++ b/src/rgw/rgw_main.cc
@@ -53,13 +53,14 @@
 #include "rgw_request.h"
 #include "rgw_process.h"
 #include "rgw_frontend.h"
-#if defined(WITH_RADOSGW_ASIO_FRONTEND)
+#if defined(WITH_RADOSGW_BEAST_FRONTEND)
 #include "rgw_asio_frontend.h"
-#endif /* WITH_RADOSGW_ASIO_FRONTEND */
+#endif /* WITH_RADOSGW_BEAST_FRONTEND */
 
 #include <map>
 #include <string>
 #include <vector>
+#include <atomic>
 
 #include "include/types.h"
 #include "common/BackTrace.h"
@@ -77,11 +78,11 @@ static sig_t sighandler_alrm;
 class RGWProcess;
 
 static int signal_fd[2] = {0, 0};
-static atomic_t disable_signal_fd;
+static std::atomic<int64_t> disable_signal_fd = { 0 };
 
 void signal_shutdown()
 {
-  if (!disable_signal_fd.read()) {
+  if (!disable_signal_fd) {
     int val = 0;
     int ret = write(signal_fd[0], (char *)&val, sizeof(val));
     if (ret < 0) {
@@ -470,8 +471,6 @@ int main(int argc, const char **argv)
     RGWFrontend *fe = NULL;
 
     if (framework == "civetweb" || framework == "mongoose") {
-      int port;
-      config->get_val("port", 80, &port);
       std::string uri_prefix;
       config->get_val("prefix", "", &uri_prefix);
 
@@ -489,9 +488,9 @@ int main(int argc, const char **argv)
 
       fe = new RGWLoadGenFrontend(env, config);
     }
-#if defined(WITH_RADOSGW_ASIO_FRONTEND)
-    else if ((framework == "asio") &&
-	cct->check_experimental_feature_enabled("rgw-asio-frontend")) {
+#if defined(WITH_RADOSGW_BEAST_FRONTEND)
+    else if ((framework == "beast") &&
+	cct->check_experimental_feature_enabled("rgw-beast-frontend")) {
       int port;
       config->get_val("port", 80, &port);
       std::string uri_prefix;
@@ -499,7 +498,7 @@ int main(int argc, const char **argv)
       RGWProcessEnv env{ store, &rest, olog, port, uri_prefix, auth_registry };
       fe = new RGWAsioFrontend(env);
     }
-#endif /* WITH_RADOSGW_ASIO_FRONTEND */
+#endif /* WITH_RADOSGW_BEAST_FRONTEND */
 #if defined(WITH_RADOSGW_FCGI_FRONTEND)
     else if (framework == "fastcgi" || framework == "fcgi") {
       std::string uri_prefix;
diff --git a/src/rgw/rgw_meta_sync_status.h b/src/rgw/rgw_meta_sync_status.h
index e913e8ffb21..e34bb05be1d 100644
--- a/src/rgw/rgw_meta_sync_status.h
+++ b/src/rgw/rgw_meta_sync_status.h
@@ -55,28 +55,33 @@ struct rgw_meta_sync_marker {
   uint64_t total_entries;
   uint64_t pos;
   real_time timestamp;
+  epoch_t realm_epoch{0}; //< realm_epoch of period marker
 
   rgw_meta_sync_marker() : state(FullSync), total_entries(0), pos(0) {}
 
   void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
+    ENCODE_START(2, 1, bl);
     ::encode(state, bl);
     ::encode(marker, bl);
     ::encode(next_step_marker, bl);
     ::encode(total_entries, bl);
     ::encode(pos, bl);
     ::encode(timestamp, bl);
+    ::encode(realm_epoch, bl);
     ENCODE_FINISH(bl);
   }
 
   void decode(bufferlist::iterator& bl) {
-    DECODE_START(1, bl);
+    DECODE_START(2, bl);
     ::decode(state, bl);
     ::decode(marker, bl);
     ::decode(next_step_marker, bl);
     ::decode(total_entries, bl);
     ::decode(pos, bl);
     ::decode(timestamp, bl);
+    if (struct_v >= 2) {
+      ::decode(realm_epoch, bl);
+    }
     DECODE_FINISH(bl);
   }
 
diff --git a/src/rgw/rgw_metadata.cc b/src/rgw/rgw_metadata.cc
index e26201bc3be..0a0f3dec324 100644
--- a/src/rgw/rgw_metadata.cc
+++ b/src/rgw/rgw_metadata.cc
@@ -313,28 +313,6 @@ public:
 static RGWMetadataTopHandler md_top_handler;
 
 
-static const std::string mdlog_history_oid = "meta.history";
-
-struct RGWMetadataLogHistory {
-  epoch_t oldest_realm_epoch;
-  std::string oldest_period_id;
-
-  void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
-    ::encode(oldest_realm_epoch, bl);
-    ::encode(oldest_period_id, bl);
-    ENCODE_FINISH(bl);
-  }
-  void decode(bufferlist::iterator& p) {
-    DECODE_START(1, p);
-    ::decode(oldest_realm_epoch, p);
-    ::decode(oldest_period_id, p);
-    DECODE_FINISH(p);
-  }
-};
-WRITE_CLASS_ENCODER(RGWMetadataLogHistory)
-
-
 RGWMetadataManager::RGWMetadataManager(CephContext *_cct, RGWRados *_store)
   : cct(_cct), store(_store)
 {
@@ -351,15 +329,18 @@ RGWMetadataManager::~RGWMetadataManager()
   handlers.clear();
 }
 
+const std::string RGWMetadataLogHistory::oid = "meta.history";
+
 namespace {
 
-int read_history(RGWRados *store, RGWMetadataLogHistory *state)
+int read_history(RGWRados *store, RGWMetadataLogHistory *state,
+                 RGWObjVersionTracker *objv_tracker)
 {
   RGWObjectCtx ctx{store};
   auto& pool = store->get_zone_params().log_pool;
-  const auto& oid = mdlog_history_oid;
+  const auto& oid = RGWMetadataLogHistory::oid;
   bufferlist bl;
-  int ret = rgw_get_system_obj(store, ctx, pool, oid, bl, nullptr, nullptr);
+  int ret = rgw_get_system_obj(store, ctx, pool, oid, bl, objv_tracker, nullptr);
   if (ret < 0) {
     return ret;
   }
@@ -375,19 +356,141 @@ int read_history(RGWRados *store, RGWMetadataLogHistory *state)
 }
 
 int write_history(RGWRados *store, const RGWMetadataLogHistory& state,
-                  bool exclusive = false)
+                  RGWObjVersionTracker *objv_tracker, bool exclusive = false)
 {
   bufferlist bl;
   state.encode(bl);
 
   auto& pool = store->get_zone_params().log_pool;
-  const auto& oid = mdlog_history_oid;
+  const auto& oid = RGWMetadataLogHistory::oid;
   return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
-                            exclusive, nullptr, real_time{});
+                            exclusive, objv_tracker, real_time{});
 }
 
 using Cursor = RGWPeriodHistory::Cursor;
 
+/// read the mdlog history and use it to initialize the given cursor
+class ReadHistoryCR : public RGWCoroutine {
+  RGWRados *store;
+  Cursor *cursor;
+  RGWObjVersionTracker *objv_tracker;
+  RGWMetadataLogHistory state;
+ public:
+  ReadHistoryCR(RGWRados *store, Cursor *cursor,
+                RGWObjVersionTracker *objv_tracker)
+    : RGWCoroutine(store->ctx()), store(store), cursor(cursor),
+      objv_tracker(objv_tracker)
+  {}
+
+  int operate() {
+    reenter(this) {
+      yield {
+        rgw_raw_obj obj{store->get_zone_params().log_pool,
+                        RGWMetadataLogHistory::oid};
+        constexpr bool empty_on_enoent = false;
+
+        using ReadCR = RGWSimpleRadosReadCR<RGWMetadataLogHistory>;
+        call(new ReadCR(store->get_async_rados(), store, obj,
+                        &state, empty_on_enoent, objv_tracker));
+      }
+      if (retcode < 0) {
+        ldout(cct, 1) << "failed to read mdlog history: "
+            << cpp_strerror(retcode) << dendl;
+        return set_cr_error(retcode);
+      }
+      *cursor = store->period_history->lookup(state.oldest_realm_epoch);
+      if (!*cursor) {
+        return set_cr_error(cursor->get_error());
+      }
+
+      ldout(cct, 10) << "read mdlog history with oldest period id="
+          << state.oldest_period_id << " realm_epoch="
+          << state.oldest_realm_epoch << dendl;
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+/// write the given cursor to the mdlog history
+class WriteHistoryCR : public RGWCoroutine {
+  RGWRados *store;
+  Cursor cursor;
+  RGWObjVersionTracker *objv;
+  RGWMetadataLogHistory state;
+ public:
+  WriteHistoryCR(RGWRados *store, const Cursor& cursor,
+                 RGWObjVersionTracker *objv)
+    : RGWCoroutine(store->ctx()), store(store), cursor(cursor), objv(objv)
+  {}
+
+  int operate() {
+    reenter(this) {
+      state.oldest_period_id = cursor.get_period().get_id();
+      state.oldest_realm_epoch = cursor.get_epoch();
+
+      yield {
+        rgw_raw_obj obj{store->get_zone_params().log_pool,
+                        RGWMetadataLogHistory::oid};
+
+        using WriteCR = RGWSimpleRadosWriteCR<RGWMetadataLogHistory>;
+        call(new WriteCR(store->get_async_rados(), store, obj, state, objv));
+      }
+      if (retcode < 0) {
+        ldout(cct, 1) << "failed to write mdlog history: "
+            << cpp_strerror(retcode) << dendl;
+        return set_cr_error(retcode);
+      }
+
+      ldout(cct, 10) << "wrote mdlog history with oldest period id="
+          << state.oldest_period_id << " realm_epoch="
+          << state.oldest_realm_epoch << dendl;
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+/// update the mdlog history to reflect trimmed logs
+class TrimHistoryCR : public RGWCoroutine {
+  RGWRados *store;
+  const Cursor cursor; //< cursor to trimmed period
+  RGWObjVersionTracker *objv; //< to prevent racing updates
+  Cursor next; //< target cursor for oldest log period
+  Cursor existing; //< existing cursor read from disk
+
+ public:
+  TrimHistoryCR(RGWRados *store, Cursor cursor, RGWObjVersionTracker *objv)
+    : RGWCoroutine(store->ctx()),
+      store(store), cursor(cursor), objv(objv), next(cursor)
+  {
+    next.next(); // advance past cursor
+  }
+
+  int operate() {
+    reenter(this) {
+      // read an existing history, and write the new history if it's newer
+      yield call(new ReadHistoryCR(store, &existing, objv));
+      if (retcode < 0) {
+        return set_cr_error(retcode);
+      }
+      // reject older trims with ECANCELED
+      if (cursor.get_epoch() < existing.get_epoch()) {
+        ldout(cct, 4) << "found oldest log epoch=" << existing.get_epoch()
+            << ", rejecting trim at epoch=" << cursor.get_epoch() << dendl;
+        return set_cr_error(-ECANCELED);
+      }
+      // overwrite with updated history
+      yield call(new WriteHistoryCR(store, next, objv));
+      if (retcode < 0) {
+        return set_cr_error(retcode);
+      }
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+
 // traverse all the way back to the beginning of the period history, and
 // return a cursor to the first period in a fully attached history
 Cursor find_oldest_period(RGWRados *store)
@@ -431,7 +534,8 @@ Cursor RGWMetadataManager::init_oldest_log_period()
 {
   // read the mdlog history
   RGWMetadataLogHistory state;
-  int ret = read_history(store, &state);
+  RGWObjVersionTracker objv;
+  int ret = read_history(store, &state, &objv);
 
   if (ret == -ENOENT) {
     // initialize the mdlog history and write it
@@ -446,7 +550,7 @@ Cursor RGWMetadataManager::init_oldest_log_period()
     state.oldest_period_id = cursor.get_period().get_id();
 
     constexpr bool exclusive = true; // don't overwrite
-    int ret = write_history(store, state, exclusive);
+    int ret = write_history(store, state, &objv, exclusive);
     if (ret < 0 && ret != -EEXIST) {
       ldout(cct, 1) << "failed to write mdlog history: "
           << cpp_strerror(ret) << dendl;
@@ -486,7 +590,7 @@ Cursor RGWMetadataManager::init_oldest_log_period()
 Cursor RGWMetadataManager::read_oldest_log_period() const
 {
   RGWMetadataLogHistory state;
-  int ret = read_history(store, &state);
+  int ret = read_history(store, &state, nullptr);
   if (ret < 0) {
     ldout(store->ctx(), 1) << "failed to read mdlog history: "
         << cpp_strerror(ret) << dendl;
@@ -500,6 +604,18 @@ Cursor RGWMetadataManager::read_oldest_log_period() const
   return store->period_history->lookup(state.oldest_realm_epoch);
 }
 
+RGWCoroutine* RGWMetadataManager::read_oldest_log_period_cr(Cursor *period,
+        RGWObjVersionTracker *objv) const
+{
+  return new ReadHistoryCR(store, period, objv);
+}
+
+RGWCoroutine* RGWMetadataManager::trim_log_period_cr(Cursor period,
+        RGWObjVersionTracker *objv) const
+{
+  return new TrimHistoryCR(store, period, objv);
+}
+
 int RGWMetadataManager::init(const std::string& current_period)
 {
   // open a log for the current period
diff --git a/src/rgw/rgw_metadata.h b/src/rgw/rgw_metadata.h
index 8b7526399a8..4d077e8f888 100644
--- a/src/rgw/rgw_metadata.h
+++ b/src/rgw/rgw_metadata.h
@@ -18,6 +18,7 @@
 
 
 class RGWRados;
+class RGWCoroutine;
 class JSONObj;
 struct RGWObjVersionTracker;
 
@@ -265,6 +266,27 @@ struct RGWMetadataLogData {
 };
 WRITE_CLASS_ENCODER(RGWMetadataLogData)
 
+struct RGWMetadataLogHistory {
+  epoch_t oldest_realm_epoch;
+  std::string oldest_period_id;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    ::encode(oldest_realm_epoch, bl);
+    ::encode(oldest_period_id, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::iterator& p) {
+    DECODE_START(1, p);
+    ::decode(oldest_realm_epoch, p);
+    ::decode(oldest_period_id, p);
+    DECODE_FINISH(p);
+  }
+
+  static const std::string oid;
+};
+WRITE_CLASS_ENCODER(RGWMetadataLogHistory)
+
 class RGWMetadataManager {
   map<string, RGWMetadataHandler *> handlers;
   CephContext *cct;
@@ -303,6 +325,16 @@ public:
   /// period history
   RGWPeriodHistory::Cursor read_oldest_log_period() const;
 
+  /// read the oldest log period asynchronously and write its result to the
+  /// given cursor pointer
+  RGWCoroutine* read_oldest_log_period_cr(RGWPeriodHistory::Cursor *period,
+                                          RGWObjVersionTracker *objv) const;
+
+  /// try to advance the oldest log period when the given period is trimmed,
+  /// using a rados lock to provide atomicity
+  RGWCoroutine* trim_log_period_cr(RGWPeriodHistory::Cursor period,
+                                   RGWObjVersionTracker *objv) const;
+
   /// find or create the metadata log for the given period
   RGWMetadataLog* get_log(const std::string& period);
 
diff --git a/src/rgw/rgw_object_expirer_core.cc b/src/rgw/rgw_object_expirer_core.cc
index 3bc033f0a68..fb0444186c9 100644
--- a/src/rgw/rgw_object_expirer_core.cc
+++ b/src/rgw/rgw_object_expirer_core.cc
@@ -230,7 +230,7 @@ bool RGWObjectExpirer::inspect_all_shards(const utime_t& last_run,
 
 bool RGWObjectExpirer::going_down()
 {
-  return (down_flag.read() != 0);
+  return down_flag;
 }
 
 void RGWObjectExpirer::start_processor()
@@ -241,7 +241,7 @@ void RGWObjectExpirer::start_processor()
 
 void RGWObjectExpirer::stop_processor()
 {
-  down_flag.set(1);
+  down_flag = true;
   if (worker) {
     worker->stop();
     worker->join();
diff --git a/src/rgw/rgw_object_expirer_core.h b/src/rgw/rgw_object_expirer_core.h
index 83f0dd30754..6fe8d1410bf 100644
--- a/src/rgw/rgw_object_expirer_core.h
+++ b/src/rgw/rgw_object_expirer_core.h
@@ -8,6 +8,7 @@
 #include <iostream>
 #include <sstream>
 #include <string>
+#include <atomic>
 
 #include "auth/Crypto.h"
 
@@ -37,6 +38,8 @@
 #include "rgw_usage.h"
 #include "rgw_replica_log.h"
 
+#include <atomic>
+
 class RGWObjectExpirer {
 protected:
   RGWRados *store;
@@ -65,7 +68,7 @@ protected:
   };
 
   OEWorker *worker;
-  atomic_t down_flag;
+  std::atomic<bool> down_flag = { false };
 
 public:
   explicit RGWObjectExpirer(RGWRados *_store)
diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc
index 465f1e82b2d..6ddd6748a1b 100644
--- a/src/rgw/rgw_op.cc
+++ b/src/rgw/rgw_op.cc
@@ -3,12 +3,15 @@
 
 #include <errno.h>
 #include <stdlib.h>
+#include <system_error>
 #include <unistd.h>
 
 #include <sstream>
 
 #include <boost/algorithm/string/predicate.hpp>
 #include <boost/optional.hpp>
+#include <boost/utility/in_place_factory.hpp>
+#include <boost/bind.hpp>
 
 #include "common/Clock.h"
 #include "common/armor.h"
@@ -43,6 +46,7 @@
 
 #include "compressor/Compressor.h"
 
+#include "rgw_acl_swift.h"
 
 #define dout_context g_ceph_context
 #define dout_subsys ceph_subsys_rgw
@@ -50,12 +54,21 @@
 using namespace std;
 using namespace librados;
 using ceph::crypto::MD5;
+using boost::optional;
+using boost::none;
+
+using rgw::IAM::ARN;
+using rgw::IAM::Effect;
+using rgw::IAM::Policy;
 
+using rgw::IAM::Policy;
 
 static string mp_ns = RGW_OBJ_NS_MULTIPART;
 static string shadow_ns = RGW_OBJ_NS_SHADOW;
 
-static int forward_request_to_master(struct req_state *s, obj_version *objv, RGWRados *store, bufferlist& in_data, JSONParser *jp);
+static void forward_req_info(CephContext *cct, req_info& info, const std::string& bucket_name);
+static int forward_request_to_master(struct req_state *s, obj_version *objv, RGWRados *store,
+                                     bufferlist& in_data, JSONParser *jp, req_info *forward_info = nullptr);
 
 static MultipartMetaFilter mp_filter;
 
@@ -137,9 +150,9 @@ static int decode_policy(CephContext *cct,
 
 
 static int get_user_policy_from_attr(CephContext * const cct,
-                                     RGWRados * const store,
-                                     map<string, bufferlist>& attrs,
-                                     RGWAccessControlPolicy& policy    /* out */)
+				     RGWRados * const store,
+				     map<string, bufferlist>& attrs,
+				     RGWAccessControlPolicy& policy    /* out */)
 {
   auto aiter = attrs.find(RGW_ATTR_ACL);
   if (aiter != attrs.end()) {
@@ -155,11 +168,11 @@ static int get_user_policy_from_attr(CephContext * const cct,
 }
 
 static int get_bucket_instance_policy_from_attr(CephContext *cct,
-                                       RGWRados *store,
-                                       RGWBucketInfo& bucket_info,
-                                       map<string, bufferlist>& bucket_attrs,
-                                       RGWAccessControlPolicy *policy,
-                                       rgw_raw_obj& obj)
+						RGWRados *store,
+						RGWBucketInfo& bucket_info,
+						map<string, bufferlist>& bucket_attrs,
+						RGWAccessControlPolicy *policy,
+						rgw_raw_obj& obj)
 {
   map<string, bufferlist>::iterator aiter = bucket_attrs.find(RGW_ATTR_ACL);
 
@@ -181,12 +194,12 @@ static int get_bucket_instance_policy_from_attr(CephContext *cct,
 }
 
 static int get_obj_policy_from_attr(CephContext *cct,
-                                    RGWRados *store,
-                                    RGWObjectCtx& obj_ctx,
-                                    RGWBucketInfo& bucket_info,
-                                    map<string, bufferlist>& bucket_attrs,
-                                    RGWAccessControlPolicy *policy,
-                                    rgw_obj& obj)
+				    RGWRados *store,
+				    RGWObjectCtx& obj_ctx,
+				    RGWBucketInfo& bucket_info,
+				    map<string, bufferlist>& bucket_attrs,
+				    RGWAccessControlPolicy *policy,
+				    rgw_obj& obj)
 {
   bufferlist bl;
   int ret = 0;
@@ -221,15 +234,27 @@ static int get_obj_policy_from_attr(CephContext *cct,
  * Returns: 0 on success, -ERR# otherwise.
  */
 static int get_bucket_policy_from_attr(CephContext *cct,
-                                RGWRados *store,
-                                RGWBucketInfo& bucket_info,
-                                map<string, bufferlist>& bucket_attrs,
-                                RGWAccessControlPolicy *policy)
+				       RGWRados *store,
+				       RGWBucketInfo& bucket_info,
+				       map<string, bufferlist>& bucket_attrs,
+				       RGWAccessControlPolicy *policy)
 {
   rgw_raw_obj instance_obj;
   store->get_bucket_instance_obj(bucket_info.bucket, instance_obj);
   return get_bucket_instance_policy_from_attr(cct, store, bucket_info, bucket_attrs,
-                                              policy, instance_obj);
+					      policy, instance_obj);
+}
+
+static optional<Policy> get_iam_policy_from_attr(CephContext* cct,
+						 RGWRados* store,
+						 map<string, bufferlist>& attrs,
+						 const string& tenant) {
+  auto i = attrs.find(RGW_ATTR_IAM_POLICY);
+  if (i != attrs.end()) {
+    return Policy(cct, tenant, i->second);
+  } else {
+    return none;
+  }
 }
 
 static int get_obj_attrs(RGWRados *store, struct req_state *s, rgw_obj& obj, map<string, bufferlist>& attrs)
@@ -238,7 +263,6 @@ static int get_obj_attrs(RGWRados *store, struct req_state *s, rgw_obj& obj, map
   RGWRados::Object::Read read_op(&op_target);
 
   read_op.params.attrs = &attrs;
-  read_op.params.perr = &s->err;
 
   return read_op.prepare();
 }
@@ -250,7 +274,6 @@ static int modify_obj_attr(RGWRados *store, struct req_state *s, rgw_obj& obj, c
   RGWRados::Object::Read read_op(&op_target);
 
   read_op.params.attrs = &attrs;
-  read_op.params.perr = &s->err;
   
   int r = read_op.prepare();
   if (r < 0) {
@@ -302,7 +325,8 @@ static int read_obj_policy(RGWRados *store,
                            struct req_state *s,
                            RGWBucketInfo& bucket_info,
                            map<string, bufferlist>& bucket_attrs,
-                           RGWAccessControlPolicy *policy,
+                           RGWAccessControlPolicy* acl,
+			   optional<Policy>& policy,
                            rgw_bucket& bucket,
                            rgw_obj_key& object)
 {
@@ -324,9 +348,11 @@ static int read_obj_policy(RGWRados *store,
   } else {
     obj = rgw_obj(bucket, object);
   }
+  policy = get_iam_policy_from_attr(s->cct, store, bucket_attrs, bucket.tenant);
+
   RGWObjectCtx *obj_ctx = static_cast<RGWObjectCtx *>(s->obj_ctx);
   int ret = get_obj_policy_from_attr(s->cct, store, *obj_ctx,
-                                     bucket_info, bucket_attrs, policy, obj);
+                                     bucket_info, bucket_attrs, acl, obj);
   if (ret == -ENOENT) {
     /* object does not exist checking the bucket's ACL to make sure
        that we send a proper error code */
@@ -361,7 +387,7 @@ int rgw_build_bucket_policies(RGWRados* store, struct req_state* s)
   int ret = 0;
   rgw_obj_key obj;
   RGWUserInfo bucket_owner_info;
-  RGWObjectCtx& obj_ctx = *static_cast<RGWObjectCtx *>(s->obj_ctx);
+  RGWObjectCtx obj_ctx(store);
 
   string bi = s->info.args.get(RGW_SYS_PARAM_PREFIX "bucket-instance");
   if (!bi.empty()) {
@@ -374,8 +400,13 @@ int rgw_build_bucket_policies(RGWRados* store, struct req_state* s)
   if(s->dialect.compare("s3") == 0) {
     s->bucket_acl = new RGWAccessControlPolicy_S3(s->cct);
   } else if(s->dialect.compare("swift")  == 0) {
-    s->user_acl = std::unique_ptr<RGWAccessControlPolicy>(
-        new RGWAccessControlPolicy_SWIFTAcct(s->cct));
+    /* We aren't allocating the account policy for those operations using
+     * the Swift's infrastructure that don't really need req_state::user.
+     * Typical example here is the implementation of /info. */
+    if (!s->user->user_id.empty()) {
+      s->user_acl = std::unique_ptr<RGWAccessControlPolicy>(
+          new RGWAccessControlPolicy_SWIFTAcct(s->cct));
+    }
     s->bucket_acl = new RGWAccessControlPolicy_SWIFT(s->cct);
   } else {
     s->bucket_acl = new RGWAccessControlPolicy(s->cct);
@@ -497,6 +528,16 @@ int rgw_build_bucket_policies(RGWRados* store, struct req_state* s)
     }
   }
 
+  try {
+    s->iam_policy = get_iam_policy_from_attr(s->cct, store, s->bucket_attrs,
+					     s->bucket_tenant);
+  } catch (const std::exception& e) {
+    // Really this is a can't happen condition. We parse the policy
+    // when it's given to us, so perhaps we should abort or otherwise
+    // raise bloody murder.
+    lderr(s->cct) << "Error reading IAM Policy: " << e.what() << dendl;
+    ret = -EACCES;
+  }
 
   return ret;
 }
@@ -524,12 +565,72 @@ int rgw_build_object_policies(RGWRados *store, struct req_state *s,
     if (prefetch_data) {
       store->set_prefetch_data(s->obj_ctx, obj);
     }
-    ret = read_obj_policy(store, s, s->bucket_info, s->bucket_attrs, s->object_acl, s->bucket, s->object);
+    ret = read_obj_policy(store, s, s->bucket_info, s->bucket_attrs, s->object_acl, s->iam_policy, s->bucket, s->object);
   }
 
   return ret;
 }
 
+rgw::IAM::Environment rgw_build_iam_environment(RGWRados* store,
+						struct req_state* s)
+{
+  rgw::IAM::Environment e;
+  const auto& m = s->info.env->get_map();
+  auto t = ceph::real_clock::now();
+  e.emplace(std::piecewise_construct,
+	    std::forward_as_tuple("aws:CurrentTime"),
+	    std::forward_as_tuple(std::to_string(
+				    ceph::real_clock::to_time_t(t))));
+  e.emplace(std::piecewise_construct,
+	    std::forward_as_tuple("aws:EpochTime"),
+	    std::forward_as_tuple(ceph::to_iso_8601(t)));
+  // TODO: This is fine for now, but once we have STS we'll need to
+  // look and see. Also this won't work with the IdentityApplier
+  // model, since we need to know the actual credential.
+  e.emplace(std::piecewise_construct,
+	    std::forward_as_tuple("aws:PrincipalType"),
+	    std::forward_as_tuple("User"));
+
+  auto i = m.find("HTTP_REFERER");
+  if (i != m.end()) {
+    e.emplace(std::piecewise_construct,
+	      std::forward_as_tuple("aws:Referer"),
+	      std::forward_as_tuple(i->second));
+  }
+
+  // These seem to be the semantics, judging from rest_rgw_s3.cc
+  i = m.find("SERVER_PORT_SECURE");
+  if (i != m.end()) {
+    e.emplace(std::piecewise_construct,
+	      std::forward_as_tuple("aws:SecureTransport"),
+	      std::forward_as_tuple("true"));
+  }
+
+  i = m.find("HTTP_HOST");
+  if (i != m.end()) {
+    e.emplace(std::piecewise_construct,
+	      std::forward_as_tuple("aws:SourceIp"),
+	      std::forward_as_tuple(i->second));
+  }
+
+  i = m.find("HTTP_USER_AGENT"); {
+  if (i != m.end())
+    e.emplace(std::piecewise_construct,
+	      std::forward_as_tuple("aws:UserAgent"),
+	      std::forward_as_tuple(i->second));
+  }
+
+  if (s->user) {
+    // What to do about aws::userid? One can have multiple access
+    // keys so that isn't really suitable. Do we have a durable
+    // identifier that can persist through name changes?
+    e.emplace(std::piecewise_construct,
+	      std::forward_as_tuple("aws:username"),
+	      std::forward_as_tuple(s->user->user_id.id));
+  }
+  return e;
+}
+
 static void rgw_bucket_object_pre_exec(struct req_state *s)
 {
   if (s->expect_cont)
@@ -546,7 +647,21 @@ int RGWGetObj::verify_permission()
     store->set_prefetch_data(s->obj_ctx, obj);
   }
 
-  if (!verify_object_permission(s, RGW_PERM_READ)) {
+  if (torrent.get_flag()) {
+    if (obj.key.instance.empty()) {
+      action = rgw::IAM::s3GetObjectTorrent;
+    } else {
+      action = rgw::IAM::s3GetObjectVersionTorrent;
+    }
+  } else {
+    if (obj.key.instance.empty()) {
+      action = rgw::IAM::s3GetObject;
+    } else {
+      action = rgw::IAM::s3GetObjectVersion;
+    }
+  }
+
+  if (!verify_object_permission(s, action)) {
     return -EACCES;
   }
 
@@ -786,7 +901,8 @@ bool RGWOp::generate_cors_headers(string& origin, string& method, string& header
 
 int RGWGetObj::read_user_manifest_part(rgw_bucket& bucket,
                                        const rgw_bucket_dir_entry& ent,
-                                       RGWAccessControlPolicy * const bucket_policy,
+                                       RGWAccessControlPolicy * const bucket_acl,
+                                       const optional<Policy>& bucket_policy,
                                        const off_t start_ofs,
                                        const off_t end_ofs)
 {
@@ -817,7 +933,6 @@ int RGWGetObj::read_user_manifest_part(rgw_bucket& bucket,
   read_op.conds.if_match = ent.meta.etag.c_str();
   read_op.params.attrs = &attrs;
   read_op.params.obj_size = &obj_size;
-  read_op.params.perr = &s->err;
 
   op_ret = read_op.prepare();
   if (op_ret < 0)
@@ -862,8 +977,8 @@ int RGWGetObj::read_user_manifest_part(rgw_bucket& bucket,
     ldout(s->cct, 2) << "overriding permissions due to system operation" << dendl;
   } else if (s->auth.identity->is_admin_of(s->user->user_id)) {
     ldout(s->cct, 2) << "overriding permissions due to admin operation" << dendl;
-  } else if (!verify_object_permission(s, s->user_acl.get(), bucket_policy,
-                                       &obj_policy, RGW_PERM_READ)) {
+  } else if (!verify_object_permission(s, part, s->user_acl.get(), bucket_acl,
+				       &obj_policy, bucket_policy, action)) {
     return -EPERM;
   }
 
@@ -885,13 +1000,15 @@ static int iterate_user_manifest_parts(CephContext * const cct,
                                        const off_t end,
                                        RGWBucketInfo *pbucket_info,
                                        const string& obj_prefix,
-                                       RGWAccessControlPolicy * const bucket_policy,
+                                       RGWAccessControlPolicy * const bucket_acl,
+                                       const optional<Policy>& bucket_policy,
                                        uint64_t * const ptotal_len,
                                        uint64_t * const pobj_size,
                                        string * const pobj_sum,
                                        int (*cb)(rgw_bucket& bucket,
                                                  const rgw_bucket_dir_entry& ent,
-                                                 RGWAccessControlPolicy * const bucket_policy,
+                                                 RGWAccessControlPolicy * const bucket_acl,
+                                                 const optional<Policy>& bucket_policy,
                                                  off_t start_ofs,
                                                  off_t end_ofs,
                                                  void *param),
@@ -947,7 +1064,7 @@ static int iterate_user_manifest_parts(CephContext * const cct,
         len_count += end_ofs - start_ofs;
 
         if (cb) {
-          r = cb(bucket, ent, bucket_policy, start_ofs, end_ofs, cb_param);
+          r = cb(bucket, ent, bucket_acl, bucket_policy, start_ofs, end_ofs, cb_param);
           if (r < 0) {
             return r;
           }
@@ -973,13 +1090,12 @@ static int iterate_user_manifest_parts(CephContext * const cct,
 }
 
 struct rgw_slo_part {
-  RGWAccessControlPolicy *bucket_policy;
+  RGWAccessControlPolicy *bucket_acl = nullptr;
+  Policy* bucket_policy = nullptr;
   rgw_bucket bucket;
   string obj_name;
-  uint64_t size;
+  uint64_t size = 0;
   string etag;
-
-  rgw_slo_part() : bucket_policy(NULL), size(0) {}
 };
 
 static int iterate_slo_parts(CephContext *cct,
@@ -989,7 +1105,8 @@ static int iterate_slo_parts(CephContext *cct,
                              map<uint64_t, rgw_slo_part>& slo_parts,
                              int (*cb)(rgw_bucket& bucket,
                                        const rgw_bucket_dir_entry& ent,
-                                       RGWAccessControlPolicy *bucket_policy,
+                                       RGWAccessControlPolicy *bucket_acl,
+                                       const optional<Policy>& bucket_policy,
                                        off_t start_ofs,
                                        off_t end_ofs,
                                        void *param),
@@ -1038,8 +1155,12 @@ static int iterate_slo_parts(CephContext *cct,
 
     if (found_start) {
       if (cb) {
-        int r = cb(part.bucket, ent, part.bucket_policy, start_ofs, end_ofs, cb_param);
-        if (r < 0)
+	// SLO is a Swift thing, and Swift has no knowledge of S3 Policies.
+        int r = cb(part.bucket, ent, part.bucket_acl,
+		   (part.bucket_policy ?
+		    optional<Policy>(*part.bucket_policy) : none),
+		   start_ofs, end_ofs, cb_param);
+	if (r < 0)
           return r;
       }
     }
@@ -1052,13 +1173,14 @@ static int iterate_slo_parts(CephContext *cct,
 
 static int get_obj_user_manifest_iterate_cb(rgw_bucket& bucket,
                                             const rgw_bucket_dir_entry& ent,
-                                            RGWAccessControlPolicy * const bucket_policy,
+                                            RGWAccessControlPolicy * const bucket_acl,
+                                            const optional<Policy>& bucket_policy,
                                             const off_t start_ofs,
                                             const off_t end_ofs,
                                             void * const param)
 {
   RGWGetObj *op = static_cast<RGWGetObj *>(param);
-  return op->read_user_manifest_part(bucket, ent, bucket_policy, start_ofs, end_ofs);
+  return op->read_user_manifest_part(bucket, ent, bucket_acl, bucket_policy, start_ofs, end_ofs);
 }
 
 int RGWGetObj::handle_user_manifest(const char *prefix)
@@ -1080,8 +1202,10 @@ int RGWGetObj::handle_user_manifest(const char *prefix)
 
   rgw_bucket bucket;
 
-  RGWAccessControlPolicy _bucket_policy(s->cct);
-  RGWAccessControlPolicy *bucket_policy;
+  RGWAccessControlPolicy _bucket_acl(s->cct);
+  RGWAccessControlPolicy *bucket_acl;
+  optional<Policy> _bucket_policy;
+  optional<Policy>* bucket_policy;
   RGWBucketInfo bucket_info;
   RGWBucketInfo *pbucket_info;
 
@@ -1098,16 +1222,20 @@ int RGWGetObj::handle_user_manifest(const char *prefix)
     }
     bucket = bucket_info.bucket;
     pbucket_info = &bucket_info;
-    bucket_policy = &_bucket_policy;
-    r = read_bucket_policy(store, s, bucket_info, bucket_attrs, bucket_policy, bucket);
+    bucket_acl = &_bucket_acl;
+    r = read_bucket_policy(store, s, bucket_info, bucket_attrs, bucket_acl, bucket);
     if (r < 0) {
       ldout(s->cct, 0) << "failed to read bucket policy" << dendl;
       return r;
     }
+    _bucket_policy = get_iam_policy_from_attr(s->cct, store, bucket_attrs,
+					      bucket_info.bucket.tenant);
+    bucket_policy = &_bucket_policy;
   } else {
     bucket = s->bucket;
     pbucket_info = &s->bucket_info;
-    bucket_policy = s->bucket_acl;
+    bucket_acl = s->bucket_acl;
+    bucket_policy = &s->iam_policy;
   }
 
   /* dry run to find out:
@@ -1115,7 +1243,7 @@ int RGWGetObj::handle_user_manifest(const char *prefix)
    * - overall DLO's content size,
    * - md5 sum of overall DLO's content (for etag of Swift API). */
   int r = iterate_user_manifest_parts(s->cct, store, ofs, end,
-        pbucket_info, obj_prefix, bucket_policy,
+	pbucket_info, obj_prefix, bucket_acl, *bucket_policy,
         &total_len, &s->obj_size, &lo_etag,
         nullptr /* cb */, nullptr /* cb arg */);
   if (r < 0) {
@@ -1129,7 +1257,7 @@ int RGWGetObj::handle_user_manifest(const char *prefix)
   }
 
   r = iterate_user_manifest_parts(s->cct, store, ofs, end,
-        pbucket_info, obj_prefix, bucket_policy,
+	pbucket_info, obj_prefix, bucket_acl, *bucket_policy,
         nullptr, nullptr, nullptr,
         get_obj_user_manifest_iterate_cb, (void *)this);
   if (r < 0) {
@@ -1156,8 +1284,8 @@ int RGWGetObj::handle_slo_manifest(bufferlist& bl)
   }
   ldout(s->cct, 2) << "RGWGetObj::handle_slo_manifest()" << dendl;
 
-  list<RGWAccessControlPolicy> allocated_policies;
-  map<string, RGWAccessControlPolicy *> policies;
+  vector<RGWAccessControlPolicy> allocated_acls;
+  map<string, pair<RGWAccessControlPolicy *, optional<Policy>>> policies;
   map<string, rgw_bucket> buckets;
 
   map<uint64_t, rgw_slo_part> slo_parts;
@@ -1189,16 +1317,18 @@ int RGWGetObj::handle_slo_manifest(bufferlist& bl)
     string obj_name = path.substr(pos_sep + 1);
 
     rgw_bucket bucket;
-    RGWAccessControlPolicy *bucket_policy;
+    RGWAccessControlPolicy *bucket_acl;
+    Policy* bucket_policy;
 
     if (bucket_name.compare(s->bucket.name) != 0) {
       const auto& piter = policies.find(bucket_name);
       if (piter != policies.end()) {
-        bucket_policy = piter->second;
-        bucket = buckets[bucket_name];
+        bucket_acl = piter->second.first;
+        bucket_policy = piter->second.second.get_ptr();
+	bucket = buckets[bucket_name];
       } else {
-        allocated_policies.push_back(RGWAccessControlPolicy(s->cct));
-        RGWAccessControlPolicy& _bucket_policy = allocated_policies.back();
+	allocated_acls.push_back(RGWAccessControlPolicy(s->cct));
+	RGWAccessControlPolicy& _bucket_acl = allocated_acls.back();
 
         RGWBucketInfo bucket_info;
         map<string, bufferlist> bucket_attrs;
@@ -1212,23 +1342,28 @@ int RGWGetObj::handle_slo_manifest(bufferlist& bl)
           return r;
         }
         bucket = bucket_info.bucket;
-        bucket_policy = &_bucket_policy;
-        r = read_bucket_policy(store, s, bucket_info, bucket_attrs, bucket_policy,
+        bucket_acl = &_bucket_acl;
+        r = read_bucket_policy(store, s, bucket_info, bucket_attrs, bucket_acl,
                                bucket);
         if (r < 0) {
-          ldout(s->cct, 0) << "failed to read bucket policy for bucket "
+          ldout(s->cct, 0) << "failed to read bucket ACL for bucket "
                            << bucket << dendl;
           return r;
-        }
-        buckets[bucket_name] = bucket;
-        policies[bucket_name] = bucket_policy;
+	}
+	auto _bucket_policy = get_iam_policy_from_attr(
+	  s->cct, store, bucket_attrs, bucket_info.bucket.tenant);
+        bucket_policy = _bucket_policy.get_ptr();
+	buckets[bucket_name] = bucket;
+        policies[bucket_name] = make_pair(bucket_acl, _bucket_policy);
       }
     } else {
       bucket = s->bucket;
-      bucket_policy = s->bucket_acl;
+      bucket_acl = s->bucket_acl;
+      bucket_policy = s->iam_policy.get_ptr();
     }
 
     rgw_slo_part part;
+    part.bucket_acl = bucket_acl;
     part.bucket_policy = bucket_policy;
     part.bucket = bucket;
     part.obj_name = obj_name;
@@ -1377,7 +1512,6 @@ void RGWGetObj::execute()
   read_op.params.attrs = &attrs;
   read_op.params.lastmod = &lastmod;
   read_op.params.obj_size = &s->obj_size;
-  read_op.params.perr = &s->err;
 
   op_ret = read_op.prepare();
   if (op_ret < 0)
@@ -1871,7 +2005,8 @@ void RGWDeleteBucketWebsite::execute()
 
 int RGWStatBucket::verify_permission()
 {
-  if (!verify_bucket_permission(s, RGW_PERM_READ)) {
+  // This (a HEAD request on a bucket) is governed by the s3:ListBucket permission.
+  if (!verify_bucket_permission(s, rgw::IAM::s3ListBucket)) {
     return -EACCES;
   }
 
@@ -1910,7 +2045,15 @@ void RGWStatBucket::execute()
 
 int RGWListBucket::verify_permission()
 {
-  if (!verify_bucket_permission(s, RGW_PERM_READ)) {
+  op_ret = get_params();
+  if (op_ret < 0) {
+    return op_ret;
+  }
+
+  if (!verify_bucket_permission(s,
+				list_versions ?
+				rgw::IAM::s3ListBucketVersions :
+				rgw::IAM::s3ListBucket)) {
     return -EACCES;
   }
 
@@ -1948,10 +2091,6 @@ void RGWListBucket::execute()
     return;
   }
 
-  op_ret = get_params();
-  if (op_ret < 0)
-    return;
-
   if (need_container_stats()) {
     map<string, RGWBucketEnt> m;
     m[s->bucket.name] = RGWBucketEnt();
@@ -2043,7 +2182,7 @@ int RGWCreateBucket::verify_permission()
 
 static int forward_request_to_master(struct req_state *s, obj_version *objv,
 				    RGWRados *store, bufferlist& in_data,
-				    JSONParser *jp)
+				    JSONParser *jp, req_info *forward_info)
 {
   if (!store->rest_master_conn) {
     ldout(s->cct, 0) << "rest connection is invalid" << dendl;
@@ -2053,9 +2192,8 @@ static int forward_request_to_master(struct req_state *s, obj_version *objv,
   bufferlist response;
   string uid_str = s->user->user_id.to_str();
 #define MAX_REST_RESPONSE (128 * 1024) // we expect a very small response
-  int ret = store->rest_master_conn->forward(uid_str, s->info, objv,
-					    MAX_REST_RESPONSE, &in_data,
-					    &response);
+  int ret = store->rest_master_conn->forward(uid_str, (forward_info ? *forward_info : s->info),
+                                             objv, MAX_REST_RESPONSE, &in_data, &response);
   if (ret < 0)
     return ret;
 
@@ -2327,7 +2465,7 @@ void RGWCreateBucket::execute()
     bucket.name = s->bucket_name;
     op_ret = store->select_bucket_placement(*(s->user), zonegroup_id,
 					    placement_rule,
-					    bucket, &selected_placement_rule, nullptr);
+					    &selected_placement_rule, nullptr);
     if (selected_placement_rule != s->bucket_info.placement_rule) {
       op_ret = -EEXIST;
       return;
@@ -2475,7 +2613,7 @@ void RGWCreateBucket::execute()
 
 int RGWDeleteBucket::verify_permission()
 {
-  if (!verify_bucket_permission(s, RGW_PERM_WRITE)) {
+  if (!verify_bucket_permission(s, rgw::IAM::s3DeleteBucket)) {
     return -EACCES;
   }
 
@@ -2572,7 +2710,8 @@ int RGWPutObj::verify_permission()
 {
   if (copy_source) {
 
-    RGWAccessControlPolicy cs_policy(s->cct);
+    RGWAccessControlPolicy cs_acl(s->cct);
+    optional<Policy> policy;
     map<string, bufferlist> cs_attrs;
     rgw_bucket cs_bucket(copy_source_bucket_info.bucket);
     rgw_obj_key cs_object(copy_source_object_name, copy_source_version_id);
@@ -2582,19 +2721,45 @@ int RGWPutObj::verify_permission()
     store->set_prefetch_data(s->obj_ctx, obj);
 
     /* check source object permissions */
-    if (read_obj_policy(store, s, copy_source_bucket_info, cs_attrs, &cs_policy, cs_bucket, cs_object) < 0) {
+    if (read_obj_policy(store, s, copy_source_bucket_info, cs_attrs, &cs_acl, policy,
+			cs_bucket, cs_object) < 0) {
       return -EACCES;
     }
 
     /* admin request overrides permission checks */
-    if (! s->auth.identity->is_admin_of(cs_policy.get_owner().get_id()) &&
-        ! cs_policy.verify_permission(*s->auth.identity, s->perm_mask, RGW_PERM_READ)) {
-      return -EACCES;
+    if (! s->auth.identity->is_admin_of(cs_acl.get_owner().get_id())) {
+      if (policy) {
+	auto e = policy->eval(s->env, *s->auth.identity,
+			      cs_object.instance.empty() ?
+			      rgw::IAM::s3GetObject :
+			      rgw::IAM::s3GetObjectVersion,
+			      rgw::IAM::ARN(obj));
+	if (e == Effect::Deny) {
+	  return -EACCES; 
+	} else if (e == Effect::Pass &&
+		   !cs_acl.verify_permission(*s->auth.identity, s->perm_mask,
+						RGW_PERM_READ)) {
+	  return -EACCES;
+	}
+      } else if (!cs_acl.verify_permission(*s->auth.identity, s->perm_mask,
+					   RGW_PERM_READ)) {
+	return -EACCES;
+      }
     }
+  }
 
+  if (s->iam_policy) {
+    auto e = s->iam_policy->eval(s->env, *s->auth.identity,
+				 rgw::IAM::s3PutObject,
+				 rgw_obj(s->bucket, s->object));
+    if (e == Effect::Allow) {
+      return 0;
+    } else if (e == Effect::Deny) {
+      return -EACCES;
+    }
   }
 
-  if (!verify_bucket_permission(s, RGW_PERM_WRITE)) {
+  if (!verify_bucket_permission_no_policy(s, RGW_PERM_WRITE)) {
     return -EACCES;
   }
 
@@ -2667,7 +2832,7 @@ int RGWPutObjProcessor_Multipart::do_complete(size_t accounted_size,
                                               map<string, bufferlist>& attrs,
                                               real_time delete_at,
                                               const char *if_match,
-                                              const char *if_nomatch)
+                                              const char *if_nomatch, const string *user_data)
 {
   complete_writing_data();
 
@@ -3027,7 +3192,7 @@ void RGWPutObj::execute()
     }
 
     bufferlist &data = data_in;
-    if (s->aws4_auth_streaming_mode) {
+    if (len && s->aws4_auth_streaming_mode) {
       /* use unwrapped data */
       data = s->aws4_auth->bl;
       len = data.length();
@@ -3221,7 +3386,8 @@ void RGWPutObj::execute()
   }
 
   op_ret = processor->complete(s->obj_size, etag, &mtime, real_time(), attrs,
-                               (delete_at ? *delete_at : real_time()), if_match, if_nomatch);
+                               (delete_at ? *delete_at : real_time()), if_match, if_nomatch,
+                               (user_data.empty() ? nullptr : &user_data));
 
   /* produce torrent */
   if (s->cct->_conf->rgw_torrent_flag && (ofs == torrent.get_data_len()))
@@ -3271,16 +3437,10 @@ void RGWPostObj::pre_exec()
 void RGWPostObj::execute()
 {
   RGWPutObjDataProcessor *filter = nullptr;
-  std::unique_ptr<RGWPutObjDataProcessor> encrypt;
-  char calc_md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
-  unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE];
-  MD5 hash;
-  buffer::list bl, aclbl;
-  int len = 0;
   boost::optional<RGWPutObj_Compress> compressor;
   CompressorRef plugin;
 
-  // read in the data from the POST form
+  /* Read in the data from the POST form. */
   op_ret = get_params();
   if (op_ret < 0) {
     return;
@@ -3291,121 +3451,151 @@ void RGWPostObj::execute()
     return;
   }
 
-  if (!verify_bucket_permission(s, RGW_PERM_WRITE)) {
+  if (s->iam_policy) {
+    auto e = s->iam_policy->eval(s->env, *s->auth.identity,
+				 rgw::IAM::s3PutObject,
+				 rgw_obj(s->bucket, s->object));
+    if (e == Effect::Deny) {
+      op_ret = -EACCES;
+      return;
+    } else if (e == Effect::Pass && !verify_bucket_permission_no_policy(s, RGW_PERM_WRITE)) {
+      op_ret = -EACCES;
+      return;
+    }
+  } else if (!verify_bucket_permission_no_policy(s, RGW_PERM_WRITE)) {
     op_ret = -EACCES;
     return;
   }
 
-  op_ret = store->check_quota(s->bucket_owner.get_id(), s->bucket,
-			      user_quota, bucket_quota, s->content_length);
-  if (op_ret < 0) {
-    return;
-  }
-
-  RGWPutObjProcessor_Atomic processor(*static_cast<RGWObjectCtx *>(s->obj_ctx),
-                                      s->bucket_info,
-                                      s->bucket,
-                                      s->object.name,
-                                      /* part size */
-                                      s->cct->_conf->rgw_obj_stripe_size,
-                                      s->req_id,
-                                      s->bucket_info.versioning_enabled());
+  /* Start iteration over data fields. It's necessary as Swift's FormPost
+   * is capable to handle multiple files in single form. */
+  do {
+    std::unique_ptr<RGWPutObjDataProcessor> encrypt;
+    char calc_md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
+    unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE];
+    MD5 hash;
+    ceph::buffer::list bl, aclbl;
+    int len = 0;
+
+    op_ret = store->check_quota(s->bucket_owner.get_id(),
+                                s->bucket,
+                                user_quota,
+                                bucket_quota,
+                                s->content_length);
+    if (op_ret < 0) {
+      return;
+    }
 
-  // no filters by default
-  filter = &processor;
+    RGWPutObjProcessor_Atomic processor(*static_cast<RGWObjectCtx *>(s->obj_ctx),
+                                        s->bucket_info,
+                                        s->bucket,
+                                        get_current_filename(),
+                                        /* part size */
+                                        s->cct->_conf->rgw_obj_stripe_size,
+                                        s->req_id,
+                                        s->bucket_info.versioning_enabled());
+    /* No filters by default. */
+    filter = &processor;
 
-  op_ret = processor.prepare(store, nullptr);
-  if (op_ret < 0)
-    return;
+    op_ret = processor.prepare(store, nullptr);
+    if (op_ret < 0) {
+      return;
+    }
 
-  op_ret = get_encrypt_filter(&encrypt, filter);
-  if (op_ret < 0) {
-    return;
-  }
-  if (encrypt != nullptr) {
-    filter = encrypt.get();
-  } else {
-    const auto& compression_type = store->get_zone_params().get_compression_type(
-        s->bucket_info.placement_rule);
-    if (compression_type != "none") {
-      plugin = Compressor::create(s->cct, compression_type);
-      if (!plugin) {
-        ldout(s->cct, 1) << "Cannot load plugin for compression type "
-            << compression_type << dendl;
-      } else {
-        compressor.emplace(s->cct, plugin, filter);
-        filter = &*compressor;
+    op_ret = get_encrypt_filter(&encrypt, filter);
+    if (op_ret < 0) {
+      return;
+    }
+    if (encrypt != nullptr) {
+      filter = encrypt.get();
+    } else {
+      const auto& compression_type = store->get_zone_params().get_compression_type(
+          s->bucket_info.placement_rule);
+      if (compression_type != "none") {
+        plugin = Compressor::create(s->cct, compression_type);
+        if (!plugin) {
+          ldout(s->cct, 1) << "Cannot load plugin for compression type "
+                           << compression_type << dendl;
+        } else {
+          compressor.emplace(s->cct, plugin, filter);
+          filter = &*compressor;
+        }
       }
     }
-  }
 
-  while (data_pending) {
-     bufferlist data;
-     len = get_data(data);
+    bool again;
+    do {
+      ceph::bufferlist data;
+      len = get_data(data, again);
 
-     if (len < 0) {
-       op_ret = len;
-       return;
-     }
+      if (len < 0) {
+        op_ret = len;
+        return;
+      }
 
-     if (!len)
-       break;
+      if (!len) {
+        break;
+      }
 
-     hash.Update((const byte *)data.c_str(), data.length());
-     op_ret = put_data_and_throttle(filter, data, ofs, false);
+      hash.Update((const byte *)data.c_str(), data.length());
+      op_ret = put_data_and_throttle(filter, data, ofs, false);
 
-     ofs += len;
+      ofs += len;
 
-     if (ofs > max_len) {
-       op_ret = -ERR_TOO_LARGE;
-       return;
-     }
-   }
-  {
-    bufferlist flush;
-    op_ret = put_data_and_throttle(filter, flush, ofs, false);
-  }
-  if (len < min_len) {
-    op_ret = -ERR_TOO_SMALL;
-    return;
-  }
+      if (ofs > max_len) {
+        op_ret = -ERR_TOO_LARGE;
+        return;
+      }
+    } while (again);
 
-  s->obj_size = ofs;
+    {
+      bufferlist flush;
+      op_ret = put_data_and_throttle(filter, flush, ofs, false);
+    }
 
-  op_ret = store->check_quota(s->bucket_owner.get_id(), s->bucket,
-			      user_quota, bucket_quota, s->obj_size);
-  if (op_ret < 0) {
-    return;
-  }
+    if (len < min_len) {
+      op_ret = -ERR_TOO_SMALL;
+      return;
+    }
 
-  hash.Final(m);
-  buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5);
+    s->obj_size = ofs;
 
-  etag = calc_md5;
-  bl.append(etag.c_str(), etag.size() + 1);
-  emplace_attr(RGW_ATTR_ETAG, std::move(bl));
+    op_ret = store->check_quota(s->bucket_owner.get_id(), s->bucket,
+                                user_quota, bucket_quota, s->obj_size);
+    if (op_ret < 0) {
+      return;
+    }
 
-  policy.encode(aclbl);
-  emplace_attr(RGW_ATTR_ACL, std::move(aclbl));
+    hash.Final(m);
+    buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5);
 
-  if (content_type.size()) {
-    bufferlist ct_bl;
-    ct_bl.append(content_type.c_str(), content_type.size() + 1);
-    emplace_attr(RGW_ATTR_CONTENT_TYPE, std::move(ct_bl));
-  }
+    etag = calc_md5;
+    bl.append(etag.c_str(), etag.size() + 1);
+    emplace_attr(RGW_ATTR_ETAG, std::move(bl));
 
-  if (compressor && compressor->is_compressed()) {
-    bufferlist tmp;
-    RGWCompressionInfo cs_info;
-    cs_info.compression_type = plugin->get_type_name();
-    cs_info.orig_size = s->obj_size;
-    cs_info.blocks = move(compressor->get_compression_blocks());
-    ::encode(cs_info, tmp);
-    emplace_attr(RGW_ATTR_COMPRESSION, std::move(tmp));
-  }
+    policy.encode(aclbl);
+    emplace_attr(RGW_ATTR_ACL, std::move(aclbl));
+
+    const std::string content_type = get_current_content_type();
+    if (! content_type.empty()) {
+      ceph::bufferlist ct_bl;
+      ct_bl.append(content_type.c_str(), content_type.size() + 1);
+      emplace_attr(RGW_ATTR_CONTENT_TYPE, std::move(ct_bl));
+    }
+
+    if (compressor && compressor->is_compressed()) {
+      ceph::bufferlist tmp;
+      RGWCompressionInfo cs_info;
+      cs_info.compression_type = plugin->get_type_name();
+      cs_info.orig_size = s->obj_size;
+      cs_info.blocks = move(compressor->get_compression_blocks());
+      ::encode(cs_info, tmp);
+      emplace_attr(RGW_ATTR_COMPRESSION, std::move(tmp));
+    }
 
-  op_ret = processor.complete(s->obj_size, etag, NULL, real_time(), attrs,
-                              (delete_at ? *delete_at : real_time()));
+    op_ret = processor.complete(s->obj_size, etag, nullptr, real_time(),
+                                attrs, (delete_at ? *delete_at : real_time()));
+  } while (is_next_file_to_upload());
 }
 
 
@@ -3538,7 +3728,7 @@ void RGWPutMetadataAccount::execute()
 
 int RGWPutMetadataBucket::verify_permission()
 {
-  if (!verify_bucket_permission(s, RGW_PERM_WRITE)) {
+  if (!verify_bucket_permission_no_policy(s, RGW_PERM_WRITE)) {
     return -EACCES;
   }
 
@@ -3569,6 +3759,12 @@ void RGWPutMetadataBucket::execute()
    * the hood. This method will add the new items only if the map doesn't
    * contain such keys yet. */
   if (has_policy) {
+    if (s->dialect.compare("swift") == 0) {
+	auto old_policy = static_cast<RGWAccessControlPolicy_SWIFT*>(s->bucket_acl);
+	auto new_policy = static_cast<RGWAccessControlPolicy_SWIFT*>(&policy);
+	new_policy->filter_merge(policy_rw_mask, old_policy);
+	policy = *new_policy;
+    }
     buffer::list bl;
     policy.encode(bl);
     emplace_attr(RGW_ATTR_ACL, std::move(bl));
@@ -3611,7 +3807,9 @@ void RGWPutMetadataBucket::execute()
 
 int RGWPutMetadataObject::verify_permission()
 {
-  if (!verify_object_permission(s, RGW_PERM_WRITE)) {
+  // This looks to be something specific to Swift. We could add
+  // operations like swift:PutMetadataObject to the Policy Engine.
+  if (!verify_object_permission_no_policy(s, RGW_PERM_WRITE)) {
     return -EACCES;
   }
 
@@ -3722,7 +3920,19 @@ int RGWDeleteObj::handle_slo_manifest(bufferlist& bl)
 
 int RGWDeleteObj::verify_permission()
 {
-  if (!verify_bucket_permission(s, RGW_PERM_WRITE)) {
+  if (s->iam_policy) {
+    auto r = s->iam_policy->eval(s->env, *s->auth.identity,
+				 s->object.instance.empty() ?
+				 rgw::IAM::s3DeleteObject :
+				 rgw::IAM::s3DeleteObjectVersion,
+				 ARN(s->bucket, s->object.name));
+    if (r == Effect::Allow)
+      return true;
+    else if (r == Effect::Deny)
+      return false;
+  }
+
+  if (!verify_bucket_permission_no_policy(s, RGW_PERM_WRITE)) {
     return -EACCES;
   }
 
@@ -3872,7 +4082,8 @@ bool RGWCopyObj::parse_copy_location(const string& url_src, string& bucket_name,
 
 int RGWCopyObj::verify_permission()
 {
-  RGWAccessControlPolicy src_policy(s->cct);
+  RGWAccessControlPolicy src_acl(s->cct);
+  optional<Policy> src_policy;
   op_ret = get_params();
   if (op_ret < 0)
     return op_ret;
@@ -3907,17 +4118,32 @@ int RGWCopyObj::verify_permission()
     store->set_prefetch_data(s->obj_ctx, src_obj);
 
     /* check source object permissions */
-    op_ret = read_obj_policy(store, s, src_bucket_info, src_attrs, &src_policy,
-                         src_bucket, src_object);
+    op_ret = read_obj_policy(store, s, src_bucket_info, src_attrs, &src_acl,
+			     src_policy, src_bucket, src_object);
     if (op_ret < 0) {
       return op_ret;
     }
 
     /* admin request overrides permission checks */
-    if (! s->auth.identity->is_admin_of(src_policy.get_owner().get_id()) &&
-        ! src_policy.verify_permission(*s->auth.identity, s->perm_mask,
-                                       RGW_PERM_READ)) {
-      return -EACCES;
+    if (!s->auth.identity->is_admin_of(src_acl.get_owner().get_id())) {
+      if (src_policy) {
+	auto e = src_policy->eval(s->env, *s->auth.identity,
+				  src_object.instance.empty() ?
+				  rgw::IAM::s3GetObject :
+				  rgw::IAM::s3GetObjectVersion,
+				  ARN(src_obj));
+	if (e == Effect::Deny) {
+	  return -EACCES;
+	} else if (e == Effect::Pass &&
+		   !src_acl.verify_permission(*s->auth.identity, s->perm_mask,
+					      RGW_PERM_READ)) { 
+	  return -EACCES;
+	}
+      } else if (!src_acl.verify_permission(*s->auth.identity,
+					       s->perm_mask,
+					    RGW_PERM_READ)) {
+	return -EACCES;
+      }
     }
   }
 
@@ -4070,7 +4296,6 @@ void RGWCopyObj::execute()
 			   (version_id.empty() ? NULL : &version_id),
 			   &s->req_id, /* use req_id as tag */
 			   &etag,
-			   &s->err,
 			   copy_obj_progress_cb, (void *)this
     );
 }
@@ -4079,9 +4304,12 @@ int RGWGetACLs::verify_permission()
 {
   bool perm;
   if (!s->object.empty()) {
-    perm = verify_object_permission(s, RGW_PERM_READ_ACP);
+    perm = verify_object_permission(s,
+				    s->object.instance.empty() ?
+				    rgw::IAM::s3GetObjectAcl :
+				    rgw::IAM::s3GetObjectVersionAcl);
   } else {
-    perm = verify_bucket_permission(s, RGW_PERM_READ_ACP);
+    perm = verify_bucket_permission(s, rgw::IAM::s3GetObjectAcl);
   }
   if (!perm)
     return -EACCES;
@@ -4109,9 +4337,12 @@ int RGWPutACLs::verify_permission()
 {
   bool perm;
   if (!s->object.empty()) {
-    perm = verify_object_permission(s, RGW_PERM_WRITE_ACP);
+    perm = verify_object_permission(s,
+				    s->object.instance.empty() ?
+				    rgw::IAM::s3PutObjectAcl :
+				    rgw::IAM::s3PutObjectVersionAcl);
   } else {
-    perm = verify_bucket_permission(s, RGW_PERM_WRITE_ACP);
+    perm = verify_bucket_permission(s, rgw::IAM::s3PutBucketAcl);
   }
   if (!perm)
     return -EACCES;
@@ -4122,7 +4353,7 @@ int RGWPutACLs::verify_permission()
 int RGWGetLC::verify_permission()
 {
   bool perm;
-  perm = verify_bucket_permission(s, RGW_PERM_READ_ACP);
+  perm = verify_bucket_permission(s, rgw::IAM::s3GetLifecycleConfiguration);
   if (!perm)
     return -EACCES;
 
@@ -4132,7 +4363,7 @@ int RGWGetLC::verify_permission()
 int RGWPutLC::verify_permission()
 {
   bool perm;
-  perm = verify_bucket_permission(s, RGW_PERM_WRITE_ACP);
+  perm = verify_bucket_permission(s, rgw::IAM::s3PutLifecycleConfiguration);
   if (!perm)
     return -EACCES;
 
@@ -4142,7 +4373,7 @@ int RGWPutLC::verify_permission()
 int RGWDeleteLC::verify_permission()
 {
   bool perm;
-  perm = verify_bucket_permission(s, RGW_PERM_WRITE_ACP);
+  perm = verify_bucket_permission(s, rgw::IAM::s3PutLifecycleConfiguration);
   if (!perm)
     return -EACCES;
 
@@ -4608,8 +4839,20 @@ void RGWSetRequestPayment::execute()
 
 int RGWInitMultipart::verify_permission()
 {
-  if (!verify_bucket_permission(s, RGW_PERM_WRITE))
+  if (s->iam_policy) {
+    auto e = s->iam_policy->eval(s->env, *s->auth.identity,
+				 rgw::IAM::s3PutObject,
+				 rgw_obj(s->bucket, s->object));
+    if (e == Effect::Allow) {
+      return 0;
+    } else if (e == Effect::Deny) {
+      return -EACCES;
+    }
+  }
+
+  if (!verify_bucket_permission_no_policy(s, RGW_PERM_WRITE)) {
     return -EACCES;
+  }
 
   return 0;
 }
@@ -4713,8 +4956,20 @@ static int get_multipart_info(RGWRados *store, struct req_state *s,
 
 int RGWCompleteMultipart::verify_permission()
 {
-  if (!verify_bucket_permission(s, RGW_PERM_WRITE))
+  if (s->iam_policy) {
+    auto e = s->iam_policy->eval(s->env, *s->auth.identity,
+				 rgw::IAM::s3PutObject,
+				 rgw_obj(s->bucket, s->object));
+    if (e == Effect::Allow) {
+      return 0;
+    } else if (e == Effect::Deny) {
+      return -EACCES;
+    }
+  }
+
+  if (!verify_bucket_permission_no_policy(s, RGW_PERM_WRITE)) {
     return -EACCES;
+  }
 
   return 0;
 }
@@ -4959,8 +5214,20 @@ void RGWCompleteMultipart::execute()
 
 int RGWAbortMultipart::verify_permission()
 {
-  if (!verify_bucket_permission(s, RGW_PERM_WRITE))
+  if (s->iam_policy) {
+    auto e = s->iam_policy->eval(s->env, *s->auth.identity,
+				 rgw::IAM::s3AbortMultipartUpload,
+				 rgw_obj(s->bucket, s->object));
+    if (e == Effect::Allow) {
+      return 0;
+    } else if (e == Effect::Deny) {
+      return -EACCES;
+    }
+  }
+
+  if (!verify_bucket_permission_no_policy(s, RGW_PERM_WRITE)) {
     return -EACCES;
+  }
 
   return 0;
 }
@@ -4996,7 +5263,7 @@ void RGWAbortMultipart::execute()
 
 int RGWListMultipart::verify_permission()
 {
-  if (!verify_object_permission(s, RGW_PERM_READ))
+  if (!verify_object_permission(s, rgw::IAM::s3ListMultipartUploadParts))
     return -EACCES;
 
   return 0;
@@ -5030,7 +5297,8 @@ void RGWListMultipart::execute()
 
 int RGWListBucketMultiparts::verify_permission()
 {
-  if (!verify_bucket_permission(s, RGW_PERM_READ))
+  if (!verify_bucket_permission(s,
+				rgw::IAM::s3ListBucketMultiPartUploads))
     return -EACCES;
 
   return 0;
@@ -5102,7 +5370,8 @@ void RGWGetHealthCheck::execute()
 
 int RGWDeleteMultiObj::verify_permission()
 {
-  if (!verify_bucket_permission(s, RGW_PERM_WRITE))
+  acl_allowed = verify_bucket_permission_no_policy(s, RGW_PERM_WRITE);
+  if (!acl_allowed && !s->iam_policy)
     return -EACCES;
 
   return 0;
@@ -5159,6 +5428,19 @@ void RGWDeleteMultiObj::execute()
         iter != multi_delete->objects.end() && num_processed < max_to_delete;
         ++iter, num_processed++) {
     rgw_obj obj(bucket, *iter);
+    if (s->iam_policy) {
+      auto e = s->iam_policy->eval(s->env,
+				   *s->auth.identity,
+				   iter->instance.empty() ?
+				   rgw::IAM::s3DeleteObject :
+				   rgw::IAM::s3DeleteObjectVersion,
+				   obj);
+      if ((e == Effect::Deny) ||
+	  (e == Effect::Pass && !acl_allowed)) {
+	send_partial_response(*iter, false, "", -EACCES);
+	continue;
+      }
+    }
 
     obj_ctx->obj.set_atomic(obj);
 
@@ -5205,11 +5487,14 @@ bool RGWBulkDelete::Deleter::verify_permission(RGWBucketInfo& binfo,
     return false;
   }
 
+  auto policy = get_iam_policy_from_attr(s->cct, store, battrs, binfo.bucket.tenant);
+
   bucket_owner = bacl.get_owner();
 
   /* We can use global user_acl because each BulkDelete request is allowed
    * to work on entities from a single account only. */
-  return verify_bucket_permission(s, s->user_acl.get(), &bacl, RGW_PERM_WRITE);
+  return verify_bucket_permission(s, binfo.bucket, s->user_acl.get(),
+				  &bacl, policy, rgw::IAM::s3DeleteBucket);
 }
 
 bool RGWBulkDelete::Deleter::delete_single(const acct_path_t& path)
@@ -5422,7 +5707,28 @@ RGWBulkUploadOp::parse_path(const boost::string_ref& path)
     }
   }
 
-  return boost::none;
+  return none;
+}
+
+std::pair<std::string, std::string>
+RGWBulkUploadOp::handle_upload_path(struct req_state *s)
+{
+  std::string bucket_path, file_prefix;
+  if (! s->init_state.url_bucket.empty()) {
+    file_prefix = bucket_path = s->init_state.url_bucket + "/";
+    if (! s->object.empty()) {
+      std::string& object_name = s->object.name;
+
+      /* As rgw_obj_key::empty() already verified emptiness of s->object.name,
+       * we can safely examine its last element. */
+      if (object_name.back() == '/') {
+        file_prefix.append(object_name);
+      } else {
+        file_prefix.append(object_name).append("/");
+      }
+    }
+  }
+  return std::make_pair(bucket_path, file_prefix);
 }
 
 int RGWBulkUploadOp::handle_dir_verify_permission()
@@ -5446,6 +5752,20 @@ int RGWBulkUploadOp::handle_dir_verify_permission()
   return 0;
 }
 
+static void forward_req_info(CephContext *cct, req_info& info, const std::string& bucket_name)
+{
+  /* the request of container or object level will contain bucket name.
+   * only at account level need to append the bucket name */
+  if (info.script_uri.find(bucket_name) != std::string::npos) {
+    return;
+  }
+
+  ldout(cct, 20) << "append the bucket: "<< bucket_name << " to req_info" << dendl;
+  info.script_uri.append("/").append(bucket_name);
+  info.request_uri_aws4 = info.request_uri = info.script_uri;
+  info.effective_uri = "/" + bucket_name;
+}
+
 int RGWBulkUploadOp::handle_dir(const boost::string_ref path)
 {
   ldout(s->cct, 20) << "bulk upload: got directory=" << path << dendl;
@@ -5462,14 +5782,6 @@ int RGWBulkUploadOp::handle_dir(const boost::string_ref path)
   rgw_raw_obj obj(store->get_zone_params().domain_root,
                   rgw_make_bucket_entry_name(s->bucket_tenant, bucket_name));
 
-  /* Swift API doesn't support location constraint. We're just checking here
-   * whether creation is taking place in the master zone or not. */
-  if (! store->get_zonegroup().is_master) {
-    ldout(s->cct, 0) << "creating bucket in a non-master zone." << dendl;
-    op_ret = -EINVAL;
-    return op_ret;
-  }
-
   /* we need to make sure we read bucket info, it's not read before for this
    * specific request */
   RGWBucketInfo binfo;
@@ -5502,7 +5814,9 @@ int RGWBulkUploadOp::handle_dir(const boost::string_ref path)
   if (! store->is_meta_master()) {
     JSONParser jp;
     ceph::bufferlist in_data;
-    op_ret = forward_request_to_master(s, nullptr, store, in_data, &jp);
+    req_info info = s->info;
+    forward_req_info(s->cct, info, bucket_name);
+    op_ret = forward_request_to_master(s, nullptr, store, in_data, &jp, &info);
     if (op_ret < 0) {
       return op_ret;
     }
@@ -5535,7 +5849,6 @@ int RGWBulkUploadOp::handle_dir(const boost::string_ref path)
     op_ret = store->select_bucket_placement(*(s->user),
                                             store->get_zonegroup().get_id(),
                                             placement_rule,
-                                            bucket,
                                             &selected_placement_rule,
                                             nullptr);
     if (selected_placement_rule != binfo.placement_rule) {
@@ -5615,6 +5928,7 @@ int RGWBulkUploadOp::handle_dir(const boost::string_ref path)
 
 
 bool RGWBulkUploadOp::handle_file_verify_permission(RGWBucketInfo& binfo,
+						    const rgw_obj& obj,
                                                     std::map<std::string, ceph::bufferlist>& battrs,
                                                     ACLOwner& bucket_owner /* out */)
 {
@@ -5626,8 +5940,21 @@ bool RGWBulkUploadOp::handle_file_verify_permission(RGWBucketInfo& binfo,
     return false;
   }
 
+  auto policy = get_iam_policy_from_attr(s->cct, store, battrs, binfo.bucket.tenant);
+
   bucket_owner = bacl.get_owner();
-  return verify_bucket_permission(s, s->user_acl.get(), &bacl, RGW_PERM_WRITE);
+  if (policy) {
+    auto e = policy->eval(s->env, *s->auth.identity,
+			  rgw::IAM::s3PutObject, obj);
+    if (e == Effect::Allow) {
+      return true;
+    } else if (e == Effect::Deny) {
+      return false;
+    }
+  }
+    
+  return verify_bucket_permission_no_policy(s, s->user_acl.get(),
+					    &bacl, RGW_PERM_WRITE);
 }
 
 int RGWBulkUploadOp::handle_file(const boost::string_ref path,
@@ -5663,7 +5990,9 @@ int RGWBulkUploadOp::handle_file(const boost::string_ref path,
     return op_ret;
   }
 
-  if (! handle_file_verify_permission(binfo, battrs, bowner)) {
+  if (! handle_file_verify_permission(binfo,
+				      rgw_obj(binfo.bucket, object),
+				      battrs, bowner)) {
     ldout(s->cct, 20) << "bulk upload: object creation unauthorized" << dendl;
     op_ret = -EACCES;
     return op_ret;
@@ -5802,6 +6131,11 @@ void RGWBulkUploadOp::execute()
     return;
   }
 
+  /* Handling the $UPLOAD_PATH accordingly to the Swift's Bulk middleware. See: 
+   * https://github.com/openstack/swift/blob/2.13.0/swift/common/middleware/bulk.py#L31-L41 */
+  std::string bucket_path, file_prefix;
+  std::tie(bucket_path, file_prefix) = handle_upload_path(s);
+
   auto status = rgw::tar::StatusIndicator::create();
   do {
     op_ret = stream->get_exactly(rgw::tar::BLOCK_SIZE, buffer);
@@ -5826,25 +6160,28 @@ void RGWBulkUploadOp::execute()
         case rgw::tar::FileType::NORMAL_FILE: {
           ldout(s->cct, 2) << "bulk upload: handling regular file" << dendl;
 
+          boost::string_ref filename = bucket_path.empty() ? header->get_filename() : \
+                            file_prefix + header->get_filename().to_string();
           auto body = AlignedStreamGetter(0, header->get_filesize(),
                                           rgw::tar::BLOCK_SIZE, *stream);
-          op_ret = handle_file(header->get_filename(),
+          op_ret = handle_file(filename,
                                header->get_filesize(),
                                body);
           if (! op_ret) {
             /* Only regular files counts. */
             num_created++;
           } else {
-            failures.emplace_back(op_ret, header->get_filename().to_string());
+            failures.emplace_back(op_ret, filename.to_string());
           }
           break;
         }
         case rgw::tar::FileType::DIRECTORY: {
           ldout(s->cct, 2) << "bulk upload: handling regular directory" << dendl;
 
-          op_ret = handle_dir(header->get_filename());
+          boost::string_ref dirname = bucket_path.empty() ? header->get_filename() : bucket_path;
+          op_ret = handle_dir(dirname);
           if (op_ret < 0 && op_ret != -ERR_BUCKET_EXISTS) {
-            failures.emplace_back(op_ret, header->get_filename().to_string());
+            failures.emplace_back(op_ret, dirname.to_string());
           }
           break;
         }
@@ -5905,11 +6242,13 @@ ssize_t RGWBulkUploadOp::AlignedStreamGetter::get_exactly(const size_t want,
 
 int RGWSetAttrs::verify_permission()
 {
+  // This looks to be part of the RGW-NFS machinery and has no S3 or
+  // Swift equivalent.
   bool perm;
   if (!s->object.empty()) {
-    perm = verify_object_permission(s, RGW_PERM_WRITE);
+    perm = verify_object_permission_no_policy(s, RGW_PERM_WRITE);
   } else {
-    perm = verify_bucket_permission(s, RGW_PERM_WRITE);
+    perm = verify_bucket_permission_no_policy(s, RGW_PERM_WRITE);
   }
   if (!perm)
     return -EACCES;
@@ -5985,6 +6324,7 @@ int RGWHandler::init(RGWRados *_store,
 int RGWHandler::do_init_permissions()
 {
   int ret = rgw_build_bucket_policies(store, s);
+  s->env = rgw_build_iam_environment(store, s);
 
   if (ret < 0) {
     ldout(s->cct, 10) << "read_permissions on " << s->bucket << " ret=" << ret << dendl;
@@ -6022,3 +6362,110 @@ int RGWHandler::error_handler(int err_no, string *error_content) {
   // This is the do-nothing error handler
   return err_no;
 }
+
+
+void RGWPutBucketPolicy::send_response()
+{
+  if (op_ret) {
+    set_req_state_err(s, op_ret);
+  }
+  dump_errno(s);
+  end_header(s);
+}
+
+int RGWPutBucketPolicy::verify_permission()
+{
+  if (!verify_bucket_permission(s, rgw::IAM::s3PutBucketPolicy)) {
+    return -EACCES;
+  }
+
+  return 0;
+}
+
+int RGWPutBucketPolicy::get_params()
+{
+  const auto max_size = s->cct->_conf->rgw_max_put_param_size;
+  // At some point when I have more time I want to make a version of
+  // rgw_rest_read_all_input that doesn't use malloc.
+  op_ret = rgw_rest_read_all_input(s, &data, &len, max_size, false);
+  // And throws exceptions.
+  return op_ret;
+}
+
+void RGWPutBucketPolicy::execute()
+{
+  op_ret = get_params();
+  if (op_ret < 0) {
+    return;
+  }
+
+  try {
+    Policy p(s->cct, s->bucket_tenant,
+	     bufferlist::static_from_mem(data, len));
+    auto attrs = s->bucket_attrs;
+    attrs[RGW_ATTR_IAM_POLICY].append(p.text);
+    op_ret = rgw_bucket_set_attrs(store, s->bucket_info, attrs,
+				  &s->bucket_info.objv_tracker);
+    if (op_ret == -ECANCELED) {
+      op_ret = 0; /* lost a race, but it's ok because policies are immutable */
+    }
+  } catch (rgw::IAM::PolicyParseException& e) {
+    op_ret = -EINVAL;
+  }
+}
+
+void RGWGetBucketPolicy::send_response()
+{
+  if (op_ret) {
+    set_req_state_err(s, op_ret);
+  }
+  dump_errno(s);
+  end_header(s);
+  dump_start(s);
+  rgw_flush_formatter(s, s->formatter);
+  dump_body(s, policy);
+}
+
+int RGWGetBucketPolicy::verify_permission()
+{
+  if (!verify_bucket_permission(s, rgw::IAM::s3GetBucketPolicy)) {
+    return -EACCES;
+  }
+
+  return 0;
+}
+
+void RGWGetBucketPolicy::execute()
+{
+  auto attrs = s->bucket_attrs;
+  policy = attrs[RGW_ATTR_IAM_POLICY];
+}
+
+void RGWDeleteBucketPolicy::send_response()
+{
+  if (op_ret) {
+    set_req_state_err(s, op_ret);
+  }
+  dump_errno(s);
+  end_header(s);
+}
+
+int RGWDeleteBucketPolicy::verify_permission()
+{
+  if (!verify_bucket_permission(s, rgw::IAM::s3DeleteBucketPolicy)) {
+    return -EACCES;
+  }
+
+  return 0;
+}
+
+void RGWDeleteBucketPolicy::execute()
+{
+  auto attrs = s->bucket_attrs;
+  attrs.erase(RGW_ATTR_IAM_POLICY);
+  op_ret = rgw_bucket_set_attrs(store, s->bucket_info, attrs,
+				&s->bucket_info.objv_tracker);
+  if (op_ret == -ECANCELED) {
+    op_ret = 0; /* lost a race, but it's ok because policies are immutable */
+  }
+}
diff --git a/src/rgw/rgw_op.h b/src/rgw/rgw_op.h
index 11c88a37fa2..1b4f02f1b7f 100644
--- a/src/rgw/rgw_op.h
+++ b/src/rgw/rgw_op.h
@@ -1,4 +1,4 @@
-// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 // vim: ts=8 sw=2 smarttab
 /**
  * All operations via the rados gateway are carried out by
@@ -22,6 +22,7 @@
 
 #include <boost/optional.hpp>
 #include <boost/utility/in_place_factory.hpp>
+#include <boost/function.hpp>
 
 #include "common/armor.h"
 #include "common/mime.h"
@@ -43,7 +44,6 @@
 
 #include "include/assert.h"
 
-using namespace std;
 using ceph::crypto::SHA1;
 
 struct req_state;
@@ -93,6 +93,7 @@ public:
   virtual int authorize() = 0;
   virtual int postauth_init() = 0;
   virtual int error_handler(int err_no, std::string* error_content);
+  virtual void dump(const string& code, const string& message) const {}
 };
 
 
@@ -211,6 +212,7 @@ protected:
   bool first_data;
   uint64_t cur_ofs;
   bufferlist waiting;
+  uint64_t action = 0;
 
   int init_common();
 public:
@@ -249,11 +251,13 @@ public:
   int verify_permission() override;
   void pre_exec() override;
   void execute() override;
-  int read_user_manifest_part(rgw_bucket& bucket,
-                              const rgw_bucket_dir_entry& ent,
-                              RGWAccessControlPolicy *bucket_policy,
-                              off_t start_ofs,
-                              off_t end_ofs);
+  int read_user_manifest_part(
+    rgw_bucket& bucket,
+    const rgw_bucket_dir_entry& ent,
+    RGWAccessControlPolicy * const bucket_acl,
+    const boost::optional<rgw::IAM::Policy>& bucket_policy,
+    const off_t start_ofs,
+    const off_t end_ofs);
   int handle_user_manifest(const char *prefix);
   int handle_slo_manifest(bufferlist& bl);
 
@@ -427,9 +431,13 @@ protected:
 
   boost::optional<std::pair<std::string, rgw_obj_key>>
   parse_path(const boost::string_ref& path);
+  
+  std::pair<std::string, std::string>
+  handle_upload_path(struct req_state *s);
 
   bool handle_file_verify_permission(RGWBucketInfo& binfo,
-                                     std::map<std::string, ceph::bufferlist>& battrs,
+				     const rgw_obj& obj,
+				     std::map<std::string, ceph::bufferlist>& battrs,
                                      ACLOwner& bucket_owner /* out */);
   int handle_file(boost::string_ref path,
                   size_t size,
@@ -926,6 +934,7 @@ protected:
   string version_id;
   bufferlist bl_aux;
   map<string, string> crypt_http_responses;
+  string user_data;
 
   boost::optional<ceph::real_time> delete_at;
 
@@ -1013,21 +1022,23 @@ protected:
   const char *supplied_md5_b64;
   const char *supplied_etag;
   string etag;
-  string boundary;
-  bool data_pending;
-  string content_type;
   RGWAccessControlPolicy policy;
   map<string, bufferlist> attrs;
   boost::optional<ceph::real_time> delete_at;
 
+  /* Must be called after get_data() or the result is undefined. */
+  virtual std::string get_current_filename() const = 0;
+  virtual std::string get_current_content_type() const = 0;
+  virtual bool is_next_file_to_upload() {
+     return false;
+  }
 public:
   RGWPostObj() : min_len(0),
                  max_len(LLONG_MAX),
                  len(0),
                  ofs(0),
                  supplied_md5_b64(nullptr),
-                 supplied_etag(nullptr),
-                 data_pending(false) {
+                 supplied_etag(nullptr) {
   }
 
   void emplace_attr(std::string&& key, buffer::list&& bl) {
@@ -1048,9 +1059,9 @@ public:
     return 0;
   }
   virtual int get_params() = 0;
-  virtual int get_data(bufferlist& bl) = 0;
+  virtual int get_data(ceph::bufferlist& bl, bool& again) = 0;
   void send_response() override = 0;
-  const string name() override { return "post_obj"; }
+  const std::string name() override { return "post_obj"; }
   RGWOpType get_type() override { return RGW_OP_POST_OBJ; }
   uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; }
 };
@@ -1098,6 +1109,7 @@ protected:
   map<string, buffer::list> attrs;
   set<string> rmattr_names;
   bool has_policy, has_cors;
+  uint32_t policy_rw_mask;
   RGWAccessControlPolicy policy;
   RGWCORSConfiguration cors_config;
   string placement_rule;
@@ -1105,7 +1117,7 @@ protected:
 
 public:
   RGWPutMetadataBucket()
-    : has_policy(false), has_cors(false)
+    : has_policy(false), has_cors(false), policy_rw_mask(0)
   {}
 
   void emplace_attr(std::string&& key, buffer::list&& bl) {
@@ -1688,6 +1700,7 @@ protected:
   rgw_bucket bucket;
   bool quiet;
   bool status_dumped;
+  bool acl_allowed = false;
 
 public:
   RGWDeleteMultiObj() {
@@ -1725,7 +1738,9 @@ public:
 
 extern int rgw_build_bucket_policies(RGWRados* store, struct req_state* s);
 extern int rgw_build_object_policies(RGWRados *store, struct req_state *s,
-				    bool prefetch_data);
+				     bool prefetch_data);
+extern rgw::IAM::Environment rgw_build_iam_environment(RGWRados* store,
+						       struct req_state* s);
 
 static inline int put_data_and_throttle(RGWPutObjDataProcessor *processor,
 					bufferlist& data, off_t ofs,
@@ -1935,6 +1950,66 @@ public:
   virtual uint32_t op_mask() { return RGW_OP_TYPE_READ; }
 };
 
+class RGWPutBucketPolicy : public RGWOp {
+  int len;
+  char *data = nullptr;
+public:
+  RGWPutBucketPolicy() = default;
+  ~RGWPutBucketPolicy() {
+    if (data) {
+      free(static_cast<void*>(data));
+    }
+  }
+  void send_response() override;
+  int verify_permission() override;
+  uint32_t op_mask() override {
+    return RGW_OP_TYPE_WRITE;
+  }
+  void execute() override;
+  int get_params();
+  const std::string name() override {
+    return "put_bucket_policy";
+  }
+  RGWOpType get_type() override {
+    return RGW_OP_PUT_BUCKET_POLICY;
+  }
+};
+
+class RGWGetBucketPolicy : public RGWOp {
+  buffer::list policy;
+public:
+  RGWGetBucketPolicy() = default;
+  void send_response() override;
+  int verify_permission() override;
+  uint32_t op_mask() override {
+    return RGW_OP_TYPE_READ;
+  }
+  void execute() override;
+  const std::string name() override {
+    return "get_bucket_policy";
+  }
+  RGWOpType get_type() override {
+    return RGW_OP_GET_BUCKET_POLICY;
+  }
+};
+
+class RGWDeleteBucketPolicy : public RGWOp {
+public:
+  RGWDeleteBucketPolicy() = default;
+  void send_response() override;
+  int verify_permission() override;
+  uint32_t op_mask() override {
+    return RGW_OP_TYPE_WRITE;
+  }
+  void execute() override;
+  int get_params();
+  const std::string name() override {
+    return "delete_bucket_policy";
+  }
+  RGWOpType get_type() override {
+    return RGW_OP_DELETE_BUCKET_POLICY;
+  }
+};
 
 
 #endif /* CEPH_RGW_OP_H */
diff --git a/src/rgw/rgw_period_history.cc b/src/rgw/rgw_period_history.cc
index eff0e78ad9d..895700f6897 100644
--- a/src/rgw/rgw_period_history.cc
+++ b/src/rgw/rgw_period_history.cc
@@ -69,6 +69,15 @@ bool Cursor::has_next() const
   return epoch < history->get_newest_epoch();
 }
 
+bool operator==(const Cursor& lhs, const Cursor& rhs)
+{
+  return lhs.history == rhs.history && lhs.epoch == rhs.epoch;
+}
+
+bool operator!=(const Cursor& lhs, const Cursor& rhs)
+{
+  return !(lhs == rhs);
+}
 
 class RGWPeriodHistory::Impl final {
  public:
diff --git a/src/rgw/rgw_period_history.h b/src/rgw/rgw_period_history.h
index 9541493aa14..0796c6116b5 100644
--- a/src/rgw/rgw_period_history.h
+++ b/src/rgw/rgw_period_history.h
@@ -75,6 +75,9 @@ class RGWPeriodHistory final {
     void prev() { epoch--; }
     void next() { epoch++; }
 
+    friend bool operator==(const Cursor& lhs, const Cursor& rhs);
+    friend bool operator!=(const Cursor& lhs, const Cursor& rhs);
+
    private:
     // private constructors for RGWPeriodHistory
     friend class RGWPeriodHistory::Impl;
diff --git a/src/rgw/rgw_period_pusher.cc b/src/rgw/rgw_period_pusher.cc
index d6b2eabec39..75b454cf1c6 100644
--- a/src/rgw/rgw_period_pusher.cc
+++ b/src/rgw/rgw_period_pusher.cc
@@ -135,10 +135,11 @@ class RGWPeriodPusher::CRThread {
            std::map<std::string, RGWRESTConn>&& conns)
     : coroutines(cct, NULL),
       http(cct, coroutines.get_completion_mgr()),
-      push_all(new PushAllCR(cct, &http, std::move(period), std::move(conns))),
-      thread([this] { coroutines.run(push_all.get()); })
+      push_all(new PushAllCR(cct, &http, std::move(period), std::move(conns)))
   {
     http.set_threaded();
+    // must spawn the CR thread after set_threaded
+    thread = std::thread([this] { coroutines.run(push_all.get()); });
   }
   ~CRThread()
   {
diff --git a/src/rgw/rgw_process.h b/src/rgw/rgw_process.h
index 83c59a4cc37..005f2db6fef 100644
--- a/src/rgw/rgw_process.h
+++ b/src/rgw/rgw_process.h
@@ -17,6 +17,8 @@
 #include "common/WorkQueue.h"
 #include "common/Throttle.h"
 
+#include <atomic>
+
 #if !defined(dout_subsys)
 #define dout_subsys ceph_subsys_rgw
 #define def_dout_subsys
@@ -182,7 +184,7 @@ public:
   void checkpoint();
   void handle_request(RGWRequest* req) override;
   void gen_request(const string& method, const string& resource,
-		  int content_length, atomic_t* fail_flag);
+		  int content_length, std::atomic<int64_t>* fail_flag);
 
   void set_access_key(RGWAccessKey& key) { access_key = key; }
 };
diff --git a/src/rgw/rgw_quota.cc b/src/rgw/rgw_quota.cc
index 02347ff743d..6a61400a110 100644
--- a/src/rgw/rgw_quota.cc
+++ b/src/rgw/rgw_quota.cc
@@ -26,6 +26,8 @@
 #include "rgw_bucket.h"
 #include "rgw_user.h"
 
+#include <atomic>
+
 #define dout_context g_ceph_context
 #define dout_subsys ceph_subsys_rgw
 
@@ -410,7 +412,7 @@ void UserAsyncRefreshHandler::handle_response(int r)
 }
 
 class RGWUserStatsCache : public RGWQuotaCache<rgw_user> {
-  atomic_t down_flag;
+  std::atomic<bool> down_flag = { false };
   RWLock rwlock;
   map<rgw_bucket, rgw_user> modified_buckets;
 
@@ -569,11 +571,11 @@ public:
   }
 
   bool going_down() {
-    return (down_flag.read() != 0);
+    return down_flag;
   }
 
   void stop() {
-    down_flag.set(1);
+    down_flag = true;
     rwlock.get_write();
     stop_thread(&buckets_sync_thread);
     rwlock.unlock();
diff --git a/src/rgw/rgw_quota.h b/src/rgw/rgw_quota.h
index d55dcaff952..9291434634c 100644
--- a/src/rgw/rgw_quota.h
+++ b/src/rgw/rgw_quota.h
@@ -15,11 +15,11 @@
 #ifndef CEPH_RGW_QUOTA_H
 #define CEPH_RGW_QUOTA_H
 
-
 #include "include/utime.h"
-#include "include/atomic.h"
 #include "common/lru_map.h"
 
+#include <atomic>
+
 static inline int64_t rgw_rounded_kb(int64_t bytes)
 {
   return (bytes + 1023) / 1024;
diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc
index eb4fbeda6a2..f3fb6dc2ea3 100644
--- a/src/rgw/rgw_rados.cc
+++ b/src/rgw/rgw_rados.cc
@@ -55,6 +55,7 @@ using namespace librados;
 #include <string>
 #include <iostream>
 #include <vector>
+#include <atomic>
 #include <list>
 #include <map>
 #include "auth/Crypto.h" // get_random_bytes()
@@ -71,14 +72,13 @@ using namespace librados;
 
 #include "compressor/Compressor.h"
 
+#include <atomic>
+
 #define dout_context g_ceph_context
 #define dout_subsys ceph_subsys_rgw
 
 using namespace std;
 
-static RGWCache<RGWRados> cached_rados_provider;
-static RGWRados rados_provider;
-
 static string notify_oid_prefix = "notify";
 static string *notify_oids = NULL;
 static string shadow_ns = "shadow";
@@ -1428,33 +1428,62 @@ void RGWPeriod::fork()
   realm_epoch++;
 }
 
-int RGWPeriod::update_sync_status()
+static int read_sync_status(RGWRados *store, rgw_meta_sync_status *sync_status)
 {
-  // must be new period's master zone to write sync status
-  if (master_zone != store->get_zone_params().get_id()) {
-    ldout(cct, 0) << "my zone " << store->get_zone_params().get_id()
-        << " is not period's master zone " << master_zone << dendl;
-    return -EINVAL;
+  // initialize a sync status manager to read the status
+  RGWMetaSyncStatusManager mgr(store, store->get_async_rados());
+  int r = mgr.init();
+  if (r < 0) {
+    return r;
   }
+  r = mgr.read_sync_status(sync_status);
+  mgr.stop();
+  return r;
+}
 
-  auto mdlog = store->meta_mgr->get_log(get_id());
-  const auto num_shards = cct->_conf->rgw_md_log_max_shards;
+int RGWPeriod::update_sync_status(const RGWPeriod &current_period,
+                                  std::ostream& error_stream,
+                                  bool force_if_stale)
+{
+  rgw_meta_sync_status status;
+  int r = read_sync_status(store, &status);
+  if (r < 0) {
+    ldout(cct, 0) << "period failed to read sync status: "
+        << cpp_strerror(-r) << dendl;
+    return r;
+  }
 
   std::vector<std::string> markers;
-  markers.reserve(num_shards);
 
-  // gather the markers for each shard
-  // TODO: use coroutines to read them in parallel
-  for (int i = 0; i < num_shards; i++) {
-    RGWMetadataLogInfo info;
-    int r = mdlog->get_info(i, &info);
-    if (r < 0) {
-      ldout(cct, 0) << "period failed to get metadata log info for shard " << i
-          << ": " << cpp_strerror(-r) << dendl;
-      return r;
+  const auto current_epoch = current_period.get_realm_epoch();
+  if (current_epoch != status.sync_info.realm_epoch) {
+    // no sync status markers for the current period
+    assert(current_epoch > status.sync_info.realm_epoch);
+    const int behind = current_epoch - status.sync_info.realm_epoch;
+    if (!force_if_stale && current_epoch > 1) {
+      error_stream << "ERROR: This zone is " << behind << " period(s) behind "
+          "the current master zone in metadata sync. If this zone is promoted "
+          "to master, any metadata changes during that time are likely to "
+          "be lost.\n"
+          "Waiting for this zone to catch up on metadata sync (see "
+          "'radosgw-admin sync status') is recommended.\n"
+          "To promote this zone to master anyway, add the flag "
+          "--yes-i-really-mean-it." << std::endl;
+      return -EINVAL;
+    }
+    // empty sync status markers - other zones will skip this period during
+    // incremental metadata sync
+    markers.resize(status.sync_info.num_shards);
+  } else {
+    markers.reserve(status.sync_info.num_shards);
+    for (auto& i : status.sync_markers) {
+      auto& marker = i.second;
+      // filter out markers from other periods
+      if (marker.realm_epoch != current_epoch) {
+        marker.marker.clear();
+      }
+      markers.emplace_back(std::move(marker.marker));
     }
-    ldout(cct, 15) << "got shard " << i << " marker " << info.marker << dendl;
-    markers.emplace_back(std::move(info.marker));
   }
 
   std::swap(sync_status, markers);
@@ -1462,7 +1491,7 @@ int RGWPeriod::update_sync_status()
 }
 
 int RGWPeriod::commit(RGWRealm& realm, const RGWPeriod& current_period,
-                      std::ostream& error_stream)
+                      std::ostream& error_stream, bool force_if_stale)
 {
   ldout(cct, 20) << __func__ << " realm " << realm.get_id() << " period " << current_period.get_id() << dendl;
   // gateway must be in the master zone to commit
@@ -1492,7 +1521,7 @@ int RGWPeriod::commit(RGWRealm& realm, const RGWPeriod& current_period,
   // did the master zone change?
   if (master_zone != current_period.get_master_zone()) {
     // store the current metadata sync status in the period
-    int r = update_sync_status();
+    int r = update_sync_status(current_period, error_stream, force_if_stale);
     if (r < 0) {
       ldout(cct, 0) << "failed to update metadata sync status: "
           << cpp_strerror(-r) << dendl;
@@ -2356,9 +2385,9 @@ void RGWObjVersionTracker::generate_new_write_ver(CephContext *cct)
 int RGWPutObjProcessor::complete(size_t accounted_size, const string& etag,
                                  real_time *mtime, real_time set_mtime,
                                  map<string, bufferlist>& attrs, real_time delete_at,
-                                 const char *if_match, const char *if_nomatch)
+                                 const char *if_match, const char *if_nomatch, const string *user_data)
 {
-  int r = do_complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at, if_match, if_nomatch);
+  int r = do_complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at, if_match, if_nomatch, user_data);
   if (r < 0)
     return r;
 
@@ -2531,6 +2560,9 @@ int RGWPutObjProcessor_Atomic::write_data(bufferlist& bl, off_t ofs, void **phan
 
   *pobj = cur_obj;
 
+  if (!bl.length())
+    return 0;
+
   return RGWPutObjProcessor_Aio::handle_obj_data(cur_obj, bl, ofs - cur_part_ofs, ofs, phandle, exclusive);
 }
 
@@ -2700,7 +2732,7 @@ int RGWPutObjProcessor_Atomic::do_complete(size_t accounted_size, const string&
                                            map<string, bufferlist>& attrs,
                                            real_time delete_at,
                                            const char *if_match,
-                                           const char *if_nomatch) {
+                                           const char *if_nomatch, const string *user_data) {
   int r = complete_writing_data();
   if (r < 0)
     return r;
@@ -2725,6 +2757,8 @@ int RGWPutObjProcessor_Atomic::do_complete(size_t accounted_size, const string&
   obj_op.meta.flags = PUT_OBJ_CREATE;
   obj_op.meta.olh_epoch = olh_epoch;
   obj_op.meta.delete_at = delete_at;
+  obj_op.meta.user_data = user_data;
+
   r = obj_op.write_meta(obj_len, accounted_size, attrs);
   if (r < 0) {
     return r;
@@ -2927,7 +2961,7 @@ protected:
   CephContext *cct;
   RGWRados *store;
 
-  atomic_t down_flag;
+  std::atomic<bool> down_flag = { false };
 
   string thread_name;
 
@@ -2943,7 +2977,8 @@ public:
   virtual int init() { return 0; }
   virtual int process() = 0;
 
-  bool going_down() { return down_flag.read() != 0; }
+  bool going_down() { return down_flag; }
+
   void start();
   void stop();
 };
@@ -2956,7 +2991,7 @@ void RGWRadosThread::start()
 
 void RGWRadosThread::stop()
 {
-  down_flag.set(1);
+  down_flag = true;
   stop_process();
   if (worker) {
     worker->stop();
@@ -3194,9 +3229,20 @@ public:
     return http.set_threaded();
   }
   int process() override {
-    crs.run(create_data_log_trim_cr(store, &http,
-                                    cct->_conf->rgw_data_log_num_shards,
-                                    trim_interval));
+    list<RGWCoroutinesStack*> stacks;
+    auto meta = new RGWCoroutinesStack(store->ctx(), &crs);
+    meta->call(create_meta_log_trim_cr(store, &http,
+                                       cct->_conf->rgw_md_log_max_shards,
+                                       trim_interval));
+    stacks.push_back(meta);
+
+    auto data = new RGWCoroutinesStack(store->ctx(), &crs);
+    data->call(create_data_log_trim_cr(store, &http,
+                                       cct->_conf->rgw_data_log_num_shards,
+                                       trim_interval));
+    stacks.push_back(data);
+
+    crs.run(stacks);
     return 0;
   }
 };
@@ -4129,7 +4175,8 @@ int RGWRados::init_complete()
 
   /* no point of running sync thread if we don't have a master zone configured
     or there is no rest_master_conn */
-  if (get_zonegroup().master_zone.empty() || !rest_master_conn) {
+  if (get_zonegroup().master_zone.empty() || !rest_master_conn
+      || current_period.get_id().empty()) {
     run_sync_thread = false;
   }
 
@@ -5182,14 +5229,14 @@ int RGWRados::Bucket::List::list_objects(int max, vector<rgw_bucket_dir_entry> *
     if (delim_pos >= 0) {
       string s = cur_marker.name.substr(0, delim_pos);
       s.append(bigger_than_delim);
-      cur_marker.set(s);
+      cur_marker = s;
     }
   }
   
   string skip_after_delim;
   while (truncated && count <= max) {
     if (skip_after_delim > cur_marker.name) {
-      cur_marker.set(skip_after_delim);
+      cur_marker = skip_after_delim;
       ldout(cct, 20) << "setting cur_marker=" << cur_marker.name << "[" << cur_marker.instance << "]" << dendl;
     }
     std::map<string, rgw_bucket_dir_entry> ent_map;
@@ -5370,7 +5417,7 @@ int RGWRados::create_bucket(RGWUserInfo& owner, rgw_bucket& bucket,
 
   for (int i = 0; i < MAX_CREATE_RETRIES; i++) {
     int ret = 0;
-    ret = select_bucket_placement(owner, zonegroup_id, placement_rule, bucket,
+    ret = select_bucket_placement(owner, zonegroup_id, placement_rule,
                                   &selected_placement_rule_name, &rule_info);
     if (ret < 0)
       return ret;
@@ -5414,7 +5461,7 @@ int RGWRados::create_bucket(RGWUserInfo& owner, rgw_bucket& bucket,
       info.quota = *pquota_info;
     }
 
-    int r = init_bucket_index(info, bucket_index_max_shards);
+    int r = init_bucket_index(info, info.num_shards);
     if (r < 0) {
       return r;
     }
@@ -5465,7 +5512,7 @@ int RGWRados::create_bucket(RGWUserInfo& owner, rgw_bucket& bucket,
 }
 
 int RGWRados::select_new_bucket_location(RGWUserInfo& user_info, const string& zonegroup_id, const string& request_rule,
-                                         rgw_bucket& bucket, string *pselected_rule_name, RGWZonePlacementInfo *rule_info)
+                                         string *pselected_rule_name, RGWZonePlacementInfo *rule_info)
 
 {
   /* first check that rule exists within the specific zonegroup */
@@ -5506,16 +5553,16 @@ int RGWRados::select_new_bucket_location(RGWUserInfo& user_info, const string& z
   if (pselected_rule_name)
     *pselected_rule_name = rule;
 
-  return select_bucket_location_by_rule(rule, bucket, rule_info);
+  return select_bucket_location_by_rule(rule, rule_info);
 }
 
-int RGWRados::select_bucket_location_by_rule(const string& location_rule, rgw_bucket& bucket, RGWZonePlacementInfo *rule_info)
+int RGWRados::select_bucket_location_by_rule(const string& location_rule, RGWZonePlacementInfo *rule_info)
 {
   if (location_rule.empty()) {
     /* we can only reach here if we're trying to set a bucket location from a bucket
      * created on a different zone, using a legacy / default pool configuration
      */
-    return select_legacy_bucket_placement(bucket, rule_info);
+    return select_legacy_bucket_placement(rule_info);
   }
 
   /*
@@ -5546,21 +5593,21 @@ int RGWRados::select_bucket_location_by_rule(const string& location_rule, rgw_bu
 }
 
 int RGWRados::select_bucket_placement(RGWUserInfo& user_info, const string& zonegroup_id, const string& placement_rule,
-                                      rgw_bucket& bucket, string *pselected_rule_name, RGWZonePlacementInfo *rule_info)
+                                      string *pselected_rule_name, RGWZonePlacementInfo *rule_info)
 {
   if (!get_zone_params().placement_pools.empty()) {
     return select_new_bucket_location(user_info, zonegroup_id, placement_rule,
-                                      bucket, pselected_rule_name, rule_info);
+                                      pselected_rule_name, rule_info);
   }
 
   if (pselected_rule_name) {
     pselected_rule_name->clear();
   }
 
-  return select_legacy_bucket_placement(bucket, rule_info);
+  return select_legacy_bucket_placement(rule_info);
 }
 
-int RGWRados::select_legacy_bucket_placement(rgw_bucket& bucket, RGWZonePlacementInfo *rule_info)
+int RGWRados::select_legacy_bucket_placement(RGWZonePlacementInfo *rule_info)
 {
   bufferlist map_bl;
   map<string, bufferlist> m;
@@ -6224,7 +6271,6 @@ int RGWRados::swift_versioning_copy(RGWObjectCtx& obj_ctx,
                NULL, /* string *version_id */
                NULL, /* string *ptag */
                NULL, /* string *petag */
-               NULL, /* struct rgw_err *err */
                NULL, /* void (*progress_cb)(off_t, void *) */
                NULL); /* void *progress_data */
   if (r == -ECANCELED || r == -ENOENT) {
@@ -6314,7 +6360,6 @@ int RGWRados::swift_versioning_restore(RGWObjectCtx& obj_ctx,
                        nullptr,       /* string *version_id */
                        nullptr,       /* string *ptag */
                        nullptr,       /* string *petag */
-                       nullptr,       /* struct rgw_err *err */
                        nullptr,       /* void (*progress_cb)(off_t, void *) */
                        nullptr);      /* void *progress_data */
     if (ret == -ECANCELED || ret == -ENOENT) {
@@ -6504,7 +6549,7 @@ int RGWRados::Object::Write::_do_write_meta(uint64_t size, uint64_t accounted_si
 
   r = index_op->complete(poolid, epoch, size, accounted_size,
                         meta.set_mtime, etag, content_type, &acl_bl,
-                        meta.category, meta.remove_objs);
+                        meta.category, meta.remove_objs, meta.user_data);
   if (r < 0)
     goto done_cancel;
 
@@ -6674,7 +6719,8 @@ int RGWRados::put_system_obj_impl(rgw_raw_obj& obj, uint64_t size, real_time *mt
 }
 
 int RGWRados::put_system_obj_data(void *ctx, rgw_raw_obj& obj, bufferlist& bl,
-			       off_t ofs, bool exclusive)
+                                  off_t ofs, bool exclusive,
+                                  RGWObjVersionTracker *objv_tracker)
 {
   rgw_rados_ref ref;
   rgw_pool pool;
@@ -6688,6 +6734,9 @@ int RGWRados::put_system_obj_data(void *ctx, rgw_raw_obj& obj, bufferlist& bl,
   if (exclusive)
     op.create(true);
 
+  if (objv_tracker) {
+    objv_tracker->prepare_op_for_write(&op);
+  }
   if (ofs == -1) {
     op.write_full(bl);
   } else {
@@ -6697,6 +6746,9 @@ int RGWRados::put_system_obj_data(void *ctx, rgw_raw_obj& obj, bufferlist& bl,
   if (r < 0)
     return r;
 
+  if (objv_tracker) {
+    objv_tracker->apply_write();
+  }
   return 0;
 }
 
@@ -6952,7 +7004,7 @@ int RGWRados::rewrite_obj(RGWBucketInfo& dest_bucket_info, rgw_obj& obj)
   }
 
   return copy_obj_data(rctx, dest_bucket_info, read_op, obj_size - 1, obj, obj, max_chunk_size, NULL, mtime, attrset,
-                       RGW_OBJ_CATEGORY_MAIN, 0, real_time(), NULL, NULL, NULL, NULL);
+                       RGW_OBJ_CATEGORY_MAIN, 0, real_time(), NULL, NULL, NULL);
 }
 
 struct obj_time_weight {
@@ -7171,7 +7223,6 @@ int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
                string *version_id,
                string *ptag,
                ceph::buffer::list *petag,
-               struct rgw_err *err,
                void (*progress_cb)(off_t, void *),
                void *progress_data)
 {
@@ -7469,7 +7520,6 @@ int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
                string *version_id,
                string *ptag,
                ceph::buffer::list *petag,
-               struct rgw_err *err,
                void (*progress_cb)(off_t, void *),
                void *progress_data)
 {
@@ -7499,7 +7549,7 @@ int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
                dest_obj, src_obj, dest_bucket_info, src_bucket_info, src_mtime, mtime, mod_ptr,
                unmod_ptr, high_precision_time,
                if_match, if_nomatch, attrs_mod, copy_if_newer, attrs, category,
-               olh_epoch, delete_at, version_id, ptag, petag, err, progress_cb, progress_data);
+               olh_epoch, delete_at, version_id, ptag, petag, progress_cb, progress_data);
   }
 
   map<string, bufferlist> src_attrs;
@@ -7514,7 +7564,6 @@ int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
   read_op.params.attrs = &src_attrs;
   read_op.params.lastmod = src_mtime;
   read_op.params.obj_size = &obj_size;
-  read_op.params.perr = err;
 
   ret = read_op.prepare();
   if (ret < 0) {
@@ -7594,7 +7643,7 @@ int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
   if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
     return copy_obj_data(obj_ctx, dest_bucket_info, read_op, obj_size - 1, dest_obj, src_obj,
                          max_chunk_size, mtime, real_time(), attrs, category, olh_epoch, delete_at,
-                         version_id, ptag, petag, err);
+                         version_id, ptag, petag);
   }
 
   RGWObjManifest::obj_iterator miter = astate->manifest.obj_begin();
@@ -7730,8 +7779,7 @@ int RGWRados::copy_obj_data(RGWObjectCtx& obj_ctx,
 	       real_time delete_at,
                string *version_id,
                string *ptag,
-               ceph::buffer::list *petag,
-               struct rgw_err *err)
+               ceph::buffer::list *petag)
 {
   bufferlist first_chunk;
   RGWObjManifest manifest;
@@ -9460,7 +9508,7 @@ int RGWRados::Bucket::UpdateIndex::complete(int64_t poolid, uint64_t epoch,
                                             const string& content_type,
                                             bufferlist *acl_bl,
                                             RGWObjCategory category,
-                                            list<rgw_obj_index_key> *remove_objs)
+                                            list<rgw_obj_index_key> *remove_objs, const string *user_data)
 {
   if (blind) {
     return 0;
@@ -9479,6 +9527,9 @@ int RGWRados::Bucket::UpdateIndex::complete(int64_t poolid, uint64_t epoch,
   ent.meta.accounted_size = accounted_size;
   ent.meta.mtime = ut;
   ent.meta.etag = etag;
+  if (user_data)
+    ent.meta.user_data = *user_data;
+
   ACLOwner owner;
   if (acl_bl && acl_bl->length()) {
     int ret = store->decode_policy(*acl_bl, &owner);
@@ -9760,8 +9811,8 @@ struct get_obj_data : public RefCountedObject {
   Mutex data_lock;
   list<get_obj_aio_data> aio_data;
   RGWGetDataCB *client_cb;
-  atomic_t cancelled;
-  atomic_t err_code;
+  std::atomic<bool> cancelled = { false };
+  std::atomic<int64_t> err_code = { 0 };
   Throttle throttle;
   list<bufferlist> read_list;
 
@@ -9773,16 +9824,16 @@ struct get_obj_data : public RefCountedObject {
       throttle(cct, "get_obj_data", cct->_conf->rgw_get_obj_window_size, false) {}
   ~get_obj_data() override { } 
   void set_cancelled(int r) {
-    cancelled.set(1);
-    err_code.set(r);
+    cancelled = true;
+    err_code = r;
   }
 
   bool is_cancelled() {
-    return cancelled.read() == 1;
+    return cancelled;
   }
 
   int get_err_code() {
-    return err_code.read();
+    return err_code;
   }
 
   int wait_next_io(bool *done) {
@@ -11626,7 +11677,7 @@ int RGWRados::pool_iterate(RGWPoolIterCtx& ctx, uint32_t num, vector<rgw_bucket_
     if (filter && !filter->filter(oid, oid))
       continue;
 
-    e.key.set(oid);
+    e.key = oid;
     objs.push_back(e);
   }
 
diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h
index e4947298e62..e60ccbdf0be 100644
--- a/src/rgw/rgw_rados.h
+++ b/src/rgw/rgw_rados.h
@@ -1787,7 +1787,8 @@ class RGWPeriod
   const string get_period_oid_prefix();
 
   // gather the metadata sync status for each shard; only for use on master zone
-  int update_sync_status();
+  int update_sync_status(const RGWPeriod &current_period,
+                         std::ostream& error_stream, bool force_if_stale);
 
 public:
   RGWPeriod() : epoch(0), cct(NULL), store(NULL) {}
@@ -1860,7 +1861,7 @@ public:
 
   // commit a staging period; only for use on master zone
   int commit(RGWRealm& realm, const RGWPeriod &current_period,
-             std::ostream& error_stream);
+             std::ostream& error_stream, bool force_if_stale = false);
 
   void encode(bufferlist& bl) const {
     ENCODE_START(1, 1, bl);
@@ -2198,7 +2199,7 @@ class RGWRados
 
   void get_bucket_instance_ids(const RGWBucketInfo& bucket_info, int shard_id, map<int, string> *result);
 
-  atomic64_t max_req_id;
+  std::atomic<int64_t> max_req_id = { 0 };
   Mutex lock;
   Mutex watchers_lock;
   SafeTimer *timer;
@@ -2297,7 +2298,7 @@ protected:
 
   RGWPeriod current_period;
 public:
-  RGWRados() : max_req_id(0), lock("rados_timer_lock"), watchers_lock("watchers_lock"), timer(NULL),
+  RGWRados() : lock("rados_timer_lock"), watchers_lock("watchers_lock"), timer(NULL),
                gc(NULL), lc(NULL), obj_expirer(NULL), use_gc_thread(false), use_lc_thread(false), quota_threads(false),
                run_sync_thread(false), async_rados(nullptr), meta_notifier(NULL),
                data_notifier(NULL), meta_sync_processor_thread(NULL),
@@ -2319,7 +2320,7 @@ public:
                meta_mgr(NULL), data_log(NULL) {}
 
   uint64_t get_new_req_id() {
-    return max_req_id.inc();
+    return ++max_req_id;
   }
 
   librados::IoCtx* get_lc_pool_ctx() {
@@ -2528,11 +2529,11 @@ public:
    */
   int init_bucket_index(RGWBucketInfo& bucket_info, int num_shards);
   int select_bucket_placement(RGWUserInfo& user_info, const string& zonegroup_id, const string& rule,
-                              rgw_bucket& bucket, string *pselected_rule_name, RGWZonePlacementInfo *rule_info);
-  int select_legacy_bucket_placement(rgw_bucket& bucket, RGWZonePlacementInfo *rule_info);
+                              string *pselected_rule_name, RGWZonePlacementInfo *rule_info);
+  int select_legacy_bucket_placement(RGWZonePlacementInfo *rule_info);
   int select_new_bucket_location(RGWUserInfo& user_info, const string& zonegroup_id, const string& rule,
-                                 rgw_bucket& bucket, string *pselected_rule_name, RGWZonePlacementInfo *rule_info);
-  int select_bucket_location_by_rule(const string& location_rule, rgw_bucket& bucket, RGWZonePlacementInfo *rule_info);
+                                 string *pselected_rule_name, RGWZonePlacementInfo *rule_info);
+  int select_bucket_location_by_rule(const string& location_rule, RGWZonePlacementInfo *rule_info);
   void create_bucket_id(string *bucket_id);
 
   bool get_obj_data_pool(const string& placement_rule, const rgw_obj& obj, rgw_pool *pool);
@@ -2594,9 +2595,8 @@ public:
         ceph::real_time *lastmod;
         uint64_t *obj_size;
         map<string, bufferlist> *attrs;
-        struct rgw_err *perr;
 
-        StatParams() : lastmod(NULL), obj_size(NULL), attrs(NULL), perr(NULL) {}
+        StatParams() : lastmod(NULL), obj_size(NULL), attrs(NULL) {}
       } stat_params;
 
       struct ReadParams {
@@ -2707,9 +2707,8 @@ public:
         ceph::real_time *lastmod;
         uint64_t *obj_size;
         map<string, bufferlist> *attrs;
-        struct rgw_err *perr;
 
-        Params() : lastmod(NULL), obj_size(NULL), attrs(NULL), perr(NULL) {}
+        Params() : lastmod(NULL), obj_size(NULL), attrs(NULL) {}
       } params;
 
       explicit Read(RGWRados::Object *_source) : source(_source) {}
@@ -2740,10 +2739,11 @@ public:
         uint64_t olh_epoch;
         ceph::real_time delete_at;
         bool canceled;
+        const string *user_data;
 
         MetaParams() : mtime(NULL), rmattrs(NULL), data(NULL), manifest(NULL), ptag(NULL),
                  remove_objs(NULL), category(RGW_OBJ_CATEGORY_MAIN), flags(0),
-                 if_match(NULL), if_nomatch(NULL), olh_epoch(0), canceled(false) {}
+                 if_match(NULL), if_nomatch(NULL), olh_epoch(0), canceled(false), user_data(nullptr) {}
       } meta;
 
       explicit Write(RGWRados::Object *_target) : target(_target) {}
@@ -2876,7 +2876,7 @@ public:
                    uint64_t accounted_size, ceph::real_time& ut,
                    const string& etag, const string& content_type,
                    bufferlist *acl_bl, RGWObjCategory category,
-		   list<rgw_obj_index_key> *remove_objs);
+		   list<rgw_obj_index_key> *remove_objs, const string *user_data = nullptr);
       int complete_del(int64_t poolid, uint64_t epoch,
                        ceph::real_time& removed_mtime, /* mtime of removed object */
                        list<rgw_obj_index_key> *remove_objs);
@@ -2922,7 +2922,8 @@ public:
               ceph::real_time set_mtime /* 0 for don't set */);
 
   virtual int put_system_obj_data(void *ctx, rgw_raw_obj& obj, bufferlist& bl,
-              off_t ofs, bool exclusive);
+              off_t ofs, bool exclusive,
+              RGWObjVersionTracker *objv_tracker = nullptr);
   int aio_put_obj_data(void *ctx, rgw_raw_obj& obj, bufferlist& bl,
                         off_t ofs, bool exclusive, void **handle);
 
@@ -3020,7 +3021,6 @@ public:
                        string *version_id,
                        string *ptag,
                        ceph::buffer::list *petag,
-                       struct rgw_err *err,
                        void (*progress_cb)(off_t, void *),
                        void *progress_data);
   /**
@@ -3035,7 +3035,6 @@ public:
    *                               parameter, source object attributes are not copied;
    *            ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
    *                             are overwritten by values contained in attrs parameter.
-   * err: stores any errors resulting from the get of the original object
    * Returns: 0 on success, -ERR# otherwise.
    */
   int copy_obj(RGWObjectCtx& obj_ctx,
@@ -3064,7 +3063,6 @@ public:
                string *version_id,
                string *ptag,
                ceph::buffer::list *petag,
-               struct rgw_err *err,
                void (*progress_cb)(off_t, void *),
                void *progress_data);
 
@@ -3082,8 +3080,7 @@ public:
 	       ceph::real_time delete_at,
                string *version_id,
                string *ptag,
-               ceph::buffer::list *petag,
-               struct rgw_err *err);
+               ceph::buffer::list *petag);
   
   int check_bucket_empty(RGWBucketInfo& bucket_info);
 
@@ -3655,7 +3652,7 @@ protected:
   virtual int do_complete(size_t accounted_size, const string& etag,
                           ceph::real_time *mtime, ceph::real_time set_mtime,
                           map<string, bufferlist>& attrs, ceph::real_time delete_at,
-                          const char *if_match, const char *if_nomatch) = 0;
+                          const char *if_match, const char *if_nomatch, const string *user_data) = 0;
 
 public:
   RGWPutObjProcessor(RGWObjectCtx& _obj_ctx, RGWBucketInfo& _bi) : store(NULL), 
@@ -3672,7 +3669,7 @@ public:
   int complete(size_t accounted_size, const string& etag, 
                ceph::real_time *mtime, ceph::real_time set_mtime,
                map<string, bufferlist>& attrs, ceph::real_time delete_at,
-               const char *if_match = NULL, const char *if_nomatch = NULL);
+               const char *if_match = NULL, const char *if_nomatch = NULL, const string *user_data = nullptr);
 
   CephContext *ctx();
 
@@ -3750,7 +3747,7 @@ protected:
   int do_complete(size_t accounted_size, const string& etag,
                   ceph::real_time *mtime, ceph::real_time set_mtime,
                   map<string, bufferlist>& attrs, ceph::real_time delete_at,
-                  const char *if_match, const char *if_nomatch) override;
+                  const char *if_match, const char *if_nomatch, const string *user_data) override;
 
   int prepare_next_part(off_t ofs);
   int complete_parts();
@@ -3865,7 +3862,7 @@ protected:
   int do_complete(size_t accounted_size, const string& etag,
                   ceph::real_time *mtime, ceph::real_time set_mtime,
                   map<string, bufferlist>& attrs, ceph::real_time delete_at,
-                  const char *if_match, const char *if_nomatch) override;
+                  const char *if_match, const char *if_nomatch, const string *user_data) override;
 public:
   bool immutable_head() { return true; }
   RGWPutObjProcessor_Multipart(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, uint64_t _p, req_state *_s) :
diff --git a/src/rgw/rgw_realm_reloader.cc b/src/rgw/rgw_realm_reloader.cc
index a1d17831798..8bd65b45d9f 100644
--- a/src/rgw/rgw_realm_reloader.cc
+++ b/src/rgw/rgw_realm_reloader.cc
@@ -64,12 +64,10 @@ void RGWRealmReloader::handle_notify(RGWRealmNotify type,
   reload_scheduled = new C_Reload(this);
   cond.SignalOne(); // wake reload() if it blocked on a bad configuration
 
-  // schedule reload() with a delay so we can batch up changes
-  auto delay = cct->_conf->rgw_realm_reconfigure_delay;
-  timer.add_event_after(delay, reload_scheduled);
+  // schedule reload() without delay
+  timer.add_event_after(0, reload_scheduled);
 
-  ldout(cct, 4) << "Notification on realm, reconfiguration scheduled in "
-      << delay << 's' << dendl;
+  ldout(cct, 4) << "Notification on realm, reconfiguration scheduled" << dendl;
 }
 
 void RGWRealmReloader::reload()
diff --git a/src/rgw/rgw_realm_reloader.h b/src/rgw/rgw_realm_reloader.h
index 3de54b1aabc..e4e3a436342 100644
--- a/src/rgw/rgw_realm_reloader.h
+++ b/src/rgw/rgw_realm_reloader.h
@@ -10,8 +10,8 @@
 class RGWRados;
 
 /**
- * RGWRealmReloader responds to notifications by recreating RGWRados with the
- * updated realm configuration.
+ * RGWRealmReloader responds to new period notifications by recreating RGWRados
+ * with the updated realm configuration.
  */
 class RGWRealmReloader : public RGWRealmWatcher::Watcher {
  public:
@@ -20,8 +20,7 @@ class RGWRealmReloader : public RGWRealmWatcher::Watcher {
    * is required to ensure that they stop issuing requests on the old
    * RGWRados instance, and restart with the updated configuration.
    *
-   * This abstraction avoids a depency on class RGWFrontend, which is only
-   * defined in rgw_main.cc
+   * This abstraction avoids a depency on class RGWFrontend.
    */
   class Pauser {
    public:
@@ -50,9 +49,9 @@ class RGWRealmReloader : public RGWRealmWatcher::Watcher {
   Pauser *const frontends;
 
   /// reload() takes a significant amount of time, so we don't want to run
-  /// it in the handle_notify() thread. we choose a timer thread because we
-  /// also want to add a delay (see rgw_realm_reconfigure_delay) so that we
-  /// can batch up notifications within that window
+  /// it in the handle_notify() thread. we choose a timer thread instead of a
+  /// Finisher because it allows us to cancel events that were scheduled while
+  /// reload() is still running
   SafeTimer timer;
   Mutex mutex; //< protects access to timer and reload_scheduled
   Cond cond; //< to signal reload() after an invalid realm config
diff --git a/src/rgw/rgw_replica_log.h b/src/rgw/rgw_replica_log.h
index 45cac52d68c..699600c0fe4 100644
--- a/src/rgw/rgw_replica_log.h
+++ b/src/rgw/rgw_replica_log.h
@@ -22,8 +22,6 @@
 class RGWRados;
 class CephContext;
 
-using namespace std;
-
 #define META_REPLICA_LOG_OBJ_PREFIX "meta.replicalog."
 #define DATA_REPLICA_LOG_OBJ_PREFIX "data.replicalog."
 
diff --git a/src/rgw/rgw_request.h b/src/rgw/rgw_request.h
index d9fc69bee1c..3c835f7b1c9 100644
--- a/src/rgw/rgw_request.h
+++ b/src/rgw/rgw_request.h
@@ -12,8 +12,11 @@
 #if defined(WITH_RADOSGW_FCGI_FRONTEND)
 #include "rgw_fcgi.h"
 #endif
+
 #include "common/QueueRing.h"
 
+#include <atomic>
+
 struct RGWRequest
 {
   uint64_t id;
@@ -56,10 +59,10 @@ struct RGWLoadGenRequest : public RGWRequest {
 	string method;
 	string resource;
 	int content_length;
-	atomic_t* fail_flag;
+	std::atomic<int64_t>* fail_flag = nullptr;
 
 RGWLoadGenRequest(uint64_t req_id, const string& _m, const  string& _r, int _cl,
-		atomic_t *ff)
+		std::atomic<int64_t> *ff)
 	: RGWRequest(req_id), method(_m), resource(_r), content_length(_cl),
 		fail_flag(ff) {}
 };
diff --git a/src/rgw/rgw_rest.cc b/src/rgw/rgw_rest.cc
index 175ceb9606b..140d434e737 100644
--- a/src/rgw/rgw_rest.cc
+++ b/src/rgw/rgw_rest.cc
@@ -1,6 +1,7 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 // vim: ts=8 sw=2 smarttab
 
+
 #include <errno.h>
 #include <limits.h>
 
@@ -18,8 +19,6 @@
 #include "rgw_rest_s3.h"
 #include "rgw_swift_auth.h"
 #include "rgw_cors_s3.h"
-#include "rgw_http_errors.h"
-#include "rgw_lib.h"
 
 #include "rgw_client_io.h"
 #include "rgw_resolve.h"
@@ -28,6 +27,53 @@
 
 #define dout_subsys ceph_subsys_rgw
 
+struct rgw_http_status_code {
+  int code;
+  const char *name;
+};
+
+const static struct rgw_http_status_code http_codes[] = {
+  { 100, "Continue" },
+  { 200, "OK" },
+  { 201, "Created" },
+  { 202, "Accepted" },
+  { 204, "No Content" },
+  { 205, "Reset Content" },
+  { 206, "Partial Content" },
+  { 207, "Multi Status" },
+  { 208, "Already Reported" },
+  { 300, "Multiple Choices" },
+  { 301, "Moved Permanently" },
+  { 302, "Found" },
+  { 303, "See Other" },
+  { 304, "Not Modified" },
+  { 305, "User Proxy" },
+  { 306, "Switch Proxy" },
+  { 307, "Temporary Redirect" },
+  { 308, "Permanent Redirect" },
+  { 400, "Bad Request" },
+  { 401, "Unauthorized" },
+  { 402, "Payment Required" },
+  { 403, "Forbidden" },
+  { 404, "Not Found" },
+  { 405, "Method Not Allowed" },
+  { 406, "Not Acceptable" },
+  { 407, "Proxy Authentication Required" },
+  { 408, "Request Timeout" },
+  { 409, "Conflict" },
+  { 410, "Gone" },
+  { 411, "Length Required" },
+  { 412, "Precondition Failed" },
+  { 413, "Request Entity Too Large" },
+  { 414, "Request-URI Too Long" },
+  { 415, "Unsupported Media Type" },
+  { 416, "Requested Range Not Satisfiable" },
+  { 417, "Expectation Failed" },
+  { 422, "Unprocessable Entity" },
+  { 500, "Internal Server Error" },
+  { 501, "Not Implemented" },
+  { 0, NULL },
+};
 
 struct rgw_http_attr {
   const char *rgw_attr;
@@ -309,45 +355,6 @@ void rgw_flush_formatter(struct req_state *s, Formatter *formatter)
   }
 }
 
-void set_req_state_err(struct rgw_err& err,     /* out */
-                       int err_no,              /* in  */
-                       const int prot_flags)    /* in  */
-{
-  const struct rgw_http_errors *r;
-
-  if (err_no < 0)
-    err_no = -err_no;
-  err.ret = -err_no;
-  if (prot_flags & RGW_REST_SWIFT) {
-    r = search_err(err_no, RGW_HTTP_SWIFT_ERRORS,
-		   ARRAY_LEN(RGW_HTTP_SWIFT_ERRORS));
-    if (r) {
-      err.http_ret = r->http_ret;
-      err.s3_code = r->s3_code;
-      return;
-    }
-  }
-
-  r = search_err(err_no, RGW_HTTP_ERRORS, ARRAY_LEN(RGW_HTTP_ERRORS));
-  if (r) {
-    err.http_ret = r->http_ret;
-    err.s3_code = r->s3_code;
-    return;
-  }
-  dout(0) << "WARNING: set_req_state_err err_no=" << err_no
-	  << " resorting to 500" << dendl;
-
-  err.http_ret = 500;
-  err.s3_code = "UnknownError";
-}
-
-void set_req_state_err(struct req_state * const s, const int err_no)
-{
-  if (s) {
-    set_req_state_err(s->err, err_no, s->prot_flags);
-  }
-}
-
 void dump_errno(int http_ret, string& out) {
   stringstream ss;
 
@@ -658,7 +665,7 @@ void end_header(struct req_state* s, RGWOp* op, const char *content_type,
 
   dump_trans_id(s);
 
-  if ((!s->err.is_err()) &&
+  if ((!s->is_err()) &&
       (s->bucket_info.owner != s->user->user_id) &&
       (s->bucket_info.requester_pays)) {
     dump_header(s, "x-amz-request-charged", "requester");
@@ -675,7 +682,7 @@ void end_header(struct req_state* s, RGWOp* op, const char *content_type,
   /* do not send content type if content length is zero
      and the content type was not set by the user */
   if (force_content_type ||
-      (!content_type &&  s->formatter->get_len()  != 0) || s->err.is_err()){
+      (!content_type &&  s->formatter->get_len()  != 0) || s->is_err()){
     switch (s->format) {
     case RGW_FORMAT_XML:
       ctype = "application/xml";
@@ -694,24 +701,9 @@ void end_header(struct req_state* s, RGWOp* op, const char *content_type,
       ctype.append("; charset=utf-8");
     content_type = ctype.c_str();
   }
-  if (!force_no_error && s->err.is_err()) {
+  if (!force_no_error && s->is_err()) {
     dump_start(s);
-    if (s->format != RGW_FORMAT_HTML) {
-      s->formatter->open_object_section("Error");
-    }
-    if (!s->err.s3_code.empty())
-      s->formatter->dump_string("Code", s->err.s3_code);
-    if (!s->err.message.empty())
-      s->formatter->dump_string("Message", s->err.message);
-    if (!s->bucket_name.empty()) // TODO: connect to expose_bucket
-      s->formatter->dump_string("BucketName", s->bucket_name);
-    if (!s->trans_id.empty()) // TODO: connect to expose_bucket or another toggle
-      s->formatter->dump_string("RequestId", s->trans_id);
-    s->formatter->dump_string("HostId", s->host_id);
-    if (s->format != RGW_FORMAT_HTML) {
-      s->formatter->close_section();
-    }
-    s->formatter->output_footer();
+    dump(s);
     dump_content_length(s, s->formatter->get_len());
   } else {
     if (proposed_content_length == CHUNKED_TRANSFER_ENCODING) {
@@ -736,8 +728,8 @@ void end_header(struct req_state* s, RGWOp* op, const char *content_type,
   rgw_flush_formatter_and_reset(s, s->formatter);
 }
 
-void abort_early(struct req_state *s, RGWOp *op, int err_no,
-		 RGWHandler* handler)
+void abort_early(struct req_state *s, RGWOp* op, int err_no,
+		RGWHandler* handler)
 {
   string error_content("");
   if (!s->formatter) {
@@ -764,12 +756,10 @@ void abort_early(struct req_state *s, RGWOp *op, int err_no,
   // returned 0. If non-zero, we need to continue here.
   if (err_no) {
     // Watch out, we might have a custom error state already set!
-    if (s->err.http_ret && s->err.http_ret != 200) {
-      dump_errno(s);
-    } else {
+    if (!s->err.http_ret || s->err.http_ret == 200) {
       set_req_state_err(s, err_no);
-      dump_errno(s);
     }
+    dump_errno(s);
     dump_bucket_from_state(s);
     if (err_no == -ERR_PERMANENT_REDIRECT || err_no == -ERR_WEBSITE_REDIRECT) {
       string dest_uri;
@@ -1236,6 +1226,308 @@ int RGWPutObj_ObjStore::get_data(bufferlist& bl)
   return len;
 }
 
+
+/*
+ * parses params in the format: 'first; param1=foo; param2=bar'
+ */
+void RGWPostObj_ObjStore::parse_boundary_params(const std::string& params_str,
+                                                std::string& first,
+                                                std::map<std::string,
+                                                std::string>& params)
+{
+  size_t pos = params_str.find(';');
+  if (std::string::npos == pos) {
+    first = rgw_trim_whitespace(params_str);
+    return;
+  }
+
+  first = rgw_trim_whitespace(params_str.substr(0, pos));
+  pos++;
+
+  while (pos < params_str.size()) {
+    size_t end = params_str.find(';', pos);
+    if (std::string::npos == end) {
+      end = params_str.size();
+    }
+
+    std::string param = params_str.substr(pos, end - pos);
+    size_t eqpos = param.find('=');
+
+    if (std::string::npos != eqpos) {
+      std::string param_name = rgw_trim_whitespace(param.substr(0, eqpos));
+      std::string val = rgw_trim_quotes(param.substr(eqpos + 1));
+      params[std::move(param_name)] = std::move(val);
+    } else {
+      params[rgw_trim_whitespace(param)] = "";
+    }
+
+    pos = end + 1;
+  }
+}
+
+int RGWPostObj_ObjStore::parse_part_field(const std::string& line,
+                                          std::string& field_name,  /* out */
+                                          post_part_field& field)   /* out */
+{
+  size_t pos = line.find(':');
+  if (pos == string::npos)
+    return -EINVAL;
+
+  field_name = line.substr(0, pos);
+  if (pos >= line.size() - 1)
+    return 0;
+
+  parse_boundary_params(line.substr(pos + 1), field.val, field.params);
+
+  return 0;
+}
+
+static bool is_crlf(const char *s)
+{
+  return (*s == '\r' && *(s + 1) == '\n');
+}
+
+/*
+ * find the index of the boundary, if exists, or optionally the next end of line
+ * also returns how many bytes to skip
+ */
+static int index_of(ceph::bufferlist& bl,
+                    uint64_t max_len,
+                    const std::string& str,
+                    const bool check_crlf,
+                    bool& reached_boundary,
+                    int& skip)
+{
+  reached_boundary = false;
+  skip = 0;
+
+  if (str.size() < 2) // we assume boundary is at least 2 chars (makes it easier with crlf checks)
+    return -EINVAL;
+
+  if (bl.length() < str.size())
+    return -1;
+
+  const char *buf = bl.c_str();
+  const char *s = str.c_str();
+
+  if (max_len > bl.length())
+    max_len = bl.length();
+
+  for (uint64_t i = 0; i < max_len; i++, buf++) {
+    if (check_crlf &&
+	i >= 1 &&
+	is_crlf(buf - 1)) {
+      return i + 1; // skip the crlf
+    }
+    if ((i < max_len - str.size() + 1) &&
+	(buf[0] == s[0] && buf[1] == s[1]) &&
+	(strncmp(buf, s, str.size()) == 0)) {
+      reached_boundary = true;
+      skip = str.size();
+
+      /* oh, great, now we need to swallow the preceding crlf
+       * if exists
+       */
+      if ((i >= 2) &&
+	  is_crlf(buf - 2)) {
+	i -= 2;
+	skip += 2;
+      }
+      return i;
+    }
+  }
+
+  return -1;
+}
+
+int RGWPostObj_ObjStore::read_with_boundary(ceph::bufferlist& bl,
+                                            uint64_t max,
+                                            const bool check_crlf,
+                                            bool& reached_boundary,
+                                            bool& done)
+{
+  uint64_t cl = max + 2 + boundary.size();
+
+  if (max > in_data.length()) {
+    uint64_t need_to_read = cl - in_data.length();
+
+    bufferptr bp(need_to_read);
+
+    const auto read_len = recv_body(s, bp.c_str(), need_to_read);
+    if (read_len < 0) {
+      return read_len;
+    }
+    in_data.append(bp, 0, read_len);
+  }
+
+  done = false;
+  int skip;
+  const int index = index_of(in_data, cl, boundary, check_crlf,
+                             reached_boundary, skip);
+  if (index >= 0) {
+    max = index;
+  }
+
+  if (max > in_data.length()) {
+    max = in_data.length();
+  }
+
+  bl.substr_of(in_data, 0, max);
+
+  ceph::bufferlist new_read_data;
+
+  /*
+   * now we need to skip boundary for next time, also skip any crlf, or
+   * check to see if it's the last final boundary (marked with "--" at the end
+   */
+  if (reached_boundary) {
+    int left = in_data.length() - max;
+    if (left < skip + 2) {
+      int need = skip + 2 - left;
+      bufferptr boundary_bp(need);
+      const int r = recv_body(s, boundary_bp.c_str(), need);
+      if (r < 0) {
+        return r;
+      }
+      in_data.append(boundary_bp);
+    }
+    max += skip; // skip boundary for next time
+    if (in_data.length() >= max + 2) {
+      const char *data = in_data.c_str();
+      if (is_crlf(data + max)) {
+	max += 2;
+      } else {
+	if (*(data + max) == '-' &&
+	    *(data + max + 1) == '-') {
+	  done = true;
+	  max += 2;
+	}
+      }
+    }
+  }
+
+  new_read_data.substr_of(in_data, max, in_data.length() - max);
+  in_data = new_read_data;
+
+  return 0;
+}
+
+int RGWPostObj_ObjStore::read_line(ceph::bufferlist& bl,
+                                   const uint64_t max,
+                                   bool& reached_boundary,
+                                   bool& done)
+{
+  return read_with_boundary(bl, max, true, reached_boundary, done);
+}
+
+int RGWPostObj_ObjStore::read_data(ceph::bufferlist& bl,
+                                   const uint64_t max,
+                                   bool& reached_boundary,
+                                   bool& done)
+{
+  return read_with_boundary(bl, max, false, reached_boundary, done);
+}
+
+
+int RGWPostObj_ObjStore::read_form_part_header(struct post_form_part* const part,
+                                               bool& done)
+{
+  bufferlist bl;
+  bool reached_boundary;
+  uint64_t chunk_size = s->cct->_conf->rgw_max_chunk_size;
+  int r = read_line(bl, chunk_size, reached_boundary, done);
+  if (r < 0) {
+    return r;
+  }
+
+  if (done) {
+    return 0;
+  }
+
+  if (reached_boundary) { // skip the first boundary
+    r = read_line(bl, chunk_size, reached_boundary, done);
+    if (r < 0) {
+      return r;
+    } else if (done) {
+      return 0;
+    }
+  }
+
+  while (true) {
+  /*
+   * iterate through fields
+   */
+    std::string line = rgw_trim_whitespace(string(bl.c_str(), bl.length()));
+
+    if (line.empty()) {
+      break;
+    }
+
+    struct post_part_field field;
+
+    string field_name;
+    r = parse_part_field(line, field_name, field);
+    if (r < 0) {
+      return r;
+    }
+
+    part->fields[field_name] = field;
+
+    if (stringcasecmp(field_name, "Content-Disposition") == 0) {
+      part->name = field.params["name"];
+    }
+
+    if (reached_boundary) {
+      break;
+    }
+
+    r = read_line(bl, chunk_size, reached_boundary, done);
+  }
+
+  return 0;
+}
+
+bool RGWPostObj_ObjStore::part_str(parts_collection_t& parts,
+                                   const std::string& name,
+                                   std::string* val)
+{
+  const auto iter = parts.find(name);
+  if (std::end(parts) == iter) {
+    return false;
+  }
+
+  ceph::bufferlist& data = iter->second.data;
+  std::string str = string(data.c_str(), data.length());
+  *val = rgw_trim_whitespace(str);
+  return true;
+}
+
+std::string RGWPostObj_ObjStore::get_part_str(parts_collection_t& parts,
+                                              const std::string& name,
+                                              const std::string& def_val)
+{
+  std::string val;
+
+  if (part_str(parts, name, &val)) {
+    return val;
+  } else {
+    return rgw_trim_whitespace(def_val);
+  }
+}
+
+bool RGWPostObj_ObjStore::part_bl(parts_collection_t& parts,
+                                  const std::string& name,
+                                  ceph::bufferlist* pbl)
+{
+  const auto iter = parts.find(name);
+  if (std::end(parts) == iter) {
+    return false;
+  }
+
+  *pbl = iter->second.data;
+  return true;
+}
+
 int RGWPostObj_ObjStore::verify_params()
 {
   /*  check that we have enough memory to store the object
@@ -1252,6 +1544,51 @@ int RGWPostObj_ObjStore::verify_params()
   return 0;
 }
 
+int RGWPostObj_ObjStore::get_params()
+{
+  if (s->expect_cont) {
+    /* OK, here it really gets ugly. With POST, the params are embedded in the
+     * request body, so we need to continue before being able to actually look
+     * at them. This diverts from the usual request flow. */
+    dump_continue(s);
+    s->expect_cont = false;
+  }
+
+  std::string req_content_type_str = s->info.env->get("CONTENT_TYPE", "");
+  std::string req_content_type;
+  std::map<std::string, std::string> params;
+  parse_boundary_params(req_content_type_str, req_content_type, params);
+
+  if (req_content_type.compare("multipart/form-data") != 0) {
+    err_msg = "Request Content-Type is not multipart/form-data";
+    return -EINVAL;
+  }
+
+  if (s->cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20)) {
+    ldout(s->cct, 20) << "request content_type_str="
+		      << req_content_type_str << dendl;
+    ldout(s->cct, 20) << "request content_type params:" << dendl;
+
+    for (const auto& pair : params) {
+      ldout(s->cct, 20) << " " << pair.first << " -> " << pair.second
+			<< dendl;
+    }
+  }
+
+  const auto iter = params.find("boundary");
+  if (std::end(params) == iter) {
+    err_msg = "Missing multipart boundary specification";
+    return -EINVAL;
+  }
+
+  /* Create the boundary. */
+  boundary = "--";
+  boundary.append(iter->second);
+
+  return 0;
+}
+
+
 int RGWPutACLs_ObjStore::get_params()
 {
   const auto max_size = s->cct->_conf->rgw_max_put_param_size;
diff --git a/src/rgw/rgw_rest.h b/src/rgw/rgw_rest.h
index c618f532f3c..41597a2d476 100644
--- a/src/rgw/rgw_rest.h
+++ b/src/rgw/rgw_rest.h
@@ -218,6 +218,67 @@ public:
 
 class RGWPostObj_ObjStore : public RGWPostObj
 {
+  std::string boundary;
+
+public:
+  struct post_part_field {
+    std::string val;
+    std::map<std::string, std::string> params;
+  };
+
+  struct post_form_part {
+    std::string name;
+    std::map<std::string, post_part_field, ltstr_nocase> fields;
+    ceph::bufferlist data;
+  };
+
+protected:
+  using parts_collection_t = \
+    std::map<std::string, post_form_part, const ltstr_nocase>;
+
+  std::string err_msg;
+  ceph::bufferlist in_data;
+
+  int read_with_boundary(ceph::bufferlist& bl,
+                         uint64_t max,
+                         bool check_eol,
+                         bool& reached_boundary,
+                         bool& done);
+
+  int read_line(ceph::bufferlist& bl,
+                uint64_t max,
+                bool& reached_boundary,
+                bool& done);
+
+  int read_data(ceph::bufferlist& bl,
+                uint64_t max,
+                bool& reached_boundary,
+                bool& done);
+
+  int read_form_part_header(struct post_form_part *part, bool& done);
+
+  int get_params() override;
+
+  static int parse_part_field(const std::string& line,
+                              std::string& field_name, /* out */
+                              post_part_field& field); /* out */
+
+  static void parse_boundary_params(const std::string& params_str,
+                                    std::string& first,
+                                    std::map<std::string, std::string>& params);
+
+  static bool part_str(parts_collection_t& parts,
+                       const std::string& name,
+                       std::string *val);
+
+  static std::string get_part_str(parts_collection_t& parts,
+                                  const std::string& name,
+                                  const std::string& def_val = std::string());
+
+  static bool part_bl(parts_collection_t& parts,
+                      const std::string& name,
+                      ceph::bufferlist *pbl);
+
 public:
   RGWPostObj_ObjStore() {}
   ~RGWPostObj_ObjStore() override {}
@@ -225,6 +286,7 @@ public:
   int verify_params() override;
 };
 
+
 class RGWPutMetadataAccount_ObjStore : public RGWPutMetadataAccount
 {
 public:
@@ -564,8 +626,6 @@ public:
 static constexpr int64_t NO_CONTENT_LENGTH = -1;
 static constexpr int64_t CHUNKED_TRANSFER_ENCODING = -2;
 
-extern void set_req_state_err(struct rgw_err &err, int err_no, int prot_flags);
-extern void set_req_state_err(struct req_state *s, int err_no);
 extern void dump_errno(int http_ret, string& out);
 extern void dump_errno(const struct rgw_err &err, string& out);
 extern void dump_errno(struct req_state *s);
diff --git a/src/rgw/rgw_rest_conn.cc b/src/rgw/rgw_rest_conn.cc
index 8699624002b..71a11554d2c 100644
--- a/src/rgw/rgw_rest_conn.cc
+++ b/src/rgw/rgw_rest_conn.cc
@@ -19,6 +19,27 @@ RGWRESTConn::RGWRESTConn(CephContext *_cct, RGWRados *store,
   }
 }
 
+RGWRESTConn::RGWRESTConn(RGWRESTConn&& other)
+  : cct(other.cct),
+    endpoints(std::move(other.endpoints)),
+    key(std::move(other.key)),
+    self_zone_group(std::move(other.self_zone_group)),
+    remote_id(std::move(other.remote_id)),
+    counter(other.counter.load())
+{
+}
+
+RGWRESTConn& RGWRESTConn::operator=(RGWRESTConn&& other)
+{
+  cct = other.cct;
+  endpoints = std::move(other.endpoints);
+  key = std::move(other.key);
+  self_zone_group = std::move(other.self_zone_group);
+  remote_id = std::move(other.remote_id);
+  counter = other.counter.load();
+  return *this;
+}
+
 int RGWRESTConn::get_url(string& endpoint)
 {
   if (endpoints.empty()) {
@@ -26,7 +47,7 @@ int RGWRESTConn::get_url(string& endpoint)
     return -EIO;
   }
 
-  int i = counter.inc();
+  int i = ++counter;
   endpoint = endpoints[i % endpoints.size()];
 
   return 0;
@@ -40,7 +61,7 @@ string RGWRESTConn::get_url()
     return endpoint;
   }
 
-  int i = counter.inc();
+  int i = ++counter;
   endpoint = endpoints[i % endpoints.size()];
 
   return endpoint;
@@ -182,7 +203,13 @@ int RGWRESTConn::get_obj(const rgw_user& uid, req_info *info /* optional */, rgw
     set_header(mod_pg_ver, extra_headers, "HTTP_DEST_PG_VER");
   }
 
-  return (*req)->get_obj(key, extra_headers, obj);
+  int r = (*req)->get_obj(key, extra_headers, obj);
+  if (r < 0) {
+    delete *req;
+    *req = nullptr;
+  }
+  
+  return r;
 }
 
 int RGWRESTConn::complete_request(RGWRESTStreamRWRequest *req, string& etag, real_time *mtime,
diff --git a/src/rgw/rgw_rest_conn.h b/src/rgw/rgw_rest_conn.h
index 674387ffa05..bec829d6939 100644
--- a/src/rgw/rgw_rest_conn.h
+++ b/src/rgw/rgw_rest_conn.h
@@ -9,6 +9,7 @@
 #include "common/ceph_json.h"
 #include "common/RefCountedObj.h"
 
+#include <atomic>
 
 class CephContext;
 class RGWRados;
@@ -55,11 +56,15 @@ class RGWRESTConn
   RGWAccessKey key;
   string self_zone_group;
   string remote_id;
-  atomic_t counter;
+  std::atomic<int64_t> counter = { 0 };
 
 public:
 
   RGWRESTConn(CephContext *_cct, RGWRados *store, const string& _remote_id, const list<string>& endpoints);
+  // custom move needed for atomic
+  RGWRESTConn(RGWRESTConn&& other);
+  RGWRESTConn& operator=(RGWRESTConn&& other);
+
   int get_url(string& endpoint);
   string get_url();
   const string& get_self_zonegroup() {
diff --git a/src/rgw/rgw_rest_log.cc b/src/rgw/rgw_rest_log.cc
index 973af3c8891..9220917c99a 100644
--- a/src/rgw/rgw_rest_log.cc
+++ b/src/rgw/rgw_rest_log.cc
@@ -20,6 +20,7 @@
 #include "rgw_client_io.h"
 #include "rgw_sync.h"
 #include "rgw_data_sync.h"
+#include "rgw_common.h"
 #include "common/errno.h"
 #include "include/assert.h"
 
@@ -863,8 +864,7 @@ void RGWOp_MDLog_Status::execute()
     http_ret = -ENOENT;
     return;
   }
-  http_ret = sync->read_sync_status();
-  status = sync->get_sync_status();
+  http_ret = sync->read_sync_status(&status);
 }
 
 void RGWOp_MDLog_Status::send_response()
diff --git a/src/rgw/rgw_rest_realm.cc b/src/rgw/rgw_rest_realm.cc
index d5ef848b805..0f5abcdfe05 100644
--- a/src/rgw/rgw_rest_realm.cc
+++ b/src/rgw/rgw_rest_realm.cc
@@ -26,9 +26,7 @@ class RGWOp_Period_Base : public RGWRESTOp {
 // reply with the period object on success
 void RGWOp_Period_Base::send_response()
 {
-  s->err.message = error_stream.str();
-
-  set_req_state_err(s, http_ret);
+  set_req_state_err(s, http_ret, error_stream.str());
   dump_errno(s);
 
   if (http_ret < 0) {
diff --git a/src/rgw/rgw_rest_role.cc b/src/rgw/rgw_rest_role.cc
index 386504398fe..40473fba67f 100644
--- a/src/rgw/rgw_rest_role.cc
+++ b/src/rgw/rgw_rest_role.cc
@@ -1,3 +1,5 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
 #include <errno.h>
 
 #include "common/errno.h"
@@ -15,8 +17,6 @@
 
 #define dout_subsys ceph_subsys_rgw
 
-using namespace std;
-
 void RGWRestRole::send_response()
 {
   if (op_ret) {
@@ -77,9 +77,7 @@ void RGWCreateRole::execute()
   if (op_ret < 0) {
     return;
   }
-  string uid;
-  s->user->user_id.to_str(uid);
-  RGWRole role(s->cct, store, role_name, role_path, trust_policy, uid);
+  RGWRole role(s->cct, store, role_name, role_path, trust_policy, s->user->user_id.tenant);
   op_ret = role.create(true);
 
   if (op_ret == -EEXIST) {
@@ -111,7 +109,7 @@ void RGWDeleteRole::execute()
   if (op_ret < 0) {
     return;
   }
-  RGWRole role(s->cct, store, role_name);
+  RGWRole role(s->cct, store, role_name, s->user->user_id.tenant);
   op_ret = role.delete_obj();
 
   if (op_ret == -ENOENT) {
@@ -137,7 +135,7 @@ void RGWGetRole::execute()
   if (op_ret < 0) {
     return;
   }
-  RGWRole role(s->cct, store, role_name);
+  RGWRole role(s->cct, store, role_name, s->user->user_id.tenant);
   op_ret = role.get();
 
   if (op_ret == -ENOENT) {
@@ -175,7 +173,7 @@ void RGWModifyRole::execute()
   if (op_ret < 0) {
     return;
   }
-  RGWRole role(s->cct, store, role_name);
+  RGWRole role(s->cct, store, role_name, s->user->user_id.tenant);
   op_ret = role.get();
   if (op_ret == -ENOENT) {
     op_ret = -ERR_NO_ROLE_FOUND;
@@ -201,7 +199,7 @@ void RGWListRoles::execute()
     return;
   }
   vector<RGWRole> result;
-  op_ret = RGWRole::get_roles_by_path_prefix(store, s->cct, path_prefix, result);
+  op_ret = RGWRole::get_roles_by_path_prefix(store, s->cct, path_prefix, s->user->user_id.tenant, result);
 
   if (op_ret == 0) {
     s->formatter->open_array_section("Roles");
@@ -240,7 +238,7 @@ void RGWPutRolePolicy::execute()
     return;
   }
 
-  RGWRole role(s->cct, store, role_name);
+  RGWRole role(s->cct, store, role_name, s->user->user_id.tenant);
   op_ret = role.get();
   if (op_ret == 0) {
     role.set_perm_policy(policy_name, perm_policy);
@@ -267,7 +265,7 @@ void RGWGetRolePolicy::execute()
     return;
   }
 
-  RGWRole role(g_ceph_context, store, role_name);
+  RGWRole role(g_ceph_context, store, role_name, s->user->user_id.tenant);
   op_ret = role.get();
 
   if (op_ret == -ENOENT) {
@@ -306,7 +304,7 @@ void RGWListRolePolicies::execute()
     return;
   }
 
-  RGWRole role(g_ceph_context, store, role_name);
+  RGWRole role(g_ceph_context, store, role_name, s->user->user_id.tenant);
   op_ret = role.get();
 
   if (op_ret == -ENOENT) {
@@ -342,7 +340,7 @@ void RGWDeleteRolePolicy::execute()
     return;
   }
 
-  RGWRole role(g_ceph_context, store, role_name);
+  RGWRole role(g_ceph_context, store, role_name, s->user->user_id.tenant);
   op_ret = role.get();
 
   if (op_ret == -ENOENT) {
diff --git a/src/rgw/rgw_rest_role.h b/src/rgw/rgw_rest_role.h
index de3ec97a637..7a99dbe45c6 100644
--- a/src/rgw/rgw_rest_role.h
+++ b/src/rgw/rgw_rest_role.h
@@ -1,3 +1,5 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
 #ifndef CEPH_RGW_REST_ROLE_H
 #define CEPH_RGW_REST_ROLE_H
 
diff --git a/src/rgw/rgw_rest_s3.cc b/src/rgw/rgw_rest_s3.cc
index 1c9e4d751b0..96285e258ff 100644
--- a/src/rgw/rgw_rest_s3.cc
+++ b/src/rgw/rgw_rest_s3.cc
@@ -65,14 +65,13 @@ void dump_bucket(struct req_state *s, RGWBucketEnt& obj)
   s->formatter->close_section();
 }
 
-void rgw_get_errno_s3(rgw_http_errors *e , int err_no)
+void rgw_get_errno_s3(rgw_http_error *e , int err_no)
 {
-  const struct rgw_http_errors *r;
-  r = search_err(err_no, RGW_HTTP_ERRORS, ARRAY_LEN(RGW_HTTP_ERRORS));
+  rgw_http_errors::const_iterator r = rgw_http_s3_errors.find(err_no);
 
-  if (r) {
-    e->http_ret = r->http_ret;
-    e->s3_code = r->s3_code;
+  if (r != rgw_http_s3_errors.end()) {
+    e->http_ret = r->second.first;
+    e->s3_code = r->second.second;
   } else {
     e->http_ret = 500;
     e->s3_code = "UnknownError";
@@ -173,7 +172,7 @@ int RGWGetObj_ObjStore_S3::send_response_data(bufferlist& bl, off_t bl_ofs,
     dump_errno(s, custom_http_ret);
   } else {
     set_req_state_err(s, (partial_content && !op_ret) ? STATUS_PARTIAL_CONTENT
-          	  : op_ret);
+                  : op_ret);
     dump_errno(s);
   }
 
@@ -1406,7 +1405,6 @@ static inline int get_obj_attrs(RGWRados *store, struct req_state *s, rgw_obj& o
   RGWRados::Object::Read read_op(&op_target);
 
   read_op.params.attrs = &attrs;
-  read_op.params.perr = &s->err;
 
   return read_op.prepare();
 }
@@ -1494,265 +1492,6 @@ int RGWPutObj_ObjStore_S3::get_encrypt_filter(
   }
   return res;
 }
-/*
- * parses params in the format: 'first; param1=foo; param2=bar'
- */
-static void parse_params(const string& params_str, string& first,
-			 map<string, string>& params)
-{
-  size_t pos = params_str.find(';');
-  if (pos == string::npos) {
-    first = rgw_trim_whitespace(params_str);
-    return;
-  }
-
-  first = rgw_trim_whitespace(params_str.substr(0, pos));
-
-  pos++;
-
-  while (pos < params_str.size()) {
-    size_t end = params_str.find(';', pos);
-    if (end == string::npos)
-      end = params_str.size();
-
-    string param = params_str.substr(pos, end - pos);
-
-    size_t eqpos = param.find('=');
-    if (eqpos != string::npos) {
-      params[rgw_trim_whitespace(param.substr(0, eqpos))] =
-        rgw_trim_quotes(param.substr(eqpos + 1));
-    } else {
-      params[rgw_trim_whitespace(param)] = "";
-    }
-
-    pos = end + 1;
-  }
-}
-
-static int parse_part_field(const string& line, string& field_name,
-			    struct post_part_field& field)
-{
-  size_t pos = line.find(':');
-  if (pos == string::npos)
-    return -EINVAL;
-
-  field_name = line.substr(0, pos);
-  if (pos >= line.size() - 1)
-    return 0;
-
-  parse_params(line.substr(pos + 1), field.val, field.params);
-
-  return 0;
-}
-
-bool is_crlf(const char *s)
-{
-  return (*s == '\r' && *(s + 1) == '\n');
-}
-
-/*
- * find the index of the boundary, if exists, or optionally the next end of line
- * also returns how many bytes to skip
- */
-static int index_of(bufferlist& bl, int max_len, const string& str,
-		    bool check_crlf,
-		    bool *reached_boundary, int *skip)
-{
-  *reached_boundary = false;
-  *skip = 0;
-
-  if (str.size() < 2) // we assume boundary is at least 2 chars (makes it easier with crlf checks)
-    return -EINVAL;
-
-  if (bl.length() < str.size())
-    return -1;
-
-  const char *buf = bl.c_str();
-  const char *s = str.c_str();
-
-  if (max_len > (int)bl.length())
-    max_len = bl.length();
-
-  int i;
-  for (i = 0; i < max_len; i++, buf++) {
-    if (check_crlf &&
-	i >= 1 &&
-	is_crlf(buf - 1)) {
-      return i + 1; // skip the crlf
-    }
-    if ((i < max_len - (int)str.size() + 1) &&
-	(buf[0] == s[0] && buf[1] == s[1]) &&
-	(strncmp(buf, s, str.size()) == 0)) {
-      *reached_boundary = true;
-      *skip = str.size();
-
-      /* oh, great, now we need to swallow the preceding crlf
-       * if exists
-       */
-      if ((i >= 2) &&
-	  is_crlf(buf - 2)) {
-	i -= 2;
-	*skip += 2;
-      }
-      return i;
-    }
-  }
-
-  return -1;
-}
-
-int RGWPostObj_ObjStore_S3::read_with_boundary(bufferlist& bl, uint64_t max,
-					       bool check_crlf,
-					       bool *reached_boundary,
-					       bool *done)
-{
-  uint64_t cl = max + 2 + boundary.size();
-
-  if (max > in_data.length()) {
-    uint64_t need_to_read = cl - in_data.length();
-
-    bufferptr bp(need_to_read);
-
-    const auto read_len = recv_body(s, bp.c_str(), need_to_read);
-    in_data.append(bp, 0, read_len);
-  }
-
-  *done = false;
-  int skip;
-  int index = index_of(in_data, cl, boundary, check_crlf, reached_boundary,
-		       &skip);
-  if (index >= 0)
-    max = index;
-
-  if (max > in_data.length())
-    max = in_data.length();
-
-  bl.substr_of(in_data, 0, max);
-
-  bufferlist new_read_data;
-
-  /*
-   * now we need to skip boundary for next time, also skip any crlf, or
-   * check to see if it's the last final boundary (marked with "--" at the end
-   */
-  if (*reached_boundary) {
-    int left = in_data.length() - max;
-    if (left < skip + 2) {
-      int need = skip + 2 - left;
-      bufferptr boundary_bp(need);
-      recv_body(s, boundary_bp.c_str(), need);
-      in_data.append(boundary_bp);
-    }
-    max += skip; // skip boundary for next time
-    if (in_data.length() >= max + 2) {
-      const char *data = in_data.c_str();
-      if (is_crlf(data + max)) {
-	max += 2;
-      } else {
-	if (*(data + max) == '-' &&
-	    *(data + max + 1) == '-') {
-	  *done = true;
-	  max += 2;
-	}
-      }
-    }
-  }
-
-  new_read_data.substr_of(in_data, max, in_data.length() - max);
-  in_data = new_read_data;
-
-  return 0;
-}
-
-int RGWPostObj_ObjStore_S3::read_line(bufferlist& bl, uint64_t max,
-				  bool *reached_boundary, bool *done)
-{
-  return read_with_boundary(bl, max, true, reached_boundary, done);
-}
-
-int RGWPostObj_ObjStore_S3::read_data(bufferlist& bl, uint64_t max,
-				  bool *reached_boundary, bool *done)
-{
-  return read_with_boundary(bl, max, false, reached_boundary, done);
-}
-
-
-int RGWPostObj_ObjStore_S3::read_form_part_header(struct post_form_part *part,
-						  bool *done)
-{
-  bufferlist bl;
-  bool reached_boundary;
-  uint64_t chunk_size = s->cct->_conf->rgw_max_chunk_size;
-  int r = read_line(bl, chunk_size, &reached_boundary, done);
-  if (r < 0)
-    return r;
-
-  if (*done) {
-    return 0;
-  }
-
-  if (reached_boundary) { // skip the first boundary
-    r = read_line(bl, chunk_size, &reached_boundary, done);
-    if (r < 0)
-      return r;
-    if (*done)
-      return 0;
-  }
-
-  while (true) {
-  /*
-   * iterate through fields
-   */
-    string line = rgw_trim_whitespace(string(bl.c_str(), bl.length()));
-
-    if (line.empty())
-      break;
-
-    struct post_part_field field;
-
-    string field_name;
-    r = parse_part_field(line, field_name, field);
-    if (r < 0)
-      return r;
-
-    part->fields[field_name] = field;
-
-    if (stringcasecmp(field_name, "Content-Disposition") == 0) {
-      part->name = field.params["name"];
-    }
-
-    if (reached_boundary)
-      break;
-
-    r = read_line(bl, chunk_size, &reached_boundary, done);
-  }
-
-  return 0;
-}
-
-bool RGWPostObj_ObjStore_S3::part_str(const string& name, string *val)
-{
-  map<string, struct post_form_part, ltstr_nocase>::iterator iter
-    = parts.find(name);
-  if (iter == parts.end())
-    return false;
-
-  bufferlist& data = iter->second.data;
-  string str = string(data.c_str(), data.length());
-  *val = rgw_trim_whitespace(str);
-  return true;
-}
-
-bool RGWPostObj_ObjStore_S3::part_bl(const string& name, bufferlist *pbl)
-{
-  map<string, struct post_form_part, ltstr_nocase>::iterator iter =
-    parts.find(name);
-  if (iter == parts.end())
-    return false;
-
-  *pbl = iter->second.data;
-  return true;
-}
 
 void RGWPostObj_ObjStore_S3::rebuild_key(string& key)
 {
@@ -1768,74 +1507,47 @@ void RGWPostObj_ObjStore_S3::rebuild_key(string& key)
   key = new_key;
 }
 
-int RGWPostObj_ObjStore_S3::get_params()
+std::string RGWPostObj_ObjStore_S3::get_current_filename() const
 {
-  // get the part boundary
-  string req_content_type_str = s->info.env->get("CONTENT_TYPE", "");
-  string req_content_type;
-  map<string, string> params;
-
-  if (s->expect_cont) {
-    /* ok, here it really gets ugly. With POST, the params are embedded in the
-     * request body, so we need to continue before being able to actually look
-     * at them. This diverts from the usual request flow.
-     */
-    dump_continue(s);
-    s->expect_cont = false;
-  }
-
-  parse_params(req_content_type_str, req_content_type, params);
+  return s->object.name;
+}
 
-  if (req_content_type.compare("multipart/form-data") != 0) {
-    err_msg = "Request Content-Type is not multipart/form-data";
-    return -EINVAL;
-  }
+std::string RGWPostObj_ObjStore_S3::get_current_content_type() const
+{
+  return content_type;
+}
 
-  if (s->cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20)) {
-    ldout(s->cct, 20) << "request content_type_str="
-		      << req_content_type_str << dendl;
-    ldout(s->cct, 20) << "request content_type params:" << dendl;
-    map<string, string>::iterator iter;
-    for (iter = params.begin(); iter != params.end(); ++iter) {
-      ldout(s->cct, 20) << " " << iter->first << " -> " << iter->second
-			<< dendl;
-    }
+int RGWPostObj_ObjStore_S3::get_params()
+{
+  op_ret = RGWPostObj_ObjStore::get_params();
+  if (op_ret < 0) {
+    return op_ret;
   }
 
   ldout(s->cct, 20) << "adding bucket to policy env: " << s->bucket.name
 		    << dendl;
   env.add_var("bucket", s->bucket.name);
 
-  map<string, string>::iterator iter = params.find("boundary");
-  if (iter == params.end()) {
-    err_msg = "Missing multipart boundary specification";
-    return -EINVAL;
-  }
-
-  // create the boundary
-  boundary = "--";
-  boundary.append(iter->second);
-
   bool done;
   do {
     struct post_form_part part;
-    int r = read_form_part_header(&part, &done);
+    int r = read_form_part_header(&part, done);
     if (r < 0)
       return r;
 
     if (s->cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20)) {
-      map<string, struct post_part_field, ltstr_nocase>::iterator piter;
-      for (piter = part.fields.begin(); piter != part.fields.end(); ++piter) {
-	ldout(s->cct, 20) << "read part header: name=" << part.name
-			  << " content_type=" << part.content_type << dendl;
-	ldout(s->cct, 20) << "name=" << piter->first << dendl;
-	ldout(s->cct, 20) << "val=" << piter->second.val << dendl;
-	ldout(s->cct, 20) << "params:" << dendl;
-	map<string, string>& params = piter->second.params;
-	for (iter = params.begin(); iter != params.end(); ++iter) {
-	  ldout(s->cct, 20) << " " << iter->first << " -> " << iter->second
-			    << dendl;
-	}
+      ldout(s->cct, 20) << "read part header -- part.name="
+                        << part.name << dendl;
+
+      for (const auto& pair : part.fields) {
+        ldout(s->cct, 20) << "field.name=" << pair.first << dendl;
+        ldout(s->cct, 20) << "field.val=" << pair.second.val << dendl;
+        ldout(s->cct, 20) << "field.params:" << dendl;
+
+        for (const auto& param_pair : pair.second.params) {
+          ldout(s->cct, 20) << " " << param_pair.first
+                            << " -> " << param_pair.second << dendl;
+        }
       }
     }
 
@@ -1851,13 +1563,12 @@ int RGWPostObj_ObjStore_S3::get_params()
 	filename = iter->second;
       }
       parts[part.name] = part;
-      data_pending = true;
       break;
     }
 
     bool boundary;
     uint64_t chunk_size = s->cct->_conf->rgw_max_chunk_size;
-    r = read_data(part.data, chunk_size, &boundary, &done);
+    r = read_data(part.data, chunk_size, boundary, done);
     if (!boundary) {
       err_msg = "Couldn't find boundary";
       return -EINVAL;
@@ -1868,7 +1579,7 @@ int RGWPostObj_ObjStore_S3::get_params()
   } while (!done);
 
   string object_str;
-  if (!part_str("key", &object_str)) {
+  if (!part_str(parts, "key", &object_str)) {
     err_msg = "Key not specified";
     return -EINVAL;
   }
@@ -1884,7 +1595,7 @@ int RGWPostObj_ObjStore_S3::get_params()
 
   env.add_var("key", s->object.name);
 
-  part_str("Content-Type", &content_type);
+  part_str(parts, "Content-Type", &content_type);
   env.add_var("Content-Type", content_type);
 
   map<string, struct post_form_part, ltstr_nocase>::iterator piter =
@@ -1935,16 +1646,16 @@ int RGWPostObj_ObjStore_S3::get_params()
 
 int RGWPostObj_ObjStore_S3::get_policy()
 {
-  if (part_bl("policy", &s->auth.s3_postobj_creds.encoded_policy)) {
-
+  if (part_bl(parts, "policy", &s->auth.s3_postobj_creds.encoded_policy)) {
     // check that the signature matches the encoded policy
-    if (! part_str("AWSAccessKeyId", &s->auth.s3_postobj_creds.access_key)) {
+    if (!part_str(parts, "AWSAccessKeyId",
+                  &s->auth.s3_postobj_creds.access_key)) {
       ldout(s->cct, 0) << "No S3 access key found!" << dendl;
       err_msg = "Missing access key";
       return -EINVAL;
     }
-    string received_signature_str;
-    if (! part_str("signature", &s->auth.s3_postobj_creds.signature)) {
+
+    if (!part_str(parts, "signature", &s->auth.s3_postobj_creds.signature)) {
       ldout(s->cct, 0) << "No signature found!" << dendl;
       err_msg = "Missing signature";
       return -EINVAL;
@@ -2020,7 +1731,7 @@ int RGWPostObj_ObjStore_S3::get_policy()
   }
 
   string canned_acl;
-  part_str("acl", &canned_acl);
+  part_str(parts, "acl", &canned_acl);
 
   RGWAccessControlPolicy_S3 s3policy(s->cct);
   ldout(s->cct, 20) << "canned_acl=" << canned_acl << dendl;
@@ -2039,44 +1750,47 @@ int RGWPostObj_ObjStore_S3::complete_get_params()
   bool done;
   do {
     struct post_form_part part;
-    int r = read_form_part_header(&part, &done);
-    if (r < 0)
+    int r = read_form_part_header(&part, done);
+    if (r < 0) {
       return r;
+    }
 
-    bufferlist part_data;
+    ceph::bufferlist part_data;
     bool boundary;
     uint64_t chunk_size = s->cct->_conf->rgw_max_chunk_size;
-    r = read_data(part.data, chunk_size, &boundary, &done);
+    r = read_data(part.data, chunk_size, boundary, done);
     if (!boundary) {
       return -EINVAL;
     }
 
-    parts[part.name] = part;
+    /* Just reading the data but not storing any results of that. */
   } while (!done);
 
   return 0;
 }
 
-int RGWPostObj_ObjStore_S3::get_data(bufferlist& bl)
+int RGWPostObj_ObjStore_S3::get_data(ceph::bufferlist& bl, bool& again)
 {
   bool boundary;
   bool done;
 
-  uint64_t chunk_size = s->cct->_conf->rgw_max_chunk_size;
-  int r = read_data(bl, chunk_size, &boundary, &done);
-  if (r < 0)
+  const uint64_t chunk_size = s->cct->_conf->rgw_max_chunk_size;
+  int r = read_data(bl, chunk_size, boundary, done);
+  if (r < 0) {
     return r;
+  }
 
   if (boundary) {
-    data_pending = false;
-
-    if (!done) {  /* reached end of data, let's drain the rest of the params */
+    if (!done) {
+      /* Reached end of data, let's drain the rest of the params */
       r = complete_get_params();
-      if (r < 0)
-	return r;
+      if (r < 0) {
+       return r;
+      }
     }
   }
 
+  again = !boundary;
   return bl.length();
 }
 
@@ -2085,7 +1799,7 @@ void RGWPostObj_ObjStore_S3::send_response()
   if (op_ret == 0 && parts.count("success_action_redirect")) {
     string redirect;
 
-    part_str("success_action_redirect", &redirect);
+    part_str(parts, "success_action_redirect", &redirect);
 
     string tenant;
     string bucket;
@@ -2135,7 +1849,7 @@ void RGWPostObj_ObjStore_S3::send_response()
     string status_string;
     uint32_t status_int;
 
-    part_str("success_action_status", &status_string);
+    part_str(parts, "success_action_status", &status_string);
 
     int r = stringtoul(status_string, &status_int);
     if (r < 0) {
@@ -2938,7 +2652,7 @@ void RGWDeleteMultiObj_ObjStore_S3::send_partial_response(rgw_obj_key& key,
       }
       s->formatter->close_section();
     } else if (op_ret < 0) {
-      struct rgw_http_errors r;
+      struct rgw_http_error r;
       int err_no;
 
       s->formatter->open_object_section("Error");
@@ -3073,6 +2787,8 @@ RGWOp *RGWHandler_REST_Bucket_S3::op_get()
     return new RGWListBucketMultiparts_ObjStore_S3;
   } else if(is_lc_op()) {
     return new RGWGetLC_ObjStore_S3;
+  } else if(is_policy_op()) {
+    return new RGWGetBucketPolicy;
   }
   return get_obj_op(true);
 }
@@ -3107,6 +2823,8 @@ RGWOp *RGWHandler_REST_Bucket_S3::op_put()
     return new RGWSetRequestPayment_ObjStore_S3;
   } else if(is_lc_op()) {
     return new RGWPutLC_ObjStore_S3;
+  } else if(is_policy_op()) {
+    return new RGWPutBucketPolicy;
   }
   return new RGWCreateBucket_ObjStore_S3;
 }
@@ -3117,6 +2835,8 @@ RGWOp *RGWHandler_REST_Bucket_S3::op_delete()
     return new RGWDeleteCORS_ObjStore_S3;
   } else if(is_lc_op()) {
     return new RGWDeleteLC_ObjStore_S3;
+  } else if(is_policy_op()) {
+    return new RGWDeleteBucketPolicy;
   }
 
   if (s->info.args.sub_resource_exists("website")) {
@@ -3936,7 +3656,8 @@ int RGW_Auth_S3::authorize_v4(RGWRados *store, struct req_state *s, bool force_b
         case RGW_OP_DELETE_MULTI_OBJ:
         case RGW_OP_ADMIN_SET_METADATA:
         case RGW_OP_SET_BUCKET_WEBSITE:
-          break;
+        case RGW_OP_PUT_BUCKET_POLICY:
+	  break;
         default:
           dout(10) << "ERROR: AWS4 completion for this operation NOT IMPLEMENTED" << dendl;
           return -ERR_NOT_IMPLEMENTED;
@@ -4253,11 +3974,11 @@ int RGWHandler_REST_S3Website::serve_errordoc(int http_ret, const string& errord
 int RGWHandler_REST_S3Website::error_handler(int err_no,
 					    string* error_content) {
   int new_err_no = -1;
-  const struct rgw_http_errors* r;
+  rgw_http_errors::const_iterator r = rgw_http_s3_errors.find(err_no > 0 ? err_no : -err_no);
   int http_error_code = -1;
-  r = search_err(err_no > 0 ? err_no : -err_no, RGW_HTTP_ERRORS, ARRAY_LEN(RGW_HTTP_ERRORS));
-  if (r) {
-    http_error_code = r->http_ret;
+
+  if (r != rgw_http_s3_errors.end()) {
+    http_error_code = r->second.first;
   }
   ldout(s->cct, 10) << "RGWHandler_REST_S3Website::error_handler err_no=" << err_no << " http_ret=" << http_error_code << dendl;
 
diff --git a/src/rgw/rgw_rest_s3.h b/src/rgw/rgw_rest_s3.h
index 291b04b9281..8dcf242b283 100644
--- a/src/rgw/rgw_rest_s3.h
+++ b/src/rgw/rgw_rest_s3.h
@@ -27,7 +27,12 @@
 
 #define RGW_AUTH_GRACE_MINS 15
 
-void rgw_get_errno_s3(struct rgw_http_errors *e, int err_no);
+struct rgw_http_error {
+  int http_ret;
+  const char *s3_code;
+};
+
+void rgw_get_errno_s3(struct rgw_http_error *e, int err_no);
 
 class RGWGetObj_ObjStore_S3 : public RGWGetObj_ObjStore
 {
@@ -195,46 +200,22 @@ public:
                          bufferlist* manifest_bl) override;
 };
 
-struct post_part_field {
-  string val;
-  map<string, string> params;
-};
-
-struct post_form_part {
-  string name;
-  string content_type;
-  map<string, struct post_part_field, ltstr_nocase> fields;
-  bufferlist data;
-};
-
 class RGWPostObj_ObjStore_S3 : public RGWPostObj_ObjStore {
-  string boundary;
-  string filename;
-  bufferlist in_data;
-  map<string, post_form_part, const ltstr_nocase> parts;  
+  parts_collection_t parts;
+  std::string filename;
+  std::string content_type;
   RGWPolicyEnv env;
   RGWPolicy post_policy;
-  string err_msg;
   map<string, string> crypt_http_responses;
 
   const rgw::auth::StrategyRegistry* auth_registry_ptr = nullptr;
 
-  int read_with_boundary(bufferlist& bl, uint64_t max, bool check_eol,
-                         bool *reached_boundary,
-			 bool *done);
-
-  int read_line(bufferlist& bl, uint64_t max,
-                bool *reached_boundary, bool *done);
-
-  int read_data(bufferlist& bl, uint64_t max, bool *reached_boundary, bool *done);
-
-  int read_form_part_header(struct post_form_part *part,
-                            bool *done);
-  bool part_str(const string& name, string *val);
-  bool part_bl(const string& name, bufferlist *pbl);
-
   int get_policy();
   void rebuild_key(string& key);
+
+  std::string get_current_filename() const override;
+  std::string get_current_content_type() const override;
+
 public:
   RGWPostObj_ObjStore_S3() {}
   ~RGWPostObj_ObjStore_S3() override {}
@@ -248,7 +229,7 @@ public:
   int complete_get_params();
 
   void send_response() override;
-  int get_data(bufferlist& bl) override;
+  int get_data(ceph::bufferlist& bl, bool& again) override;
   int get_encrypt_filter(std::unique_ptr<RGWPutObjDataProcessor>* filter,
                          RGWPutObjDataProcessor* cb) override;
 };
@@ -530,6 +511,9 @@ protected:
   bool is_request_payment_op() {
     return s->info.args.exists("requestPayment");
   }
+  bool is_policy_op() {
+    return s->info.args.exists("policy");
+  }
   RGWOp *get_obj_op(bool get_data);
 
   RGWOp *op_get() override;
diff --git a/src/rgw/rgw_rest_swift.cc b/src/rgw/rgw_rest_swift.cc
index 3d4a4d5bda8..6cca4f2efc2 100644
--- a/src/rgw/rgw_rest_swift.cc
+++ b/src/rgw/rgw_rest_swift.cc
@@ -306,6 +306,8 @@ void RGWListBucket_ObjStore_SWIFT::send_response()
       s->formatter->dump_string("name", key.name);
       s->formatter->dump_string("hash", iter->meta.etag);
       s->formatter->dump_int("bytes", iter->meta.accounted_size);
+      if (!iter->meta.user_data.empty())
+        s->formatter->dump_string("user_custom_data", iter->meta.user_data);
       string single_content_type = iter->meta.content_type;
       if (iter->meta.content_type.size()) {
         // content type might hold multiple values, just dump the last one
@@ -502,6 +504,7 @@ static int get_swift_container_settings(req_state * const s,
                                         RGWRados * const store,
                                         RGWAccessControlPolicy * const policy,
                                         bool * const has_policy,
+                                        uint32_t * rw_mask,
                                         RGWCORSConfiguration * const cors_config,
                                         bool * const has_cors)
 {
@@ -524,7 +527,8 @@ static int get_swift_container_settings(req_state * const s,
                                        s->user->user_id,
                                        s->user->display_name,
                                        read_list,
-                                       write_list);
+                                       write_list,
+                                       *rw_mask);
     if (r < 0) {
       return r;
     }
@@ -622,8 +626,10 @@ static int get_swift_versioning_settings(
 int RGWCreateBucket_ObjStore_SWIFT::get_params()
 {
   bool has_policy;
+  uint32_t policy_rw_mask = 0;
 
-  int r = get_swift_container_settings(s, store, &policy, &has_policy, &cors_config, &has_cors);
+  int r = get_swift_container_settings(s, store, &policy, &has_policy,
+				       &policy_rw_mask, &cors_config, &has_cors);
   if (r < 0) {
     return r;
   }
@@ -758,6 +764,13 @@ int RGWPutObj_ObjStore_SWIFT::get_params()
     return r;
   }
 
+  if (!s->cct->_conf->rgw_swift_custom_header.empty()) {
+    string custom_header = s->cct->_conf->rgw_swift_custom_header;
+    if (s->info.env->exists(custom_header.c_str())) {
+      user_data = s->info.env->get(custom_header.c_str());
+    }
+  }
+
   dlo_manifest = s->info.env->get("HTTP_X_OBJECT_MANIFEST");
   bool exists;
   string multipart_manifest = s->info.args.get("multipart-manifest", &exists);
@@ -895,7 +908,7 @@ int RGWPutMetadataBucket_ObjStore_SWIFT::get_params()
   }
 
   int r = get_swift_container_settings(s, store, &policy, &has_policy,
-				       &cors_config, &has_cors);
+				       &policy_rw_mask, &cors_config, &has_cors);
   if (r < 0) {
     return r;
   }
@@ -943,7 +956,7 @@ void RGWPutMetadataObject_ObjStore_SWIFT::send_response()
     op_ret = STATUS_ACCEPTED;
   }
   set_req_state_err(s, op_ret);
-  if (!s->err.is_err()) {
+  if (!s->is_err()) {
     dump_content_length(s, 0);
   }
   dump_errno(s);
@@ -969,7 +982,6 @@ static void bulkdelete_respond(const unsigned num_deleted,
         reason = fail_desc.err;
       }
     }
-
     rgw_err err;
     set_req_state_err(err, reason, prot_flags);
     dump_errno(err, resp_status);
@@ -1283,7 +1295,7 @@ int RGWGetObj_ObjStore_SWIFT::send_response_data(bufferlist& bl,
 		    : op_ret);
     dump_errno(s);
 
-    if (s->err.is_err()) {
+    if (s->is_err()) {
       end_header(s, NULL);
       return 0;
     }
@@ -1293,7 +1305,7 @@ int RGWGetObj_ObjStore_SWIFT::send_response_data(bufferlist& bl,
     dump_range(s, ofs, end, s->obj_size);
   }
 
-  if (s->err.is_err()) {
+  if (s->is_err()) {
     end_header(s, NULL);
     return 0;
   }
@@ -1713,6 +1725,278 @@ bool RGWInfo_ObjStore_SWIFT::is_expired(const std::string& expires, CephContext*
   return false;
 }
 
+
+void RGWFormPost::init(RGWRados* const store,
+                       req_state* const s,
+                       RGWHandler* const dialect_handler)
+{
+  prefix = std::move(s->object.name);
+  s->object = rgw_obj_key();
+
+  return RGWPostObj_ObjStore::init(store, s, dialect_handler);
+}
+
+std::size_t RGWFormPost::get_max_file_size() /*const*/
+{
+  std::string max_str = get_part_str(ctrl_parts, "max_file_size", "0");
+
+  std::string err;
+  const std::size_t max_file_size =
+    static_cast<uint64_t>(strict_strtoll(max_str.c_str(), 10, &err));
+
+  if (! err.empty()) {
+    ldout(s->cct, 5) << "failed to parse FormPost's max_file_size: " << err
+                     << dendl;
+    return 0;
+  }
+
+  return max_file_size;
+}
+
+bool RGWFormPost::is_non_expired()
+{
+  std::string expires = get_part_str(ctrl_parts, "expires", "0");
+
+  std::string err;
+  const uint64_t expires_timestamp =
+    static_cast<uint64_t>(strict_strtoll(expires.c_str(), 10, &err));
+
+  if (! err.empty()) {
+    dout(5) << "failed to parse FormPost's expires: " << err << dendl;
+    return false;
+  }
+
+  const utime_t now = ceph_clock_now();
+  if (expires_timestamp <= static_cast<uint64_t>(now.sec())) {
+    dout(5) << "FormPost form expired: "
+            << expires_timestamp << " <= " << now.sec() << dendl;
+    return false;
+  }
+
+  return true;
+}
+
+bool RGWFormPost::is_integral()
+{
+  const std::string form_signature = get_part_str(ctrl_parts, "signature");
+
+  for (const auto& kv : s->user->temp_url_keys) {
+    const int temp_url_key_num = kv.first;
+    const string& temp_url_key = kv.second;
+
+    if (temp_url_key.empty()) {
+      continue;
+    }
+
+    SignatureHelper sig_helper;
+    sig_helper.calc(temp_url_key,
+                    s->info.request_uri,
+                    get_part_str(ctrl_parts, "redirect"),
+                    get_part_str(ctrl_parts, "max_file_size", "0"),
+                    get_part_str(ctrl_parts, "max_file_count", "0"),
+                    get_part_str(ctrl_parts, "expires", "0"));
+
+    const auto local_sig = sig_helper.get_signature();
+
+    ldout(s->cct, 20) << "FormPost signature [" << temp_url_key_num << "]"
+                      << " (calculated): " << local_sig << dendl;
+
+    if (sig_helper.is_equal_to(form_signature)) {
+      return true;
+    } else {
+      ldout(s->cct, 5) << "FormPost's signature mismatch: "
+                       << local_sig << " != " << form_signature << dendl;
+    }
+  }
+
+  return false;
+}
+
+int RGWFormPost::get_params()
+{
+  /* The parentt class extracts boundary info from the Content-Type. */
+  int ret = RGWPostObj_ObjStore::get_params();
+  if (ret < 0) {
+    return ret;
+  }
+
+  policy.create_default(s->user->user_id, s->user->display_name);
+
+  /* Let's start parsing the HTTP body by parsing each form part step-
+   * by-step till encountering the first part with file data. */
+  do {
+    struct post_form_part part;
+    ret = read_form_part_header(&part, stream_done);
+    if (ret < 0) {
+      return ret;
+    }
+
+    if (s->cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20)) {
+      ldout(s->cct, 20) << "read part header -- part.name="
+                        << part.name << dendl;
+
+      for (const auto& pair : part.fields) {
+        ldout(s->cct, 20) << "field.name=" << pair.first << dendl;
+        ldout(s->cct, 20) << "field.val=" << pair.second.val << dendl;
+        ldout(s->cct, 20) << "field.params:" << dendl;
+
+        for (const auto& param_pair : pair.second.params) {
+          ldout(s->cct, 20) << " " << param_pair.first
+                            << " -> " << param_pair.second << dendl;
+        }
+      }
+    }
+
+    if (stream_done) {
+      /* Unexpected here. */
+      err_msg = "Malformed request";
+      return -EINVAL;
+    }
+
+    const auto field_iter = part.fields.find("Content-Disposition");
+    if (std::end(part.fields) != field_iter &&
+        std::end(field_iter->second.params) != field_iter->second.params.find("filename")) {
+      /* First data part ahead. */
+      current_data_part = std::move(part);
+
+      /* Stop the iteration. We can assume that all control parts have been
+       * already parsed. The rest of HTTP body should contain data parts
+       * only. They will be picked up by ::get_data(). */
+      break;
+    } else {
+      /* Control part ahead. Receive, parse and store for later usage. */
+      bool boundary;
+      ret = read_data(part.data, s->cct->_conf->rgw_max_chunk_size,
+                      boundary, stream_done);
+      if (ret < 0) {
+        return ret;
+      } else if (! boundary) {
+        err_msg = "Couldn't find boundary";
+        return -EINVAL;
+      }
+
+      ctrl_parts[part.name] = std::move(part);
+    }
+  } while (! stream_done);
+
+  min_len = 0;
+  max_len = get_max_file_size();
+
+  if (! current_data_part) {
+    err_msg = "FormPost: no files to process";
+    return -EINVAL;
+  }
+
+  if (! is_non_expired()) {
+    err_msg = "FormPost: Form Expired";
+    return -EPERM;
+  }
+
+  if (! is_integral()) {
+    err_msg = "FormPost: Invalid Signature";
+    return -EPERM;
+  }
+
+  return 0;
+}
+
+std::string RGWFormPost::get_current_filename() const
+{
+  try {
+    const auto& field = current_data_part->fields.at("Content-Disposition");
+    const auto iter = field.params.find("filename");
+
+    if (std::end(field.params) != iter) {
+      return prefix + iter->second;
+    }
+  } catch (std::out_of_range&) {
+    /* NOP */;
+  }
+
+  return prefix;
+}
+
+std::string RGWFormPost::get_current_content_type() const
+{
+  try {
+    const auto& field = current_data_part->fields.at("Content-Type");
+    return field.val;
+  } catch (std::out_of_range&) {
+    /* NOP */;
+  }
+
+  return std::string();
+}
+
+bool RGWFormPost::is_next_file_to_upload()
+{
+  if (! stream_done) {
+    /* We have at least one additional part in the body. */
+    struct post_form_part part;
+    int r = read_form_part_header(&part, stream_done);
+    if (r < 0) {
+      return false;
+    }
+
+    const auto field_iter = part.fields.find("Content-Disposition");
+    if (std::end(part.fields) != field_iter) {
+      const auto& params = field_iter->second.params;
+
+      if (std::end(params) != params.find("filename")) {
+        current_data_part = std::move(part);
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+int RGWFormPost::get_data(ceph::bufferlist& bl, bool& again)
+{
+  bool boundary;
+
+  int r = read_data(bl, s->cct->_conf->rgw_max_chunk_size,
+                    boundary, stream_done);
+  if (r < 0) {
+    return r;
+  }
+
+  /* Tell RGWPostObj::execute() that it has some data to put. */
+  again = !boundary;
+
+  return bl.length();
+}
+
+void RGWFormPost::send_response()
+{
+  std::string redirect = get_part_str(ctrl_parts, "redirect");
+  if (! redirect.empty()) {
+    op_ret = STATUS_REDIRECT;
+  }
+
+  set_req_state_err(s, op_ret);
+  s->err.s3_code = err_msg;
+  dump_errno(s);
+  if (! redirect.empty()) {
+    dump_redirect(s, redirect);
+  }
+  end_header(s, this);
+}
+
+bool RGWFormPost::is_formpost_req(req_state* const s)
+{
+  std::string content_type;
+  std::map<std::string, std::string> params;
+
+  parse_boundary_params(s->info.env->get("CONTENT_TYPE", ""),
+                        content_type, params);
+
+  return boost::algorithm::iequals(content_type, "multipart/form-data") &&
+         params.count("boundary") > 0;
+}
+
+
 RGWOp *RGWHandler_REST_Service_SWIFT::op_get()
 {
   return new RGWListBuckets_ObjStore_SWIFT;
@@ -1788,9 +2072,8 @@ int RGWSwiftWebsiteHandler::error_handler(const int err_no,
   const auto& ws_conf = s->bucket_info.website_conf;
 
   if (can_be_website_req() && ! ws_conf.error_doc.empty()) {
-    struct rgw_err err;
-    set_req_state_err(err, err_no, s->prot_flags);
-    return serve_errordoc(err.http_ret, ws_conf.error_doc);
+    set_req_state_err(s, err_no);
+    return serve_errordoc(s->err.http_ret, ws_conf.error_doc);
   }
 
   /* Let's go to the default, no-op handler. */
@@ -2103,6 +2386,9 @@ RGWOp *RGWHandler_REST_Bucket_SWIFT::op_put()
   if (is_acl_op()) {
     return new RGWPutACLs_ObjStore_SWIFT;
   }
+  if(s->info.args.exists("extract-archive")) {
+    return new RGWBulkUploadOp_ObjStore_SWIFT;
+  }
   return new RGWCreateBucket_ObjStore_SWIFT;
 }
 
@@ -2113,7 +2399,11 @@ RGWOp *RGWHandler_REST_Bucket_SWIFT::op_delete()
 
 RGWOp *RGWHandler_REST_Bucket_SWIFT::op_post()
 {
-  return new RGWPutMetadataBucket_ObjStore_SWIFT;
+  if (RGWFormPost::is_formpost_req(s)) {
+    return new RGWFormPost;
+  } else {
+    return new RGWPutMetadataBucket_ObjStore_SWIFT;
+  }
 }
 
 RGWOp *RGWHandler_REST_Bucket_SWIFT::op_options()
@@ -2148,6 +2438,9 @@ RGWOp *RGWHandler_REST_Obj_SWIFT::op_put()
   if (is_acl_op()) {
     return new RGWPutACLs_ObjStore_SWIFT;
   }
+  if(s->info.args.exists("extract-archive")) {
+    return new RGWBulkUploadOp_ObjStore_SWIFT;
+  }
   if (s->init_state.src_bucket.empty())
     return new RGWPutObj_ObjStore_SWIFT;
   else
@@ -2161,7 +2454,11 @@ RGWOp *RGWHandler_REST_Obj_SWIFT::op_delete()
 
 RGWOp *RGWHandler_REST_Obj_SWIFT::op_post()
 {
-  return new RGWPutMetadataObject_ObjStore_SWIFT;
+  if (RGWFormPost::is_formpost_req(s)) {
+    return new RGWFormPost;
+  } else {
+    return new RGWPutMetadataObject_ObjStore_SWIFT;
+  }
 }
 
 RGWOp *RGWHandler_REST_Obj_SWIFT::op_copy()
diff --git a/src/rgw/rgw_rest_swift.h b/src/rgw/rgw_rest_swift.h
index 8478b1bad7b..0b1293e286f 100644
--- a/src/rgw/rgw_rest_swift.h
+++ b/src/rgw/rgw_rest_swift.h
@@ -11,6 +11,9 @@
 #include "rgw_op.h"
 #include "rgw_rest.h"
 #include "rgw_swift_auth.h"
+#include "rgw_http_errors.h"
+
+#include <boost/utility/string_ref.hpp>
 
 class RGWGetObj_ObjStore_SWIFT : public RGWGetObj_ObjStore {
   int custom_http_ret = 0;
@@ -238,6 +241,94 @@ public:
 };
 
 
+class RGWFormPost : public RGWPostObj_ObjStore {
+  std::string get_current_filename() const override;
+  std::string get_current_content_type() const override;
+  std::size_t get_max_file_size() /*const*/;
+  bool is_next_file_to_upload() override;
+  bool is_integral();
+  bool is_non_expired();
+
+  parts_collection_t ctrl_parts;
+  boost::optional<post_form_part> current_data_part;
+  std::string prefix;
+  bool stream_done = false;
+
+  class SignatureHelper;
+public:
+  RGWFormPost() = default;
+  ~RGWFormPost() = default;
+
+  void init(RGWRados* store,
+            req_state* s,
+            RGWHandler* dialect_handler) override;
+
+  int get_params() override;
+  int get_data(ceph::bufferlist& bl, bool& again) override;
+  void send_response() override;
+
+  static bool is_formpost_req(req_state* const s);
+};
+
+class RGWFormPost::SignatureHelper
+{
+private:
+  static constexpr uint32_t output_size =
+    CEPH_CRYPTO_HMACSHA1_DIGESTSIZE * 2 + 1;
+
+  unsigned char dest[CEPH_CRYPTO_HMACSHA1_DIGESTSIZE]; // 20
+  char dest_str[output_size];
+
+public:
+  SignatureHelper() = default;
+
+  const char* calc(const std::string& key,
+                   const boost::string_ref& path_info,
+                   const boost::string_ref& redirect,
+                   const boost::string_ref& max_file_size,
+                   const boost::string_ref& max_file_count,
+                   const boost::string_ref& expires) {
+    using ceph::crypto::HMACSHA1;
+    using UCHARPTR = const unsigned char*;
+
+    HMACSHA1 hmac((UCHARPTR) key.data(), key.size());
+
+    hmac.Update((UCHARPTR) path_info.data(), path_info.size());
+    hmac.Update((UCHARPTR) "\n", 1);
+
+    hmac.Update((UCHARPTR) redirect.data(), redirect.size());
+    hmac.Update((UCHARPTR) "\n", 1);
+
+    hmac.Update((UCHARPTR) max_file_size.data(), max_file_size.size());
+    hmac.Update((UCHARPTR) "\n", 1);
+
+    hmac.Update((UCHARPTR) max_file_count.data(), max_file_count.size());
+    hmac.Update((UCHARPTR) "\n", 1);
+
+    hmac.Update((UCHARPTR) expires.data(), expires.size());
+
+    hmac.Final(dest);
+
+    buf_to_hex((UCHARPTR) dest, sizeof(dest), dest_str);
+
+    return dest_str;
+  }
+
+  const char* get_signature() const {
+    return dest_str;
+  }
+
+  bool is_equal_to(const std::string& rhs) const {
+    /* never allow out-of-range exception */
+    if (rhs.size() < (output_size - 1)) {
+      return false;
+    }
+    return rhs.compare(0 /* pos */,  output_size, dest_str) == 0;
+  }
+
+}; /* RGWFormPost::SignatureHelper */
+
+
 class RGWSwiftWebsiteHandler {
   RGWRados* const store;
   req_state* const s;
diff --git a/src/rgw/rgw_role.cc b/src/rgw/rgw_role.cc
index 9ccc20e70e3..236fe6e6aa4 100644
--- a/src/rgw/rgw_role.cc
+++ b/src/rgw/rgw_role.cc
@@ -1,5 +1,6 @@
 #include <errno.h>
 #include <ctime>
+#include <regex>
 
 #include "common/errno.h"
 #include "common/Formatter.h"
@@ -38,7 +39,7 @@ int RGWRole::store_name(bool exclusive)
   RGWNameToId nameToId;
   nameToId.obj_id = id;
 
-  string oid = get_names_oid_prefix() + name;
+  string oid = tenant + get_names_oid_prefix() + name;
 
   bufferlist bl;
   ::encode(nameToId, bl);
@@ -48,7 +49,7 @@ int RGWRole::store_name(bool exclusive)
 
 int RGWRole::store_path(bool exclusive)
 {
-  string oid = get_path_oid_prefix() + path + get_info_oid_prefix() + id;
+  string oid = tenant + get_path_oid_prefix() + path + get_info_oid_prefix() + id;
 
   return rgw_put_system_obj(store, store->get_zone_params().roles_pool, oid,
               NULL, 0, exclusive, NULL, real_time(), NULL);
@@ -58,8 +59,12 @@ int RGWRole::create(bool exclusive)
 {
   int ret;
 
+  if (! validate_input()) {
+    return -EINVAL;
+  }
+
   /* check to see the name is not used */
-  ret = read_id(name, id);
+  ret = read_id(name, tenant, id);
   if (exclusive && ret == 0) {
     ldout(cct, 0) << "ERROR: name " << name << " already in use for role id "
                     << id << dendl;
@@ -78,7 +83,7 @@ int RGWRole::create(bool exclusive)
   id = uuid_str;
 
   //arn
-  arn = role_arn_prefix + uid + ":role" + path + name;
+  arn = role_arn_prefix + tenant + ":role" + path + name;
 
   // Creation time
   real_clock::time_point t = real_clock::now();
@@ -128,7 +133,7 @@ int RGWRole::create(bool exclusive)
                   << id << ": " << cpp_strerror(-info_ret) << dendl;
     }
     //Delete role name that was stored in previous call
-    oid = get_names_oid_prefix() + name;
+    oid = tenant + get_names_oid_prefix() + name;
     int name_ret = rgw_delete_system_obj(store, pool, oid, NULL);
     if (name_ret < 0) {
       ldout(cct, 0) << "ERROR: cleanup of role name from pool: " << pool.name << ": "
@@ -166,7 +171,7 @@ int RGWRole::delete_obj()
   }
 
   // Delete name
-  oid = get_names_oid_prefix() + name;
+  oid = tenant + get_names_oid_prefix() + name;
   ret = rgw_delete_system_obj(store, pool, oid, NULL);
   if (ret < 0) {
     ldout(cct, 0) << "ERROR: deleting role name from pool: " << pool.name << ": "
@@ -174,7 +179,7 @@ int RGWRole::delete_obj()
   }
 
   // Delete path
-  oid = get_path_oid_prefix() + path + get_info_oid_prefix() + id;
+  oid = tenant + get_path_oid_prefix() + path + get_info_oid_prefix() + id;
   ret = rgw_delete_system_obj(store, pool, oid, NULL);
   if (ret < 0) {
     ldout(cct, 0) << "ERROR: deleting role path from pool: " << pool.name << ": "
@@ -198,6 +203,16 @@ int RGWRole::get()
   return 0;
 }
 
+int RGWRole::get_by_id()
+{
+  int ret = read_info();
+  if (ret < 0) {
+    return ret;
+  }
+
+  return 0;
+}
+
 int RGWRole::update()
 {
   auto& pool = store->get_zone_params().roles_pool;
@@ -272,10 +287,10 @@ void RGWRole::decode_json(JSONObj *obj)
   JSONDecoder::decode_json("assume_role_policy_document", trust_policy, obj);
 }
 
-int RGWRole::read_id(const string& role_name, string& role_id)
+int RGWRole::read_id(const string& role_name, const string& tenant, string& role_id)
 {
   auto& pool = store->get_zone_params().roles_pool;
-  string oid = get_names_oid_prefix() + role_name;
+  string oid = tenant + get_names_oid_prefix() + role_name;
   bufferlist bl;
   RGWObjectCtx obj_ctx(store);
 
@@ -326,7 +341,7 @@ int RGWRole::read_info()
 int RGWRole::read_name()
 {
   auto& pool = store->get_zone_params().roles_pool;
-  string oid = get_names_oid_prefix() + name;
+  string oid = tenant + get_names_oid_prefix() + name;
   bufferlist bl;
   RGWObjectCtx obj_ctx(store);
 
@@ -350,21 +365,61 @@ int RGWRole::read_name()
   return 0;
 }
 
+bool RGWRole::validate_input()
+{
+  if (name.length() > MAX_ROLE_NAME_LEN) {
+    ldout(cct, 0) << "ERROR: Invalid name length " << dendl;
+    return false;
+  }
+
+  if (path.length() > MAX_PATH_NAME_LEN) {
+    ldout(cct, 0) << "ERROR: Invalid path length " << dendl;
+    return false;
+  }
+
+  std::regex regex_name("[A-Za-z0-9:=,.@-]+");
+  if (! std::regex_match(name, regex_name)) {
+    ldout(cct, 0) << "ERROR: Invalid chars in name " << dendl;
+    return false;
+  }
+
+  std::regex regex_path("(/[!-~]+/)|(/)");
+  if (! std::regex_match(path,regex_path)) {
+    ldout(cct, 0) << "ERROR: Invalid chars in path " << dendl;
+    return false;
+  }
+
+  return true;
+}
+
+void RGWRole::extract_name_tenant(const std::string& str)
+{
+  size_t pos = str.find('$');
+  if (pos != std::string::npos) {
+    tenant = str.substr(0, pos);
+    name = str.substr(pos + 1);
+  }
+}
+
 void RGWRole::update_trust_policy(string& trust_policy)
 {
   this->trust_policy = trust_policy;
 }
 
-int RGWRole::get_roles_by_path_prefix(RGWRados *store, CephContext *cct, const string& path_prefix, vector<RGWRole>& roles)
+int RGWRole::get_roles_by_path_prefix(RGWRados *store,
+                                      CephContext *cct,
+                                      const string& path_prefix,
+                                      const string& tenant,
+                                      vector<RGWRole>& roles)
 {
   auto pool = store->get_zone_params().roles_pool;
   string prefix;
 
   // List all roles if path prefix is empty
   if (! path_prefix.empty()) {
-    prefix = role_path_oid_prefix + path_prefix;
+    prefix = tenant + role_path_oid_prefix + path_prefix;
   } else {
-    prefix = role_path_oid_prefix;
+    prefix = tenant + role_path_oid_prefix;
   }
 
   //Get the filtered objects
diff --git a/src/rgw/rgw_role.h b/src/rgw/rgw_role.h
index f8a60a4c568..bccb576ed2d 100644
--- a/src/rgw/rgw_role.h
+++ b/src/rgw/rgw_role.h
@@ -7,6 +7,8 @@ class RGWRole
   static const string role_oid_prefix;
   static const string role_path_oid_prefix;
   static const string role_arn_prefix;
+  static constexpr int MAX_ROLE_NAME_LEN = 64;
+  static constexpr int MAX_PATH_NAME_LEN = 512;
 
   CephContext *cct;
   RGWRados *store;
@@ -17,15 +19,17 @@ class RGWRole
   string creation_date;
   string trust_policy;
   map<string, string> perm_policy_map;
-  string uid;
+  string tenant;
 
   int store_info(bool exclusive);
   int store_name(bool exclusive);
   int store_path(bool exclusive);
-  int read_id(const string& role_name, string& role_id);
+  int read_id(const string& role_name, const string& tenant, string& role_id);
   int read_name();
   int read_info();
   void set_id(const string& id) { this->id = id; }
+  bool validate_input();
+  void extract_name_tenant(const std::string& str);
 
 public:
   RGWRole(CephContext *cct,
@@ -33,23 +37,35 @@ public:
           string name,
           string path,
           string trust_policy,
-          string uid)
+          string tenant)
   : cct(cct),
     store(store),
     name(std::move(name)),
     path(std::move(path)),
     trust_policy(std::move(trust_policy)),
-    uid(std::move(uid)) {
+    tenant(std::move(tenant)) {
     if (this->path.empty())
       this->path = "/";
+    extract_name_tenant(this->name);
   }
 
   RGWRole(CephContext *cct,
           RGWRados *store,
-          string name)
+          string name,
+          string tenant)
+  : cct(cct),
+    store(store),
+    name(std::move(name)),
+    tenant(std::move(tenant)) {
+    extract_name_tenant(this->name);
+  }
+
+  RGWRole(CephContext *cct,
+          RGWRados *store,
+          string id)
   : cct(cct),
     store(store),
-    name(std::move(name)) {}
+    id(std::move(id)) {}
 
   RGWRole(CephContext *cct,
           RGWRados *store)
@@ -61,7 +77,7 @@ public:
   ~RGWRole() = default;
 
   void encode(bufferlist& bl) const {
-    ENCODE_START(1, 1, bl);
+    ENCODE_START(2, 1, bl);
     ::encode(id, bl);
     ::encode(name, bl);
     ::encode(path, bl);
@@ -69,11 +85,12 @@ public:
     ::encode(creation_date, bl);
     ::encode(trust_policy, bl);
     ::encode(perm_policy_map, bl);
+    ::encode(tenant, bl);
     ENCODE_FINISH(bl);
   }
 
   void decode(bufferlist::iterator& bl) {
-    DECODE_START(1, bl);
+    DECODE_START(2, bl);
     ::decode(id, bl);
     ::decode(name, bl);
     ::decode(path, bl);
@@ -81,6 +98,9 @@ public:
     ::decode(creation_date, bl);
     ::decode(trust_policy, bl);
     ::decode(perm_policy_map, bl);
+    if (struct_v >= 2) {
+      ::decode(tenant, bl);
+    }
     DECODE_FINISH(bl);
   }
 
@@ -88,10 +108,12 @@ public:
   const string& get_name() const { return name; }
   const string& get_path() const { return path; }
   const string& get_create_date() const { return creation_date; }
+  const string& get_assume_role_policy() const { return trust_policy;}
 
   int create(bool exclusive);
   int delete_obj();
   int get();
+  int get_by_id();
   int update();
   void update_trust_policy(string& trust_policy);
   void set_perm_policy(const string& policy_name, const string& perm_policy);
@@ -104,7 +126,11 @@ public:
   static const string& get_names_oid_prefix();
   static const string& get_info_oid_prefix();
   static const string& get_path_oid_prefix();
-  static int get_roles_by_path_prefix(RGWRados *store, CephContext *cct, const string& path_prefix, vector<RGWRole>& roles);
+  static int get_roles_by_path_prefix(RGWRados *store,
+                                      CephContext *cct,
+                                      const string& path_prefix,
+                                      const string& tenant,
+                                      vector<RGWRole>& roles);
 };
 WRITE_CLASS_ENCODER(RGWRole)
 #endif /* CEPH_RGW_ROLE_H */
diff --git a/src/rgw/rgw_sync.cc b/src/rgw/rgw_sync.cc
index be1fa2fdd88..9d8d250132b 100644
--- a/src/rgw/rgw_sync.cc
+++ b/src/rgw/rgw_sync.cc
@@ -52,7 +52,7 @@ RGWCoroutine *RGWSyncErrorLogger::log_error_cr(const string& source_zone, const
   ::encode(info, bl);
   store->time_log_prepare_entry(entry, real_clock::now(), section, name, bl);
 
-  uint32_t shard_id = counter.inc() % num_shards;
+  uint32_t shard_id = ++counter % num_shards;
 
 
   return new RGWRadosTimelogAddCR(store, oids[shard_id], entry);
@@ -83,8 +83,8 @@ void RGWSyncBackoff::backoff(RGWCoroutine *op)
 }
 
 int RGWBackoffControlCR::operate() {
-  RGWCoroutine *finisher_cr;
   reenter(this) {
+    // retry the operation until it succeeds
     while (true) {
       yield {
         Mutex::Locker l(lock);
@@ -97,7 +97,10 @@ int RGWBackoffControlCR::operate() {
         cr->put();
         cr = NULL;
       }
-      if (retcode < 0 && retcode != -EBUSY && retcode != -EAGAIN) {
+      if (retcode >= 0) {
+        break;
+      }
+      if (retcode != -EBUSY && retcode != -EAGAIN) {
         ldout(cct, 0) << "ERROR: RGWBackoffControlCR called coroutine returned " << retcode << dendl;
         if (exit_on_error) {
           return set_cr_error(retcode);
@@ -107,17 +110,15 @@ int RGWBackoffControlCR::operate() {
         backoff.reset();
       }
       yield backoff.backoff(this);
-      finisher_cr = alloc_finisher_cr();
-      if (finisher_cr) {
-        yield call(finisher_cr);
-        if (retcode < 0) {
-          ldout(cct, 0) << "ERROR: call to finisher_cr() failed: retcode=" << retcode << dendl;
-          if (exit_on_error) {
-            return set_cr_error(retcode);
-          }
-        }
-      }
     }
+
+    // run an optional finisher
+    yield call(alloc_finisher_cr());
+    if (retcode < 0) {
+      ldout(cct, 0) << "ERROR: call to finisher_cr() failed: retcode=" << retcode << dendl;
+      return set_cr_error(retcode);
+    }
+    return set_cr_done();
   }
   return 0;
 }
@@ -289,7 +290,7 @@ int RGWRemoteMetaLog::init()
 
 void RGWRemoteMetaLog::finish()
 {
-  going_down.set(1);
+  going_down = true;
   stop();
 }
 
@@ -320,13 +321,14 @@ int RGWMetaSyncStatusManager::init()
 
   RGWMetaSyncEnv& sync_env = master_log.get_sync_env();
 
-  r = read_sync_status();
+  rgw_meta_sync_status sync_status;
+  r = read_sync_status(&sync_status);
   if (r < 0 && r != -ENOENT) {
     lderr(store->ctx()) << "ERROR: failed to read sync status, r=" << r << dendl;
     return r;
   }
 
-  int num_shards = master_log.get_sync_status().sync_info.num_shards;
+  int num_shards = sync_status.sync_info.num_shards;
 
   for (int i = 0; i < num_shards; i++) {
     shard_objs[i] = rgw_raw_obj(store->get_zone_params().log_pool, sync_env.shard_obj_name(i));
@@ -691,36 +693,75 @@ public:
   }
 };
 
-class RGWReadSyncStatusCoroutine : public RGWSimpleRadosReadCR<rgw_meta_sync_info> {
-  RGWMetaSyncEnv *sync_env;
+class RGWReadSyncStatusMarkersCR : public RGWShardCollectCR {
+  static constexpr int MAX_CONCURRENT_SHARDS = 16;
 
-  rgw_meta_sync_status *sync_status;
+  RGWMetaSyncEnv *env;
+  const int num_shards;
+  int shard_id{0};
+  map<uint32_t, rgw_meta_sync_marker>& markers;
 
-public:
-  RGWReadSyncStatusCoroutine(RGWMetaSyncEnv *_sync_env,
-		      rgw_meta_sync_status *_status) : RGWSimpleRadosReadCR(_sync_env->async_rados, _sync_env->store,
-									    rgw_raw_obj(_sync_env->store->get_zone_params().log_pool, _sync_env->status_oid()),
-									    &_status->sync_info),
-                                                                            sync_env(_sync_env),
-									    sync_status(_status) {
+ public:
+  RGWReadSyncStatusMarkersCR(RGWMetaSyncEnv *env, int num_shards,
+                             map<uint32_t, rgw_meta_sync_marker>& markers)
+    : RGWShardCollectCR(env->cct, MAX_CONCURRENT_SHARDS),
+      env(env), num_shards(num_shards), markers(markers)
+  {}
+  bool spawn_next() override;
+};
 
+bool RGWReadSyncStatusMarkersCR::spawn_next()
+{
+  if (shard_id >= num_shards) {
+    return false;
   }
+  using CR = RGWSimpleRadosReadCR<rgw_meta_sync_marker>;
+  rgw_raw_obj obj{env->store->get_zone_params().log_pool,
+                  env->shard_obj_name(shard_id)};
+  spawn(new CR(env->async_rados, env->store, obj, &markers[shard_id]), false);
+  shard_id++;
+  return true;
+}
+
+class RGWReadSyncStatusCoroutine : public RGWCoroutine {
+  RGWMetaSyncEnv *sync_env;
+  rgw_meta_sync_status *sync_status;
 
-  int handle_data(rgw_meta_sync_info& data) override;
+public:
+  RGWReadSyncStatusCoroutine(RGWMetaSyncEnv *_sync_env,
+                             rgw_meta_sync_status *_status)
+    : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), sync_status(_status)
+  {}
+  int operate() override;
 };
 
-int RGWReadSyncStatusCoroutine::handle_data(rgw_meta_sync_info& data)
+int RGWReadSyncStatusCoroutine::operate()
 {
-  if (retcode == -ENOENT) {
-    return 0;
-  }
-
-  RGWRados *store = sync_env->store;
-  map<uint32_t, rgw_meta_sync_marker>& markers = sync_status->sync_markers;
-  for (int i = 0; i < (int)data.num_shards; i++) {
-    spawn(new RGWSimpleRadosReadCR<rgw_meta_sync_marker>(sync_env->async_rados, store,
-                                                         rgw_raw_obj(store->get_zone_params().log_pool, sync_env->shard_obj_name(i)),
-                                                         &markers[i]), true);
+  reenter(this) {
+    // read sync info
+    using ReadInfoCR = RGWSimpleRadosReadCR<rgw_meta_sync_info>;
+    yield {
+      bool empty_on_enoent = false; // fail on ENOENT
+      rgw_raw_obj obj{sync_env->store->get_zone_params().log_pool,
+                      sync_env->status_oid()};
+      call(new ReadInfoCR(sync_env->async_rados, sync_env->store, obj,
+                          &sync_status->sync_info, empty_on_enoent));
+    }
+    if (retcode < 0) {
+      ldout(sync_env->cct, 4) << "failed to read sync status info with "
+          << cpp_strerror(retcode) << dendl;
+      return set_cr_error(retcode);
+    }
+    // read shard markers
+    using ReadMarkersCR = RGWReadSyncStatusMarkersCR;
+    yield call(new ReadMarkersCR(sync_env, sync_status->sync_info.num_shards,
+                                 sync_status->sync_markers));
+    if (retcode < 0) {
+      ldout(sync_env->cct, 4) << "failed to read sync status markers with "
+          << cpp_strerror(retcode) << dendl;
+      return set_cr_error(retcode);
+    }
+    return set_cr_done();
   }
   return 0;
 }
@@ -1103,7 +1144,7 @@ public:
       sync_marker.timestamp = timestamp;
     }
 
-    ldout(sync_env->cct, 20) << __func__ << "(): updating marker marker_oid=" << marker_oid << " marker=" << new_marker << dendl;
+    ldout(sync_env->cct, 20) << __func__ << "(): updating marker marker_oid=" << marker_oid << " marker=" << new_marker << " realm_epoch=" << sync_marker.realm_epoch << dendl;
     RGWRados *store = sync_env->store;
     return new RGWSimpleRadosWriteCR<rgw_meta_sync_marker>(sync_env->async_rados,
                                                            store,
@@ -1244,6 +1285,7 @@ class RGWMetaSyncShardCR : public RGWCoroutine {
 
   const rgw_pool& pool;
   const std::string& period; //< currently syncing period id
+  const epoch_t realm_epoch; //< realm_epoch of period
   RGWMetadataLog* mdlog; //< log of syncing period
   uint32_t shard_id;
   rgw_meta_sync_marker& sync_marker;
@@ -1291,11 +1333,13 @@ class RGWMetaSyncShardCR : public RGWCoroutine {
 
 public:
   RGWMetaSyncShardCR(RGWMetaSyncEnv *_sync_env, const rgw_pool& _pool,
-                     const std::string& period, RGWMetadataLog* mdlog,
-                     uint32_t _shard_id, rgw_meta_sync_marker& _marker,
+                     const std::string& period, epoch_t realm_epoch,
+                     RGWMetadataLog* mdlog, uint32_t _shard_id,
+                     rgw_meta_sync_marker& _marker,
                      const std::string& period_marker, bool *_reset_backoff)
     : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), pool(_pool),
-      period(period), mdlog(mdlog), shard_id(_shard_id), sync_marker(_marker),
+      period(period), realm_epoch(realm_epoch), mdlog(mdlog),
+      shard_id(_shard_id), sync_marker(_marker),
       period_marker(period_marker), inc_lock("RGWMetaSyncShardCR::inc_lock"),
       reset_backoff(_reset_backoff) {
     *reset_backoff = false;
@@ -1478,7 +1522,8 @@ public:
 	  temp_marker->state = rgw_meta_sync_marker::IncrementalSync;
 	  temp_marker->marker = std::move(temp_marker->next_step_marker);
 	  temp_marker->next_step_marker.clear();
-	  ldout(sync_env->cct, 0) << *this << ": saving marker pos=" << temp_marker->marker << dendl;
+	  temp_marker->realm_epoch = realm_epoch;
+	  ldout(sync_env->cct, 0) << *this << ": saving marker pos=" << temp_marker->marker << " realm_epoch=" << realm_epoch << dendl;
 
 	  using WriteMarkerCR = RGWSimpleRadosWriteCR<rgw_meta_sync_marker>;
 	  yield call(new WriteMarkerCR(sync_env->async_rados, sync_env->store,
@@ -1549,6 +1594,14 @@ public:
           yield;
         }
       }
+      // if the period has advanced, we can't use the existing marker
+      if (sync_marker.realm_epoch < realm_epoch) {
+        ldout(sync_env->cct, 0) << "clearing marker=" << sync_marker.marker
+            << " from old realm_epoch=" << sync_marker.realm_epoch
+            << " (now " << realm_epoch << ')' << dendl;
+        sync_marker.realm_epoch = realm_epoch;
+        sync_marker.marker.clear();
+      }
       mdlog_marker = sync_marker.marker;
       set_marker_tracker(new RGWMetaSyncShardMarkerTrack(sync_env,
                                                          sync_env->shard_obj_name(shard_id),
@@ -1570,7 +1623,8 @@ public:
         }
 #define INCREMENTAL_MAX_ENTRIES 100
 	ldout(sync_env->cct, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " mdlog_marker=" << mdlog_marker << " sync_marker.marker=" << sync_marker.marker << " period_marker=" << period_marker << dendl;
-        if (!period_marker.empty() && period_marker <= marker) {
+        if (!period_marker.empty() && period_marker <= mdlog_marker) {
+          ldout(cct, 10) << "mdlog_marker past period_marker=" << period_marker << dendl;
           done_with_period = true;
           break;
         }
@@ -1602,10 +1656,16 @@ public:
             *reset_backoff = false; // back off and try again later
             return retcode;
           }
-          for (log_iter = log_entries.begin(); log_iter != log_entries.end(); ++log_iter) {
-            if (!period_marker.empty() && period_marker < log_iter->id) {
+          for (log_iter = log_entries.begin(); log_iter != log_entries.end() && !done_with_period; ++log_iter) {
+            if (!period_marker.empty() && period_marker <= log_iter->id) {
               done_with_period = true;
-              break;
+              if (period_marker < log_iter->id) {
+                ldout(cct, 10) << "found key=" << log_iter->id
+                    << " past period_marker=" << period_marker << dendl;
+                break;
+              }
+              ldout(cct, 10) << "found key at period_marker=" << period_marker << dendl;
+              // sync this entry, then return control to RGWMetaSyncCR
             }
             if (!mdlog_entry.convert_from(*log_iter)) {
               ldout(sync_env->cct, 0) << __func__ << ":" << __LINE__ << ": ERROR: failed to convert mdlog entry, shard_id=" << shard_id << " log_entry: " << log_iter->id << ":" << log_iter->section << ":" << log_iter->name << ":" << log_iter->timestamp << " ... skipping entry" << dendl;
@@ -1631,6 +1691,7 @@ public:
 	ldout(sync_env->cct, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " mdlog_marker=" << mdlog_marker << " max_marker=" << max_marker << " sync_marker.marker=" << sync_marker.marker << " period_marker=" << period_marker << dendl;
         if (done_with_period) {
           // return control to RGWMetaSyncCR and advance to the next period
+          ldout(sync_env->cct, 10) << *this << ": done with period" << dendl;
           break;
         }
 	if (mdlog_marker == max_marker && can_adjust_marker) {
@@ -1655,6 +1716,8 @@ public:
       if (!can_adjust_marker) {
         return -EAGAIN;
       }
+
+      return set_cr_done();
     }
     /* TODO */
     return 0;
@@ -1667,6 +1730,7 @@ class RGWMetaSyncShardControlCR : public RGWBackoffControlCR
 
   const rgw_pool& pool;
   const std::string& period;
+  epoch_t realm_epoch;
   RGWMetadataLog* mdlog;
   uint32_t shard_id;
   rgw_meta_sync_marker sync_marker;
@@ -1675,16 +1739,18 @@ class RGWMetaSyncShardControlCR : public RGWBackoffControlCR
   static constexpr bool exit_on_error = false; // retry on all errors
 public:
   RGWMetaSyncShardControlCR(RGWMetaSyncEnv *_sync_env, const rgw_pool& _pool,
-                            const std::string& period, RGWMetadataLog* mdlog,
-                            uint32_t _shard_id, const rgw_meta_sync_marker& _marker,
+                            const std::string& period, epoch_t realm_epoch,
+                            RGWMetadataLog* mdlog, uint32_t _shard_id,
+                            const rgw_meta_sync_marker& _marker,
                             std::string&& period_marker)
     : RGWBackoffControlCR(_sync_env->cct, exit_on_error), sync_env(_sync_env),
-      pool(_pool), period(period), mdlog(mdlog), shard_id(_shard_id),
-      sync_marker(_marker), period_marker(std::move(period_marker)) {}
+      pool(_pool), period(period), realm_epoch(realm_epoch), mdlog(mdlog),
+      shard_id(_shard_id), sync_marker(_marker),
+      period_marker(std::move(period_marker)) {}
 
   RGWCoroutine *alloc_cr() override {
-    return new RGWMetaSyncShardCR(sync_env, pool, period, mdlog, shard_id,
-                                  sync_marker, period_marker, backoff_ptr());
+    return new RGWMetaSyncShardCR(sync_env, pool, period, realm_epoch, mdlog,
+                                  shard_id, sync_marker, period_marker, backoff_ptr());
   }
 
   RGWCoroutine *alloc_finisher_cr() override {
@@ -1743,6 +1809,7 @@ public:
         yield {
           // get the mdlog for the current period (may be empty)
           auto& period_id = sync_status.sync_info.period;
+          auto realm_epoch = sync_status.sync_info.realm_epoch;
           auto mdlog = sync_env->store->meta_mgr->get_log(period_id);
 
           // prevent wakeup() from accessing shard_crs while we're spawning them
@@ -1765,9 +1832,10 @@ public:
               }
             }
 
-            auto cr = new RGWMetaSyncShardControlCR(sync_env, pool, period_id,
-                                                    mdlog, shard_id, marker,
-                                                    std::move(period_marker));
+            using ShardCR = RGWMetaSyncShardControlCR;
+            auto cr = new ShardCR(sync_env, pool, period_id, realm_epoch,
+                                  mdlog, shard_id, marker,
+                                  std::move(period_marker));
             auto stack = spawn(cr, false);
             shard_crs[shard_id] = RefPair{cr, stack};
           }
@@ -1821,13 +1889,24 @@ void RGWRemoteMetaLog::init_sync_env(RGWMetaSyncEnv *env) {
   env->error_logger = error_logger;
 }
 
-int RGWRemoteMetaLog::read_sync_status()
+int RGWRemoteMetaLog::read_sync_status(rgw_meta_sync_status *sync_status)
 {
   if (store->is_meta_master()) {
     return 0;
   }
-
-  return run(new RGWReadSyncStatusCoroutine(&sync_env, &sync_status));
+  // cannot run concurrently with run_sync(), so run in a separate manager
+  RGWCoroutinesManager crs(store->ctx(), store->get_cr_registry());
+  RGWHTTPManager http_manager(store->ctx(), crs.get_completion_mgr());
+  int ret = http_manager.set_threaded();
+  if (ret < 0) {
+    ldout(store->ctx(), 0) << "failed in http_manager.set_threaded() ret=" << ret << dendl;
+    return ret;
+  }
+  RGWMetaSyncEnv sync_env_local = sync_env;
+  sync_env_local.http_manager = &http_manager;
+  ret = crs.run(new RGWReadSyncStatusCoroutine(&sync_env_local, sync_status));
+  http_manager.stop();
+  return ret;
 }
 
 int RGWRemoteMetaLog::init_sync_status()
@@ -1836,30 +1915,29 @@ int RGWRemoteMetaLog::init_sync_status()
     return 0;
   }
 
-  auto& sync_info = sync_status.sync_info;
-  if (!sync_info.num_shards) {
-    rgw_mdlog_info mdlog_info;
-    int r = read_log_info(&mdlog_info);
-    if (r < 0) {
-      lderr(store->ctx()) << "ERROR: fail to fetch master log info (r=" << r << ")" << dendl;
-      return r;
-    }
-    sync_info.num_shards = mdlog_info.num_shards;
-    auto cursor = store->period_history->get_current();
-    if (cursor) {
-      sync_info.period = cursor.get_period().get_id();
-      sync_info.realm_epoch = cursor.get_epoch();
-    }
+  rgw_mdlog_info mdlog_info;
+  int r = read_log_info(&mdlog_info);
+  if (r < 0) {
+    lderr(store->ctx()) << "ERROR: fail to fetch master log info (r=" << r << ")" << dendl;
+    return r;
+  }
+
+  rgw_meta_sync_info sync_info;
+  sync_info.num_shards = mdlog_info.num_shards;
+  auto cursor = store->period_history->get_current();
+  if (cursor) {
+    sync_info.period = cursor.get_period().get_id();
+    sync_info.realm_epoch = cursor.get_epoch();
   }
 
   return run(new RGWInitSyncStatusCoroutine(&sync_env, sync_info));
 }
 
-int RGWRemoteMetaLog::store_sync_info()
+int RGWRemoteMetaLog::store_sync_info(const rgw_meta_sync_info& sync_info)
 {
   return run(new RGWSimpleRadosWriteCR<rgw_meta_sync_info>(async_rados, store,
                                                            rgw_raw_obj(store->get_zone_params().log_pool, sync_env.status_oid()),
-                                                           sync_status.sync_info));
+                                                           sync_info));
 }
 
 // return a cursor to the period at our sync position
@@ -1914,7 +1992,7 @@ int RGWRemoteMetaLog::run_sync()
   // get shard count and oldest log period from master
   rgw_mdlog_info mdlog_info;
   for (;;) {
-    if (going_down.read()) {
+    if (going_down) {
       ldout(store->ctx(), 1) << __func__ << "(): going down" << dendl;
       return 0;
     }
@@ -1933,8 +2011,9 @@ int RGWRemoteMetaLog::run_sync()
     break;
   }
 
+  rgw_meta_sync_status sync_status;
   do {
-    if (going_down.read()) {
+    if (going_down) {
       ldout(store->ctx(), 1) << __func__ << "(): going down" << dendl;
       return 0;
     }
@@ -2008,7 +2087,7 @@ int RGWRemoteMetaLog::run_sync()
         }
 
         sync_status.sync_info.state = rgw_meta_sync_info::StateSync;
-        r = store_sync_info();
+        r = store_sync_info(sync_status.sync_info);
         if (r < 0) {
           ldout(store->ctx(), 0) << "ERROR: failed to update sync status" << dendl;
           return r;
@@ -2033,7 +2112,7 @@ int RGWRemoteMetaLog::run_sync()
         ldout(store->ctx(), 0) << "ERROR: bad sync state!" << dendl;
         return -EIO;
     }
-  } while (!going_down.read());
+  } while (!going_down);
 
   return 0;
 }
@@ -2234,3 +2313,690 @@ int RGWCloneMetaLogCoroutine::state_store_mdlog_entries_complete()
 }
 
 
+// TODO: move into rgw_sync_trim.cc
+#undef dout_prefix
+#define dout_prefix (*_dout << "meta trim: ")
+
+/// purge all log shards for the given mdlog
+class PurgeLogShardsCR : public RGWShardCollectCR {
+  RGWRados *const store;
+  const RGWMetadataLog* mdlog;
+  const int num_shards;
+  rgw_raw_obj obj;
+  int i{0};
+
+  static constexpr int max_concurrent = 16;
+
+ public:
+  PurgeLogShardsCR(RGWRados *store, const RGWMetadataLog* mdlog,
+                   const rgw_pool& pool, int num_shards)
+    : RGWShardCollectCR(store->ctx(), max_concurrent),
+      store(store), mdlog(mdlog), num_shards(num_shards), obj(pool, "")
+  {}
+
+  bool spawn_next() override {
+    if (i == num_shards) {
+      return false;
+    }
+    mdlog->get_shard_oid(i++, obj.oid);
+    spawn(new RGWRadosRemoveCR(store, obj), false);
+    return true;
+  }
+};
+
+using Cursor = RGWPeriodHistory::Cursor;
+
+/// purge mdlogs from the oldest up to (but not including) the given realm_epoch
+class PurgePeriodLogsCR : public RGWCoroutine {
+  RGWRados *const store;
+  RGWMetadataManager *const metadata;
+  RGWObjVersionTracker objv;
+  Cursor cursor;
+  epoch_t realm_epoch;
+  epoch_t *last_trim_epoch; //< update last trim on success
+
+ public:
+  PurgePeriodLogsCR(RGWRados *store, epoch_t realm_epoch, epoch_t *last_trim)
+    : RGWCoroutine(store->ctx()), store(store), metadata(store->meta_mgr),
+      realm_epoch(realm_epoch), last_trim_epoch(last_trim)
+  {}
+
+  int operate();
+};
+
+int PurgePeriodLogsCR::operate()
+{
+  reenter(this) {
+    // read our current oldest log period
+    yield call(metadata->read_oldest_log_period_cr(&cursor, &objv));
+    if (retcode < 0) {
+      return set_cr_error(retcode);
+    }
+    assert(cursor);
+    ldout(cct, 20) << "oldest log realm_epoch=" << cursor.get_epoch()
+        << " period=" << cursor.get_period().get_id() << dendl;
+
+    // trim -up to- the given realm_epoch
+    while (cursor.get_epoch() < realm_epoch) {
+      ldout(cct, 4) << "purging log shards for realm_epoch=" << cursor.get_epoch()
+          << " period=" << cursor.get_period().get_id() << dendl;
+      yield {
+        const auto mdlog = metadata->get_log(cursor.get_period().get_id());
+        const auto& pool = store->get_zone_params().log_pool;
+        auto num_shards = cct->_conf->rgw_md_log_max_shards;
+        call(new PurgeLogShardsCR(store, mdlog, pool, num_shards));
+      }
+      if (retcode < 0) {
+        ldout(cct, 1) << "failed to remove log shards: "
+            << cpp_strerror(retcode) << dendl;
+        return set_cr_error(retcode);
+      }
+      ldout(cct, 10) << "removed log shards for realm_epoch=" << cursor.get_epoch()
+          << " period=" << cursor.get_period().get_id() << dendl;
+
+      // update our mdlog history
+      yield call(metadata->trim_log_period_cr(cursor, &objv));
+      if (retcode == -ENOENT) {
+        // must have raced to update mdlog history. return success and allow the
+        // winner to continue purging
+        ldout(cct, 10) << "already removed log shards for realm_epoch=" << cursor.get_epoch()
+            << " period=" << cursor.get_period().get_id() << dendl;
+        return set_cr_done();
+      } else if (retcode < 0) {
+        ldout(cct, 1) << "failed to remove log shards for realm_epoch="
+            << cursor.get_epoch() << " period=" << cursor.get_period().get_id()
+            << " with: " << cpp_strerror(retcode) << dendl;
+        return set_cr_error(retcode);
+      }
+
+      if (*last_trim_epoch < cursor.get_epoch()) {
+        *last_trim_epoch = cursor.get_epoch();
+      }
+
+      assert(cursor.has_next()); // get_current() should always come after
+      cursor.next();
+    }
+    return set_cr_done();
+  }
+  return 0;
+}
+
+namespace {
+
+using connection_map = std::map<std::string, std::unique_ptr<RGWRESTConn>>;
+
+/// construct a RGWRESTConn for each zone in the realm
+template <typename Zonegroups>
+connection_map make_peer_connections(RGWRados *store,
+                                     const Zonegroups& zonegroups)
+{
+  connection_map connections;
+  for (auto& g : zonegroups) {
+    for (auto& z : g.second.zones) {
+      std::unique_ptr<RGWRESTConn> conn{
+        new RGWRESTConn(store->ctx(), store, z.first, z.second.endpoints)};
+      connections.emplace(z.first, std::move(conn));
+    }
+  }
+  return connections;
+}
+
+/// return the marker that it's safe to trim up to
+const std::string& get_stable_marker(const rgw_meta_sync_marker& m)
+{
+  return m.state == m.FullSync ? m.next_step_marker : m.marker;
+}
+
+/// comparison operator for take_min_status()
+bool operator<(const rgw_meta_sync_marker& lhs, const rgw_meta_sync_marker& rhs)
+{
+  // sort by stable marker
+  return get_stable_marker(lhs) < get_stable_marker(rhs);
+}
+
+/// populate the status with the minimum stable marker of each shard for any
+/// peer whose realm_epoch matches the minimum realm_epoch in the input
+template <typename Iter>
+int take_min_status(CephContext *cct, Iter first, Iter last,
+                    rgw_meta_sync_status *status)
+{
+  if (first == last) {
+    return -EINVAL;
+  }
+  const size_t num_shards = cct->_conf->rgw_md_log_max_shards;
+
+  status->sync_info.realm_epoch = std::numeric_limits<epoch_t>::max();
+  for (auto p = first; p != last; ++p) {
+    // validate peer's shard count
+    if (p->sync_markers.size() != num_shards) {
+      ldout(cct, 1) << "take_min_status got peer status with "
+          << p->sync_markers.size() << " shards, expected "
+          << num_shards << dendl;
+      return -EINVAL;
+    }
+    if (p->sync_info.realm_epoch < status->sync_info.realm_epoch) {
+      // earlier epoch, take its entire status
+      *status = std::move(*p);
+    } else if (p->sync_info.realm_epoch == status->sync_info.realm_epoch) {
+      // same epoch, take any earlier markers
+      auto m = status->sync_markers.begin();
+      for (auto& shard : p->sync_markers) {
+        if (shard.second < m->second) {
+          m->second = std::move(shard.second);
+        }
+        ++m;
+      }
+    }
+  }
+  return 0;
+}
+
+struct TrimEnv {
+  RGWRados *const store;
+  RGWHTTPManager *const http;
+  int num_shards;
+  const std::string& zone;
+  Cursor current; //< cursor to current period
+  epoch_t last_trim_epoch{0}; //< epoch of last mdlog that was purged
+
+  TrimEnv(RGWRados *store, RGWHTTPManager *http, int num_shards)
+    : store(store), http(http), num_shards(num_shards),
+      zone(store->get_zone_params().get_id()),
+      current(store->period_history->get_current())
+  {}
+};
+
+struct MasterTrimEnv : public TrimEnv {
+  connection_map connections; //< peer connections
+  std::vector<rgw_meta_sync_status> peer_status; //< sync status for each peer
+  /// last trim marker for each shard, only applies to current period's mdlog
+  std::vector<std::string> last_trim_markers;
+
+  MasterTrimEnv(RGWRados *store, RGWHTTPManager *http, int num_shards)
+    : TrimEnv(store, http, num_shards),
+      last_trim_markers(num_shards)
+  {
+    auto& period = current.get_period();
+    connections = make_peer_connections(store, period.get_map().zonegroups);
+    connections.erase(zone);
+    peer_status.resize(connections.size());
+  }
+};
+
+struct PeerTrimEnv : public TrimEnv {
+  /// last trim timestamp for each shard, only applies to current period's mdlog
+  std::vector<ceph::real_time> last_trim_timestamps;
+
+  PeerTrimEnv(RGWRados *store, RGWHTTPManager *http, int num_shards)
+    : TrimEnv(store, http, num_shards),
+      last_trim_timestamps(num_shards)
+  {}
+
+  void set_num_shards(int num_shards) {
+    this->num_shards = num_shards;
+    last_trim_timestamps.resize(num_shards);
+  }
+};
+
+} // anonymous namespace
+
+
+/// spawn a trim cr for each shard that needs it, while limiting the number
+/// of concurrent shards
+class MetaMasterTrimShardCollectCR : public RGWShardCollectCR {
+ private:
+  static constexpr int MAX_CONCURRENT_SHARDS = 16;
+
+  MasterTrimEnv& env;
+  RGWMetadataLog *mdlog;
+  int shard_id{0};
+  std::string oid;
+  const rgw_meta_sync_status& sync_status;
+
+ public:
+  MetaMasterTrimShardCollectCR(MasterTrimEnv& env, RGWMetadataLog *mdlog,
+                               const rgw_meta_sync_status& sync_status)
+    : RGWShardCollectCR(env.store->ctx(), MAX_CONCURRENT_SHARDS),
+      env(env), mdlog(mdlog), sync_status(sync_status)
+  {}
+
+  bool spawn_next() override;
+};
+
+bool MetaMasterTrimShardCollectCR::spawn_next()
+{
+  while (shard_id < env.num_shards) {
+    auto m = sync_status.sync_markers.find(shard_id);
+    if (m == sync_status.sync_markers.end()) {
+      shard_id++;
+      continue;
+    }
+    auto& stable = get_stable_marker(m->second);
+    auto& last_trim = env.last_trim_markers[shard_id];
+
+    if (stable <= last_trim) {
+      // already trimmed
+      ldout(cct, 20) << "skipping log shard " << shard_id
+          << " at marker=" << stable
+          << " last_trim=" << last_trim
+          << " realm_epoch=" << sync_status.sync_info.realm_epoch << dendl;
+      shard_id++;
+      continue;
+    }
+
+    mdlog->get_shard_oid(shard_id, oid);
+
+    ldout(cct, 10) << "trimming log shard " << shard_id
+        << " at marker=" << stable
+        << " last_trim=" << last_trim
+        << " realm_epoch=" << sync_status.sync_info.realm_epoch << dendl;
+    spawn(new RGWSyncLogTrimCR(env.store, oid, stable, &last_trim), false);
+    shard_id++;
+    return true;
+  }
+  return false;
+}
+
+/// spawn rest requests to read each peer's sync status
+class MetaMasterStatusCollectCR : public RGWShardCollectCR {
+  static constexpr int MAX_CONCURRENT_SHARDS = 16;
+
+  MasterTrimEnv& env;
+  connection_map::iterator c;
+  std::vector<rgw_meta_sync_status>::iterator s;
+ public:
+  MetaMasterStatusCollectCR(MasterTrimEnv& env)
+    : RGWShardCollectCR(env.store->ctx(), MAX_CONCURRENT_SHARDS),
+      env(env), c(env.connections.begin()), s(env.peer_status.begin())
+  {}
+
+  bool spawn_next() override {
+    if (c == env.connections.end()) {
+      return false;
+    }
+    static rgw_http_param_pair params[] = {
+      { "type", "metadata" },
+      { "status", nullptr },
+      { nullptr, nullptr }
+    };
+
+    ldout(cct, 20) << "query sync status from " << c->first << dendl;
+    auto conn = c->second.get();
+    using StatusCR = RGWReadRESTResourceCR<rgw_meta_sync_status>;
+    spawn(new StatusCR(cct, conn, env.http, "/admin/log/", params, &*s),
+          false);
+    ++c;
+    ++s;
+    return true;
+  }
+};
+
+class MetaMasterTrimCR : public RGWCoroutine {
+  MasterTrimEnv& env;
+  rgw_meta_sync_status min_status; //< minimum sync status of all peers
+  int ret{0};
+
+ public:
+  MetaMasterTrimCR(MasterTrimEnv& env)
+    : RGWCoroutine(env.store->ctx()), env(env)
+  {}
+
+  int operate();
+};
+
+int MetaMasterTrimCR::operate()
+{
+  reenter(this) {
+    // TODO: detect this and fail before we spawn the trim thread?
+    if (env.connections.empty()) {
+      ldout(cct, 4) << "no peers, exiting" << dendl;
+      return set_cr_done();
+    }
+
+    ldout(cct, 10) << "fetching sync status for zone " << env.zone << dendl;
+    // query mdlog sync status from peers
+    yield call(new MetaMasterStatusCollectCR(env));
+
+    // must get a successful reply from all peers to consider trimming
+    if (ret < 0) {
+      ldout(cct, 4) << "failed to fetch sync status from all peers" << dendl;
+      return set_cr_error(ret);
+    }
+
+    // determine the minimum epoch and markers
+    ret = take_min_status(env.store->ctx(), env.peer_status.begin(),
+                          env.peer_status.end(), &min_status);
+    if (ret < 0) {
+      ldout(cct, 4) << "failed to calculate min sync status from peers" << dendl;
+      return set_cr_error(ret);
+    }
+    yield {
+      auto store = env.store;
+      auto epoch = min_status.sync_info.realm_epoch;
+      ldout(cct, 4) << "realm epoch min=" << epoch
+          << " current=" << env.current.get_epoch()<< dendl;
+      if (epoch > env.last_trim_epoch + 1) {
+        // delete any prior mdlog periods
+        spawn(new PurgePeriodLogsCR(store, epoch, &env.last_trim_epoch), true);
+      } else {
+        ldout(cct, 10) << "mdlogs already purged up to realm_epoch "
+            << env.last_trim_epoch << dendl;
+      }
+
+      // if realm_epoch == current, trim mdlog based on markers
+      if (epoch == env.current.get_epoch()) {
+        auto mdlog = store->meta_mgr->get_log(env.current.get_period().get_id());
+        spawn(new MetaMasterTrimShardCollectCR(env, mdlog, min_status), true);
+      }
+    }
+    // ignore any errors during purge/trim because we want to hold the lock open
+    return set_cr_done();
+  }
+  return 0;
+}
+
+
+/// read the first entry of the master's mdlog shard and trim to that position
+class MetaPeerTrimShardCR : public RGWCoroutine {
+  RGWMetaSyncEnv& env;
+  RGWMetadataLog *mdlog;
+  const std::string& period_id;
+  const int shard_id;
+  RGWMetadataLogInfo info;
+  ceph::real_time stable; //< safe timestamp to trim, according to master
+  ceph::real_time *last_trim; //< last trimmed timestamp, updated on trim
+  rgw_mdlog_shard_data result; //< result from master's mdlog listing
+
+ public:
+  MetaPeerTrimShardCR(RGWMetaSyncEnv& env, RGWMetadataLog *mdlog,
+                      const std::string& period_id, int shard_id,
+                      ceph::real_time *last_trim)
+    : RGWCoroutine(env.store->ctx()), env(env), mdlog(mdlog),
+      period_id(period_id), shard_id(shard_id), last_trim(last_trim)
+  {}
+
+  int operate() override;
+};
+
+int MetaPeerTrimShardCR::operate()
+{
+  reenter(this) {
+    // query master's first mdlog entry for this shard
+    yield call(new RGWListRemoteMDLogShardCR(&env, period_id, shard_id,
+                                             "", 1, &result));
+    if (retcode < 0) {
+      ldout(cct, 5) << "failed to read first entry from master's mdlog shard "
+          << shard_id << " for period " << period_id
+          << ": " << cpp_strerror(retcode) << dendl;
+      return set_cr_error(retcode);
+    }
+    if (result.entries.empty()) {
+      // if there are no mdlog entries, we don't have a timestamp to compare. we
+      // can't just trim everything, because there could be racing updates since
+      // this empty reply. query the mdlog shard info to read its max timestamp,
+      // then retry the listing to make sure it's still empty before trimming to
+      // that
+      ldout(cct, 10) << "empty master mdlog shard " << shard_id
+          << ", reading last timestamp from shard info" << dendl;
+      // read the mdlog shard info for the last timestamp
+      using ShardInfoCR = RGWReadRemoteMDLogShardInfoCR;
+      yield call(new ShardInfoCR(&env, period_id, shard_id, &info));
+      if (retcode < 0) {
+        ldout(cct, 5) << "failed to read info from master's mdlog shard "
+            << shard_id << " for period " << period_id
+            << ": " << cpp_strerror(retcode) << dendl;
+        return set_cr_error(retcode);
+      }
+      if (ceph::real_clock::is_zero(info.last_update)) {
+        return set_cr_done(); // nothing to trim
+      }
+      ldout(cct, 10) << "got mdlog shard info with last update="
+          << info.last_update << dendl;
+      // re-read the master's first mdlog entry to make sure it hasn't changed
+      yield call(new RGWListRemoteMDLogShardCR(&env, period_id, shard_id,
+                                               "", 1, &result));
+      if (retcode < 0) {
+        ldout(cct, 5) << "failed to read first entry from master's mdlog shard "
+            << shard_id << " for period " << period_id
+            << ": " << cpp_strerror(retcode) << dendl;
+        return set_cr_error(retcode);
+      }
+      // if the mdlog is still empty, trim to max marker
+      if (result.entries.empty()) {
+        stable = info.last_update;
+      } else {
+        stable = result.entries.front().timestamp;
+
+        // can only trim -up to- master's first timestamp, so subtract a second.
+        // (this is why we use timestamps instead of markers for the peers)
+        stable -= std::chrono::seconds(1);
+      }
+    } else {
+      stable = result.entries.front().timestamp;
+      stable -= std::chrono::seconds(1);
+    }
+
+    if (stable <= *last_trim) {
+      ldout(cct, 10) << "skipping log shard " << shard_id
+          << " at timestamp=" << stable
+          << " last_trim=" << *last_trim << dendl;
+      return set_cr_done();
+    }
+
+    ldout(cct, 10) << "trimming log shard " << shard_id
+        << " at timestamp=" << stable
+        << " last_trim=" << *last_trim << dendl;
+    yield {
+      std::string oid;
+      mdlog->get_shard_oid(shard_id, oid);
+      call(new RGWRadosTimelogTrimCR(env.store, oid, real_time{}, stable, "", ""));
+    }
+    if (retcode < 0 && retcode != -ENODATA) {
+      ldout(cct, 1) << "failed to trim mdlog shard " << shard_id
+          << ": " << cpp_strerror(retcode) << dendl;
+      return set_cr_error(retcode);
+    }
+    *last_trim = stable;
+    return set_cr_done();
+  }
+  return 0;
+}
+
+class MetaPeerTrimShardCollectCR : public RGWShardCollectCR {
+  static constexpr int MAX_CONCURRENT_SHARDS = 16;
+
+  PeerTrimEnv& env;
+  RGWMetadataLog *mdlog;
+  const std::string& period_id;
+  RGWMetaSyncEnv meta_env; //< for RGWListRemoteMDLogShardCR
+  int shard_id{0};
+
+ public:
+  MetaPeerTrimShardCollectCR(PeerTrimEnv& env, RGWMetadataLog *mdlog)
+    : RGWShardCollectCR(env.store->ctx(), MAX_CONCURRENT_SHARDS),
+      env(env), mdlog(mdlog), period_id(env.current.get_period().get_id())
+  {
+    meta_env.init(cct, env.store, env.store->rest_master_conn,
+                  env.store->get_async_rados(), env.http, nullptr);
+  }
+
+  bool spawn_next() override;
+};
+
+bool MetaPeerTrimShardCollectCR::spawn_next()
+{
+  if (shard_id >= env.num_shards) {
+    return false;
+  }
+  auto& last_trim = env.last_trim_timestamps[shard_id];
+  spawn(new MetaPeerTrimShardCR(meta_env, mdlog, period_id, shard_id, &last_trim),
+        false);
+  shard_id++;
+  return true;
+}
+
+class MetaPeerTrimCR : public RGWCoroutine {
+  PeerTrimEnv& env;
+  rgw_mdlog_info mdlog_info; //< master's mdlog info
+
+ public:
+  MetaPeerTrimCR(PeerTrimEnv& env) : RGWCoroutine(env.store->ctx()), env(env) {}
+
+  int operate();
+};
+
+int MetaPeerTrimCR::operate()
+{
+  reenter(this) {
+    ldout(cct, 10) << "fetching master mdlog info" << dendl;
+    yield {
+      // query mdlog_info from master for oldest_log_period
+      rgw_http_param_pair params[] = {
+        { "type", "metadata" },
+        { nullptr, nullptr }
+      };
+
+      using LogInfoCR = RGWReadRESTResourceCR<rgw_mdlog_info>;
+      call(new LogInfoCR(cct, env.store->rest_master_conn, env.http,
+                         "/admin/log/", params, &mdlog_info));
+    }
+    if (retcode < 0) {
+      ldout(cct, 4) << "failed to read mdlog info from master" << dendl;
+      return set_cr_error(retcode);
+    }
+    // use master's shard count instead
+    env.set_num_shards(mdlog_info.num_shards);
+
+    if (mdlog_info.realm_epoch > env.last_trim_epoch + 1) {
+      // delete any prior mdlog periods
+      yield call(new PurgePeriodLogsCR(env.store, mdlog_info.realm_epoch,
+                                       &env.last_trim_epoch));
+    } else {
+      ldout(cct, 10) << "mdlogs already purged through realm_epoch "
+          << env.last_trim_epoch << dendl;
+    }
+
+    // if realm_epoch == current, trim mdlog based on master's markers
+    if (mdlog_info.realm_epoch == env.current.get_epoch()) {
+      yield {
+        auto meta_mgr = env.store->meta_mgr;
+        auto mdlog = meta_mgr->get_log(env.current.get_period().get_id());
+        call(new MetaPeerTrimShardCollectCR(env, mdlog));
+        // ignore any errors during purge/trim because we want to hold the lock open
+      }
+    }
+    return set_cr_done();
+  }
+  return 0;
+}
+
+class MetaTrimPollCR : public RGWCoroutine {
+  RGWRados *const store;
+  const utime_t interval; //< polling interval
+  const rgw_raw_obj obj;
+  const std::string name{"meta_trim"}; //< lock name
+  const std::string cookie;
+
+ protected:
+  /// allocate the coroutine to run within the lease
+  virtual RGWCoroutine* alloc_cr() = 0;
+
+ public:
+  MetaTrimPollCR(RGWRados *store, utime_t interval)
+    : RGWCoroutine(store->ctx()), store(store), interval(interval),
+      obj(store->get_zone_params().log_pool, RGWMetadataLogHistory::oid),
+      cookie(RGWSimpleRadosLockCR::gen_random_cookie(cct))
+  {}
+
+  int operate();
+};
+
+int MetaTrimPollCR::operate()
+{
+  reenter(this) {
+    for (;;) {
+      set_status("sleeping");
+      wait(interval);
+
+      // prevent others from trimming for our entire wait interval
+      set_status("acquiring trim lock");
+      yield call(new RGWSimpleRadosLockCR(store->get_async_rados(), store,
+                                          obj, name, cookie, interval.sec()));
+      if (retcode < 0) {
+        ldout(cct, 4) << "failed to lock: " << cpp_strerror(retcode) << dendl;
+        continue;
+      }
+
+      set_status("trimming");
+      yield call(alloc_cr());
+
+      if (retcode < 0) {
+        // on errors, unlock so other gateways can try
+        set_status("unlocking");
+        yield call(new RGWSimpleRadosUnlockCR(store->get_async_rados(), store,
+                                              obj, name, cookie));
+      }
+    }
+  }
+  return 0;
+}
+
+class MetaMasterTrimPollCR : public MetaTrimPollCR  {
+  MasterTrimEnv env; //< trim state to share between calls
+  RGWCoroutine* alloc_cr() override {
+    return new MetaMasterTrimCR(env);
+  }
+ public:
+  MetaMasterTrimPollCR(RGWRados *store, RGWHTTPManager *http,
+                       int num_shards, utime_t interval)
+    : MetaTrimPollCR(store, interval),
+      env(store, http, num_shards)
+  {}
+};
+
+class MetaPeerTrimPollCR : public MetaTrimPollCR {
+  PeerTrimEnv env; //< trim state to share between calls
+  RGWCoroutine* alloc_cr() override {
+    return new MetaPeerTrimCR(env);
+  }
+ public:
+  MetaPeerTrimPollCR(RGWRados *store, RGWHTTPManager *http,
+                     int num_shards, utime_t interval)
+    : MetaTrimPollCR(store, interval),
+      env(store, http, num_shards)
+  {}
+};
+
+RGWCoroutine* create_meta_log_trim_cr(RGWRados *store, RGWHTTPManager *http,
+                                      int num_shards, utime_t interval)
+{
+  if (store->is_meta_master()) {
+    return new MetaMasterTrimPollCR(store, http, num_shards, interval);
+  }
+  return new MetaPeerTrimPollCR(store, http, num_shards, interval);
+}
+
+
+struct MetaMasterAdminTrimCR : private MasterTrimEnv, public MetaMasterTrimCR {
+  MetaMasterAdminTrimCR(RGWRados *store, RGWHTTPManager *http, int num_shards)
+    : MasterTrimEnv(store, http, num_shards),
+      MetaMasterTrimCR(*static_cast<MasterTrimEnv*>(this))
+  {}
+};
+
+struct MetaPeerAdminTrimCR : private PeerTrimEnv, public MetaPeerTrimCR {
+  MetaPeerAdminTrimCR(RGWRados *store, RGWHTTPManager *http, int num_shards)
+    : PeerTrimEnv(store, http, num_shards),
+      MetaPeerTrimCR(*static_cast<PeerTrimEnv*>(this))
+  {}
+};
+
+RGWCoroutine* create_admin_meta_log_trim_cr(RGWRados *store,
+                                            RGWHTTPManager *http,
+                                            int num_shards)
+{
+  if (store->is_meta_master()) {
+    return new MetaMasterAdminTrimCR(store, http, num_shards);
+  }
+  return new MetaPeerAdminTrimCR(store, http, num_shards);
+}
diff --git a/src/rgw/rgw_sync.h b/src/rgw/rgw_sync.h
index 878c5d8e18b..c651f7a9ad1 100644
--- a/src/rgw/rgw_sync.h
+++ b/src/rgw/rgw_sync.h
@@ -8,6 +8,8 @@
 #include "include/stringify.h"
 #include "common/RWLock.h"
 
+#include <atomic>
+
 #define ERROR_LOGGER_SHARDS 32
 #define RGW_SYNC_ERROR_LOG_SHARD_PREFIX "sync.error-log"
 
@@ -65,7 +67,7 @@ class RGWSyncErrorLogger {
   vector<string> oids;
   int num_shards;
 
-  atomic_t counter;
+  std::atomic<int64_t> counter = { 0 };
 public:
   RGWSyncErrorLogger(RGWRados *_store, const string &oid_prefix, int _num_shards);
   RGWCoroutine *log_error_cr(const string& source_zone, const string& section, const string& name, uint32_t error_code, const string& message);
@@ -191,12 +193,11 @@ class RGWRemoteMetaLog : public RGWCoroutinesManager {
   RGWSyncBackoff backoff;
 
   RGWMetaSyncEnv sync_env;
-  rgw_meta_sync_status sync_status;
 
   void init_sync_env(RGWMetaSyncEnv *env);
-  int store_sync_info();
+  int store_sync_info(const rgw_meta_sync_info& sync_info);
 
-  atomic_t going_down;
+  std::atomic<bool> going_down = { false };
 
 public:
   RGWRemoteMetaLog(RGWRados *_store, RGWAsyncRadosProcessor *async_rados,
@@ -214,7 +215,7 @@ public:
   int read_log_info(rgw_mdlog_info *log_info);
   int read_master_log_shards_info(const string& master_period, map<int, RGWMetadataLogInfo> *shards_info);
   int read_master_log_shards_next(const string& period, map<int, string> shard_markers, map<int, rgw_mdlog_shard_data> *result);
-  int read_sync_status();
+  int read_sync_status(rgw_meta_sync_status *sync_status);
   int init_sync_status();
   int run_sync();
 
@@ -223,7 +224,6 @@ public:
   RGWMetaSyncEnv& get_sync_env() {
     return sync_env;
   }
-  const rgw_meta_sync_status& get_sync_status() const { return sync_status; }
 };
 
 class RGWMetaSyncStatusManager {
@@ -257,13 +257,10 @@ public:
     : store(_store), master_log(store, async_rados, this),
       ts_to_shard_lock("ts_to_shard_lock") {}
   int init();
-  void finish();
 
-  const rgw_meta_sync_status& get_sync_status() const {
-    return master_log.get_sync_status();
+  int read_sync_status(rgw_meta_sync_status *sync_status) {
+    return master_log.read_sync_status(sync_status);
   }
-
-  int read_sync_status() { return master_log.read_sync_status(); }
   int init_sync_status() { return master_log.init_sync_status(); }
   int read_log_info(rgw_mdlog_info *log_info) {
     return master_log.read_log_info(log_info);
@@ -455,5 +452,13 @@ public:
   int operate() override;
 };
 
+// MetaLogTrimCR factory function
+RGWCoroutine* create_meta_log_trim_cr(RGWRados *store, RGWHTTPManager *http,
+                                      int num_shards, utime_t interval);
+
+// factory function for mdlog trim via radosgw-admin
+RGWCoroutine* create_admin_meta_log_trim_cr(RGWRados *store,
+                                            RGWHTTPManager *http,
+                                            int num_shards);
 
 #endif
diff --git a/src/rgw/rgw_tools.cc b/src/rgw/rgw_tools.cc
index 79bffb2018f..b79fecb6a85 100644
--- a/src/rgw/rgw_tools.cc
+++ b/src/rgw/rgw_tools.cc
@@ -42,7 +42,6 @@ int rgw_get_system_obj(RGWRados *rgwstore, RGWObjectCtx& obj_ctx, const rgw_pool
                        RGWObjVersionTracker *objv_tracker, real_time *pmtime, map<string, bufferlist> *pattrs,
                        rgw_cache_entry_info *cache_info)
 {
-  struct rgw_err err;
   bufferlist::iterator iter;
   int request_len = READ_CHUNK_LEN;
   rgw_raw_obj obj(pool, key);
@@ -58,7 +57,6 @@ int rgw_get_system_obj(RGWRados *rgwstore, RGWObjectCtx& obj_ctx, const rgw_pool
 
     rop.stat_params.attrs = pattrs;
     rop.stat_params.lastmod = pmtime;
-    rop.stat_params.perr = &err;
 
     int ret = rop.stat(objv_tracker);
     if (ret < 0)
diff --git a/src/rgw/rgw_torrent.h b/src/rgw/rgw_torrent.h
index 6d2b28cd91a..ab8e89bbd79 100644
--- a/src/rgw/rgw_torrent.h
+++ b/src/rgw/rgw_torrent.h
@@ -11,7 +11,6 @@
 #include "rgw_rados.h"
 #include "rgw_common.h"
 
-using namespace std;
 using ceph::crypto::SHA1;
 
 struct req_state;
diff --git a/src/rgw/rgw_user.h b/src/rgw/rgw_user.h
index 50cc1b62909..7a27d0e7007 100644
--- a/src/rgw/rgw_user.h
+++ b/src/rgw/rgw_user.h
@@ -19,8 +19,6 @@
 #include "common/Formatter.h"
 #include "rgw_formats.h"
 
-using namespace std;
-
 #define RGW_USER_ANON_ID "anonymous"
 
 #define SECRET_KEY_LEN 40
diff --git a/src/rgw/rgw_xml.h b/src/rgw/rgw_xml.h
index 2517c7b78e8..1f37a6a49e2 100644
--- a/src/rgw/rgw_xml.h
+++ b/src/rgw/rgw_xml.h
@@ -10,9 +10,6 @@
 #include <include/types.h>
 #include <common/Formatter.h>
 
-using namespace std;
-
-
 class XMLObj;
 
 class XMLObjIter {
diff --git a/src/rocksdb b/src/rocksdb
-Subproject 9f2cc59ec5065c828dabbab483506bde681c955
+Subproject e15382c09c87a65eaeca9bda233bab503f1e577
diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt
index 4da3ef05922..ef2e1f8ef97 100644
--- a/src/test/CMakeLists.txt
+++ b/src/test/CMakeLists.txt
@@ -12,6 +12,7 @@ add_subdirectory(cls_hello)
 add_subdirectory(cls_lock)
 add_subdirectory(cls_log)
 add_subdirectory(cls_numops)
+add_subdirectory(cls_sdk)
 if(WITH_RBD)
   add_subdirectory(cls_rbd)
 endif(WITH_RBD)
diff --git a/src/test/bufferlist.cc b/src/test/bufferlist.cc
index eb259544b45..af3999e55a0 100644
--- a/src/test/bufferlist.cc
+++ b/src/test/bufferlist.cc
@@ -26,6 +26,7 @@
 
 #include "include/buffer.h"
 #include "include/utime.h"
+#include "include/coredumpctl.h"
 #include "include/encoding.h"
 #include "common/environment.h"
 #include "common/Clock.h"
@@ -484,6 +485,7 @@ TEST(BufferPtr, constructors) {
     EXPECT_EQ(original.get_raw(), ptr.get_raw());
     EXPECT_EQ(2, ptr.raw_nref());
     EXPECT_EQ(0, ::memcmp(original.c_str(), ptr.c_str(), len));
+    PrCtl unset_dumpable;
     EXPECT_DEATH(bufferptr(original, 0, original.length() + 1), "");
     EXPECT_DEATH(bufferptr(bufferptr(), 0, 0), "");
   }
@@ -678,12 +680,14 @@ TEST(BufferPtr, accessors) {
   EXPECT_EQ('X', ptr.c_str()[0]);
   {
     bufferptr ptr;
+    PrCtl unset_dumpable;
     EXPECT_DEATH(ptr.c_str(), "");
     EXPECT_DEATH(ptr[0], "");
   }
   EXPECT_EQ('X', const_ptr.c_str()[0]);
   {
     const bufferptr const_ptr;
+    PrCtl unset_dumpable;
     EXPECT_DEATH(const_ptr.c_str(), "");
     EXPECT_DEATH(const_ptr[0], "");
   }
@@ -702,10 +706,14 @@ TEST(BufferPtr, accessors) {
     bufferptr ptr;
     EXPECT_EQ((unsigned)0, ptr.unused_tail_length());
   }
-  EXPECT_DEATH(ptr[len], "");
-  EXPECT_DEATH(const_ptr[len], "");
+  {
+    PrCtl unset_dumpable;
+    EXPECT_DEATH(ptr[len], "");
+    EXPECT_DEATH(const_ptr[len], "");
+  }
   {
     const bufferptr const_ptr;
+    PrCtl unset_dumpable;
     EXPECT_DEATH(const_ptr.raw_c_str(), "");
     EXPECT_DEATH(const_ptr.raw_length(), "");
     EXPECT_DEATH(const_ptr.raw_nref(), "");
@@ -754,6 +762,7 @@ TEST(BufferPtr, is_zero) {
 TEST(BufferPtr, copy_out) {
   {
     const bufferptr ptr;
+    PrCtl unset_dumpable;
     EXPECT_DEATH(ptr.copy_out((unsigned)0, (unsigned)0, NULL), "");
   }
   {
@@ -789,13 +798,17 @@ TEST(BufferPtr, copy_out_bench) {
 TEST(BufferPtr, copy_in) {
   {
     bufferptr ptr;
+    PrCtl unset_dumpable;
     EXPECT_DEATH(ptr.copy_in((unsigned)0, (unsigned)0, NULL), "");
   }
   {
     char in[] = "ABCD";
     bufferptr ptr(2);
-    EXPECT_DEATH(ptr.copy_in((unsigned)0, strlen(in) + 1, NULL), "");
-    EXPECT_DEATH(ptr.copy_in(strlen(in) + 1, (unsigned)0, NULL), "");
+    {
+      PrCtl unset_dumpable;
+      EXPECT_DEATH(ptr.copy_in((unsigned)0, strlen(in) + 1, NULL), "");
+      EXPECT_DEATH(ptr.copy_in(strlen(in) + 1, (unsigned)0, NULL), "");
+    }
     ptr.copy_in((unsigned)0, (unsigned)2, in);
     EXPECT_EQ(in[0], ptr[0]);
     EXPECT_EQ(in[1], ptr[1]);
@@ -823,13 +836,17 @@ TEST(BufferPtr, copy_in_bench) {
 TEST(BufferPtr, append) {
   {
     bufferptr ptr;
+    PrCtl unset_dumpable;
     EXPECT_DEATH(ptr.append('A'), "");
     EXPECT_DEATH(ptr.append("B", (unsigned)1), "");
   }
   {
     bufferptr ptr(2);
-    EXPECT_DEATH(ptr.append('A'), "");
-    EXPECT_DEATH(ptr.append("B", (unsigned)1), "");
+    {
+      PrCtl unset_dumpable;
+      EXPECT_DEATH(ptr.append('A'), "");
+      EXPECT_DEATH(ptr.append("B", (unsigned)1), "");
+    }
     ptr.set_length(0);
     ptr.append('A');
     EXPECT_EQ((unsigned)1, ptr.length());
@@ -864,7 +881,10 @@ TEST(BufferPtr, append_bench) {
 TEST(BufferPtr, zero) {
   char str[] = "XXXX";
   bufferptr ptr(buffer::create_static(strlen(str), str));
-  EXPECT_DEATH(ptr.zero(ptr.length() + 1, 0), "");
+  {
+    PrCtl unset_dumpable;
+    EXPECT_DEATH(ptr.zero(ptr.length() + 1, 0), "");
+  }
   ptr.zero(1, 1);
   EXPECT_EQ('X', ptr[0]);
   EXPECT_EQ('\0', ptr[1]);
@@ -2207,7 +2227,10 @@ TEST(BufferList, append) {
     bufferptr in(back);
     EXPECT_EQ((unsigned)1, bl.get_num_buffers());
     EXPECT_EQ((unsigned)1, bl.length());
-    EXPECT_DEATH(bl.append(in, (unsigned)100, (unsigned)100), "");
+    {
+      PrCtl unset_dumpable;
+      EXPECT_DEATH(bl.append(in, (unsigned)100, (unsigned)100), "");
+    }
     EXPECT_LT((unsigned)0, in.unused_tail_length());
     in.append('B');
     bl.append(in, back.end(), 1);
@@ -2764,7 +2787,10 @@ TEST(BufferList, zero) {
       bufferptr ptr(s[i], strlen(s[i]));
       bl.push_back(ptr);
     }
-    EXPECT_DEATH(bl.zero((unsigned)0, (unsigned)2000), "");
+    {
+      PrCtl unset_dumpable;
+      EXPECT_DEATH(bl.zero((unsigned)0, (unsigned)2000), "");
+    }
     bl.zero((unsigned)2, (unsigned)5);
     EXPECT_EQ(0, ::memcmp("AB\0\0\0\0\0HIKLM", bl.c_str(), 9));
   }
diff --git a/src/test/ceph_compatset.cc b/src/test/ceph_compatset.cc
index 5cd7ef2a7c9..3010b559988 100644
--- a/src/test/ceph_compatset.cc
+++ b/src/test/ceph_compatset.cc
@@ -23,6 +23,7 @@
 
 #include "include/types.h"
 #include "include/compat.h"
+#include "include/coredumpctl.h"
 
 //#undef assert
 //#define	assert(foo) if (!(foo)) abort();
@@ -37,8 +38,11 @@ TEST(CephCompatSet, AllSet) {
   CompatSet::FeatureSet ro;
   CompatSet::FeatureSet incompat;
 
-  EXPECT_DEATH(compat.insert(CompatSet::Feature(0, "test")), "");
-  EXPECT_DEATH(compat.insert(CompatSet::Feature(64, "test")), "");
+  {
+    PrCtl unset_dumpable;
+    EXPECT_DEATH(compat.insert(CompatSet::Feature(0, "test")), "");
+    EXPECT_DEATH(compat.insert(CompatSet::Feature(64, "test")), "");
+  }
 
   for (int i = 1; i < 64; i++) {
     stringstream cname;
diff --git a/src/test/ceph_crypto.cc b/src/test/ceph_crypto.cc
index 2a35aeb08ba..86b5c62cfc1 100644
--- a/src/test/ceph_crypto.cc
+++ b/src/test/ceph_crypto.cc
@@ -112,7 +112,10 @@ class ForkDeathTest : public ::testing::Test {
  protected:
   void SetUp() override {
     // shutdown NSS so it can be reinitialized after the fork
-    ceph::crypto::shutdown();
+    // some data structures used by NSPR are only initialized once, and they
+    // will be cleaned up with ceph::crypto::shutdown(false), so we need to
+    // keep them around after fork.
+    ceph::crypto::shutdown(true);
   }
 
   void TearDown() override {
diff --git a/src/test/cli/crushtool/arg-order-checks.t b/src/test/cli/crushtool/arg-order-checks.t
index 8ee193431cd..5c8b6ab0278 100644
--- a/src/test/cli/crushtool/arg-order-checks.t
+++ b/src/test/cli/crushtool/arg-order-checks.t
@@ -50,6 +50,7 @@
   tunable choose_total_tries 50
   tunable chooseleaf_descend_once 1
   tunable straw_calc_version 1
+  tunable allowed_bucket_algs 54
   
   # devices
   device 0 osd.0
diff --git a/src/test/cli/crushtool/build.t b/src/test/cli/crushtool/build.t
index 5fa4b504eb4..0a779f0f4f9 100644
--- a/src/test/cli/crushtool/build.t
+++ b/src/test/cli/crushtool/build.t
@@ -32,6 +32,7 @@
   tunable choose_total_tries 50
   tunable chooseleaf_descend_once 1
   tunable chooseleaf_vary_r 1
+  tunable allowed_bucket_algs 54
   
   # devices
   device 0 osd.0
diff --git a/src/test/cli/osdmaptool/clobber.t b/src/test/cli/osdmaptool/clobber.t
index dd7e1756104..5ca08e8dcd2 100644
--- a/src/test/cli/osdmaptool/clobber.t
+++ b/src/test/cli/osdmaptool/clobber.t
@@ -22,6 +22,7 @@
   full_ratio 0
   backfillfull_ratio 0
   nearfull_ratio 0
+  min_compat_client hammer 0.94
   
   pool 0 'rbd' replicated size 3 min_size 2 crush_ruleset 0 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 flags hashpspool stripe_width 0
   
@@ -46,6 +47,7 @@
   full_ratio 0
   backfillfull_ratio 0
   nearfull_ratio 0
+  min_compat_client hammer 0.94
   
   pool 0 'rbd' replicated size 3 min_size 2 crush_ruleset 0 object_hash rjenkins pg_num 64 pgp_num 64 last_change 0 flags hashpspool stripe_width 0
   
diff --git a/src/test/cli/osdmaptool/create-print.t b/src/test/cli/osdmaptool/create-print.t
index 32468a4a6fa..81520af9ce5 100644
--- a/src/test/cli/osdmaptool/create-print.t
+++ b/src/test/cli/osdmaptool/create-print.t
@@ -13,6 +13,7 @@
   tunable chooseleaf_descend_once 1
   tunable chooseleaf_vary_r 1
   tunable straw_calc_version 1
+  tunable allowed_bucket_algs 54
   
   # devices
   device 0 osd.0
@@ -36,7 +37,7 @@
   host localhost {
   \tid -2\t\t# do not change unnecessarily (esc)
   \t# weight 3.000 (esc)
-  \talg straw (esc)
+  \talg straw2 (esc)
   \thash 0\t# rjenkins1 (esc)
   \titem osd.0 weight 1.000 (esc)
   \titem osd.1 weight 1.000 (esc)
@@ -45,14 +46,14 @@
   rack localrack {
   \tid -3\t\t# do not change unnecessarily (esc)
   \t# weight 3.000 (esc)
-  \talg straw (esc)
+  \talg straw2 (esc)
   \thash 0\t# rjenkins1 (esc)
   \titem localhost weight 3.000 (esc)
   }
   root default {
   \tid -1\t\t# do not change unnecessarily (esc)
   \t# weight 3.000 (esc)
-  \talg straw (esc)
+  \talg straw2 (esc)
   \thash 0\t# rjenkins1 (esc)
   \titem localrack weight 3.000 (esc)
   }
@@ -79,6 +80,7 @@
   full_ratio 0
   backfillfull_ratio 0
   nearfull_ratio 0
+  min_compat_client hammer 0.94
   
   pool 0 'rbd' replicated size 3 min_size 2 crush_ruleset 0 object_hash rjenkins pg_num 192 pgp_num 192 last_change 0 flags hashpspool stripe_width 0
   
diff --git a/src/test/cli/osdmaptool/create-racks.t b/src/test/cli/osdmaptool/create-racks.t
index 0759698127d..a0443ffbb66 100644
--- a/src/test/cli/osdmaptool/create-racks.t
+++ b/src/test/cli/osdmaptool/create-racks.t
@@ -12,6 +12,7 @@
   tunable chooseleaf_descend_once 1
   tunable chooseleaf_vary_r 1
   tunable straw_calc_version 1
+  tunable allowed_bucket_algs 54
   
   # devices
   device 0 device0
@@ -271,7 +272,7 @@
   host cephstore5522 {
   \tid -2\t\t# do not change unnecessarily (esc)
   \t# weight 7.000 (esc)
-  \talg straw (esc)
+  \talg straw2 (esc)
   \thash 0\t# rjenkins1 (esc)
   \titem osd.1 weight 1.000 (esc)
   \titem osd.2 weight 1.000 (esc)
@@ -284,7 +285,7 @@
   host cephstore5523 {
   \tid -4\t\t# do not change unnecessarily (esc)
   \t# weight 7.000 (esc)
-  \talg straw (esc)
+  \talg straw2 (esc)
   \thash 0\t# rjenkins1 (esc)
   \titem osd.10 weight 1.000 (esc)
   \titem osd.11 weight 1.000 (esc)
@@ -297,7 +298,7 @@
   host cephstore6238 {
   \tid -8\t\t# do not change unnecessarily (esc)
   \t# weight 7.000 (esc)
-  \talg straw (esc)
+  \talg straw2 (esc)
   \thash 0\t# rjenkins1 (esc)
   \titem osd.113 weight 1.000 (esc)
   \titem osd.114 weight 1.000 (esc)
@@ -310,7 +311,7 @@
   host cephstore6240 {
   \tid -10\t\t# do not change unnecessarily (esc)
   \t# weight 7.000 (esc)
-  \talg straw (esc)
+  \talg straw2 (esc)
   \thash 0\t# rjenkins1 (esc)
   \titem osd.127 weight 1.000 (esc)
   \titem osd.128 weight 1.000 (esc)
@@ -323,7 +324,7 @@
   host cephstore6242 {
   \tid -12\t\t# do not change unnecessarily (esc)
   \t# weight 7.000 (esc)
-  \talg straw (esc)
+  \talg straw2 (esc)
   \thash 0\t# rjenkins1 (esc)
   \titem osd.141 weight 1.000 (esc)
   \titem osd.142 weight 1.000 (esc)
@@ -336,7 +337,7 @@
   host cephstore5524 {
   \tid -14\t\t# do not change unnecessarily (esc)
   \t# weight 7.000 (esc)
-  \talg straw (esc)
+  \talg straw2 (esc)
   \thash 0\t# rjenkins1 (esc)
   \titem osd.15 weight 1.000 (esc)
   \titem osd.16 weight 1.000 (esc)
@@ -349,7 +350,7 @@
   host cephstore6244 {
   \tid -15\t\t# do not change unnecessarily (esc)
   \t# weight 7.000 (esc)
-  \talg straw (esc)
+  \talg straw2 (esc)
   \thash 0\t# rjenkins1 (esc)
   \titem osd.155 weight 1.000 (esc)
   \titem osd.156 weight 1.000 (esc)
@@ -362,7 +363,7 @@
   host cephstore6246 {
   \tid -17\t\t# do not change unnecessarily (esc)
   \t# weight 7.000 (esc)
-  \talg straw (esc)
+  \talg straw2 (esc)
   \thash 0\t# rjenkins1 (esc)
   \titem osd.169 weight 1.000 (esc)
   \titem osd.170 weight 1.000 (esc)
@@ -375,7 +376,7 @@
   host cephstore6337 {
   \tid -19\t\t# do not change unnecessarily (esc)
   \t# weight 7.000 (esc)
-  \talg straw (esc)
+  \talg straw2 (esc)
   \thash 0\t# rjenkins1 (esc)
   \titem osd.183 weight 1.000 (esc)
   \titem osd.184 weight 1.000 (esc)
@@ -388,7 +389,7 @@
   host cephstore6341 {
   \tid -23\t\t# do not change unnecessarily (esc)
   \t# weight 7.000 (esc)
-  \talg straw (esc)
+  \talg straw2 (esc)
   \thash 0\t# rjenkins1 (esc)
   \titem osd.211 weight 1.000 (esc)
   \titem osd.212 weight 1.000 (esc)
@@ -401,7 +402,7 @@
   host cephstore6342 {
   \tid -24\t\t# do not change unnecessarily (esc)
   \t# weight 7.000 (esc)
-  \talg straw (esc)
+  \talg straw2 (esc)
   \thash 0\t# rjenkins1 (esc)
   \titem osd.218 weight 1.000 (esc)
   \titem osd.219 weight 1.000 (esc)
@@ -414,7 +415,7 @@
   host cephstore5525 {
   \tid -25\t\t# do not change unnecessarily (esc)
   \t# weight 7.000 (esc)
-  \talg straw (esc)
+  \talg straw2 (esc)
   \thash 0\t# rjenkins1 (esc)
   \titem osd.22 weight 1.000 (esc)
   \titem osd.23 weight 1.000 (esc)
@@ -427,7 +428,7 @@
   host cephstore6345 {
   \tid -27\t\t# do not change unnecessarily (esc)
   \t# weight 7.000 (esc)
-  \talg straw (esc)
+  \talg straw2 (esc)
   \thash 0\t# rjenkins1 (esc)
   \titem osd.232 weight 1.000 (esc)
   \titem osd.233 weight 1.000 (esc)
@@ -440,7 +441,7 @@
   host cephstore5526 {
   \tid -28\t\t# do not change unnecessarily (esc)
   \t# weight 7.000 (esc)
-  \talg straw (esc)
+  \talg straw2 (esc)
   \thash 0\t# rjenkins1 (esc)
   \titem osd.29 weight 1.000 (esc)
   \titem osd.30 weight 1.000 (esc)
@@ -453,7 +454,7 @@
   host cephstore5527 {
   \tid -29\t\t# do not change unnecessarily (esc)
   \t# weight 7.000 (esc)
-  \talg straw (esc)
+  \talg straw2 (esc)
   \thash 0\t# rjenkins1 (esc)
   \titem osd.36 weight 1.000 (esc)
   \titem osd.37 weight 1.000 (esc)
@@ -466,7 +467,7 @@
   host cephstore5529 {
   \tid -30\t\t# do not change unnecessarily (esc)
   \t# weight 7.000 (esc)
-  \talg straw (esc)
+  \talg straw2 (esc)
   \thash 0\t# rjenkins1 (esc)
   \titem osd.43 weight 1.000 (esc)
   \titem osd.44 weight 1.000 (esc)
@@ -479,7 +480,7 @@
   host cephstore5530 {
   \tid -31\t\t# do not change unnecessarily (esc)
   \t# weight 7.000 (esc)
-  \talg straw (esc)
+  \talg straw2 (esc)
   \thash 0\t# rjenkins1 (esc)
   \titem osd.50 weight 1.000 (esc)
   \titem osd.51 weight 1.000 (esc)
@@ -492,7 +493,7 @@
   rack irv-n2 {
   \tid -3\t\t# do not change unnecessarily (esc)
   \t# weight 119.000 (esc)
-  \talg straw (esc)
+  \talg straw2 (esc)
   \thash 0\t# rjenkins1 (esc)
   \titem cephstore5522 weight 7.000 (esc)
   \titem cephstore5523 weight 7.000 (esc)
@@ -515,7 +516,7 @@
   host cephstore6236 {
   \tid -5\t\t# do not change unnecessarily (esc)
   \t# weight 7.000 (esc)
-  \talg straw (esc)
+  \talg straw2 (esc)
   \thash 0\t# rjenkins1 (esc)
   \titem osd.100 weight 1.000 (esc)
   \titem osd.101 weight 1.000 (esc)
@@ -528,7 +529,7 @@
   host cephstore6237 {
   \tid -7\t\t# do not change unnecessarily (esc)
   \t# weight 7.000 (esc)
-  \talg straw (esc)
+  \talg straw2 (esc)
   \thash 0\t# rjenkins1 (esc)
   \titem osd.106 weight 1.000 (esc)
   \titem osd.107 weight 1.000 (esc)
@@ -541,7 +542,7 @@
   host cephstore6239 {
   \tid -9\t\t# do not change unnecessarily (esc)
   \t# weight 7.000 (esc)
-  \talg straw (esc)
+  \talg straw2 (esc)
   \thash 0\t# rjenkins1 (esc)
   \titem osd.120 weight 1.000 (esc)
   \titem osd.121 weight 1.000 (esc)
@@ -554,7 +555,7 @@
   host cephstore6241 {
   \tid -11\t\t# do not change unnecessarily (esc)
   \t# weight 7.000 (esc)
-  \talg straw (esc)
+  \talg straw2 (esc)
   \thash 0\t# rjenkins1 (esc)
   \titem osd.134 weight 1.000 (esc)
   \titem osd.135 weight 1.000 (esc)
@@ -567,7 +568,7 @@
   host cephstore6243 {
   \tid -13\t\t# do not change unnecessarily (esc)
   \t# weight 7.000 (esc)
-  \talg straw (esc)
+  \talg straw2 (esc)
   \thash 0\t# rjenkins1 (esc)
   \titem osd.148 weight 1.000 (esc)
   \titem osd.149 weight 1.000 (esc)
@@ -580,7 +581,7 @@
   host cephstore6245 {
   \tid -16\t\t# do not change unnecessarily (esc)
   \t# weight 7.000 (esc)
-  \talg straw (esc)
+  \talg straw2 (esc)
   \thash 0\t# rjenkins1 (esc)
   \titem osd.162 weight 1.000 (esc)
   \titem osd.163 weight 1.000 (esc)
@@ -593,7 +594,7 @@
   host cephstore6336 {
   \tid -18\t\t# do not change unnecessarily (esc)
   \t# weight 7.000 (esc)
-  \talg straw (esc)
+  \talg straw2 (esc)
   \thash 0\t# rjenkins1 (esc)
   \titem osd.176 weight 1.000 (esc)
   \titem osd.177 weight 1.000 (esc)
@@ -606,7 +607,7 @@
   host cephstore6338 {
   \tid -20\t\t# do not change unnecessarily (esc)
   \t# weight 7.000 (esc)
-  \talg straw (esc)
+  \talg straw2 (esc)
   \thash 0\t# rjenkins1 (esc)
   \titem osd.190 weight 1.000 (esc)
   \titem osd.191 weight 1.000 (esc)
@@ -619,7 +620,7 @@
   host cephstore6339 {
   \tid -21\t\t# do not change unnecessarily (esc)
   \t# weight 7.000 (esc)
-  \talg straw (esc)
+  \talg straw2 (esc)
   \thash 0\t# rjenkins1 (esc)
   \titem osd.197 weight 1.000 (esc)
   \titem osd.198 weight 1.000 (esc)
@@ -632,7 +633,7 @@
   host cephstore6340 {
   \tid -22\t\t# do not change unnecessarily (esc)
   \t# weight 7.000 (esc)
-  \talg straw (esc)
+  \talg straw2 (esc)
   \thash 0\t# rjenkins1 (esc)
   \titem osd.204 weight 1.000 (esc)
   \titem osd.205 weight 1.000 (esc)
@@ -645,7 +646,7 @@
   host cephstore6343 {
   \tid -26\t\t# do not change unnecessarily (esc)
   \t# weight 7.000 (esc)
-  \talg straw (esc)
+  \talg straw2 (esc)
   \thash 0\t# rjenkins1 (esc)
   \titem osd.225 weight 1.000 (esc)
   \titem osd.226 weight 1.000 (esc)
@@ -658,7 +659,7 @@
   host cephstore6230 {
   \tid -32\t\t# do not change unnecessarily (esc)
   \t# weight 7.000 (esc)
-  \talg straw (esc)
+  \talg straw2 (esc)
   \thash 0\t# rjenkins1 (esc)
   \titem osd.57 weight 1.000 (esc)
   \titem osd.58 weight 1.000 (esc)
@@ -671,7 +672,7 @@
   host cephstore6231 {
   \tid -33\t\t# do not change unnecessarily (esc)
   \t# weight 7.000 (esc)
-  \talg straw (esc)
+  \talg straw2 (esc)
   \thash 0\t# rjenkins1 (esc)
   \titem osd.64 weight 1.000 (esc)
   \titem osd.65 weight 1.000 (esc)
@@ -684,7 +685,7 @@
   host cephstore6232 {
   \tid -34\t\t# do not change unnecessarily (esc)
   \t# weight 7.000 (esc)
-  \talg straw (esc)
+  \talg straw2 (esc)
   \thash 0\t# rjenkins1 (esc)
   \titem osd.71 weight 1.000 (esc)
   \titem osd.72 weight 1.000 (esc)
@@ -697,7 +698,7 @@
   host cephstore6233 {
   \tid -35\t\t# do not change unnecessarily (esc)
   \t# weight 7.000 (esc)
-  \talg straw (esc)
+  \talg straw2 (esc)
   \thash 0\t# rjenkins1 (esc)
   \titem osd.78 weight 1.000 (esc)
   \titem osd.79 weight 1.000 (esc)
@@ -710,7 +711,7 @@
   host cephstore6234 {
   \tid -36\t\t# do not change unnecessarily (esc)
   \t# weight 7.000 (esc)
-  \talg straw (esc)
+  \talg straw2 (esc)
   \thash 0\t# rjenkins1 (esc)
   \titem osd.85 weight 1.000 (esc)
   \titem osd.86 weight 1.000 (esc)
@@ -723,7 +724,7 @@
   host cephstore6235 {
   \tid -37\t\t# do not change unnecessarily (esc)
   \t# weight 7.000 (esc)
-  \talg straw (esc)
+  \talg straw2 (esc)
   \thash 0\t# rjenkins1 (esc)
   \titem osd.92 weight 1.000 (esc)
   \titem osd.93 weight 1.000 (esc)
@@ -736,7 +737,7 @@
   rack irv-n1 {
   \tid -6\t\t# do not change unnecessarily (esc)
   \t# weight 119.000 (esc)
-  \talg straw (esc)
+  \talg straw2 (esc)
   \thash 0\t# rjenkins1 (esc)
   \titem cephstore6236 weight 7.000 (esc)
   \titem cephstore6237 weight 7.000 (esc)
@@ -759,7 +760,7 @@
   root default {
   \tid -1\t\t# do not change unnecessarily (esc)
   \t# weight 238.000 (esc)
-  \talg straw (esc)
+  \talg straw2 (esc)
   \thash 0\t# rjenkins1 (esc)
   \titem irv-n2 weight 119.000 (esc)
   \titem irv-n1 weight 119.000 (esc)
@@ -792,6 +793,7 @@
   full_ratio 0
   backfillfull_ratio 0
   nearfull_ratio 0
+  min_compat_client hammer 0.94
   
   pool 0 'rbd' replicated size 3 min_size 2 crush_ruleset 0 object_hash rjenkins pg_num 15296 pgp_num 15296 last_change 0 flags hashpspool stripe_width 0
   
diff --git a/src/test/cli/osdmaptool/crush.t b/src/test/cli/osdmaptool/crush.t
index 0cd9ae61968..d2f27ef736d 100644
--- a/src/test/cli/osdmaptool/crush.t
+++ b/src/test/cli/osdmaptool/crush.t
@@ -6,5 +6,5 @@
   osdmaptool: exported crush map to oc
   $ osdmaptool --import-crush oc myosdmap
   osdmaptool: osdmap file 'myosdmap'
-  osdmaptool: imported 512 byte crush map from oc
+  osdmaptool: imported 492 byte crush map from oc
   osdmaptool: writing epoch 3 to myosdmap
diff --git a/src/test/cli/rbd/help.t b/src/test/cli/rbd/help.t
index b775c37ab89..e4349e7d7c3 100644
--- a/src/test/cli/rbd/help.t
+++ b/src/test/cli/rbd/help.t
@@ -874,7 +874,7 @@
   
   rbd help map
   usage: rbd map [--pool <pool>] [--image <image>] [--snap <snap>] 
-                 [--options <options>] [--read-only] 
+                 [--options <options>] [--read-only] [--exclusive] 
                  <image-or-snap-spec> 
   
   Map image to a block device using the kernel.
@@ -889,6 +889,7 @@
     --snap arg            snapshot name
     -o [ --options ] arg  map options
     --read-only           map read-only
+    --exclusive           disable automatic exclusive lock transitions
   
   rbd help merge-diff
   usage: rbd merge-diff [--path <path>] [--no-progress] 
diff --git a/src/test/cls_refcount/test_cls_refcount.cc b/src/test/cls_refcount/test_cls_refcount.cc
index a585b9a9d12..3ac1cb892fb 100644
--- a/src/test/cls_refcount/test_cls_refcount.cc
+++ b/src/test/cls_refcount/test_cls_refcount.cc
@@ -133,7 +133,6 @@ TEST(cls_rgw, test_put_snap) {
   ASSERT_EQ(0, ioctx.snap_create("snapbar"));
 
   librados::ObjectWriteOperation *op = new_op();
-  op->create(false);
   cls_refcount_put(*op, "notag", true);
   ASSERT_EQ(-ENOENT, ioctx.operate("foo", op));
 
diff --git a/src/test/cls_sdk/CMakeLists.txt b/src/test/cls_sdk/CMakeLists.txt
new file mode 100644
index 00000000000..2579b9c76b0
--- /dev/null
+++ b/src/test/cls_sdk/CMakeLists.txt
@@ -0,0 +1,17 @@
+add_executable(ceph_test_cls_sdk
+  test_cls_sdk.cc
+  )
+set_target_properties(ceph_test_cls_sdk PROPERTIES COMPILE_FLAGS
+  ${UNITTEST_CXX_FLAGS})
+target_link_libraries(ceph_test_cls_sdk
+  librados
+  global
+  ${EXTRALIBS}
+  ${BLKID_LIBRARIES}
+  ${CMAKE_DL_LIBS}
+  radostest
+  ${UNITTEST_LIBS}
+  )
+install(TARGETS
+  ceph_test_cls_sdk
+  DESTINATION ${CMAKE_INSTALL_BINDIR})
diff --git a/src/test/cls_sdk/test_cls_sdk.cc b/src/test/cls_sdk/test_cls_sdk.cc
new file mode 100644
index 00000000000..af3452bb5c6
--- /dev/null
+++ b/src/test/cls_sdk/test_cls_sdk.cc
@@ -0,0 +1,35 @@
+#include <iostream>
+#include <errno.h>
+
+#include "test/librados/test.h"
+#include "gtest/gtest.h"
+
+using namespace librados;
+
+TEST(ClsSDK, TestSDKCoverageWrite) {
+  Rados cluster;
+  std::string pool_name = get_temp_pool_name();
+  ASSERT_EQ("", create_one_pool_pp(pool_name, cluster));
+  IoCtx ioctx;
+  cluster.ioctx_create(pool_name.c_str(), ioctx);
+
+  bufferlist in, out;
+  ASSERT_EQ(0, ioctx.exec("myobject", "sdk", "test_coverage_write", in, out));
+
+  ASSERT_EQ(0, destroy_one_pool_pp(pool_name, cluster));
+}
+
+TEST(ClsSDK, TestSDKCoverageReplay) {
+  Rados cluster;
+  std::string pool_name = get_temp_pool_name();
+  ASSERT_EQ("", create_one_pool_pp(pool_name, cluster));
+  IoCtx ioctx;
+  cluster.ioctx_create(pool_name.c_str(), ioctx);
+
+  bufferlist in, out;
+  ASSERT_EQ(0, ioctx.exec("myobject", "sdk", "test_coverage_write", in, out));
+  ASSERT_EQ(0, ioctx.exec("myobject", "sdk", "test_coverage_replay", in, out));
+
+  ASSERT_EQ(0, destroy_one_pool_pp(pool_name, cluster));
+}
+
diff --git a/src/test/common/CMakeLists.txt b/src/test/common/CMakeLists.txt
index 3a34fd05c1b..45f8a43a9a3 100644
--- a/src/test/common/CMakeLists.txt
+++ b/src/test/common/CMakeLists.txt
@@ -259,3 +259,9 @@ add_executable(unittest_hostname
 target_link_libraries(unittest_hostname ceph-common)
 add_ceph_unittest(unittest_hostname
   ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/unittest_hostname)
+
+add_executable(unittest_iso_8601
+    test_iso_8601.cc)
+target_link_libraries(unittest_iso_8601 ceph-common)
+add_ceph_unittest(unittest_iso_8601
+  ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/unittest_hostname)
diff --git a/src/test/common/test_blkdev.cc b/src/test/common/test_blkdev.cc
index d54c5a0aac7..1dcfd915194 100644
--- a/src/test/common/test_blkdev.cc
+++ b/src/test/common/test_blkdev.cc
@@ -94,7 +94,9 @@ TEST(blkdev, device_model)
   set_block_device_sandbox_dir(root.c_str());
 
   char model[1000] = {0};
-  block_device_model("sda", model, sizeof(model));
+  int rc = block_device_model("sda", model, sizeof(model));
+  ASSERT_EQ(0, rc);
+
   printf("model '%s'\n", model);
   ASSERT_EQ(strcmp(model, "myfancymodel"), 0);
 }
diff --git a/src/test/common/test_context.cc b/src/test/common/test_context.cc
index d976758c6b7..ee0f6460666 100644
--- a/src/test/common/test_context.cc
+++ b/src/test/common/test_context.cc
@@ -52,6 +52,13 @@ TEST(CephContext, do_command)
     EXPECT_EQ("{\n    \"key\": \"value\"\n}\n", s);
   }
 
+  {
+    bufferlist out;
+    cct->do_command("config diff get", cmdmap, "xml", &out);
+    string s(out.c_str(), out.length());
+    EXPECT_EQ("<config_diff_get><diff><current><key>" + value + 
+      "</key></current><defaults><key></key></defaults></diff></config_diff_get>", s);
+  }
   cct->put();
 }
 
diff --git a/src/test/common/test_iso_8601.cc b/src/test/common/test_iso_8601.cc
new file mode 100644
index 00000000000..dbb3aa2a366
--- /dev/null
+++ b/src/test/common/test_iso_8601.cc
@@ -0,0 +1,60 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat <contact@redhat.com>
+ *
+ * LGPL2.1 (see COPYING-LGPL2.1) or later
+ */
+
+#include <chrono>
+
+#include <gtest/gtest.h>
+
+#include "common/ceph_time.h"
+#include "common/iso_8601.h"
+
+using std::chrono::minutes;
+using std::chrono::seconds;
+using std::chrono::time_point_cast;
+
+using ceph::from_iso_8601;
+using ceph::iso_8601_format;
+using ceph::real_clock;
+using ceph::real_time;
+using ceph::to_iso_8601;
+
+TEST(iso_8601, epoch) {
+  const auto epoch = real_clock::from_time_t(0);
+
+  ASSERT_EQ("1970", to_iso_8601(epoch, iso_8601_format::Y));
+  ASSERT_EQ("1970-01", to_iso_8601(epoch, iso_8601_format::YM));
+  ASSERT_EQ("1970-01-01", to_iso_8601(epoch, iso_8601_format::YMD));
+  ASSERT_EQ("1970-01-01T00Z", to_iso_8601(epoch, iso_8601_format::YMDh));
+  ASSERT_EQ("1970-01-01T00:00Z", to_iso_8601(epoch, iso_8601_format::YMDhm));
+  ASSERT_EQ("1970-01-01T00:00:00Z",
+	    to_iso_8601(epoch, iso_8601_format::YMDhms));
+  ASSERT_EQ("1970-01-01T00:00:00.000000000Z",
+	    to_iso_8601(epoch, iso_8601_format::YMDhmsn));
+
+  ASSERT_EQ(epoch, *from_iso_8601("1970"));
+  ASSERT_EQ(epoch, *from_iso_8601("1970-01"));
+  ASSERT_EQ(epoch, *from_iso_8601("1970-01-01"));
+  ASSERT_EQ(epoch, *from_iso_8601("1970-01-01T00:00Z"));
+  ASSERT_EQ(epoch, *from_iso_8601("1970-01-01T00:00:00Z"));
+  ASSERT_EQ(epoch, *from_iso_8601("1970-01-01T00:00:00.000000000Z"));
+}
+
+TEST(iso_8601, now) {
+  const auto now = real_clock::now();
+
+  ASSERT_EQ(real_time(time_point_cast<minutes>(now)),
+	    *from_iso_8601(to_iso_8601(now, iso_8601_format::YMDhm)));
+  ASSERT_EQ(real_time(time_point_cast<seconds>(now)),
+	    *from_iso_8601(
+	      to_iso_8601(now, iso_8601_format::YMDhms)));
+  ASSERT_EQ(now,
+	    *from_iso_8601(
+	      to_iso_8601(now, iso_8601_format::YMDhmsn)));
+}
diff --git a/src/test/common/test_mutex.cc b/src/test/common/test_mutex.cc
index eea86df204f..b62341e7142 100644
--- a/src/test/common/test_mutex.cc
+++ b/src/test/common/test_mutex.cc
@@ -1,5 +1,5 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 &smarttab
+// vim: ts=8 sw=2 smarttab
 /*
  * Ceph - scalable distributed file system
  *
@@ -9,6 +9,7 @@
 #include "gtest/gtest.h"
 #include "common/ceph_context.h"
 #include "common/config.h"
+#include "include/coredumpctl.h"
 
 /*
  * Override normal ceph assert.
@@ -62,5 +63,6 @@ TEST(Mutex, RecursiveWithoutLockdep) {
 TEST(Mutex, DeleteLocked) {
   Mutex* m = new Mutex("Recursive3",false);
   m->Lock();
+  PrCtl unset_dumpable;
   EXPECT_DEATH(delete m,".*");
 }
diff --git a/src/test/common/test_sloppy_crc_map.cc b/src/test/common/test_sloppy_crc_map.cc
index 1cdb6e459de..3eb855130aa 100644
--- a/src/test/common/test_sloppy_crc_map.cc
+++ b/src/test/common/test_sloppy_crc_map.cc
@@ -1,10 +1,14 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 // vim: ts=8 sw=2 smarttab
 
+#include <iostream>
+
 #include "common/SloppyCRCMap.h"
 #include "common/Formatter.h"
 #include <gtest/gtest.h>
 
+using namespace std;
+
 void dump(const SloppyCRCMap& scm)
 {
   Formatter *f = Formatter::create("json-pretty");
diff --git a/src/test/compressor/test_compression.cc b/src/test/compressor/test_compression.cc
index 11678e024e8..495af076699 100644
--- a/src/test/compressor/test_compression.cc
+++ b/src/test/compressor/test_compression.cc
@@ -322,16 +322,24 @@ INSTANTIATE_TEST_CASE_P(
   Compressor,
   CompressorTest,
   ::testing::Values(
+#ifdef __x86_64__
     "zlib/isal",
+#endif
     "zlib/noisal",
     "snappy",
     "zstd"));
 
+#ifdef __x86_64__
+
 TEST(ZlibCompressor, zlib_isal_compatibility)
 {
   g_conf->set_val("compressor_zlib_isal", "true");
   g_ceph_context->_conf->apply_changes(NULL);
   CompressorRef isal = Compressor::create(g_ceph_context, "zlib");
+  if (!isal) {
+    // skip the test if the plugin is not ready
+    return;
+  }
   g_conf->set_val("compressor_zlib_isal", "false");
   g_ceph_context->_conf->apply_changes(NULL);
   CompressorRef zlib = Compressor::create(g_ceph_context, "zlib");
@@ -363,6 +371,7 @@ TEST(ZlibCompressor, zlib_isal_compatibility)
   exp.append(test);
   EXPECT_TRUE(exp.contents_equal(after));
 }
+#endif
 
 TEST(CompressionPlugin, all)
 {
@@ -386,11 +395,17 @@ TEST(CompressionPlugin, all)
   }
 }
 
+#ifdef __x86_64__
+
 TEST(ZlibCompressor, isal_compress_zlib_decompress_random)
 {
   g_conf->set_val("compressor_zlib_isal", "true");
   g_ceph_context->_conf->apply_changes(NULL);
   CompressorRef isal = Compressor::create(g_ceph_context, "zlib");
+  if (!isal) {
+    // skip the test if the plugin is not ready
+    return;
+  }
   g_conf->set_val("compressor_zlib_isal", "false");
   g_ceph_context->_conf->apply_changes(NULL);
   CompressorRef zlib = Compressor::create(g_ceph_context, "zlib");
@@ -423,6 +438,10 @@ TEST(ZlibCompressor, isal_compress_zlib_decompress_walk)
   g_conf->set_val("compressor_zlib_isal", "true");
   g_ceph_context->_conf->apply_changes(NULL);
   CompressorRef isal = Compressor::create(g_ceph_context, "zlib");
+  if (!isal) {
+    // skip the test if the plugin is not ready
+    return;
+  }
   g_conf->set_val("compressor_zlib_isal", "false");
   g_ceph_context->_conf->apply_changes(NULL);
   CompressorRef zlib = Compressor::create(g_ceph_context, "zlib");
@@ -452,3 +471,5 @@ TEST(ZlibCompressor, isal_compress_zlib_decompress_walk)
     EXPECT_TRUE(exp.contents_equal(after));
   }
 }
+
+#endif	// __x86_64__
diff --git a/src/test/crush/CrushWrapper.cc b/src/test/crush/CrushWrapper.cc
index 6e6fb05e5e7..816460d6796 100644
--- a/src/test/crush/CrushWrapper.cc
+++ b/src/test/crush/CrushWrapper.cc
@@ -128,6 +128,69 @@ TEST(CrushWrapper, move_bucket) {
   delete c;
 }
 
+TEST(CrushWrapper, swap_bucket) {
+  CrushWrapper *c = new CrushWrapper;
+
+  const int ROOT_TYPE = 2;
+  c->set_type_name(ROOT_TYPE, "root");
+  const int HOST_TYPE = 1;
+  c->set_type_name(HOST_TYPE, "host");
+  const int OSD_TYPE = 0;
+  c->set_type_name(OSD_TYPE, "osd");
+
+  int root;
+  EXPECT_EQ(0, c->add_bucket(0, CRUSH_BUCKET_STRAW2, CRUSH_HASH_RJENKINS1,
+			     ROOT_TYPE, 0, NULL, NULL, &root));
+  EXPECT_EQ(0, c->set_item_name(root, "root"));
+
+  int a, b;
+  EXPECT_EQ(0, c->add_bucket(0, CRUSH_BUCKET_STRAW2, CRUSH_HASH_RJENKINS1,
+			     HOST_TYPE, 0, NULL, NULL, &a));
+  EXPECT_EQ(0, c->set_item_name(a, "a"));
+  EXPECT_EQ(0, c->add_bucket(0, CRUSH_BUCKET_STRAW2, CRUSH_HASH_RJENKINS1,
+			     HOST_TYPE, 0, NULL, NULL, &b));
+  EXPECT_EQ(0, c->set_item_name(b, "b"));
+
+  {
+    map<string,string> loc;
+    loc["root"] = "root";
+    EXPECT_EQ(0, c->move_bucket(g_ceph_context, a, loc));
+  }
+  {
+    map<string,string> loc;
+    loc["root"] = "root";
+    loc["host"] = "a";
+    EXPECT_EQ(0, c->insert_item(g_ceph_context, 0, 1.0, "osd.0", loc));
+    EXPECT_EQ(0, c->insert_item(g_ceph_context, 1, 1.0, "osd.1", loc));
+    EXPECT_EQ(0, c->insert_item(g_ceph_context, 2, 1.0, "osd.2", loc));
+  }
+  {
+    map<string,string> loc;
+    loc["host"] = "b";
+    EXPECT_EQ(0, c->insert_item(g_ceph_context, 3, 1.0, "osd.3", loc));
+  }
+  ASSERT_EQ(0x30000, c->get_item_weight(a));
+  ASSERT_EQ(string("a"), c->get_item_name(a));
+  ASSERT_EQ(0x10000, c->get_item_weight(b));
+  ASSERT_EQ(string("b"), c->get_item_name(b));
+  ASSERT_EQ(a, c->get_bucket_item(root, 0));
+  ASSERT_EQ(0, c->get_bucket_item(a, 0));
+  ASSERT_EQ(1, c->get_bucket_item(a, 1));
+  ASSERT_EQ(2, c->get_bucket_item(a, 2));
+  ASSERT_EQ(3, c->get_bucket_item(b, 0));
+
+  c->swap_bucket(g_ceph_context, a, b);
+  ASSERT_EQ(0x30000, c->get_item_weight(b));
+  ASSERT_EQ(string("a"), c->get_item_name(b));
+  ASSERT_EQ(0x10000, c->get_item_weight(a));
+  ASSERT_EQ(string("b"), c->get_item_name(a));
+  ASSERT_EQ(a, c->get_bucket_item(root, 0));
+  ASSERT_EQ(0, c->get_bucket_item(b, 0));
+  ASSERT_EQ(1, c->get_bucket_item(b, 1));
+  ASSERT_EQ(2, c->get_bucket_item(b, 2));
+  ASSERT_EQ(3, c->get_bucket_item(a, 0));
+}
+
 TEST(CrushWrapper, rename_bucket_or_item) {
   CrushWrapper *c = new CrushWrapper;
 
@@ -961,6 +1024,77 @@ TEST(CrushWrapper, distance) {
   ASSERT_EQ(1, c.get_common_ancestor_distance(g_ceph_context, 3, p));
 }
 
+TEST(CrushWrapper, choose_args_compat) {
+  CrushWrapper c;
+  c.create();
+  c.set_type_name(1, "host");
+  c.set_type_name(2, "rack");
+  c.set_type_name(3, "root");
+
+  int weight = 12;
+
+  map<string,string> loc;
+  loc["host"] = "b1";
+  loc["rack"] = "r11";
+  loc["root"] = "default";
+  int item = 1;
+  c.insert_item(g_ceph_context, item, weight, "osd.1", loc);
+
+  loc["host"] = "b2";
+  loc["rack"] = "r12";
+  loc["root"] = "default";
+  item = 2;
+  c.insert_item(g_ceph_context, item, weight, "osd.2", loc);
+
+  assert(c.add_simple_ruleset("rule1", "r11", "host", "firstn", pg_pool_t::TYPE_ERASURE) >= 0);
+
+  int id = c.get_item_id("b1");
+
+  __u32 weights = 666 * 0x10000;
+  crush_weight_set weight_set;
+  weight_set.size = 1;
+  weight_set.weights = &weights;
+  int maxbuckets = c.get_max_buckets();
+  assert(maxbuckets > 0);
+  crush_choose_arg choose_args[maxbuckets];
+  memset(choose_args, '\0', sizeof(crush_choose_arg) * maxbuckets);
+  choose_args[-1-id].ids_size = 0;
+  choose_args[-1-id].weight_set_size = 1;
+  choose_args[-1-id].weight_set = &weight_set;
+  crush_choose_arg_map arg_map;
+  arg_map.size = c.get_max_buckets();
+  arg_map.args = choose_args;
+
+  uint64_t features = CEPH_FEATURE_CRUSH_TUNABLES5|CEPH_FEATURE_INCARNATION_2;
+
+  // if the client is capable, encode choose_args
+  {
+    c.choose_args[0] = arg_map;
+    bufferlist bl;
+    c.encode(bl, features|CEPH_FEATURE_CRUSH_CHOOSE_ARGS);
+    bufferlist::iterator i(bl.begin());
+    CrushWrapper c_new;
+    c_new.decode(i);
+    ASSERT_EQ(1u, c_new.choose_args.size());
+    ASSERT_EQ(1u, c_new.choose_args[0].args[-1-id].weight_set_size);
+    ASSERT_EQ(weights, c_new.choose_args[0].args[-1-id].weight_set[0].weights[0]);
+    ASSERT_EQ(weight, c_new.get_bucket_item_weightf(id, 0));
+  }
+
+  // if the client is not compatible, copy choose_arg in the weights
+  {
+    c.choose_args[0] = arg_map;
+    bufferlist bl;
+    c.encode(bl, features);
+    c.choose_args.clear();
+    bufferlist::iterator i(bl.begin());
+    CrushWrapper c_new;
+    c_new.decode(i);
+    ASSERT_EQ(0u, c_new.choose_args.size());
+    ASSERT_EQ((int)weights, c_new.get_bucket_item_weight(id, 0));
+  }
+}
+
 TEST(CrushWrapper, remove_unused_root) {
   CrushWrapper c;
   c.create();
diff --git a/src/test/encoding.cc b/src/test/encoding.cc
index 29c3b99fddc..af3346e1a09 100644
--- a/src/test/encoding.cc
+++ b/src/test/encoding.cc
@@ -1,6 +1,5 @@
 #include "include/buffer.h"
 #include "include/encoding.h"
-#include "include/small_encoding.h"
 
 #include "gtest/gtest.h"
 
@@ -292,18 +291,15 @@ TEST(EncodingRoundTrip, Integers) {
 }
 
 const char* expected_what[] = {
-  "buffer::malformed_input: void lame_decoder(int) unknown encoding version > 100",
-  "buffer::malformed_input: void lame_decoder(int) no longer understand old encoding version < 100",
+  "buffer::malformed_input: void lame_decoder(int) no longer understand old encoding version 100 < 200",
   "buffer::malformed_input: void lame_decoder(int) decode past end of struct encoding",
 };
 
 void lame_decoder(int which) {
   switch (which) {
   case 0:
-    throw buffer::malformed_input(DECODE_ERR_VERSION(__PRETTY_FUNCTION__, 100));
+    throw buffer::malformed_input(DECODE_ERR_OLDVERSION(__PRETTY_FUNCTION__, 100, 200));
   case 1:
-    throw buffer::malformed_input(DECODE_ERR_OLDVERSION(__PRETTY_FUNCTION__, 100));
-  case 2:
     throw buffer::malformed_input(DECODE_ERR_PAST(__PRETTY_FUNCTION__));
   }
 }
@@ -350,39 +346,48 @@ TEST(small_encoding, varint) {
   for (unsigned i=0; v[i][1]; ++i) {
     {
       bufferlist bl;
-      small_encode_varint(v[i][0], bl);
+      {
+         auto app = bl.get_contiguous_appender(16, true);
+         denc_varint(v[i][0], app);
+      }
       cout << std::hex << v[i][0] << "\t" << v[i][1] << "\t";
       bl.hexdump(cout, false);
       cout << std::endl;
       ASSERT_EQ(bl.length(), v[i][1]);
       uint32_t u;
-      auto p = bl.begin();
-      small_decode_varint(u, p);
+      auto p = bl.begin().get_current_ptr().begin();
+      denc_varint(u, p);
       ASSERT_EQ(v[i][0], u);
     }
     {
       bufferlist bl;
-      small_encode_signed_varint(v[i][0], bl);
+      {
+         auto app = bl.get_contiguous_appender(16, true);
+         denc_signed_varint(v[i][0], app);
+      }
       cout << std::hex << v[i][0] << "\t" << v[i][2] << "\t";
       bl.hexdump(cout, false);
       cout << std::endl;
       ASSERT_EQ(bl.length(), v[i][2]);
       int32_t u;
-      auto p = bl.begin();
-      small_decode_signed_varint(u, p);
+      auto p = bl.begin().get_current_ptr().begin();
+      denc_signed_varint(u, p);
       ASSERT_EQ((int32_t)v[i][0], u);
     }
     {
       bufferlist bl;
       int64_t x = -(int64_t)v[i][0];
-      small_encode_signed_varint(x, bl);
+      {
+         auto app = bl.get_contiguous_appender(16, true);
+         denc_signed_varint(x, app);
+      }
       cout << std::dec << x << std::hex << "\t" << v[i][3] << "\t";
       bl.hexdump(cout, false);
       cout << std::endl;
       ASSERT_EQ(bl.length(), v[i][3]);
       int64_t u;
-      auto p = bl.begin();
-      small_decode_signed_varint(u, p);
+      auto p = bl.begin().get_current_ptr().begin();
+      denc_signed_varint(u, p);
       ASSERT_EQ(x, u);
     }
   }
@@ -419,40 +424,49 @@ TEST(small_encoding, varint_lowz) {
   for (unsigned i=0; v[i][1]; ++i) {
     {
       bufferlist bl;
-      small_encode_varint_lowz(v[i][0], bl);
+      {
+         auto app = bl.get_contiguous_appender(16, true);
+         denc_varint_lowz(v[i][0], app);
+      }
       cout << std::hex << v[i][0] << "\t" << v[i][1] << "\t";
       bl.hexdump(cout, false);
       cout << std::endl;
       ASSERT_EQ(bl.length(), v[i][1]);
       uint32_t u;
-      auto p = bl.begin();
-      small_decode_varint_lowz(u, p);
+      auto p = bl.begin().get_current_ptr().begin();
+      denc_varint_lowz(u, p);
       ASSERT_EQ(v[i][0], u);
     }
     {
       bufferlist bl;
       int64_t x = v[i][0];
-      small_encode_signed_varint_lowz(x, bl);
+      {
+         auto app = bl.get_contiguous_appender(16, true);
+         denc_signed_varint_lowz(x, app);
+      }
       cout << std::hex << x << "\t" << v[i][1] << "\t";
       bl.hexdump(cout, false);
       cout << std::endl;
       ASSERT_EQ(bl.length(), v[i][2]);
       int64_t u;
-      auto p = bl.begin();
-      small_decode_signed_varint_lowz(u, p);
+      auto p = bl.begin().get_current_ptr().begin();
+      denc_signed_varint_lowz(u, p);
       ASSERT_EQ(x, u);
     }
     {
       bufferlist bl;
       int64_t x = -(int64_t)v[i][0];
-      small_encode_signed_varint_lowz(x, bl);
+      {
+         auto app = bl.get_contiguous_appender(16, true);
+         denc_signed_varint_lowz(x, app);
+      }
       cout << std::dec << x << "\t" << v[i][1] << "\t";
       bl.hexdump(cout, false);
       cout << std::endl;
       ASSERT_EQ(bl.length(), v[i][3]);
       int64_t u;
-      auto p = bl.begin();
-      small_decode_signed_varint_lowz(u, p);
+      auto p = bl.begin().get_current_ptr().begin();
+      denc_signed_varint_lowz(u, p);
       ASSERT_EQ(x, u);
     }    
   }
@@ -481,14 +495,17 @@ TEST(small_encoding, lba) {
   };
   for (unsigned i=0; v[i][1]; ++i) {
     bufferlist bl;
-    small_encode_lba(v[i][0], bl);
+    {
+       auto app = bl.get_contiguous_appender(16, true);
+       denc_lba(v[i][0], app);
+    }
     cout << std::hex << v[i][0] << "\t" << v[i][1] << "\t";
     bl.hexdump(cout, false);
     cout << std::endl;
     ASSERT_EQ(bl.length(), v[i][1]);
     uint64_t u;
-    auto p = bl.begin();
-    small_decode_lba(u, p);
+    auto p = bl.begin().get_current_ptr().begin();
+    denc_lba(u, p);
     ASSERT_EQ(v[i][0], u);
   }
 
diff --git a/src/test/encoding/generate-corpus-objects.sh b/src/test/encoding/generate-corpus-objects.sh
new file mode 100755
index 00000000000..6f84181ecf4
--- /dev/null
+++ b/src/test/encoding/generate-corpus-objects.sh
@@ -0,0 +1,58 @@
+#!/bin/bash -ex
+
+BDIR=`pwd`
+
+p=$1
+echo path $p
+test ! -d $p
+mkdir $p
+strings bin/ceph-osd | grep "^$p/%s__%d.%x"
+
+v=`git describe | cut -c 2-`
+echo version $v
+
+echo 'binaries look ok, vstarting'
+echo
+
+MON=3 MDS=3 OSD=5 MDS=3 MGR=2 RGW=1 ../src/vstart.sh -x -n -l --bluestore -e
+
+export PATH=bin:$PATH
+
+# do some work to generate a hopefully braod set of object instances
+
+echo 'starting some background work'
+../qa/workunits/rados/test.sh &
+../qa/workunits/rbd/test_librbd.sh &
+../qa/workunits/libcephfs/test.sh &
+../qa/workunits/rgw/run-s3tests.sh &
+ceph-syn --syn makedirs 3 3 3 &
+
+echo 'waiting a bit'
+
+sleep 10
+echo 'triggering some recovery'
+
+kill -9 `cat out/osd.0.pid`
+sleep 10
+ceph osd out 0
+sleep 10
+init-ceph start osd.0
+ceph osd in 0
+
+sleep 5
+echo 'triggering mds work'
+bin/ceph mds fail 0
+
+echo 'waiting for worker to join (and ignoring errors)'
+wait || true
+
+echo 'importing'
+../src/test/encoding/import.sh $p $v ../ceph-object-corpus/archive
+
+for d in ../ceph-object-corpus/archive/$v/objects/*
+do
+    echo prune $d
+    ../ceph-object-corpus/bin/prune.sh $d 25
+done
+
+echo 'done'
diff --git a/src/test/encoding/types.h b/src/test/encoding/types.h
index 874b8250cbb..d4f4413080a 100644
--- a/src/test/encoding/types.h
+++ b/src/test/encoding/types.h
@@ -67,7 +67,6 @@ TYPE(pg_stat_t)
 TYPE_FEATUREFUL(pool_stat_t)
 TYPE(pg_history_t)
 TYPE(pg_info_t)
-TYPE(pg_interval_t)
 TYPE_FEATUREFUL(pg_query_t)
 TYPE(pg_log_entry_t)
 TYPE(pg_log_t)
diff --git a/src/test/fio/README.md b/src/test/fio/README.md
index 9864db5149e..05c54442cc1 100644
--- a/src/test/fio/README.md
+++ b/src/test/fio/README.md
@@ -49,11 +49,11 @@ Because the ObjectStore is not a public-facing interface, we build it inside
 of the ceph tree and load libfio_ceph_objectstore.so into fio as an external
 engine.
 
-To build ceph with fio_ceph_objectstore:
+To build fio_ceph_objectstore:
 ```
   mkdir build && cd build
   cmake -DWITH_FIO=ON -DFIO_INCLUDE_DIR=/path/to/fio -DCMAKE_BUILD_TYPE=Release /path/to/ceph
-  make install
+  make -C src/test/fio install
 ```
 If you install the ceph libraries to a location that isn't in your
 LD_LIBRARY_PATH, be sure to add it:
diff --git a/src/test/fio/fio_ceph_objectstore.cc b/src/test/fio/fio_ceph_objectstore.cc
index e453fc01e0c..2123671a7fb 100644
--- a/src/test/fio/fio_ceph_objectstore.cc
+++ b/src/test/fio/fio_ceph_objectstore.cc
@@ -209,7 +209,7 @@ Job::Job(Engine* engine, const thread_data* td)
   for (uint32_t i = 0; i < td->o.nr_files; i++) {
     auto f = td->files[i];
     f->real_file_size = file_size;
-    f->engine_data = i;
+    f->engine_pos = i;
 
     // associate each object with a collection in a round-robin fashion
     auto& coll = collections[i % collections.size()];
@@ -329,7 +329,7 @@ int fio_ceph_os_queue(thread_data* td, io_u* u)
   fio_ro_check(td, u);
 
   auto job = static_cast<Job*>(td->io_ops_data);
-  auto& object = job->objects[u->file->engine_data];
+  auto& object = job->objects[u->file->engine_pos];
   auto& coll = object.coll;
   auto& os = job->engine->os;
 
diff --git a/src/test/formatter.cc b/src/test/formatter.cc
index 35ec1d652b2..da8cc937e71 100644
--- a/src/test/formatter.cc
+++ b/src/test/formatter.cc
@@ -19,6 +19,7 @@
 #include <sstream>
 #include <string>
 
+using namespace ceph;
 using std::ostringstream;
 
 TEST(JsonFormatter, Simple1) {
diff --git a/src/test/libcephd/CMakeLists.txt b/src/test/libcephd/CMakeLists.txt
index 8aa253562c4..a12e8ea40c8 100644
--- a/src/test/libcephd/CMakeLists.txt
+++ b/src/test/libcephd/CMakeLists.txt
@@ -16,7 +16,7 @@ add_executable(ceph_test_cephd_api_misc
 set_target_properties(ceph_test_cephd_api_misc PROPERTIES COMPILE_FLAGS
   ${UNITTEST_CXX_FLAGS})
 target_link_libraries(ceph_test_cephd_api_misc
-  cephd global ${UNITTEST_LIBS} cephdtest z snappy ceph_zstd)
+	cephd global ${UNITTEST_LIBS} cephdtest z snappy ceph_zstd)
 
 install(TARGETS
   ceph_test_cephd_api_misc
diff --git a/src/test/librados/TestCase.cc b/src/test/librados/TestCase.cc
index b9ae9b79e0e..9df0cce9f53 100644
--- a/src/test/librados/TestCase.cc
+++ b/src/test/librados/TestCase.cc
@@ -47,7 +47,8 @@ void RadosTestNS::SetUp()
 
 void RadosTestNS::TearDown()
 {
-  cleanup_all_objects(ioctx);
+  if (cleanup)
+    cleanup_all_objects(ioctx);
   rados_ioctx_destroy(ioctx);
 }
 
@@ -97,7 +98,8 @@ void RadosTestPPNS::SetUp()
 
 void RadosTestPPNS::TearDown()
 {
-  cleanup_all_objects(ioctx);
+  if (cleanup)
+    cleanup_all_objects(ioctx);
   ioctx.close();
 }
 
@@ -179,7 +181,8 @@ void RadosTestParamPPNS::SetUp()
 
 void RadosTestParamPPNS::TearDown()
 {
-  cleanup_all_objects(ioctx);
+  if (cleanup)
+    cleanup_all_objects(ioctx);
   ioctx.close();
 }
 
@@ -223,7 +226,8 @@ void RadosTestECNS::SetUp()
 
 void RadosTestECNS::TearDown()
 {
-  cleanup_all_objects(ioctx);
+  if (cleanup)
+    cleanup_all_objects(ioctx);
   rados_ioctx_destroy(ioctx);
 }
 
@@ -253,7 +257,8 @@ void RadosTestECPPNS::SetUp()
 
 void RadosTestECPPNS::TearDown()
 {
-  cleanup_all_objects(ioctx);
+  if (cleanup)
+    cleanup_all_objects(ioctx);
   ioctx.close();
 }
 
@@ -284,8 +289,10 @@ void RadosTest::SetUp()
 
 void RadosTest::TearDown()
 {
-  cleanup_default_namespace(ioctx);
-  cleanup_namespace(ioctx, nspace);
+  if (cleanup) {
+    cleanup_default_namespace(ioctx);
+    cleanup_namespace(ioctx, nspace);
+  }
   rados_ioctx_destroy(ioctx);
 }
 
@@ -343,8 +350,10 @@ void RadosTestPP::SetUp()
 
 void RadosTestPP::TearDown()
 {
-  cleanup_default_namespace(ioctx);
-  cleanup_namespace(ioctx, nspace);
+  if (cleanup) {
+    cleanup_default_namespace(ioctx);
+    cleanup_namespace(ioctx, nspace);
+  }
   ioctx.close();
 }
 
@@ -442,8 +451,10 @@ void RadosTestParamPP::SetUp()
 
 void RadosTestParamPP::TearDown()
 {
-  cleanup_default_namespace(ioctx);
-  cleanup_namespace(ioctx, nspace);
+  if (cleanup) {
+    cleanup_default_namespace(ioctx);
+    cleanup_namespace(ioctx, nspace);
+  }
   ioctx.close();
 }
 
@@ -494,8 +505,10 @@ void RadosTestEC::SetUp()
 
 void RadosTestEC::TearDown()
 {
-  cleanup_default_namespace(ioctx);
-  cleanup_namespace(ioctx, nspace);
+  if (cleanup) {
+    cleanup_default_namespace(ioctx);
+    cleanup_namespace(ioctx, nspace);
+  }
   rados_ioctx_destroy(ioctx);
 }
 
@@ -527,8 +540,10 @@ void RadosTestECPP::SetUp()
 
 void RadosTestECPP::TearDown()
 {
-  cleanup_default_namespace(ioctx);
-  cleanup_namespace(ioctx, nspace);
+  if (cleanup) {
+    cleanup_default_namespace(ioctx);
+    cleanup_namespace(ioctx, nspace);
+  }
   ioctx.close();
 }
 
diff --git a/src/test/librados/TestCase.h b/src/test/librados/TestCase.h
index 2bb1f1f4aa9..ac84bba0864 100644
--- a/src/test/librados/TestCase.h
+++ b/src/test/librados/TestCase.h
@@ -21,7 +21,7 @@
  */
 class RadosTestNS : public ::testing::Test {
 public:
-  RadosTestNS() {}
+  RadosTestNS(bool c=false) : cleanup(c) {}
   ~RadosTestNS() override {}
 protected:
   static void SetUpTestCase();
@@ -34,11 +34,16 @@ protected:
   void TearDown() override;
   rados_t cluster;
   rados_ioctx_t ioctx;
+  bool cleanup;
+};
+
+struct RadosTestNSCleanup : public RadosTestNS {
+  RadosTestNSCleanup() : RadosTestNS(true) {}
 };
 
 class RadosTestPPNS : public ::testing::Test {
 public:
-  RadosTestPPNS() : cluster(s_cluster) {}
+  RadosTestPPNS(bool c=false) : cluster(s_cluster), cleanup(c) {}
   ~RadosTestPPNS() override {}
 protected:
   static void SetUpTestCase();
@@ -51,11 +56,16 @@ protected:
   void TearDown() override;
   librados::Rados &cluster;
   librados::IoCtx ioctx;
+  bool cleanup;
+};
+
+struct RadosTestPPNSCleanup : public RadosTestPPNS {
+  RadosTestPPNSCleanup() : RadosTestPPNS(true) {}
 };
 
 class RadosTestParamPPNS : public ::testing::TestWithParam<const char*> {
 public:
-  RadosTestParamPPNS() : cluster(s_cluster) {}
+  RadosTestParamPPNS(bool c=false) : cluster(s_cluster), cleanup(c) {}
   ~RadosTestParamPPNS() override {}
   static void SetUpTestCase();
   static void TearDownTestCase();
@@ -69,11 +79,12 @@ protected:
   void TearDown() override;
   librados::Rados &cluster;
   librados::IoCtx ioctx;
+  bool cleanup;
 };
 
 class RadosTestECNS : public RadosTestNS {
 public:
-  RadosTestECNS() {}
+  RadosTestECNS(bool c=false) : cleanup(c) {}
   ~RadosTestECNS() override {}
 protected:
   static void SetUpTestCase();
@@ -86,11 +97,16 @@ protected:
   rados_t cluster;
   rados_ioctx_t ioctx;
   uint64_t alignment;
+  bool cleanup;
+};
+
+struct RadosTestECNSCleanup : public RadosTestECNS {
+  RadosTestECNSCleanup() : RadosTestECNS(true) {}
 };
 
 class RadosTestECPPNS : public RadosTestPPNS {
 public:
-  RadosTestECPPNS() : cluster(s_cluster) {}
+  RadosTestECPPNS(bool c=false) : cluster(s_cluster), cleanup(c) {}
   ~RadosTestECPPNS() override {}
 protected:
   static void SetUpTestCase();
@@ -103,6 +119,11 @@ protected:
   librados::Rados &cluster;
   librados::IoCtx ioctx;
   uint64_t alignment;
+  bool cleanup;
+};
+
+struct RadosTestECPPNSCleanup : public RadosTestECPPNS {
+  RadosTestECPPNSCleanup() : RadosTestECPPNS(true) {}
 };
 
 /**
@@ -115,7 +136,7 @@ protected:
  */
 class RadosTest : public ::testing::Test {
 public:
-  RadosTest() {}
+  RadosTest(bool c=false) : cleanup(c) {}
   ~RadosTest() override {}
 protected:
   static void SetUpTestCase();
@@ -130,11 +151,12 @@ protected:
   rados_t cluster;
   rados_ioctx_t ioctx;
   std::string nspace;
+  bool cleanup;
 };
 
 class RadosTestPP : public ::testing::Test {
 public:
-  RadosTestPP() : cluster(s_cluster) {}
+  RadosTestPP(bool c=false) : cluster(s_cluster), cleanup(c) {}
   ~RadosTestPP() override {}
 protected:
   static void SetUpTestCase();
@@ -148,12 +170,13 @@ protected:
   void TearDown() override;
   librados::Rados &cluster;
   librados::IoCtx ioctx;
+  bool cleanup;
   std::string nspace;
 };
 
 class RadosTestParamPP : public ::testing::TestWithParam<const char*> {
 public:
-  RadosTestParamPP() : cluster(s_cluster) {}
+  RadosTestParamPP(bool c=false) : cluster(s_cluster), cleanup(c) {}
   ~RadosTestParamPP() override {}
   static void SetUpTestCase();
   static void TearDownTestCase();
@@ -168,12 +191,13 @@ protected:
   void TearDown() override;
   librados::Rados &cluster;
   librados::IoCtx ioctx;
+  bool cleanup;
   std::string nspace;
 };
 
 class RadosTestEC : public RadosTest {
 public:
-  RadosTestEC() {}
+  RadosTestEC(bool c=false) : cleanup(c) {}
   ~RadosTestEC() override {}
 protected:
   static void SetUpTestCase();
@@ -185,13 +209,14 @@ protected:
   void TearDown() override;
   rados_t cluster;
   rados_ioctx_t ioctx;
+  bool cleanup;
   std::string nspace;
   uint64_t alignment;
 };
 
 class RadosTestECPP : public RadosTestPP {
 public:
-  RadosTestECPP() : cluster(s_cluster) {}
+  RadosTestECPP(bool c=false) : cluster(s_cluster), cleanup(c) {}
   ~RadosTestECPP() override {}
 protected:
   static void SetUpTestCase();
@@ -203,6 +228,7 @@ protected:
   void TearDown() override;
   librados::Rados &cluster;
   librados::IoCtx ioctx;
+  bool cleanup;
   std::string nspace;
   uint64_t alignment;
 };
diff --git a/src/test/librados/aio.cc b/src/test/librados/aio.cc
index a82d168e9ff..22ed976995e 100644
--- a/src/test/librados/aio.cc
+++ b/src/test/librados/aio.cc
@@ -1,4 +1,5 @@
 #include "common/errno.h"
+#include "include/err.h"
 #include "include/rados/librados.h"
 #include "test/librados/test.h"
 #include "include/types.h"
@@ -4066,3 +4067,152 @@ TEST(LibRadosAio, RacingRemovePP) {
   delete my_completion;
   delete my_completion2;
 }
+
+TEST(LibRadosAio, RoundTripCmpExtPP) {
+  AioTestDataPP test_data;
+  ASSERT_EQ("", test_data.init());
+  AioCompletion *my_completion = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_complete, set_completion_safe);
+  AioCompletion *my_completion_null = NULL;
+  ASSERT_NE(my_completion, my_completion_null);
+  char full[128];
+  memset(full, 0xcc, sizeof(full));
+  bufferlist bl1;
+  bl1.append(full, sizeof(full));
+  ASSERT_EQ(0, test_data.m_ioctx.aio_write("foo", my_completion,
+					   bl1, sizeof(full), 0));
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, my_completion->wait_for_complete());
+  }
+  ASSERT_EQ(0, my_completion->get_return_value());
+
+  /* compare with match */
+  bufferlist cbl;
+  cbl.append(full, sizeof(full));
+  AioCompletion *my_completion2 = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_complete, set_completion_safe);
+  ASSERT_EQ(0, test_data.m_ioctx.aio_cmpext("foo", my_completion2, 0, cbl));
+
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, my_completion2->wait_for_complete());
+  }
+  ASSERT_EQ(0, my_completion2->get_return_value());
+
+  /* compare with mismatch */
+  memset(full, 0xdd, sizeof(full));
+  cbl.clear();
+  cbl.append(full, sizeof(full));
+  AioCompletion *my_completion3 = test_data.m_cluster.aio_create_completion(
+	  (void*)&test_data, set_completion_complete, set_completion_safe);
+  ASSERT_EQ(0, test_data.m_ioctx.aio_cmpext("foo", my_completion3, 0, cbl));
+
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, my_completion3->wait_for_complete());
+  }
+  ASSERT_EQ(-MAX_ERRNO, my_completion3->get_return_value());
+
+  delete my_completion;
+  delete my_completion2;
+  delete my_completion3;
+}
+
+TEST(LibRadosAio, RoundTripCmpExtPP2)
+{
+  int ret;
+  char buf[128];
+  char miscmp_buf[128];
+  bufferlist cbl;
+  Rados cluster;
+  std::string pool_name = get_temp_pool_name();
+  ASSERT_EQ("", create_one_pool_pp(pool_name, cluster));
+  IoCtx ioctx;
+  cluster.ioctx_create(pool_name.c_str(), ioctx);
+
+  boost::scoped_ptr<AioCompletion>
+			wr_cmpl(cluster.aio_create_completion(0, 0, 0));
+  ObjectWriteOperation wr_op;
+  memset(buf, 0xcc, sizeof(buf));
+  memset(miscmp_buf, 0xdd, sizeof(miscmp_buf));
+  bufferlist bl;
+  bl.append(buf, sizeof(buf));
+
+  wr_op.write_full(bl);
+  wr_op.set_op_flags2(LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
+  ioctx.aio_operate("test_obj", wr_cmpl.get(), &wr_op);
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, wr_cmpl->wait_for_complete());
+  }
+  EXPECT_EQ(0, wr_cmpl->get_return_value());
+
+  /* cmpext as write op. first match then mismatch */
+  boost::scoped_ptr<AioCompletion>
+			wr_cmpext_cmpl(cluster.aio_create_completion(0, 0, 0));
+  cbl.append(buf, sizeof(buf));
+  ret = 0;
+
+  wr_op.cmpext(0, cbl, &ret);
+  wr_op.set_op_flags2(LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
+  ioctx.aio_operate("test_obj", wr_cmpext_cmpl.get(), &wr_op);
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, wr_cmpext_cmpl->wait_for_complete());
+  }
+  EXPECT_EQ(0, wr_cmpext_cmpl->get_return_value());
+  EXPECT_EQ(0, ret);
+
+  boost::scoped_ptr<AioCompletion>
+			wr_cmpext_cmpl2(cluster.aio_create_completion(0, 0, 0));
+  cbl.clear();
+  cbl.append(miscmp_buf, sizeof(miscmp_buf));
+  ret = 0;
+
+  wr_op.cmpext(0, cbl, &ret);
+  wr_op.set_op_flags2(LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
+  ioctx.aio_operate("test_obj", wr_cmpext_cmpl2.get(), &wr_op);
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, wr_cmpext_cmpl2->wait_for_complete());
+  }
+  EXPECT_EQ(-MAX_ERRNO, wr_cmpext_cmpl2->get_return_value());
+  EXPECT_EQ(-MAX_ERRNO, ret);
+
+  /* cmpext as read op */
+  boost::scoped_ptr<AioCompletion>
+			rd_cmpext_cmpl(cluster.aio_create_completion(0, 0, 0));
+  ObjectReadOperation rd_op;
+  cbl.clear();
+  cbl.append(buf, sizeof(buf));
+  ret = 0;
+  rd_op.cmpext(0, cbl, &ret);
+  rd_op.set_op_flags2(LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
+  ioctx.aio_operate("test_obj", rd_cmpext_cmpl.get(), &rd_op, 0);
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, rd_cmpext_cmpl->wait_for_complete());
+  }
+  EXPECT_EQ(0, rd_cmpext_cmpl->get_return_value());
+  EXPECT_EQ(0, ret);
+
+  boost::scoped_ptr<AioCompletion>
+			rd_cmpext_cmpl2(cluster.aio_create_completion(0, 0, 0));
+  cbl.clear();
+  cbl.append(miscmp_buf, sizeof(miscmp_buf));
+  ret = 0;
+
+  rd_op.cmpext(0, cbl, &ret);
+  rd_op.set_op_flags2(LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
+  ioctx.aio_operate("test_obj", rd_cmpext_cmpl2.get(), &rd_op, 0);
+  {
+    TestAlarm alarm;
+    ASSERT_EQ(0, rd_cmpext_cmpl2->wait_for_complete());
+  }
+  EXPECT_EQ(-MAX_ERRNO, rd_cmpext_cmpl2->get_return_value());
+  EXPECT_EQ(-MAX_ERRNO, ret);
+
+  ioctx.remove("test_obj");
+  destroy_one_pool_pp(pool_name, cluster);
+}
diff --git a/src/test/librados/c_read_operations.cc b/src/test/librados/c_read_operations.cc
index 5378e60b1fa..41714219c31 100644
--- a/src/test/librados/c_read_operations.cc
+++ b/src/test/librados/c_read_operations.cc
@@ -4,13 +4,14 @@
 #include <errno.h>
 #include <string>
 
+#include "include/err.h"
 #include "include/rados/librados.h"
 #include "test/librados/test.h"
 #include "test/librados/TestCase.h"
 
 const char *data = "testdata";
 const char *obj = "testobj";
-const int len = strlen(data);
+const size_t len = strlen(data);
 
 class CReadOpsTest : public RadosTest {
 protected:
@@ -282,7 +283,7 @@ TEST_F(CReadOpsTest, Read) {
     size_t bytes_read = 0;
     rados_read_op_read(op, 0, len, buf, &bytes_read, NULL);
     ASSERT_EQ(0, rados_read_op_operate(op, ioctx, obj, 0));
-    ASSERT_EQ(len, (int)bytes_read);
+    ASSERT_EQ(len, bytes_read);
     ASSERT_EQ(0, memcmp(data, buf, len));
     rados_release_read_op(op);
   }
@@ -293,7 +294,7 @@ TEST_F(CReadOpsTest, Read) {
     int rval;
     rados_read_op_read(op, 0, len, buf, &bytes_read, &rval);
     ASSERT_EQ(0, rados_read_op_operate(op, ioctx, obj, 0));
-    ASSERT_EQ(len, (int)bytes_read);
+    ASSERT_EQ(len, bytes_read);
     ASSERT_EQ(0, rval);
     ASSERT_EQ(0, memcmp(data, buf, len));
     rados_release_read_op(op);
@@ -306,7 +307,7 @@ TEST_F(CReadOpsTest, Read) {
     rados_read_op_read(op, 0, len, buf, &bytes_read, &rval);
     rados_read_op_set_flags(op, LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
     ASSERT_EQ(0, rados_read_op_operate(op, ioctx, obj, 0));
-    ASSERT_EQ(len, (int)bytes_read);
+    ASSERT_EQ(len, bytes_read);
     ASSERT_EQ(0, rval);
     ASSERT_EQ(0, memcmp(data, buf, len));
     rados_release_read_op(op);
@@ -393,7 +394,7 @@ TEST_F(CReadOpsTest, RWOrderedRead) {
   rados_read_op_set_flags(op, LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
   ASSERT_EQ(0, rados_read_op_operate(op, ioctx, obj,
 				     LIBRADOS_OPERATION_ORDER_READS_WRITES));
-  ASSERT_EQ(len, (int)bytes_read);
+  ASSERT_EQ(len, bytes_read);
   ASSERT_EQ(0, rval);
   ASSERT_EQ(0, memcmp(data, buf, len));
   rados_release_read_op(op);
@@ -430,7 +431,7 @@ TEST_F(CReadOpsTest, ShortRead) {
     size_t bytes_read = 0;
     rados_read_op_read(op, 0, len * 2, buf, &bytes_read, NULL);
     ASSERT_EQ(0, rados_read_op_operate(op, ioctx, obj, 0));
-    ASSERT_EQ(len, (int)bytes_read);
+    ASSERT_EQ(len, bytes_read);
     ASSERT_EQ(0, memcmp(data, buf, len));
     rados_release_read_op(op);
   }
@@ -441,7 +442,7 @@ TEST_F(CReadOpsTest, ShortRead) {
     int rval;
     rados_read_op_read(op, 0, len * 2, buf, &bytes_read, &rval);
     ASSERT_EQ(0, rados_read_op_operate(op, ioctx, obj, 0));
-    ASSERT_EQ(len, (int)bytes_read);
+    ASSERT_EQ(len, bytes_read);
     ASSERT_EQ(0, rval);
     ASSERT_EQ(0, memcmp(data, buf, len));
     rados_release_read_op(op);
@@ -526,7 +527,7 @@ TEST_F(CReadOpsTest, Stat) {
   rados_read_op_stat(op, &size, NULL, &rval);
   EXPECT_EQ(0, rados_read_op_operate(op, ioctx, obj, 0));
   EXPECT_EQ(0, rval);
-  EXPECT_EQ(len, (int)size);
+  EXPECT_EQ(len, size);
   rados_release_read_op(op);
 
   op = rados_create_read_op();
@@ -665,3 +666,41 @@ TEST_F(CReadOpsTest, GetXattrs) {
 
   remove_object();
 }
+
+TEST_F(CReadOpsTest, CmpExt) {
+  char buf[len];
+  size_t bytes_read = 0;
+  int cmpext_val = 0;
+  int read_val = 0;
+
+  write_object();
+
+  // cmpext with match should ensure that the following read is successful
+  rados_read_op_t op = rados_create_read_op();
+  ASSERT_TRUE(op);
+  // @obj, @data and @len correspond to object initialised by write_object()
+  rados_read_op_cmpext(op, data, len, 0, &cmpext_val);
+  rados_read_op_read(op, 0, len, buf, &bytes_read, &read_val);
+  ASSERT_EQ(0, rados_read_op_operate(op, ioctx, obj, 0));
+  ASSERT_EQ(len, bytes_read);
+  ASSERT_EQ(0, memcmp(data, buf, len));
+  ASSERT_EQ(cmpext_val, 0);
+  rados_release_read_op(op);
+
+  // cmpext with mismatch should fail and fill mismatch_buf accordingly
+  memset(buf, 0, sizeof(buf));
+  bytes_read = 0;
+  cmpext_val = 0;
+  read_val = 0;
+  op = rados_create_read_op();
+  ASSERT_TRUE(op);
+  // @obj, @data and @len correspond to object initialised by write_object()
+  rados_read_op_cmpext(op, "mismatch", strlen("mismatch"), 0, &cmpext_val);
+  rados_read_op_read(op, 0, len, buf, &bytes_read, &read_val);
+  ASSERT_EQ(-MAX_ERRNO, rados_read_op_operate(op, ioctx, obj, 0));
+  rados_release_read_op(op);
+
+  ASSERT_EQ(-MAX_ERRNO, cmpext_val);
+
+  remove_object();
+}
diff --git a/src/test/librados/c_write_operations.cc b/src/test/librados/c_write_operations.cc
index 496c31d2f3f..325a2f0a3e0 100644
--- a/src/test/librados/c_write_operations.cc
+++ b/src/test/librados/c_write_operations.cc
@@ -1,6 +1,7 @@
 // Tests for the C API coverage of atomic write operations
 
 #include <errno.h>
+#include "include/err.h"
 #include "include/rados/librados.h"
 #include "test/librados/test.h"
 #include "gtest/gtest.h"
@@ -226,3 +227,53 @@ TEST(LibRadosCWriteOps, WriteSame) {
   rados_ioctx_destroy(ioctx);
   ASSERT_EQ(0, destroy_one_pool(pool_name, &cluster));
 }
+
+TEST(LibRadosCWriteOps, CmpExt) {
+  rados_t cluster;
+  rados_ioctx_t ioctx;
+  std::string pool_name = get_temp_pool_name();
+  ASSERT_EQ("", create_one_pool(pool_name, &cluster));
+  rados_ioctx_create(cluster, pool_name.c_str(), &ioctx);
+
+  // create an object, write to it using writesame
+  rados_write_op_t op = rados_create_write_op();
+  ASSERT_TRUE(op);
+  rados_write_op_create(op, LIBRADOS_CREATE_EXCLUSIVE, NULL);
+  rados_write_op_write(op, "four", 4, 0);
+  ASSERT_EQ(0, rados_write_op_operate(op, ioctx, "test", NULL, 0));
+  rados_release_write_op(op);
+  char hi[4];
+  ASSERT_EQ(sizeof(hi), static_cast<std::size_t>(rados_read(ioctx, "test", hi, sizeof(hi), 0)));
+  ASSERT_EQ(0, memcmp("four", hi, sizeof(hi)));
+
+  // compare and overwrite on (expected) match
+  int val = 0;
+  op = rados_create_write_op();
+  ASSERT_TRUE(op);
+  rados_write_op_cmpext(op, "four", 4, 0, &val);
+  rados_write_op_write(op, "five", 4, 0);
+  ASSERT_EQ(0, rados_write_op_operate(op, ioctx, "test", NULL, 0));
+  ASSERT_EQ(0, val);
+  rados_release_write_op(op);
+  ASSERT_EQ(sizeof(hi), static_cast<std::size_t>(rados_read(ioctx, "test", hi, sizeof(hi), 0)));
+  ASSERT_EQ(0, memcmp("five", hi, sizeof(hi)));
+
+  // compare and bail before write due to mismatch
+  val = 0;
+  op = rados_create_write_op();
+  ASSERT_TRUE(op);
+  rados_write_op_cmpext(op, "four", 4, 0, &val);
+  rados_write_op_write(op, "six ", 4, 0);
+  ASSERT_EQ(-MAX_ERRNO - 1, rados_write_op_operate(op, ioctx, "test", NULL, 0));
+
+  ASSERT_EQ(-MAX_ERRNO - 1, val);
+
+  // cleanup
+  op = rados_create_write_op();
+  ASSERT_TRUE(op);
+  rados_write_op_remove(op);
+  ASSERT_EQ(0, rados_write_op_operate(op, ioctx, "test", NULL, 0));
+
+  rados_ioctx_destroy(ioctx);
+  ASSERT_EQ(0, destroy_one_pool(pool_name, &cluster));
+}
diff --git a/src/test/librados/list.cc b/src/test/librados/list.cc
index 50c61a43889..f3ca8107d9d 100644
--- a/src/test/librados/list.cc
+++ b/src/test/librados/list.cc
@@ -16,10 +16,10 @@
 
 using namespace librados;
 
-typedef RadosTestNS LibRadosList;
-typedef RadosTestPPNS LibRadosListPP;
-typedef RadosTestECNS LibRadosListEC;
-typedef RadosTestECPPNS LibRadosListECPP;
+typedef RadosTestNSCleanup LibRadosList;
+typedef RadosTestPPNSCleanup LibRadosListPP;
+typedef RadosTestECNSCleanup LibRadosListEC;
+typedef RadosTestECPPNSCleanup LibRadosListECPP;
 typedef RadosTestNP LibRadosListNP;
 
 
@@ -138,16 +138,14 @@ TEST_F(LibRadosListPP, ListObjectsEndIter) {
   ASSERT_TRUE(iter2 == iter_end2);
 }
 
-static void check_list(std::set<std::string>& myset, rados_list_ctx_t& ctx, std::string check_nspace)
+static void check_list(
+  std::set<std::string>& myset,
+  rados_list_ctx_t& ctx,
+  std::string check_nspace)
 {
   const char *entry, *nspace;
-  std::set<std::string> orig_set(myset);
-  /**
-   * During splitting, we might see duplicate items.
-   * We assert that every object returned is in myset and that
-   * we don't hit ENOENT until we have hit every item in myset
-   * at least once.
-   */
+  cout << "myset " << myset << std::endl;
+  // we should see every item exactly once.
   int ret;
   while ((ret = rados_nobjects_list_next(ctx, &entry, NULL, &nspace)) == 0) {
     std::string test_name;
@@ -157,8 +155,9 @@ static void check_list(std::set<std::string>& myset, rados_list_ctx_t& ctx, std:
       ASSERT_TRUE(std::string(nspace) == check_nspace);
       test_name = std::string(entry);
     }
+    cout << test_name << std::endl;
 
-    ASSERT_TRUE(orig_set.end() != orig_set.find(test_name));
+    ASSERT_TRUE(myset.end() != myset.find(test_name));
     myset.erase(test_name);
   }
   ASSERT_EQ(-ENOENT, ret);
@@ -965,7 +964,18 @@ TEST_F(LibRadosListNP, ListObjectsError) {
   memset(buf, 0xcc, sizeof(buf));
   rados_ioctx_set_namespace(ioctx, "");
   ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
-  ASSERT_EQ(0, rados_pool_delete(cluster, pool_name.c_str()));
+
+  //ASSERT_EQ(0, rados_pool_delete(cluster, pool_name.c_str()));
+  {
+    char *buf, *st;
+    size_t buflen, stlen;
+    string c = "{\"prefix\":\"osd pool rm\",\"pool\": \"" + pool_name +
+      "\",\"pool2\":\"" + pool_name +
+      "\",\"sure\": \"--yes-i-really-really-mean-it-not-faking\"}";
+    const char *cmd[2] = { c.c_str(), 0 };
+    ASSERT_EQ(0, rados_mon_command(cluster, (const char **)cmd, 1, "", 0, &buf, &buflen, &st, &stlen));
+    ASSERT_EQ(0, rados_wait_for_latest_osdmap(cluster));
+  }
 
   rados_list_ctx_t ctx;
   ASSERT_EQ(0, rados_nobjects_list_open(ioctx, &ctx));
diff --git a/src/test/librados/misc.cc b/src/test/librados/misc.cc
index a7d44ace6f8..bdc94cd7238 100644
--- a/src/test/librados/misc.cc
+++ b/src/test/librados/misc.cc
@@ -3,6 +3,7 @@
 #include "gtest/gtest.h"
 
 #include "mds/mdstypes.h"
+#include "include/err.h"
 #include "include/buffer.h"
 #include "include/rbd_types.h"
 #include "include/rados/librados.h"
@@ -1198,3 +1199,32 @@ TYPED_TEST(LibRadosChecksum, Chunked) {
     ASSERT_EQ(expected_values[i], value);
   }
 }
+
+TEST_F(LibRadosMiscPP, CmpExtPP) {
+  bufferlist cmp_bl, bad_cmp_bl, write_bl;
+  char stored_str[] = "1234567891";
+  char mismatch_str[] = "1234577777";
+
+  write_bl.append(stored_str);
+  ioctx.write("cmpextpp", write_bl, write_bl.length(), 0);
+  cmp_bl.append(stored_str);
+  ASSERT_EQ(0, ioctx.cmpext("cmpextpp", 0, cmp_bl));
+
+  bad_cmp_bl.append(mismatch_str);
+  ASSERT_EQ(-MAX_ERRNO - 5, ioctx.cmpext("cmpextpp", 0, bad_cmp_bl));
+}
+
+TEST_F(LibRadosMisc, CmpExt) {
+  bufferlist cmp_bl, bad_cmp_bl, write_bl;
+  char stored_str[] = "1234567891";
+  char mismatch_str[] = "1234577777";
+
+  ASSERT_EQ(0,
+	    rados_write(ioctx, "cmpextpp", stored_str, sizeof(stored_str), 0));
+
+  ASSERT_EQ(0,
+	    rados_cmpext(ioctx, "cmpextpp", stored_str, sizeof(stored_str), 0));
+
+  ASSERT_EQ(-MAX_ERRNO - 5,
+	    rados_cmpext(ioctx, "cmpextpp", mismatch_str, sizeof(mismatch_str), 0));
+}
diff --git a/src/test/librados/pool.cc b/src/test/librados/pool.cc
index 2c01ee91b03..5764fa87588 100644
--- a/src/test/librados/pool.cc
+++ b/src/test/librados/pool.cc
@@ -15,6 +15,9 @@ TEST(LibRadosPools, PoolList) {
   ASSERT_EQ("", create_one_pool(pool_name, &cluster));
   ASSERT_LT(rados_pool_list(cluster, buf, POOL_LIST_BUF_SZ), POOL_LIST_BUF_SZ);
 
+  // we can pass a null buffer too.
+  ASSERT_LT(rados_pool_list(cluster, NULL, POOL_LIST_BUF_SZ), POOL_LIST_BUF_SZ);
+
   bool found_pool = false;
   while (buf[0] != '\0') {
     if ((found_pool == false) && (strcmp(buf, pool_name.c_str()) == 0)) {
diff --git a/src/test/librados/test.cc b/src/test/librados/test.cc
index afb855f3eed..af34c3ac284 100644
--- a/src/test/librados/test.cc
+++ b/src/test/librados/test.cc
@@ -9,6 +9,8 @@
 #include "common/Formatter.h"
 #include "json_spirit/json_spirit.h"
 #include "common/errno.h"
+#include "common/ceph_context.h"
+#include "common/config.h"
 
 #include <errno.h>
 #include <sstream>
@@ -447,11 +449,14 @@ int destroy_one_ec_pool(const std::string &pool_name, rados_t *cluster)
     return ret;
   }
 
-  std::ostringstream oss;
-  ret = destroy_ec_profile_and_ruleset(cluster, pool_name, oss);
-  if (ret) {
-    rados_shutdown(*cluster);
-    return ret;
+  CephContext *cct = static_cast<CephContext*>(rados_cct(*cluster));
+  if (!cct->_conf->mon_fake_pool_delete) { // hope this is in [global]
+    std::ostringstream oss;
+    ret = destroy_ec_profile_and_ruleset(cluster, pool_name, oss);
+    if (ret) {
+      rados_shutdown(*cluster);
+      return ret;
+    }
   }
 
   rados_wait_for_latest_osdmap(*cluster);
@@ -478,11 +483,14 @@ int destroy_one_ec_pool_pp(const std::string &pool_name, Rados &cluster)
     return ret;
   }
 
-  std::ostringstream oss;
-  ret = destroy_ec_profile_and_ruleset_pp(cluster, pool_name, oss);
-  if (ret) {
-    cluster.shutdown();
-    return ret;
+  CephContext *cct = static_cast<CephContext*>(cluster.cct());
+  if (!cct->_conf->mon_fake_pool_delete) { // hope this is in [global]
+    std::ostringstream oss;
+    ret = destroy_ec_profile_and_ruleset_pp(cluster, pool_name, oss);
+    if (ret) {
+      cluster.shutdown();
+      return ret;
+    }
   }
 
   cluster.wait_for_latest_osdmap();
diff --git a/src/test/librados/tier.cc b/src/test/librados/tier.cc
index ab7130ce5a4..73d4d1eccc9 100755
--- a/src/test/librados/tier.cc
+++ b/src/test/librados/tier.cc
@@ -530,11 +530,18 @@ TEST_F(LibRadosTwoPoolsPP, PromoteSnapScrub) {
     IoCtx cache_ioctx;
     ASSERT_EQ(0, cluster.ioctx_create(cache_pool_name.c_str(), cache_ioctx));
     for (int i=0; i<10; ++i) {
-      ostringstream ss;
-      ss << "{\"prefix\": \"pg scrub\", \"pgid\": \""
-	 << cache_ioctx.get_id() << "." << i
-	 << "\"}";
-      cluster.mon_command(ss.str(), inbl, NULL, NULL);
+      do {
+	ostringstream ss;
+	ss << "{\"prefix\": \"pg scrub\", \"pgid\": \""
+	   << cache_ioctx.get_id() << "." << i
+	   << "\"}";
+	int r = cluster.mon_command(ss.str(), inbl, NULL, NULL);
+	if (r == -ENOENT ||  // in case mgr osdmap is stale
+	    r == -EAGAIN) {
+	  sleep(5);
+	  continue;
+	}
+      } while (false);
     }
 
     // give it a few seconds to go.  this is sloppy but is usually enough time
@@ -3072,8 +3079,11 @@ TEST_F(LibRadosTwoPoolsECPP, PromoteSnap) {
 	 << hash
 	 << "\"}";
       int r = cluster.mon_command(ss.str(), inbl, NULL, NULL);
-      if (r == -EAGAIN)
+      if (r == -EAGAIN ||
+	  r == -ENOENT) {  // in case mgr osdmap is a bit stale
+	sleep(5);
 	continue;
+      }
       ASSERT_EQ(0, r);
       break;
     }
diff --git a/src/test/librados/watch_notify.cc b/src/test/librados/watch_notify.cc
index 953df293f9c..e28fd527119 100644
--- a/src/test/librados/watch_notify.cc
+++ b/src/test/librados/watch_notify.cc
@@ -795,7 +795,9 @@ TEST_F(LibRadosWatchNotify, Watch3Timeout) {
   ASSERT_LT(age, age_bound * 1000);
   ASSERT_GT(age, 0);
   rados_conf_set(cluster, "objecter_inject_no_watch_ping", "true");
-  int left = 2 * timeout;
+  // allow a long time here since an osd peering event will renew our
+  // watch.
+  int left = 16 * timeout;
   std::cout << "waiting up to " << left << " for osd to time us out ..."
 	    << std::endl;
   while (notify_err == 0 && --left) {
diff --git a/src/test/librados_test_stub/LibradosTestStub.cc b/src/test/librados_test_stub/LibradosTestStub.cc
index 2c747e76440..059c19114a6 100644
--- a/src/test/librados_test_stub/LibradosTestStub.cc
+++ b/src/test/librados_test_stub/LibradosTestStub.cc
@@ -411,6 +411,12 @@ int IoCtx::aio_operate(const std::string& oid, AioCompletion *c,
 }
 
 int IoCtx::aio_operate(const std::string& oid, AioCompletion *c,
+                       ObjectReadOperation *op, int flags,
+                       bufferlist *pbl, const blkin_trace_info *trace_info) {
+  return aio_operate(oid, c, op, flags, pbl);
+}
+
+int IoCtx::aio_operate(const std::string& oid, AioCompletion *c,
                        ObjectWriteOperation *op) {
   TestIoCtxImpl *ctx = reinterpret_cast<TestIoCtxImpl*>(io_ctx_impl);
   TestObjectOperationImpl *ops = reinterpret_cast<TestObjectOperationImpl*>(op->impl);
@@ -432,6 +438,13 @@ int IoCtx::aio_operate(const std::string& oid, AioCompletion *c,
   return ctx->aio_operate(oid, *ops, c->pc, &snapc, 0);
 }
 
+int IoCtx::aio_operate(const std::string& oid, AioCompletion *c,
+                       ObjectWriteOperation *op, snap_t seq,
+                       std::vector<snap_t>& snaps,
+		       const blkin_trace_info *trace_info) {
+  return aio_operate(oid, c, op, seq, snaps);
+}
+
 int IoCtx::aio_remove(const std::string& oid, AioCompletion *c) {
   TestIoCtxImpl *ctx = reinterpret_cast<TestIoCtxImpl*>(io_ctx_impl);
   return ctx->aio_remove(oid, c->pc);
diff --git a/src/test/librados_test_stub/MockTestMemIoCtxImpl.h b/src/test/librados_test_stub/MockTestMemIoCtxImpl.h
index e1d703dceff..03c6c326de5 100644
--- a/src/test/librados_test_stub/MockTestMemIoCtxImpl.h
+++ b/src/test/librados_test_stub/MockTestMemIoCtxImpl.h
@@ -99,7 +99,7 @@ public:
   }
   MOCK_METHOD5(sparse_read, int(const std::string& oid,
                                uint64_t off,
-                               size_t len,
+                               uint64_t len,
                                std::map<uint64_t, uint64_t> *m,
                                bufferlist *bl));
   int do_sparse_read(const std::string& oid, uint64_t off, size_t len,
diff --git a/src/test/librbd/fsx.cc b/src/test/librbd/fsx.cc
index 7c7f2d1235e..444fdbefd59 100644
--- a/src/test/librbd/fsx.cc
+++ b/src/test/librbd/fsx.cc
@@ -172,23 +172,12 @@ int	randomize_parent_overlap = 1;
 int 	mapped_reads = 0;		/* -R flag disables it */
 int	fsxgoodfd = 0;
 int	o_direct = 0;			/* -Z flag */
-int	aio = 0;
 
 int num_clones = 0;
 
 int page_size;
 int page_mask;
 int mmap_mask;
-#ifdef AIO
-int aio_rw(int rw, int fd, char *buf, unsigned len, unsigned offset);
-#define READ 0
-#define WRITE 1
-#define fsxread(a,b,c,d)	aio_rw(READ, a,b,c,d)
-#define fsxwrite(a,b,c,d)	aio_rw(WRITE, a,b,c,d)
-#else
-#define fsxread(a,b,c,d)	read(a,b,c)
-#define fsxwrite(a,b,c,d)	write(a,b,c)
-#endif
 
 FILE *	fsxlogf = NULL;
 int badoff = -1;
@@ -2397,9 +2386,6 @@ usage(void)
 	-x: preallocate file space before starting, XFS only (default 0)\n\
 	-y: synchronize changes to a file\n"
 
-#ifdef AIO
-"	-A: Use the AIO system calls\n"
-#endif
 "	-C: do not use clone calls\n\
 	-D startingop: debug output starting at specified operation\n"
 #ifdef FALLOCATE
@@ -2456,108 +2442,6 @@ getnum(char *s, char **e)
 	return (ret);
 }
 
-#ifdef AIO
-
-#define QSZ     1024
-io_context_t	io_ctx;
-struct iocb 	iocb;
-
-int aio_setup()
-{
-	int ret;
-	ret = io_queue_init(QSZ, &io_ctx);
-	if (ret != 0) {
-		fprintf(stderr, "aio_setup: io_queue_init failed: %s\n",
-                        strerror(ret));
-                return(-1);
-        }
-        return(0);
-}
-
-int
-__aio_rw(int rw, int fd, char *buf, unsigned len, unsigned offset)
-{
-	struct io_event event;
-	static struct timespec ts;
-	struct iocb *iocbs[] = { &iocb };
-	int ret;
-	long res;
-
-	if (rw == READ) {
-		io_prep_pread(&iocb, fd, buf, len, offset);
-	} else {
-		io_prep_pwrite(&iocb, fd, buf, len, offset);
-	}
-
-	ts.tv_sec = 30;
-	ts.tv_nsec = 0;
-	ret = io_submit(io_ctx, 1, iocbs);
-	if (ret != 1) {
-		fprintf(stderr, "errcode=%d\n", ret);
-		fprintf(stderr, "aio_rw: io_submit failed: %s\n",
-				strerror(ret));
-		goto out_error;
-	}
-
-	ret = io_getevents(io_ctx, 1, 1, &event, &ts);
-	if (ret != 1) {
-		if (ret == 0)
-			fprintf(stderr, "aio_rw: no events available\n");
-		else {
-			fprintf(stderr, "errcode=%d\n", -ret);
-			fprintf(stderr, "aio_rw: io_getevents failed: %s\n",
-				 	strerror(-ret));
-		}
-		goto out_error;
-	}
-	if (len != event.res) {
-		/*
-		 * The b0rked libaio defines event.res as unsigned.
-		 * However the kernel strucuture has it signed,
-		 * and it's used to pass negated error value.
-		 * Till the library is fixed use the temp var.
-		 */
-		res = (long)event.res;
-		if (res >= 0)
-			fprintf(stderr, "bad io length: %lu instead of %u\n",
-					res, len);
-		else {
-			fprintf(stderr, "errcode=%ld\n", -res);
-			fprintf(stderr, "aio_rw: async io failed: %s\n",
-					strerror(-res));
-			ret = res;
-			goto out_error;
-		}
-
-	}
-	return event.res;
-
-out_error:
-	/*
-	 * The caller expects error return in traditional libc
-	 * convention, i.e. -1 and the errno set to error.
-	 */
-	errno = -ret;
-	return -1;
-}
-
-int aio_rw(int rw, int fd, char *buf, unsigned len, unsigned offset)
-{
-	int ret;
-
-	if (aio) {
-		ret = __aio_rw(rw, fd, buf, len, offset);
-	} else {
-		if (rw == READ)
-			ret = read(fd, buf, len);
-		else
-			ret = write(fd, buf, len);
-	}
-	return ret;
-}
-
-#endif
-
 void
 test_fallocate()
 {
@@ -2637,7 +2521,7 @@ main(int argc, char **argv)
 
 	setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
 
-	while ((ch = getopt(argc, argv, "b:c:dfh:jkl:m:no:p:qr:s:t:w:xyACD:FHKMLN:OP:RS:UWZ"))
+	while ((ch = getopt(argc, argv, "b:c:dfh:jkl:m:no:p:qr:s:t:w:xyCD:FHKMLN:OP:RS:UWZ"))
 	       != EOF)
 		switch (ch) {
 		case 'b':
@@ -2738,9 +2622,6 @@ main(int argc, char **argv)
 		case 'y':
 			do_fsync = 1;
 			break;
-		case 'A':
-		        aio = 1;
-			break;
 		case 'C':
 			clone_calls = 0;
 			break;
@@ -2871,11 +2752,6 @@ main(int argc, char **argv)
 		exit(93);
 	}
 
-#ifdef AIO
-	if (aio) 
-		aio_setup();
-#endif
-
 	original_buf = (char *) malloc(maxfilelen);
 	for (i = 0; i < (int)maxfilelen; i++)
 		original_buf[i] = get_random() % 256;
diff --git a/src/test/librbd/image/test_mock_RemoveRequest.cc b/src/test/librbd/image/test_mock_RemoveRequest.cc
index 4b271a454c9..77c536d03a8 100644
--- a/src/test/librbd/image/test_mock_RemoveRequest.cc
+++ b/src/test/librbd/image/test_mock_RemoveRequest.cc
@@ -154,6 +154,9 @@ public:
       .WillOnce(Invoke([r](bool open_parent, Context *on_ready) {
 		  on_ready->complete(r);
                 }));
+    if (r < 0) {
+      EXPECT_CALL(mock_image_ctx, destroy());
+    }
   }
 
   void expect_state_close(MockImageCtx &mock_image_ctx) {
@@ -161,6 +164,7 @@ public:
       .WillOnce(Invoke([](Context *on_ready) {
                   on_ready->complete(0);
                 }));
+    EXPECT_CALL(mock_image_ctx, destroy());
   }
 
   void expect_wq_queue(ContextWQ &wq, int r) {
diff --git a/src/test/librbd/io/test_mock_ImageRequest.cc b/src/test/librbd/io/test_mock_ImageRequest.cc
index 3a5d2fee373..7a11c2c6ab1 100644
--- a/src/test/librbd/io/test_mock_ImageRequest.cc
+++ b/src/test/librbd/io/test_mock_ImageRequest.cc
@@ -38,6 +38,7 @@ struct ObjectRequest<librbd::MockTestImageCtx> : public ObjectRequestHandle {
                                       const std::string &oid,
                                       uint64_t object_no,
                                       const ::SnapContext &snapc,
+                                      const ZTracer::Trace &parent_trace,
                                       Context *completion) {
     assert(s_instance != nullptr);
     s_instance->on_finish = completion;
@@ -49,6 +50,7 @@ struct ObjectRequest<librbd::MockTestImageCtx> : public ObjectRequestHandle {
                                         uint64_t object_no,
                                         uint64_t object_off,
                                         const ::SnapContext &snapc,
+                                        const ZTracer::Trace &parent_trace,
                                         Context *completion) {
     assert(s_instance != nullptr);
     s_instance->on_finish = completion;
@@ -60,8 +62,9 @@ struct ObjectRequest<librbd::MockTestImageCtx> : public ObjectRequestHandle {
                                      uint64_t object_no,
                                      uint64_t object_off,
                                      const ceph::bufferlist &data,
-                                     const ::SnapContext &snapc,
-                                     Context *completion, int op_flags) {
+                                     const ::SnapContext &snapc, int op_flags,
+                                     const ZTracer::Trace &parent_trace,
+                                     Context *completion) {
     assert(s_instance != nullptr);
     s_instance->on_finish = completion;
     return s_instance;
@@ -72,6 +75,7 @@ struct ObjectRequest<librbd::MockTestImageCtx> : public ObjectRequestHandle {
                                     uint64_t object_no, uint64_t object_off,
                                     uint64_t object_len,
                                     const ::SnapContext &snapc,
+                                    const ZTracer::Trace &parent_trace,
                                     Context *completion) {
     assert(s_instance != nullptr);
     s_instance->on_finish = completion;
@@ -85,7 +89,9 @@ struct ObjectRequest<librbd::MockTestImageCtx> : public ObjectRequestHandle {
                                          uint64_t object_len,
                                          const ceph::bufferlist &data,
                                          const ::SnapContext &snapc,
-                                         Context *completion, int op_flags) {
+                                         int op_flags,
+                                         const ZTracer::Trace &parent_trace,
+                                         Context *completion) {
     assert(s_instance != nullptr);
     s_instance->on_finish = completion;
     return s_instance;
@@ -115,7 +121,9 @@ struct ObjectReadRequest<librbd::MockTestImageCtx> : public ObjectRequest<librbd
                                    uint64_t objectno, uint64_t offset,
                                    uint64_t len, Extents &buffer_extents,
                                    librados::snap_t snap_id, bool sparse,
-                                   Context *completion, int op_flags) {
+                                   int op_flags,
+                                   const ZTracer::Trace &parent_trace,
+                                   Context *completion) {
     assert(s_instance != nullptr);
     s_instance->on_finish = completion;
     return s_instance;
@@ -173,7 +181,7 @@ struct TestMockIoImageRequest : public TestMockFixture {
                              uint64_t offset, uint64_t length,
                              uint64_t journal_tid, int r) {
     EXPECT_CALL(mock_image_ctx, write_to_cache(object, _, length, offset, _, _,
-                journal_tid))
+                journal_tid, _))
       .WillOnce(WithArg<4>(CompleteContext(r, mock_image_ctx.image_ctx->op_work_queue)));
   }
 
@@ -210,8 +218,12 @@ TEST_F(TestMockIoImageRequest, AioWriteJournalAppendDisabled) {
 
   InSequence seq;
   expect_is_journal_appending(mock_journal, false);
-  expect_write_to_cache(mock_image_ctx, ictx->get_object_name(0),
-                        0, 1, 0, 0);
+  if (mock_image_ctx.image_ctx->cache) {
+    expect_write_to_cache(mock_image_ctx, ictx->get_object_name(0),
+                          0, 1, 0, 0);
+  } else {
+    expect_object_request_send(mock_image_ctx, mock_aio_object_request, 0);
+  }
 
   C_SaferCond aio_comp_ctx;
   AioCompletion *aio_comp = AioCompletion::create_and_start(
@@ -220,7 +232,7 @@ TEST_F(TestMockIoImageRequest, AioWriteJournalAppendDisabled) {
   bufferlist bl;
   bl.append("1");
   MockImageWriteRequest mock_aio_image_write(mock_image_ctx, aio_comp,
-                                             {{0, 1}}, std::move(bl), 0);
+                                             {{0, 1}}, std::move(bl), 0, {});
   {
     RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
     mock_aio_image_write.send();
@@ -249,7 +261,9 @@ TEST_F(TestMockIoImageRequest, AioDiscardJournalAppendDisabled) {
   AioCompletion *aio_comp = AioCompletion::create_and_start(
     &aio_comp_ctx, ictx, AIO_TYPE_DISCARD);
   MockImageDiscardRequest mock_aio_image_discard(mock_image_ctx, aio_comp,
-                                                 0, 1, ictx->skip_partial_discard);
+                                                 0, 1,
+                                                 ictx->skip_partial_discard,
+                                                 {});
   {
     RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
     mock_aio_image_discard.send();
@@ -275,7 +289,7 @@ TEST_F(TestMockIoImageRequest, AioFlushJournalAppendDisabled) {
   C_SaferCond aio_comp_ctx;
   AioCompletion *aio_comp = AioCompletion::create_and_start(
     &aio_comp_ctx, ictx, AIO_TYPE_FLUSH);
-  MockImageFlushRequest mock_aio_image_flush(mock_image_ctx, aio_comp);
+  MockImageFlushRequest mock_aio_image_flush(mock_image_ctx, aio_comp, {});
   {
     RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
     mock_aio_image_flush.send();
@@ -296,8 +310,13 @@ TEST_F(TestMockIoImageRequest, AioWriteSameJournalAppendDisabled) {
 
   InSequence seq;
   expect_is_journal_appending(mock_journal, false);
-  expect_write_to_cache(mock_image_ctx, ictx->get_object_name(0),
-                        0, 1, 0, 0);
+  if (mock_image_ctx.image_ctx->cache) {
+    expect_write_to_cache(mock_image_ctx, ictx->get_object_name(0),
+                          0, 1, 0, 0);
+  } else {
+    expect_object_request_send(mock_image_ctx, mock_aio_object_request, 0);
+  }
+
 
   C_SaferCond aio_comp_ctx;
   AioCompletion *aio_comp = AioCompletion::create_and_start(
@@ -306,7 +325,8 @@ TEST_F(TestMockIoImageRequest, AioWriteSameJournalAppendDisabled) {
   bufferlist bl;
   bl.append("1");
   MockImageWriteSameRequest mock_aio_image_writesame(mock_image_ctx, aio_comp,
-                                                     0, 1, std::move(bl), 0);
+                                                     0, 1, std::move(bl), 0,
+                                                     {});
   {
     RWLock::RLocker owner_locker(mock_image_ctx.owner_lock);
     mock_aio_image_writesame.send();
diff --git a/src/test/librbd/journal/test_mock_PromoteRequest.cc b/src/test/librbd/journal/test_mock_PromoteRequest.cc
index 805e76aaed4..209249955b1 100644
--- a/src/test/librbd/journal/test_mock_PromoteRequest.cc
+++ b/src/test/librbd/journal/test_mock_PromoteRequest.cc
@@ -24,6 +24,7 @@ namespace journal {
 template <>
 struct TypeTraits<MockTestImageCtx> {
   typedef ::journal::MockJournalerProxy Journaler;
+  typedef ::journal::MockFutureProxy  Future;
 };
 
 template <>
@@ -63,7 +64,9 @@ namespace librbd {
 namespace journal {
 
 using ::testing::_;
+using ::testing::A;
 using ::testing::InSequence;
+using ::testing::Return;
 using ::testing::WithArg;
 
 class TestMockJournalPromoteRequest : public TestMockFixture {
@@ -95,6 +98,35 @@ public:
       .WillOnce(WithArg<3>(CompleteContext(r, static_cast<ContextWQ*>(NULL))));
   }
 
+  void expect_append_journaler(::journal::MockJournaler &mock_journaler) {
+    EXPECT_CALL(mock_journaler, append(_, _))
+                  .WillOnce(Return(::journal::MockFutureProxy()));
+  }
+
+  void expect_future_flush(::journal::MockFuture &mock_future, int r) {
+    EXPECT_CALL(mock_future, flush(_))
+                  .WillOnce(CompleteContext(r, static_cast<ContextWQ*>(NULL)));
+  }
+
+  void expect_future_committed(::journal::MockJournaler &mock_journaler) {
+    EXPECT_CALL(mock_journaler, committed(A<const ::journal::MockFutureProxy &>()));
+  }
+
+  void expect_flush_commit_position(::journal::MockJournaler &mock_journaler,
+                                    int r) {
+    EXPECT_CALL(mock_journaler, flush_commit_position(_))
+                  .WillOnce(CompleteContext(r, static_cast<ContextWQ*>(NULL)));
+  }
+
+  void expect_start_append(::journal::MockJournaler &mock_journaler) {
+    EXPECT_CALL(mock_journaler, start_append(_, _, _));
+  }
+
+  void expect_stop_append(::journal::MockJournaler &mock_journaler, int r) {
+    EXPECT_CALL(mock_journaler, stop_append(_))
+                  .WillOnce(CompleteContext(r, static_cast<ContextWQ*>(NULL)));
+  }
+
   void expect_shut_down_journaler(::journal::MockJournaler &mock_journaler,
                                   int r) {
     EXPECT_CALL(mock_journaler, shut_down(_))
@@ -120,6 +152,15 @@ TEST_F(TestMockJournalPromoteRequest, SuccessOrderly) {
   expect_open_journaler(mock_image_ctx, mock_open_request, 0);
   expect_allocate_tag(mock_journaler,
                       {Journal<>::ORPHAN_MIRROR_UUID, true, 567, 1}, 0);
+
+  ::journal::MockFuture mock_future;
+  expect_start_append(mock_journaler);
+  expect_append_journaler(mock_journaler);
+  expect_future_flush(mock_future, 0);
+  expect_future_committed(mock_journaler);
+  expect_flush_commit_position(mock_journaler, 0);
+  expect_stop_append(mock_journaler, 0);
+
   expect_shut_down_journaler(mock_journaler, 0);
 
   C_SaferCond ctx;
@@ -145,6 +186,15 @@ TEST_F(TestMockJournalPromoteRequest, SuccessForced) {
   expect_open_journaler(mock_image_ctx, mock_open_request, 0);
   expect_allocate_tag(mock_journaler,
                       {Journal<>::LOCAL_MIRROR_UUID, true, 567, 0}, 0);
+
+  ::journal::MockFuture mock_future;
+  expect_start_append(mock_journaler);
+  expect_append_journaler(mock_journaler);
+  expect_future_flush(mock_future, 0);
+  expect_future_committed(mock_journaler);
+  expect_flush_commit_position(mock_journaler, 0);
+  expect_stop_append(mock_journaler, 0);
+
   expect_shut_down_journaler(mock_journaler, 0);
 
   C_SaferCond ctx;
@@ -201,6 +251,72 @@ TEST_F(TestMockJournalPromoteRequest, AllocateTagError) {
   ASSERT_EQ(-EBADMSG, ctx.wait());
 }
 
+TEST_F(TestMockJournalPromoteRequest, AppendEventError) {
+  REQUIRE_FEATURE(RBD_FEATURE_JOURNALING);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockTestImageCtx mock_image_ctx(*ictx);
+  ::journal::MockJournaler mock_journaler;
+  MockOpenRequest mock_open_request;
+
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  expect_construct_journaler(mock_journaler);
+  expect_open_journaler(mock_image_ctx, mock_open_request, 0);
+  expect_allocate_tag(mock_journaler,
+                      {Journal<>::ORPHAN_MIRROR_UUID, true, 567, 1}, 0);
+
+  ::journal::MockFuture mock_future;
+  expect_start_append(mock_journaler);
+  expect_append_journaler(mock_journaler);
+  expect_future_flush(mock_future, -EPERM);
+  expect_stop_append(mock_journaler, 0);
+
+  expect_shut_down_journaler(mock_journaler, 0);
+
+  C_SaferCond ctx;
+  auto req = MockPromoteRequest::create(&mock_image_ctx, false, &ctx);
+  req->send();
+  ASSERT_EQ(-EPERM, ctx.wait());
+}
+
+TEST_F(TestMockJournalPromoteRequest, CommitEventError) {
+  REQUIRE_FEATURE(RBD_FEATURE_JOURNALING);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockTestImageCtx mock_image_ctx(*ictx);
+  ::journal::MockJournaler mock_journaler;
+  MockOpenRequest mock_open_request;
+
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  expect_construct_journaler(mock_journaler);
+  expect_open_journaler(mock_image_ctx, mock_open_request, 0);
+  expect_allocate_tag(mock_journaler,
+                      {Journal<>::ORPHAN_MIRROR_UUID, true, 567, 1}, 0);
+
+  ::journal::MockFuture mock_future;
+  expect_start_append(mock_journaler);
+  expect_append_journaler(mock_journaler);
+  expect_future_flush(mock_future, 0);
+  expect_future_committed(mock_journaler);
+  expect_flush_commit_position(mock_journaler, -EINVAL);
+  expect_stop_append(mock_journaler, 0);
+
+  expect_shut_down_journaler(mock_journaler, 0);
+
+  C_SaferCond ctx;
+  auto req = MockPromoteRequest::create(&mock_image_ctx, false, &ctx);
+  req->send();
+  ASSERT_EQ(-EINVAL, ctx.wait());
+}
+
 TEST_F(TestMockJournalPromoteRequest, ShutDownError) {
   REQUIRE_FEATURE(RBD_FEATURE_JOURNALING);
 
@@ -218,6 +334,15 @@ TEST_F(TestMockJournalPromoteRequest, ShutDownError) {
   expect_open_journaler(mock_image_ctx, mock_open_request, 0);
   expect_allocate_tag(mock_journaler,
                       {Journal<>::LOCAL_MIRROR_UUID, true, 567, 0}, 0);
+
+  ::journal::MockFuture mock_future;
+  expect_start_append(mock_journaler);
+  expect_append_journaler(mock_journaler);
+  expect_future_flush(mock_future, 0);
+  expect_future_committed(mock_journaler);
+  expect_flush_commit_position(mock_journaler, 0);
+  expect_stop_append(mock_journaler, 0);
+
   expect_shut_down_journaler(mock_journaler, -EINVAL);
 
   C_SaferCond ctx;
diff --git a/src/test/librbd/journal/test_mock_Replay.cc b/src/test/librbd/journal/test_mock_Replay.cc
index aade74c1205..e13a156ee05 100644
--- a/src/test/librbd/journal/test_mock_Replay.cc
+++ b/src/test/librbd/journal/test_mock_Replay.cc
@@ -32,7 +32,7 @@ struct ImageRequest<MockReplayImageCtx> {
                                const bufferlist &bl, int op_flags));
   static void aio_write(MockReplayImageCtx *ictx, AioCompletion *c,
                         Extents &&image_extents, bufferlist &&bl,
-                        int op_flags) {
+                        int op_flags, const ZTracer::Trace &parent_trace) {
     assert(s_instance != nullptr);
     s_instance->aio_write(c, image_extents, bl, op_flags);
   }
@@ -40,13 +40,16 @@ struct ImageRequest<MockReplayImageCtx> {
   MOCK_METHOD4(aio_discard, void(AioCompletion *c, uint64_t off, uint64_t len,
                                  bool skip_partial_discard));
   static void aio_discard(MockReplayImageCtx *ictx, AioCompletion *c,
-                          uint64_t off, uint64_t len, bool skip_partial_discard) {
+                          uint64_t off, uint64_t len,
+                          bool skip_partial_discard,
+                          const ZTracer::Trace &parent_trace) {
     assert(s_instance != nullptr);
     s_instance->aio_discard(c, off, len, skip_partial_discard);
   }
 
   MOCK_METHOD1(aio_flush, void(AioCompletion *c));
-  static void aio_flush(MockReplayImageCtx *ictx, AioCompletion *c) {
+  static void aio_flush(MockReplayImageCtx *ictx, AioCompletion *c,
+                        const ZTracer::Trace &parent_trace) {
     assert(s_instance != nullptr);
     s_instance->aio_flush(c);
   }
@@ -55,7 +58,7 @@ struct ImageRequest<MockReplayImageCtx> {
                                    const bufferlist &bl, int op_flags));
   static void aio_writesame(MockReplayImageCtx *ictx, AioCompletion *c,
                             uint64_t off, uint64_t len, bufferlist &&bl,
-                            int op_flags) {
+                            int op_flags, const ZTracer::Trace &parent_trace) {
     assert(s_instance != nullptr);
     s_instance->aio_writesame(c, off, len, bl, op_flags);
   }
diff --git a/src/test/librbd/mock/MockImageCtx.h b/src/test/librbd/mock/MockImageCtx.h
index e7147a2cfe6..84383f91300 100644
--- a/src/test/librbd/mock/MockImageCtx.h
+++ b/src/test/librbd/mock/MockImageCtx.h
@@ -16,6 +16,7 @@
 #include "test/librbd/mock/io/MockImageRequestWQ.h"
 #include "common/RWLock.h"
 #include "common/WorkQueue.h"
+#include "common/zipkin_trace.h"
 #include "librbd/ImageCtx.h"
 #include "gmock/gmock.h"
 #include <string>
@@ -86,6 +87,7 @@ struct MockImageCtx {
       state(new MockImageState()),
       image_watcher(NULL), object_map(NULL),
       exclusive_lock(NULL), journal(NULL),
+      trace_endpoint(image_ctx.trace_endpoint),
       concurrent_management_ops(image_ctx.concurrent_management_ops),
       blacklist_on_break_lock(image_ctx.blacklist_on_break_lock),
       blacklist_expire_seconds(image_ctx.blacklist_expire_seconds),
@@ -192,10 +194,10 @@ struct MockImageCtx {
   MOCK_CONST_METHOD0(get_journal_policy, journal::Policy*());
   MOCK_CONST_METHOD1(set_journal_policy, void(journal::Policy*));
 
-  MOCK_METHOD7(aio_read_from_cache, void(object_t, uint64_t, bufferlist *,
-                                         size_t, uint64_t, Context *, int));
-  MOCK_METHOD7(write_to_cache, void(object_t, const bufferlist&, size_t,
-                                    uint64_t, Context *, int, uint64_t));
+  MOCK_METHOD8(aio_read_from_cache, void(object_t, uint64_t, bufferlist *,
+                                         size_t, uint64_t, Context *, int, ZTracer::Trace *));
+  MOCK_METHOD8(write_to_cache, void(object_t, const bufferlist&, size_t,
+                                    uint64_t, Context *, int, uint64_t, ZTracer::Trace *));
 
   ImageCtx *image_ctx;
   CephContext *cct;
@@ -272,6 +274,8 @@ struct MockImageCtx {
   MockExclusiveLock *exclusive_lock;
   MockJournal *journal;
 
+  ZTracer::Endpoint trace_endpoint;
+
   int concurrent_management_ops;
   bool blacklist_on_break_lock;
   uint32_t blacklist_expire_seconds;
diff --git a/src/test/librbd/mock/MockJournal.h b/src/test/librbd/mock/MockJournal.h
index a31a80386e3..9f0a985cb7b 100644
--- a/src/test/librbd/mock/MockJournal.h
+++ b/src/test/librbd/mock/MockJournal.h
@@ -9,6 +9,10 @@
 #include "librbd/journal/Types.h"
 #include <list>
 
+struct Context;
+struct ContextWQ;
+namespace librados { class IoCtx; }
+
 namespace librbd {
 
 struct ImageCtx;
@@ -28,6 +32,14 @@ struct MockJournal {
     return get_instance()->is_tag_owner(is_tag_owner);
   }
 
+  static void get_tag_owner(librados::IoCtx &,
+                            const std::string &global_image_id,
+                            std::string *tag_owner, ContextWQ *work_queue,
+                            Context *on_finish) {
+    get_instance()->get_tag_owner(global_image_id, tag_owner,
+                                  work_queue, on_finish);
+  }
+
   MockJournal() {
     s_instance = this;
   }
@@ -38,6 +50,10 @@ struct MockJournal {
 
   MOCK_METHOD1(wait_for_journal_ready, void(Context *));
 
+  MOCK_METHOD4(get_tag_owner, void(const std::string &,
+                                   std::string *, ContextWQ *,
+                                   Context *));
+
   MOCK_CONST_METHOD0(is_tag_owner, bool());
   MOCK_CONST_METHOD1(is_tag_owner, int(bool *));
   MOCK_METHOD3(allocate_tag, void(const std::string &mirror_uuid,
diff --git a/src/test/librbd/mock/MockObjectMap.h b/src/test/librbd/mock/MockObjectMap.h
index 057364d53e7..9ace5e37480 100644
--- a/src/test/librbd/mock/MockObjectMap.h
+++ b/src/test/librbd/mock/MockObjectMap.h
@@ -22,23 +22,25 @@ struct MockObjectMap {
   template <typename T, void(T::*MF)(int)>
   bool aio_update(uint64_t snap_id, uint64_t start_object_no, uint8_t new_state,
                   const boost::optional<uint8_t> &current_state,
-                  T *callback_object) {
+                  const ZTracer::Trace &parent_trace, T *callback_object) {
     return aio_update<T, MF>(snap_id, start_object_no, start_object_no + 1,
-                             new_state, current_state, callback_object);
+                             new_state, current_state, parent_trace,
+                             callback_object);
   }
 
   template <typename T, void(T::*MF)(int)>
   bool aio_update(uint64_t snap_id, uint64_t start_object_no,
                   uint64_t end_object_no, uint8_t new_state,
                   const boost::optional<uint8_t> &current_state,
-                  T *callback_object) {
+                  const ZTracer::Trace &parent_trace, T *callback_object) {
     return aio_update(snap_id, start_object_no, end_object_no, new_state,
-                      current_state,
+                      current_state, parent_trace,
                       util::create_context_callback<T, MF>(callback_object));
   }
-  MOCK_METHOD6(aio_update, bool(uint64_t snap_id, uint64_t start_object_no,
+  MOCK_METHOD7(aio_update, bool(uint64_t snap_id, uint64_t start_object_no,
                                 uint64_t end_object_no, uint8_t new_state,
                                 const boost::optional<uint8_t> &current_state,
+                                const ZTracer::Trace &parent_trace,
                                 Context *on_finish));
   MOCK_METHOD2(snapshot_add, void(uint64_t snap_id, Context *on_finish));
   MOCK_METHOD2(snapshot_remove, void(uint64_t snap_id, Context *on_finish));
diff --git a/src/test/librbd/object_map/test_mock_UpdateRequest.cc b/src/test/librbd/object_map/test_mock_UpdateRequest.cc
index ea40404d39b..7f47be28176 100644
--- a/src/test/librbd/object_map/test_mock_UpdateRequest.cc
+++ b/src/test/librbd/object_map/test_mock_UpdateRequest.cc
@@ -68,7 +68,7 @@ TEST_F(TestMockObjectMapUpdateRequest, UpdateInMemory) {
   C_SaferCond cond_ctx;
   AsyncRequest<> *req = new UpdateRequest<>(
     *ictx, &object_map, CEPH_NOSNAP, 0, object_map.size(), OBJECT_NONEXISTENT,
-    OBJECT_EXISTS, &cond_ctx);
+    OBJECT_EXISTS, {}, &cond_ctx);
   {
     RWLock::RLocker snap_locker(ictx->snap_lock);
     RWLock::WLocker object_map_locker(ictx->object_map_lock);
@@ -100,7 +100,7 @@ TEST_F(TestMockObjectMapUpdateRequest, UpdateHeadOnDisk) {
   C_SaferCond cond_ctx;
   AsyncRequest<> *req = new UpdateRequest<>(
     *ictx, &object_map, CEPH_NOSNAP, 0, object_map.size(), OBJECT_NONEXISTENT,
-    OBJECT_EXISTS, &cond_ctx);
+    OBJECT_EXISTS, {}, &cond_ctx);
   {
     RWLock::RLocker snap_locker(ictx->snap_lock);
     RWLock::WLocker object_map_locker(ictx->object_map_lock);
@@ -130,7 +130,7 @@ TEST_F(TestMockObjectMapUpdateRequest, UpdateSnapOnDisk) {
   C_SaferCond cond_ctx;
   AsyncRequest<> *req = new UpdateRequest<>(
     *ictx, &object_map, snap_id, 0, object_map.size(), OBJECT_NONEXISTENT,
-    OBJECT_EXISTS, &cond_ctx);
+    OBJECT_EXISTS, {}, &cond_ctx);
   {
     RWLock::RLocker snap_locker(ictx->snap_lock);
     RWLock::WLocker object_map_locker(ictx->object_map_lock);
@@ -157,7 +157,7 @@ TEST_F(TestMockObjectMapUpdateRequest, UpdateOnDiskError) {
   C_SaferCond cond_ctx;
   AsyncRequest<> *req = new UpdateRequest<>(
     *ictx, &object_map, CEPH_NOSNAP, 0, object_map.size(), OBJECT_NONEXISTENT,
-    OBJECT_EXISTS, &cond_ctx);
+    OBJECT_EXISTS, {}, &cond_ctx);
   {
     RWLock::RLocker snap_locker(ictx->snap_lock);
     RWLock::WLocker object_map_locker(ictx->object_map_lock);
@@ -187,7 +187,7 @@ TEST_F(TestMockObjectMapUpdateRequest, RebuildSnapOnDisk) {
   C_SaferCond cond_ctx;
   AsyncRequest<> *req = new UpdateRequest<>(
     *ictx, &object_map, snap_id, 0, object_map.size(), OBJECT_EXISTS_CLEAN,
-    boost::optional<uint8_t>(), &cond_ctx);
+    boost::optional<uint8_t>(), {}, &cond_ctx);
   {
     RWLock::RLocker snap_locker(ictx->snap_lock);
     RWLock::WLocker object_map_locker(ictx->object_map_lock);
diff --git a/src/test/librbd/operation/test_mock_ResizeRequest.cc b/src/test/librbd/operation/test_mock_ResizeRequest.cc
index 851f34a108f..e67f8a2a832 100644
--- a/src/test/librbd/operation/test_mock_ResizeRequest.cc
+++ b/src/test/librbd/operation/test_mock_ResizeRequest.cc
@@ -114,12 +114,18 @@ public:
   }
 
   void expect_flush_cache(MockImageCtx &mock_image_ctx, int r) {
+    if (!mock_image_ctx.image_ctx->cache) {
+      return;
+    }
     EXPECT_CALL(mock_image_ctx, flush_cache(_))
                   .WillOnce(CompleteContext(r, static_cast<ContextWQ*>(NULL)));
     expect_op_work_queue(mock_image_ctx);
   }
 
   void expect_invalidate_cache(MockImageCtx &mock_image_ctx, int r) {
+    if (!mock_image_ctx.image_ctx->cache) {
+      return;
+    }
     EXPECT_CALL(mock_image_ctx, invalidate_cache(false, _))
                    .WillOnce(WithArg<1>(CompleteContext(r, static_cast<ContextWQ*>(NULL))));
     expect_op_work_queue(mock_image_ctx);
@@ -278,6 +284,7 @@ TEST_F(TestMockOperationResizeRequest, TrimError) {
 TEST_F(TestMockOperationResizeRequest, FlushCacheError) {
   librbd::ImageCtx *ictx;
   ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  REQUIRE(ictx->cache);
 
   MockImageCtx mock_image_ctx(*ictx);
   MockExclusiveLock mock_exclusive_lock;
@@ -300,6 +307,7 @@ TEST_F(TestMockOperationResizeRequest, FlushCacheError) {
 TEST_F(TestMockOperationResizeRequest, InvalidateCacheError) {
   librbd::ImageCtx *ictx;
   ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  REQUIRE(ictx->cache);
 
   MockImageCtx mock_image_ctx(*ictx);
   MockExclusiveLock mock_exclusive_lock;
diff --git a/src/test/librbd/operation/test_mock_SnapshotRollbackRequest.cc b/src/test/librbd/operation/test_mock_SnapshotRollbackRequest.cc
index 1f8b4addf05..97cc5f153dc 100644
--- a/src/test/librbd/operation/test_mock_SnapshotRollbackRequest.cc
+++ b/src/test/librbd/operation/test_mock_SnapshotRollbackRequest.cc
@@ -70,10 +70,6 @@ struct AsyncRequest<MockOperationImageCtx> : public AsyncRequest<MockImageCtx> {
 #include "librbd/operation/Request.cc"
 #include "librbd/operation/SnapshotRollbackRequest.cc"
 
-template class librbd::AsyncRequest<librbd::MockImageCtx>;
-template class librbd::AsyncObjectThrottle<librbd::MockImageCtx>;
-template class librbd::operation::Request<librbd::MockImageCtx>;
-
 namespace librbd {
 namespace operation {
 
@@ -306,6 +302,7 @@ TEST_F(TestMockOperationSnapshotRollbackRequest, RollbackObjectsError) {
 TEST_F(TestMockOperationSnapshotRollbackRequest, InvalidateCacheError) {
   librbd::ImageCtx *ictx;
   ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  REQUIRE(ictx->cache);
 
   MockOperationImageCtx mock_image_ctx(*ictx);
   MockExclusiveLock mock_exclusive_lock;
diff --git a/src/test/librbd/test_internal.cc b/src/test/librbd/test_internal.cc
index 5b4f1d68162..130a699a96b 100644
--- a/src/test/librbd/test_internal.cc
+++ b/src/test/librbd/test_internal.cc
@@ -664,8 +664,9 @@ TEST_F(TestInternal, ResizeCopyup)
   bufferlist bl;
   bl.append(std::string(4096, '1'));
   for (size_t i = 0; i < m_image_size; i += bl.length()) {
-    ASSERT_EQ(bl.length(), ictx->io_work_queue->write(i, bl.length(),
-                                                      bufferlist{bl}, 0));
+    ASSERT_EQ((ssize_t)bl.length(),
+	      ictx->io_work_queue->write(i, bl.length(),
+					 bufferlist{bl}, 0));
   }
 
   ASSERT_EQ(0, snap_create(*ictx, "snap1"));
@@ -703,7 +704,7 @@ TEST_F(TestInternal, ResizeCopyup)
 
   librbd::io::ReadResult read_result{&read_bl};
   for (size_t i = 2 << order; i < m_image_size; i += bl.length()) {
-    ASSERT_EQ(bl.length(),
+    ASSERT_EQ((ssize_t)bl.length(),
               ictx2->io_work_queue->read(i, bl.length(),
                                          librbd::io::ReadResult{read_result},
                                          0));
@@ -733,8 +734,9 @@ TEST_F(TestInternal, DiscardCopyup)
   bufferlist bl;
   bl.append(std::string(4096, '1'));
   for (size_t i = 0; i < m_image_size; i += bl.length()) {
-    ASSERT_EQ(bl.length(), ictx->io_work_queue->write(i, bl.length(),
-                                                      bufferlist{bl}, 0));
+    ASSERT_EQ((ssize_t)bl.length(),
+	      ictx->io_work_queue->write(i, bl.length(),
+					 bufferlist{bl}, 0));
   }
 
   ASSERT_EQ(0, snap_create(*ictx, "snap1"));
@@ -769,7 +771,7 @@ TEST_F(TestInternal, DiscardCopyup)
 
   librbd::io::ReadResult read_result{&read_bl};
   for (size_t i = 0; i < m_image_size; i += bl.length()) {
-    ASSERT_EQ(bl.length(),
+    ASSERT_EQ((ssize_t)bl.length(),
               ictx2->io_work_queue->read(i, bl.length(),
                                          librbd::io::ReadResult{read_result},
                                          0));
@@ -880,7 +882,7 @@ TEST_F(TestInternal, WriteFullCopyup) {
 
   bufferlist bl;
   bl.append(std::string(1 << ictx->order, '1'));
-  ASSERT_EQ(bl.length(),
+  ASSERT_EQ((ssize_t)bl.length(),
             ictx->io_work_queue->write(0, bl.length(), bufferlist{bl}, 0));
   ASSERT_EQ(0, librbd::flush(ictx));
 
@@ -910,7 +912,7 @@ TEST_F(TestInternal, WriteFullCopyup) {
 
   bufferlist write_full_bl;
   write_full_bl.append(std::string(1 << ictx2->order, '2'));
-  ASSERT_EQ(write_full_bl.length(),
+  ASSERT_EQ((ssize_t)write_full_bl.length(),
             ictx2->io_work_queue->write(0, write_full_bl.length(),
                                         bufferlist{write_full_bl}, 0));
 
@@ -921,7 +923,7 @@ TEST_F(TestInternal, WriteFullCopyup) {
   read_bl.push_back(read_ptr);
 
   librbd::io::ReadResult read_result{&read_bl};
-  ASSERT_EQ(read_bl.length(),
+  ASSERT_EQ((ssize_t)read_bl.length(),
             ictx2->io_work_queue->read(0, read_bl.length(),
                                        librbd::io::ReadResult{read_result}, 0));
   ASSERT_TRUE(write_full_bl.contents_equal(read_bl));
@@ -929,7 +931,7 @@ TEST_F(TestInternal, WriteFullCopyup) {
   ASSERT_EQ(0, librbd::snap_set(ictx2,
 				cls::rbd::UserSnapshotNamespace(),
 				"snap1"));
-  ASSERT_EQ(read_bl.length(),
+  ASSERT_EQ((ssize_t)read_bl.length(),
             ictx2->io_work_queue->read(0, read_bl.length(),
                                        librbd::io::ReadResult{read_result}, 0));
   ASSERT_TRUE(bl.contents_equal(read_bl));
@@ -1030,7 +1032,7 @@ TEST_F(TestInternal, TestCoR)
   ASSERT_EQ(0, image.stat(info, sizeof(info)));
 
   const int object_num = info.size / info.obj_size;
-  printf("made parent image \"%s\": %ldK (%d * %ldK)\n", m_image_name.c_str(),
+  printf("made parent image \"%s\": %ldK (%d * %" PRIu64 "K)\n", m_image_name.c_str(),
          (unsigned long)m_image_size, object_num, info.obj_size/1024);
 
   // write something into parent
@@ -1167,8 +1169,8 @@ TEST_F(TestInternal, FlattenNoEmptyObjects)
   ASSERT_EQ(0, image.stat(info, sizeof(info)));
 
   const int object_num = info.size / info.obj_size;
-  printf("made parent image \"%s\": %ldK (%d * %ldK)\n", m_image_name.c_str(),
-         (unsigned long)m_image_size, object_num, info.obj_size/1024);
+  printf("made parent image \"%s\": %" PRIu64 "K (%d * %" PRIu64 "K)\n",
+	 m_image_name.c_str(), m_image_size, object_num, info.obj_size/1024);
 
   // write something into parent
   char test_data[TEST_IO_SIZE + 1];
diff --git a/src/test/librbd/test_librbd.cc b/src/test/librbd/test_librbd.cc
index b5a4765d6b5..f16fbcca2a9 100644
--- a/src/test/librbd/test_librbd.cc
+++ b/src/test/librbd/test_librbd.cc
@@ -84,6 +84,7 @@ static int get_features(bool *old_format, uint64_t *features)
     cout << "using new format!" << std::endl;
   } else {
     *old_format = true;
+    *features = 0;
     cout << "using old format" << std::endl;
   }
 
@@ -109,7 +110,8 @@ static int create_image_full(rados_ioctx_t ioctx, const char *name,
       stripe_unit = (1ull << (*order-1));
     }
 
-    printf("creating image with stripe unit: %ld, stripe count: %ld\n",
+    printf("creating image with stripe unit: %" PRIu64 ", "
+	   "stripe count: %" PRIu64 "\n",
            stripe_unit, IMAGE_STRIPE_COUNT);
     return rbd_create3(ioctx, name, size, features, order,
                        stripe_unit, IMAGE_STRIPE_COUNT);
@@ -1845,7 +1847,7 @@ TEST_F(TestLibRBD, TestScatterGatherIO)
                              sizeof(read_iovs) / sizeof(struct iovec),
                              1<<order, comp));
   ASSERT_EQ(0, rbd_aio_wait_for_complete(comp));
-  ASSERT_EQ(10U, rbd_aio_get_return_value(comp));
+  ASSERT_EQ(10, rbd_aio_get_return_value(comp));
   rbd_aio_release(comp);
   ASSERT_EQ("This1111 is a ", read_buffer);
 
@@ -1858,7 +1860,7 @@ TEST_F(TestLibRBD, TestScatterGatherIO)
                              sizeof(linear_iovs) / sizeof(struct iovec),
                              1<<order, comp));
   ASSERT_EQ(0, rbd_aio_wait_for_complete(comp));
-  ASSERT_EQ(4U, rbd_aio_get_return_value(comp));
+  ASSERT_EQ(4, rbd_aio_get_return_value(comp));
   rbd_aio_release(comp);
   ASSERT_EQ("1111This111111", linear_buffer);
 
@@ -3666,7 +3668,7 @@ TEST_F(TestLibRBD, Flatten)
 
   bufferlist bl;
   bl.append(std::string(4096, '1'));
-  ASSERT_EQ(bl.length(), parent_image.write(0, bl.length(), bl));
+  ASSERT_EQ((ssize_t)bl.length(), parent_image.write(0, bl.length(), bl));
 
   ASSERT_EQ(0, parent_image.snap_create("snap1"));
   ASSERT_EQ(0, parent_image.snap_protect("snap1"));
@@ -3687,7 +3689,7 @@ TEST_F(TestLibRBD, Flatten)
   bufferlist read_bl;
   clone_image.aio_read(0, bl.length(), read_bl, read_comp);
   ASSERT_EQ(0, read_comp->wait_for_complete());
-  ASSERT_EQ(bl.length(), read_comp->get_return_value());
+  ASSERT_EQ((ssize_t)bl.length(), read_comp->get_return_value());
   read_comp->release();
   ASSERT_TRUE(bl.contents_equal(read_bl));
 
@@ -3862,7 +3864,7 @@ TEST_F(TestLibRBD, SnapCreateViaLockOwner)
 
   bufferlist bl;
   bl.append(std::string(4096, '1'));
-  ASSERT_EQ(bl.length(), image1.write(0, bl.length(), bl));
+  ASSERT_EQ((ssize_t)bl.length(), image1.write(0, bl.length(), bl));
 
   bool lock_owner;
   ASSERT_EQ(0, image1.is_exclusive_lock_owner(&lock_owner));
@@ -4646,10 +4648,10 @@ TEST_F(TestLibRBD, RebuildObjectMap)
       return;
     }
 
-    ASSERT_EQ(bl.length(), image.write(0, bl.length(), bl));
+    ASSERT_EQ((ssize_t)bl.length(), image.write(0, bl.length(), bl));
 
     ASSERT_EQ(0, image.snap_create("snap1"));
-    ASSERT_EQ(bl.length(), image.write(1<<order, bl.length(), bl));
+    ASSERT_EQ((ssize_t)bl.length(), image.write(1<<order, bl.length(), bl));
 
     std::string image_id;
     ASSERT_EQ(0, get_image_id(image, &image_id));
@@ -4678,11 +4680,11 @@ TEST_F(TestLibRBD, RebuildObjectMap)
   ASSERT_EQ(0, rbd.open(ioctx, image2, name.c_str(), NULL));
 
   bufferlist read_bl;
-  ASSERT_EQ(bl.length(), image2.read(0, bl.length(), read_bl));
+  ASSERT_EQ((ssize_t)bl.length(), image2.read(0, bl.length(), read_bl));
   ASSERT_TRUE(bl.contents_equal(read_bl));
 
   read_bl.clear();
-  ASSERT_EQ(bl.length(), image2.read(1<<order, bl.length(), read_bl));
+  ASSERT_EQ((ssize_t)bl.length(), image2.read(1<<order, bl.length(), read_bl));
   ASSERT_TRUE(bl.contents_equal(read_bl));
 
   ASSERT_PASSED(validate_object_map, image1);
@@ -4738,10 +4740,10 @@ TEST_F(TestLibRBD, CheckObjectMap)
     uint64_t features;
     ASSERT_EQ(0, image.features(&features));
 
-    ASSERT_EQ(bl1.length(), image.write(0, bl1.length(), bl1));
+    ASSERT_EQ((ssize_t)bl1.length(), image.write(0, bl1.length(), bl1));
 
     ASSERT_EQ(0, image.snap_create("snap1"));
-    ASSERT_EQ(bl1.length(), image.write(1<<order, bl1.length(), bl1));
+    ASSERT_EQ((ssize_t)bl1.length(), image.write(1<<order, bl1.length(), bl1));
   }
 
   librbd::Image image1;
@@ -4755,7 +4757,7 @@ TEST_F(TestLibRBD, CheckObjectMap)
   ASSERT_LT(0, ioctx.read(object_map_oid, bl2, 1024, 0));
 
   bool lock_owner;
-  ASSERT_EQ(bl1.length(), image1.write(3 * (1 << 18), bl1.length(), bl1));
+  ASSERT_EQ((ssize_t)bl1.length(), image1.write(3 * (1 << 18), bl1.length(), bl1));
   ASSERT_EQ(0, image1.is_exclusive_lock_owner(&lock_owner));
   ASSERT_TRUE(lock_owner);
 
@@ -4833,7 +4835,7 @@ TEST_F(TestLibRBD, BlockingAIO)
   bufferlist read_bl;
   image.aio_read(0, bl.length(), read_bl, read_comp);
   ASSERT_EQ(0, read_comp->wait_for_complete());
-  ASSERT_EQ(bl.length(), read_comp->get_return_value());
+  ASSERT_EQ((ssize_t)bl.length(), read_comp->get_return_value());
   read_comp->release();
 
   bufferlist expected_bl;
@@ -4888,8 +4890,8 @@ TEST_F(TestLibRBD, ExclusiveLockTransition)
   ASSERT_EQ(0, rbd.open(ioctx, image3, name.c_str(), NULL));
   for (size_t object_no = 0; object_no < (size >> 12); ++object_no) {
     bufferlist read_bl;
-    ASSERT_EQ(bl.length(), image3.read(object_no << order, bl.length(),
-                                       read_bl));
+    ASSERT_EQ((ssize_t)bl.length(), image3.read(object_no << order, bl.length(),
+						read_bl));
     ASSERT_TRUE(bl.contents_equal(read_bl));
   }
 
@@ -5316,20 +5318,23 @@ TEST_F(TestLibRBD, Mirror) {
 
   // Add some images to the pool
   int order = 0;
-  ASSERT_EQ(0, create_image_pp(rbd, ioctx, "parent", 2 << 20, &order));
+  std::string parent_name = get_temp_image_name();
+  std::string child_name = get_temp_image_name();
+  ASSERT_EQ(0, create_image_pp(rbd, ioctx, parent_name.c_str(), 2 << 20,
+                               &order));
   bool old_format;
   uint64_t features;
   ASSERT_EQ(0, get_features(&old_format, &features));
   if ((features & RBD_FEATURE_LAYERING) != 0) {
     librbd::Image parent;
-    ASSERT_EQ(0, rbd.open(ioctx, parent, "parent", NULL));
+    ASSERT_EQ(0, rbd.open(ioctx, parent, parent_name.c_str(), NULL));
     ASSERT_EQ(0, parent.snap_create("parent_snap"));
     ASSERT_EQ(0, parent.close());
-    ASSERT_EQ(0, rbd.open(ioctx, parent, "parent", "parent_snap"));
+    ASSERT_EQ(0, rbd.open(ioctx, parent, parent_name.c_str(), "parent_snap"));
     ASSERT_EQ(0, parent.snap_protect("parent_snap"));
     ASSERT_EQ(0, parent.close());
-    ASSERT_EQ(0, rbd.clone(ioctx, "parent", "parent_snap", ioctx, "child",
-                           features, &order));
+    ASSERT_EQ(0, rbd.clone(ioctx, parent_name.c_str(), "parent_snap", ioctx,
+                           child_name.c_str(), features, &order));
   }
 
   ASSERT_EQ(RBD_MIRROR_MODE_IMAGE, mirror_mode);
@@ -5735,6 +5740,11 @@ TEST_F(TestLibRBD, TestTrashMoveAndPurge) {
     ASSERT_TRUE(image != name);
   }
 
+  librbd::trash_image_info_t info;
+  ASSERT_EQ(-ENOENT, rbd.trash_get(ioctx, "dummy image id", &info));
+  ASSERT_EQ(0, rbd.trash_get(ioctx, image_id.c_str(), &info));
+  ASSERT_EQ(image_id, info.id);
+
   std::vector<librbd::trash_image_info_t> entries;
   ASSERT_EQ(0, rbd.trash_list(ioctx, entries));
   ASSERT_FALSE(entries.empty());
diff --git a/src/test/librbd/test_mock_ExclusiveLock.cc b/src/test/librbd/test_mock_ExclusiveLock.cc
index f593f834803..eeb62f63ea5 100644
--- a/src/test/librbd/test_mock_ExclusiveLock.cc
+++ b/src/test/librbd/test_mock_ExclusiveLock.cc
@@ -52,6 +52,7 @@ struct ManagedLock<MockExclusiveLockImageCtx> {
   virtual void post_acquire_lock_handler(int, Context *) = 0;
   virtual void pre_release_lock_handler(bool, Context *) = 0;
   virtual void post_release_lock_handler(bool, int, Context *) = 0;
+  virtual void post_reacquire_lock_handler(int, Context *) = 0;
 
   MOCK_CONST_METHOD0(is_lock_owner, bool());
 
@@ -322,6 +323,12 @@ public:
     return ctx.wait();
   }
 
+  int when_post_reacquire_lock_handler(MockManagedLock &managed_lock, int r) {
+    C_SaferCond ctx;
+    managed_lock.post_reacquire_lock_handler(r, &ctx);
+    return ctx.wait();
+  }
+
   int when_shut_down(MockExclusiveLockImageCtx &mock_image_ctx,
                      MockExclusiveLock &exclusive_lock) {
     C_SaferCond ctx;
@@ -599,6 +606,52 @@ TEST_F(TestMockExclusiveLock, PreReleaseLockError) {
                                                     -EINVAL));
 }
 
+TEST_F(TestMockExclusiveLock, ReacquireLock) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  MockExclusiveLockImageCtx mock_image_ctx(*ictx);
+  MockExclusiveLock exclusive_lock(mock_image_ctx);
+  expect_op_work_queue(mock_image_ctx);
+
+  InSequence seq;
+  expect_set_state_initializing(exclusive_lock);
+  expect_block_writes(mock_image_ctx);
+  expect_set_state_unlocked(exclusive_lock);
+  ASSERT_EQ(0, when_init(mock_image_ctx, exclusive_lock));
+
+  // (try) acquire lock
+  MockPreAcquireRequest try_lock_pre_acquire;
+  expect_pre_acquire_request(try_lock_pre_acquire, 0);
+  ASSERT_EQ(0, when_pre_acquire_lock_handler(exclusive_lock));
+
+  MockPostAcquireRequest try_lock_post_acquire;
+  expect_post_acquire_request(try_lock_post_acquire, 0);
+  expect_is_state_acquiring(exclusive_lock, true);
+  expect_notify_acquired_lock(mock_image_ctx);
+  expect_unblock_writes(mock_image_ctx);
+  ASSERT_EQ(0, when_post_acquire_lock_handler(exclusive_lock, 0));
+
+  // reacquire lock
+  expect_notify_acquired_lock(mock_image_ctx);
+  ASSERT_EQ(0, when_post_reacquire_lock_handler(exclusive_lock, 0));
+
+  // shut down (and release)
+  expect_shut_down(exclusive_lock);
+  expect_is_state_waiting_for_lock(exclusive_lock, false);
+  ASSERT_EQ(0, when_shut_down(mock_image_ctx, exclusive_lock));
+
+  MockPreReleaseRequest shutdown_pre_release;
+  expect_pre_release_request(shutdown_pre_release, 0);
+  ASSERT_EQ(0, when_pre_release_lock_handler(exclusive_lock, true));
+
+  expect_unblock_writes(mock_image_ctx);
+  expect_notify_released_lock(mock_image_ctx);
+  ASSERT_EQ(0, when_post_release_lock_handler(exclusive_lock, true, 0));
+}
+
 TEST_F(TestMockExclusiveLock, BlockRequests) {
   REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
 
diff --git a/src/test/librbd/test_mock_Journal.cc b/src/test/librbd/test_mock_Journal.cc
index c7a6ea30417..50b59aac645 100644
--- a/src/test/librbd/test_mock_Journal.cc
+++ b/src/test/librbd/test_mock_Journal.cc
@@ -1019,7 +1019,7 @@ TEST_F(TestMockJournal, EventCommitError) {
 
   C_SaferCond object_request_ctx;
   auto object_request = new io::ObjectRemoveRequest(
-    ictx, "oid", 0, {}, &object_request_ctx);
+    ictx, "oid", 0, {}, {}, &object_request_ctx);
 
   ::journal::MockFuture mock_future;
   Context *on_journal_safe;
@@ -1060,7 +1060,7 @@ TEST_F(TestMockJournal, EventCommitErrorWithPendingWriteback) {
 
   C_SaferCond object_request_ctx;
   auto object_request = new io::ObjectRemoveRequest(
-    ictx, "oid", 0, {}, &object_request_ctx);
+    ictx, "oid", 0, {}, {}, &object_request_ctx);
 
   ::journal::MockFuture mock_future;
   Context *on_journal_safe;
diff --git a/src/test/librbd/test_mock_ObjectMap.cc b/src/test/librbd/test_mock_ObjectMap.cc
index f14c6b9a5b2..7deaef03bb8 100644
--- a/src/test/librbd/test_mock_ObjectMap.cc
+++ b/src/test/librbd/test_mock_ObjectMap.cc
@@ -70,6 +70,7 @@ struct UpdateRequest<MockTestImageCtx> {
                                uint64_t start_object_no, uint64_t end_object_no,
                                uint8_t new_state,
                                const boost::optional<uint8_t> &current_state,
+                               const ZTracer::Trace &parent_trace,
                                Context *on_finish) {
     assert(s_instance != nullptr);
     s_instance->on_finish = on_finish;
@@ -180,8 +181,8 @@ TEST_F(TestMockObjectMap, NonDetainedUpdate) {
   {
     RWLock::RLocker snap_locker(mock_image_ctx.snap_lock);
     RWLock::WLocker object_map_locker(mock_image_ctx.object_map_lock);
-    mock_object_map.aio_update(CEPH_NOSNAP, 0, 1, {}, &update_ctx1);
-    mock_object_map.aio_update(CEPH_NOSNAP, 1, 1, {}, &update_ctx2);
+    mock_object_map.aio_update(CEPH_NOSNAP, 0, 1, {}, {}, &update_ctx1);
+    mock_object_map.aio_update(CEPH_NOSNAP, 1, 1, {}, {}, &update_ctx2);
   }
 
   finish_update_2->complete(0);
@@ -238,10 +239,10 @@ TEST_F(TestMockObjectMap, DetainedUpdate) {
   {
     RWLock::RLocker snap_locker(mock_image_ctx.snap_lock);
     RWLock::WLocker object_map_locker(mock_image_ctx.object_map_lock);
-    mock_object_map.aio_update(CEPH_NOSNAP, 1, 4, 1, {}, &update_ctx1);
-    mock_object_map.aio_update(CEPH_NOSNAP, 1, 3, 1, {}, &update_ctx2);
-    mock_object_map.aio_update(CEPH_NOSNAP, 2, 3, 1, {}, &update_ctx3);
-    mock_object_map.aio_update(CEPH_NOSNAP, 0, 2, 1, {}, &update_ctx4);
+    mock_object_map.aio_update(CEPH_NOSNAP, 1, 4, 1, {}, {}, &update_ctx1);
+    mock_object_map.aio_update(CEPH_NOSNAP, 1, 3, 1, {}, {}, &update_ctx2);
+    mock_object_map.aio_update(CEPH_NOSNAP, 2, 3, 1, {}, {}, &update_ctx3);
+    mock_object_map.aio_update(CEPH_NOSNAP, 0, 2, 1, {}, {}, &update_ctx4);
   }
 
   // updates 2, 3, and 4 are blocked on update 1
diff --git a/src/test/librbd/test_notify.py b/src/test/librbd/test_notify.py
index 6571935f961..b44ef0cab30 100755
--- a/src/test/librbd/test_notify.py
+++ b/src/test/librbd/test_notify.py
@@ -64,6 +64,7 @@ def master(ioctx):
         image.create_snap('snap1')
         image.protect_snap('snap1')
 
+    features = features & ~(RBD_FEATURE_OBJECT_MAP | RBD_FEATURE_FAST_DIFF)
     RBD().clone(ioctx, PARENT_IMG_NAME, 'snap1', ioctx, CLONE_IMG_NAME,
                 features=features)
     with Image(ioctx, CLONE_IMG_NAME) as image:
@@ -141,10 +142,7 @@ def slave(ioctx):
         assert(list(image.list_snaps()) == [])
 
         print("rebuild object map")
-        features = image.features() & (
-                RBD_FEATURE_OBJECT_MAP | RBD_FEATURE_FAST_DIFF)
-        if features:
-            image.update_features(features, False)
+        assert((image.features() & RBD_FEATURE_OBJECT_MAP) == 0)
         image.update_features(RBD_FEATURE_OBJECT_MAP, True)
         assert((image.flags() & RBD_FLAG_OBJECT_MAP_INVALID) != 0)
         image.rebuild_object_map()
diff --git a/src/test/librbd/test_support.cc b/src/test/librbd/test_support.cc
index db16131df05..fa8e25194a7 100644
--- a/src/test/librbd/test_support.cc
+++ b/src/test/librbd/test_support.cc
@@ -40,7 +40,7 @@ int create_image_full_pp(librbd::RBD &rbd, librados::IoCtx &ioctx,
       stripe_unit = (1ull << (*order-1));
     }
 
-    printf("creating image with stripe unit: %ld, stripe count: %ld\n",
+    printf("creating image with stripe unit: %" PRIu64 ", stripe count: %" PRIu64 "\n",
            stripe_unit, IMAGE_STRIPE_COUNT);
     return rbd.create3(ioctx, name.c_str(), size, features, order, stripe_unit,
                        IMAGE_STRIPE_COUNT);
diff --git a/src/test/messenger/CMakeLists.txt b/src/test/messenger/CMakeLists.txt
index ebdd00f2081..7981fa6d5cb 100644
--- a/src/test/messenger/CMakeLists.txt
+++ b/src/test/messenger/CMakeLists.txt
@@ -3,7 +3,7 @@ add_executable(simple_server
   simple_dispatcher.cc
   )
 target_link_libraries(simple_server
-  os global ceph-common
+  global ceph-common
   ${EXTRALIBS}
   ${CMAKE_DL_LIBS}
   )
@@ -13,7 +13,7 @@ add_executable(simple_client
   simple_dispatcher.cc
   )
 target_link_libraries(simple_client
-  os global ceph-common
+  global ceph-common
   ${EXTRALIBS}
   ${CMAKE_DL_LIBS}
   )
@@ -24,7 +24,7 @@ if(HAVE_XIO)
     xio_dispatcher.cc
     )
   target_link_libraries(xio_server
-    os global ceph-common
+    global ceph-common
     ${XIO_LIBRARY} pthread rt
     ${EXTRALIBS}
     ${CMAKE_DL_LIBS}
@@ -35,7 +35,7 @@ if(HAVE_XIO)
     xio_dispatcher.cc
     )
   target_link_libraries(xio_client
-    os global ceph-common
+    global ceph-common
     ${XIO_LIBRARY} pthread rt
     ${EXTRALIBS}
     ${CMAKE_DL_LIBS}
diff --git a/src/test/messenger/message_helper.h b/src/test/messenger/message_helper.h
index 769c9b4a418..bec2c685cf2 100644
--- a/src/test/messenger/message_helper.h
+++ b/src/test/messenger/message_helper.h
@@ -107,11 +107,12 @@ static inline Message* new_simple_ping_with_data(const char *tag,
   for (uint32_t i = 0; i < nfrags; ++i) {
     if (do_page_alignment) {
       if (posix_memalign(&p, pagesize, segsize))
-	p = NULL;
+	p = nullptr;
     } else {
 	p = malloc(segsize);
     }
-
+    if (!p)
+      throw std::bad_alloc();
     strcpy((char*) p, tag);
     uint32_t* t = (uint32_t* ) (((char*) p) + segsize - 32);
     *t = counter;
diff --git a/src/test/mon/test_mon_workloadgen.cc b/src/test/mon/test_mon_workloadgen.cc
index 05df8e44db0..f64844c6643 100644
--- a/src/test/mon/test_mon_workloadgen.cc
+++ b/src/test/mon/test_mon_workloadgen.cc
@@ -495,9 +495,8 @@ class OSDStub : public TestStub
       return;
     }
 
-    const map<int64_t,pg_pool_t> &osdmap_pools = osdmap.get_pools();
-    map<int64_t,pg_pool_t>::const_iterator pit;
-    for (pit = osdmap_pools.begin(); pit != osdmap_pools.end(); ++pit) {
+    auto& osdmap_pools = osdmap.get_pools();
+    for (auto pit = osdmap_pools.begin(); pit != osdmap_pools.end(); ++pit) {
       const int64_t pool_id = pit->first;
       const pg_pool_t &pool = pit->second;
       int ruleno = pool.get_crush_ruleset();
@@ -674,9 +673,8 @@ class OSDStub : public TestStub
 
     JSONFormatter f(true);
     f.open_array_section("pools");
-    const map<int64_t,pg_pool_t> &osdmap_pools = osdmap.get_pools();
-    map<int64_t,pg_pool_t>::const_iterator pit;
-    for (pit = osdmap_pools.begin(); pit != osdmap_pools.end(); ++pit) {
+    auto& osdmap_pools = osdmap.get_pools();
+    for (auto pit = osdmap_pools.begin(); pit != osdmap_pools.end(); ++pit) {
       const int64_t pool_id = pit->first;
       const pg_pool_t &pool = pit->second;
       f.open_object_section("pool");
diff --git a/src/test/msgr/test_async_driver.cc b/src/test/msgr/test_async_driver.cc
index 697198a38de..a93cffa0f4e 100644
--- a/src/test/msgr/test_async_driver.cc
+++ b/src/test/msgr/test_async_driver.cc
@@ -148,6 +148,7 @@ void* echoclient(void *arg)
   sa.sin_port = htons(port);
   char addr[] = "127.0.0.1";
   int r = inet_pton(AF_INET, addr, &sa.sin_addr);
+  assert(r == 1);
 
   int connect_sd = ::socket(AF_INET, SOCK_STREAM, 0);
   if (connect_sd >= 0) {
diff --git a/src/test/objectstore/Allocator_test.cc b/src/test/objectstore/Allocator_test.cc
index 3a408265962..65f8e47bd08 100644
--- a/src/test/objectstore/Allocator_test.cc
+++ b/src/test/objectstore/Allocator_test.cc
@@ -200,6 +200,22 @@ TEST_P(AllocTest, test_alloc_failure)
   }
 }
 
+TEST_P(AllocTest, test_alloc_big)
+{
+  int64_t block_size = 4096;
+  int64_t blocks = 104857600;
+  int64_t mas = 4096;
+  init_alloc(blocks*block_size, block_size);
+  alloc->init_add_free(2*block_size, (blocks-2)*block_size);
+  for (int64_t big = mas; big < 1048576*128; big*=2) {
+    cout << big << std::endl;
+    EXPECT_EQ(alloc->reserve(big), 0);
+    AllocExtentVector extents;
+    EXPECT_EQ(big,
+	      alloc->allocate(big, mas, 0, &extents));
+  }
+}
+
 TEST_P(AllocTest, test_alloc_hint_bmap)
 {
   if (GetParam() == std::string("stupid")) {
diff --git a/src/test/objectstore/chain_xattr.cc b/src/test/objectstore/chain_xattr.cc
index 9f98ed2934d..a30c57c8403 100644
--- a/src/test/objectstore/chain_xattr.cc
+++ b/src/test/objectstore/chain_xattr.cc
@@ -23,6 +23,7 @@
 #include <signal.h>
 #include "os/filestore/chain_xattr.h"
 #include "include/Context.h"
+#include "include/coredumpctl.h"
 #include "common/errno.h"
 #include "common/ceph_argparse.h"
 #include "global/global_init.h"
@@ -120,6 +121,7 @@ TEST(chain_xattr, get_and_set) {
   {
     int x;
     const string name = user + string(CHAIN_XATTR_MAX_NAME_LEN * 2, '@');
+    PrCtl unset_dumpable;
     ASSERT_DEATH(chain_setxattr(file, name.c_str(), &x, sizeof(x)), "");
     ASSERT_DEATH(chain_fsetxattr(fd, name.c_str(), &x, sizeof(x)), "");
   }
diff --git a/src/test/objectstore/store_test.cc b/src/test/objectstore/store_test.cc
index 08ca9b4022d..eabe7f1c026 100644
--- a/src/test/objectstore/store_test.cc
+++ b/src/test/objectstore/store_test.cc
@@ -36,6 +36,7 @@
 #include "common/Cond.h"
 #include "common/errno.h"
 #include "include/stringify.h"
+#include "include/coredumpctl.h"
 
 #include "include/unordered_map.h"
 #include "store_test_fixture.h"
@@ -461,6 +462,7 @@ TEST_P(StoreTest, FiemapHoles) {
     ASSERT_EQ(r, 0);
   }
   {
+    //fiemap test from 0 to SKIP_STEP * (MAX_EXTENTS - 1) + 3
     bufferlist bl;
     store->fiemap(cid, oid, 0, SKIP_STEP * (MAX_EXTENTS - 1) + 3, bl);
     map<uint64_t,uint64_t> m, e;
@@ -477,6 +479,26 @@ TEST_P(StoreTest, FiemapHoles) {
     ASSERT_TRUE((m.size() == 1 &&
 		 m[0] > SKIP_STEP * (MAX_EXTENTS - 1)) ||
 		 (m.size() == MAX_EXTENTS && extents_exist));
+
+    // fiemap test from SKIP_STEP to SKIP_STEP * (MAX_EXTENTS - 2) + 3
+    // reset bufferlist and map
+    bl.clear();
+    m.clear();
+    e.clear();
+    store->fiemap(cid, oid, SKIP_STEP, SKIP_STEP * (MAX_EXTENTS - 2) + 3, bl);
+    p = bl.begin();
+    ::decode(m, p);
+    cout << " got " << m << std::endl;
+    ASSERT_TRUE(!m.empty());
+    ASSERT_GE(m[SKIP_STEP], 3u);
+    extents_exist = true;
+    if (m.size() == (MAX_EXTENTS - 2)) {
+      for (uint64_t i = 1; i < MAX_EXTENTS - 1; i++)
+	extents_exist = extents_exist && m.count(SKIP_STEP*i);
+    }
+    ASSERT_TRUE((m.size() == 1 &&
+		 m[SKIP_STEP] > SKIP_STEP * (MAX_EXTENTS - 2)) ||
+		 (m.size() == (MAX_EXTENTS - 1) && extents_exist));
   }
   {
     ObjectStore::Transaction t;
@@ -2826,8 +2848,8 @@ TEST_P(StoreTest, SimpleCloneTest) {
     ObjectStore::Transaction t;
     t.remove_collection(cid);
     cerr << "Invalid rm coll" << std::endl;
+    PrCtl unset_dumpable;
     EXPECT_DEATH(apply_transaction(store, &osr, std::move(t)), ".*Directory not empty.*");
-
   }
   {
     ObjectStore::Transaction t;
@@ -2848,6 +2870,7 @@ TEST_P(StoreTest, SimpleCloneTest) {
     t.remove(cid, hoid);
     t.remove(cid, hoid2);
     t.remove_collection(cid);
+    PrCtl unset_dumpable;
     EXPECT_DEATH(apply_transaction(store, &osr, std::move(t)), ".*Directory not empty.*");
   }
   {
@@ -4623,7 +4646,8 @@ TEST_P(StoreTest, OMapTest) {
     t.omap_clear(cid, hoid);
     map<string, bufferlist> start_set;
     t.omap_setkeys(cid, hoid, start_set);
-    apply_transaction(store, &osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
+    ASSERT_EQ(r, 0);
   }
 
   for (int i = 0; i < 100; i++) {
@@ -4658,7 +4682,8 @@ TEST_P(StoreTest, OMapTest) {
     to_add.insert(pair<string, bufferlist>("key-" + string(buf), bl));
     attrs.insert(pair<string, bufferlist>("key-" + string(buf), bl));
     t.omap_setkeys(cid, hoid, to_add);
-    apply_transaction(store, &osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
+    ASSERT_EQ(r, 0);
   }
 
   int i = 0;
@@ -4688,7 +4713,8 @@ TEST_P(StoreTest, OMapTest) {
     set<string> keys_to_remove;
     keys_to_remove.insert(to_remove);
     t.omap_rmkeys(cid, hoid, keys_to_remove);
-    apply_transaction(store, &osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
+    ASSERT_EQ(r, 0);
 
     attrs.erase(to_remove);
 
@@ -4700,7 +4726,8 @@ TEST_P(StoreTest, OMapTest) {
     bl1.append("omap_header");
     ObjectStore::Transaction t;
     t.omap_setheader(cid, hoid, bl1);
-    apply_transaction(store, &osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
+    ASSERT_EQ(r, 0);
     t = ObjectStore::Transaction();
  
     bufferlist bl2;
@@ -4708,7 +4735,8 @@ TEST_P(StoreTest, OMapTest) {
     map<string, bufferlist> to_add;
     to_add.insert(pair<string, bufferlist>("key", bl2));
     t.omap_setkeys(cid, hoid, to_add);
-    apply_transaction(store, &osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
+    ASSERT_EQ(r, 0);
 
     bufferlist bl3;
     map<string, bufferlist> cur_attrs;
@@ -4737,12 +4765,14 @@ TEST_P(StoreTest, OMapTest) {
       t.touch(cid, hoid);
       t.omap_setheader(cid, hoid, h);
       t.omap_setkeys(cid, hoid, to_set);
-      apply_transaction(store, &osr, std::move(t));
+      r = apply_transaction(store, &osr, std::move(t));
+      ASSERT_EQ(r, 0);
     }
     {
       ObjectStore::Transaction t;
       t.omap_rmkeyrange(cid, hoid, "3", "7");
-      apply_transaction(store, &osr, std::move(t));
+      r = apply_transaction(store, &osr, std::move(t));
+      ASSERT_EQ(r, 0);
     }
     {
       bufferlist hdr;
@@ -4760,7 +4790,8 @@ TEST_P(StoreTest, OMapTest) {
     {
       ObjectStore::Transaction t;
       t.omap_clear(cid, hoid);
-      apply_transaction(store, &osr, std::move(t));
+      r = apply_transaction(store, &osr, std::move(t));
+      ASSERT_EQ(r, 0);
     }
     {
       bufferlist hdr;
@@ -4798,7 +4829,8 @@ TEST_P(StoreTest, OMapIterator) {
     t.omap_clear(cid, hoid);
     map<string, bufferlist> start_set;
     t.omap_setkeys(cid, hoid, start_set);
-    apply_transaction(store, &osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
+    ASSERT_EQ(r, 0);
   }
   ObjectMap::ObjectMapIterator iter;
   bool correct;
@@ -4841,7 +4873,8 @@ TEST_P(StoreTest, OMapIterator) {
     attrs.insert(pair<string, bufferlist>("key-" + string(buf), bl));
     ObjectStore::Transaction t;
     t.omap_setkeys(cid, hoid, to_add);
-    apply_transaction(store, &osr, std::move(t));
+    r = apply_transaction(store, &osr, std::move(t));
+    ASSERT_EQ(r, 0);
   }
 
   iter = store->get_omap_iterator(cid, hoid);
@@ -6301,6 +6334,7 @@ TEST_P(StoreTestSpecificAUSize, SmallWriteOnShardedExtents) {
     t.write(cid, hoid1, 0, bl2.length(), bl2, 0);
     t.zero(cid, hoid1, 0, 0x50000);
     r = apply_transaction(store, &osr, std::move(t));
+    ASSERT_EQ(r, 0);
 
   }
   store->umount();
diff --git a/src/test/objectstore/test_bluestore_types.cc b/src/test/objectstore/test_bluestore_types.cc
index 306b8c2c517..8f39ecbfd87 100644
--- a/src/test/objectstore/test_bluestore_types.cc
+++ b/src/test/objectstore/test_bluestore_types.cc
@@ -338,13 +338,13 @@ TEST(Blob, put_ref)
     cout << b << std::endl;
     PExtentVector r;
 
-    b.put_ref(&coll, 0, 0x1200, &r);
+    ASSERT_FALSE(b.put_ref(&coll, 0, 0x1200, &r));
     ASSERT_EQ(0x4200u, b.get_referenced_bytes());
     cout << " r " << r << std::endl;
     cout << b << std::endl;
 
     r.clear();
-    b.put_ref(&coll, 0xae00, 0x4200, &r);
+    ASSERT_TRUE(b.put_ref(&coll, 0xae00, 0x4200, &r));
     ASSERT_EQ(0u, b.get_referenced_bytes());
     cout << " r " << r << std::endl;
     cout << b << std::endl;
@@ -366,7 +366,7 @@ TEST(Blob, put_ref)
     B.get_ref(coll.get(), 0, mas*2);
     ASSERT_EQ(mas * 2, B.get_referenced_bytes());
     ASSERT_TRUE(b.is_allocated(0, mas*2));
-    B.put_ref(coll.get(), 0, mas*2, &r);
+    ASSERT_TRUE(B.put_ref(coll.get(), 0, mas*2, &r));
     ASSERT_EQ(0u, B.get_referenced_bytes());
     cout << "r " << r << " " << b << std::endl;
     ASSERT_EQ(1u, r.size());
@@ -387,12 +387,12 @@ TEST(Blob, put_ref)
     b.allocated_test(bluestore_pextent_t(123, mas * 2));
     B.get_ref(coll.get(), 0, mas*2);
     ASSERT_EQ(mas * 2, B.get_referenced_bytes());
-    B.put_ref(coll.get(), 0, mas, &r);
+    ASSERT_FALSE(B.put_ref(coll.get(), 0, mas, &r));
     ASSERT_EQ(mas, B.get_referenced_bytes());
     cout << "r " << r << " " << b << std::endl;
     ASSERT_EQ(0u, r.size());
     ASSERT_TRUE(b.is_allocated(0, mas*2));
-    B.put_ref(coll.get(), mas, mas, &r);
+    ASSERT_TRUE(B.put_ref(coll.get(), mas, mas, &r));
     ASSERT_EQ(0u, B.get_referenced_bytes());
     ASSERT_EQ(0u, B.get_referenced_bytes());
     cout << "r " << r << " " << b << std::endl;
@@ -415,19 +415,19 @@ TEST(Blob, put_ref)
     b.allocated_test(bluestore_pextent_t(4, mas));
     B.get_ref(coll.get(), 0, mas*4);
     ASSERT_EQ(mas * 4, B.get_referenced_bytes());
-    B.put_ref(coll.get(), mas, mas, &r);
+    ASSERT_FALSE(B.put_ref(coll.get(), mas, mas, &r));
     ASSERT_EQ(mas * 3, B.get_referenced_bytes());
     cout << "r " << r << " " << b << std::endl;
     ASSERT_EQ(0u, r.size());
     ASSERT_TRUE(b.is_allocated(0, mas*4));
     ASSERT_TRUE(b.is_allocated(mas, mas));
-    B.put_ref(coll.get(), mas*2, mas, &r);
+    ASSERT_FALSE(B.put_ref(coll.get(), mas*2, mas, &r));
     ASSERT_EQ(mas * 2, B.get_referenced_bytes());
     cout << "r " << r << " " << b << std::endl;
     ASSERT_EQ(0u, r.size());
     ASSERT_TRUE(b.is_allocated(mas*2, mas));
     ASSERT_TRUE(b.is_allocated(0, mas*4));
-    B.put_ref(coll.get(), mas*3, mas, &r);
+    ASSERT_FALSE(B.put_ref(coll.get(), mas*3, mas, &r));
     ASSERT_EQ(mas, B.get_referenced_bytes());
     cout << "r " << r << " " << b << std::endl;
     ASSERT_EQ(2u, r.size());
@@ -456,17 +456,17 @@ TEST(Blob, put_ref)
     b.allocated_test(bluestore_pextent_t(6, mas));
     B.get_ref(coll.get(), 0, mas*6);
     ASSERT_EQ(mas * 6, B.get_referenced_bytes());
-    B.put_ref(coll.get(), mas, mas, &r);
+    ASSERT_FALSE(B.put_ref(coll.get(), mas, mas, &r));
     ASSERT_EQ(mas * 5, B.get_referenced_bytes());
     cout << "r " << r << " " << b << std::endl;
     ASSERT_EQ(0u, r.size());
     ASSERT_TRUE(b.is_allocated(0, mas*6));
-    B.put_ref(coll.get(), mas*2, mas, &r);
+    ASSERT_FALSE(B.put_ref(coll.get(), mas*2, mas, &r));
     ASSERT_EQ(mas * 4, B.get_referenced_bytes());
     cout << "r " << r << " " << b << std::endl;
     ASSERT_EQ(0u, r.size());
     ASSERT_TRUE(b.is_allocated(0, mas*6));
-    B.put_ref(coll.get(), mas*3, mas, &r);
+    ASSERT_FALSE(B.put_ref(coll.get(), mas*3, mas, &r));
     ASSERT_EQ(mas * 3, B.get_referenced_bytes());
     cout << "r " << r << " " << b << std::endl;
     ASSERT_EQ(2u, r.size());
@@ -493,17 +493,17 @@ TEST(Blob, put_ref)
     b.allocated_test(bluestore_pextent_t(1, mas * 6));
     B.get_ref(coll.get(), 0, mas*6);
     ASSERT_EQ(mas * 6, B.get_referenced_bytes());
-    B.put_ref(coll.get(), mas, mas, &r);
+    ASSERT_FALSE(B.put_ref(coll.get(), mas, mas, &r));
     ASSERT_EQ(mas * 5, B.get_referenced_bytes());
     cout << "r " << r << " " << b << std::endl;
     ASSERT_EQ(0u, r.size());
     ASSERT_TRUE(b.is_allocated(0, mas*6));
-    B.put_ref(coll.get(), mas*2, mas, &r);
+    ASSERT_FALSE(B.put_ref(coll.get(), mas*2, mas, &r));
     ASSERT_EQ(mas * 4, B.get_referenced_bytes());
     cout << "r " << r << " " << b << std::endl;
     ASSERT_EQ(0u, r.size());
     ASSERT_TRUE(b.is_allocated(0, mas*6));
-    B.put_ref(coll.get(), mas*3, mas, &r);
+    ASSERT_FALSE(B.put_ref(coll.get(), mas*3, mas, &r));
     ASSERT_EQ(mas * 3, B.get_referenced_bytes());
     cout << "r " << r << " " << b << std::endl;
     ASSERT_EQ(1u, r.size());
@@ -528,17 +528,17 @@ TEST(Blob, put_ref)
     b.allocated_test(bluestore_pextent_t(3, mas * 4));
     B.get_ref(coll.get(), 0, mas*12);
     ASSERT_EQ(mas * 12, B.get_referenced_bytes());
-    B.put_ref(coll.get(), mas, mas, &r);
+    ASSERT_FALSE(B.put_ref(coll.get(), mas, mas, &r));
     ASSERT_EQ(mas * 11, B.get_referenced_bytes());
     cout << "r " << r << " " << b << std::endl;
     ASSERT_EQ(0u, r.size());
     ASSERT_TRUE(b.is_allocated(0, mas*12));
-    B.put_ref(coll.get(), mas*9, mas, &r);
+    ASSERT_FALSE(B.put_ref(coll.get(), mas*9, mas, &r));
     ASSERT_EQ(mas * 10, B.get_referenced_bytes());
     cout << "r " << r << " " << b << std::endl;
     ASSERT_EQ(0u, r.size());
     ASSERT_TRUE(b.is_allocated(0, mas*12));
-    B.put_ref(coll.get(), mas*2, mas*7, &r);
+    ASSERT_FALSE(B.put_ref(coll.get(), mas*2, mas*7, &r));
     ASSERT_EQ(mas * 3, B.get_referenced_bytes());
     cout << "r " << r << " " << b << std::endl;
     ASSERT_EQ(3u, r.size());
@@ -567,17 +567,17 @@ TEST(Blob, put_ref)
     b.allocated_test(bluestore_pextent_t(3, mas * 4));
     B.get_ref(coll.get(), 0, mas*12);
     ASSERT_EQ(mas * 12, B.get_referenced_bytes());
-    B.put_ref(coll.get(), mas, mas, &r);
+    ASSERT_FALSE(B.put_ref(coll.get(), mas, mas, &r));
     ASSERT_EQ(mas * 11, B.get_referenced_bytes());
     cout << "r " << r << " " << b << std::endl;
     ASSERT_EQ(0u, r.size());
     ASSERT_TRUE(b.is_allocated(0, mas*12));
-    B.put_ref(coll.get(), mas*9, mas, &r);
+    ASSERT_FALSE(B.put_ref(coll.get(), mas*9, mas, &r));
     ASSERT_EQ(mas * 10, B.get_referenced_bytes());
     cout << "r " << r << " " << b << std::endl;
     ASSERT_EQ(0u, r.size());
     ASSERT_TRUE(b.is_allocated(0, mas*12));
-    B.put_ref(coll.get(), mas*2, mas*7, &r);
+    ASSERT_FALSE(B.put_ref(coll.get(), mas*2, mas*7, &r));
     ASSERT_EQ(mas * 3, B.get_referenced_bytes());
     cout << "r " << r << " " << b << std::endl;
     ASSERT_EQ(3u, r.size());
@@ -594,7 +594,7 @@ TEST(Blob, put_ref)
     ASSERT_TRUE(b.get_extents()[0].is_valid());
     ASSERT_FALSE(b.get_extents()[1].is_valid());
     ASSERT_TRUE(b.get_extents()[2].is_valid());
-    B.put_ref(coll.get(), 0, mas, &r);
+    ASSERT_FALSE(B.put_ref(coll.get(), 0, mas, &r));
     ASSERT_EQ(mas * 2, B.get_referenced_bytes());
     cout << "r " << r << " " << b << std::endl;
     ASSERT_EQ(1u, r.size());
@@ -603,7 +603,7 @@ TEST(Blob, put_ref)
     ASSERT_EQ(2u, b.get_extents().size());
     ASSERT_FALSE(b.get_extents()[0].is_valid());
     ASSERT_TRUE(b.get_extents()[1].is_valid());
-    B.put_ref(coll.get(), mas*10, mas*2, &r);
+    ASSERT_TRUE(B.put_ref(coll.get(), mas*10, mas*2, &r));
     ASSERT_EQ(mas * 0, B.get_referenced_bytes());
     cout << "r " << r << " " << b << std::endl;
     ASSERT_EQ(1u, r.size());
@@ -623,17 +623,17 @@ TEST(Blob, put_ref)
     b.allocated_test(bluestore_pextent_t(3, mas * 4));
     B.get_ref(coll.get(), 0, mas*12);
     ASSERT_EQ(mas * 12, B.get_referenced_bytes());
-    B.put_ref(coll.get(), mas, mas, &r);
+    ASSERT_FALSE(B.put_ref(coll.get(), mas, mas, &r));
     ASSERT_EQ(mas * 11, B.get_referenced_bytes());
     cout << "r " << r << " " << b << std::endl;
     ASSERT_EQ(0u, r.size());
     ASSERT_TRUE(b.is_allocated(0, mas*12));
-    B.put_ref(coll.get(), mas*9, mas, &r);
+    ASSERT_FALSE(B.put_ref(coll.get(), mas*9, mas, &r));
     ASSERT_EQ(mas * 10, B.get_referenced_bytes());
     cout << "r " << r << " " << b << std::endl;
     ASSERT_EQ(0u, r.size());
     ASSERT_TRUE(b.is_allocated(0, mas*12));
-    B.put_ref(coll.get(), mas*2, mas*7, &r);
+    ASSERT_FALSE(B.put_ref(coll.get(), mas*2, mas*7, &r));
     ASSERT_EQ(mas * 3, B.get_referenced_bytes());
     cout << "r " << r << " " << b << std::endl;
     ASSERT_EQ(3u, r.size());
@@ -650,7 +650,7 @@ TEST(Blob, put_ref)
     ASSERT_TRUE(b.get_extents()[0].is_valid());
     ASSERT_FALSE(b.get_extents()[1].is_valid());
     ASSERT_TRUE(b.get_extents()[2].is_valid());
-    B.put_ref(coll.get(), mas*10, mas*2, &r);
+    ASSERT_FALSE(B.put_ref(coll.get(), mas*10, mas*2, &r));
     ASSERT_EQ(mas * 1, B.get_referenced_bytes());
     cout << "r " << r << " " << b << std::endl;
     ASSERT_EQ(1u, r.size());
@@ -659,7 +659,7 @@ TEST(Blob, put_ref)
     ASSERT_EQ(2u, b.get_extents().size());
     ASSERT_TRUE(b.get_extents()[0].is_valid());
     ASSERT_FALSE(b.get_extents()[1].is_valid());
-    B.put_ref(coll.get(), 0, mas, &r);
+    ASSERT_TRUE(B.put_ref(coll.get(), 0, mas, &r));
     ASSERT_EQ(mas * 0, B.get_referenced_bytes());
     cout << "r " << r << " " << b << std::endl;
     ASSERT_EQ(1u, r.size());
@@ -677,22 +677,22 @@ TEST(Blob, put_ref)
     b.allocated_test(bluestore_pextent_t(1, mas * 8));
     B.get_ref(coll.get(), 0, mas*8);
     ASSERT_EQ(mas * 8, B.get_referenced_bytes());
-    B.put_ref(coll.get(), 0, mas, &r);
+    ASSERT_FALSE(B.put_ref(coll.get(), 0, mas, &r));
     ASSERT_EQ(mas * 7, B.get_referenced_bytes());
     cout << "r " << r << " " << b << std::endl;
     ASSERT_EQ(0u, r.size());
     ASSERT_TRUE(b.is_allocated(0, mas*8));
-    B.put_ref(coll.get(), mas*7, mas, &r);
+    ASSERT_FALSE(B.put_ref(coll.get(), mas*7, mas, &r));
     ASSERT_EQ(mas * 6, B.get_referenced_bytes());
     cout << "r " << r << " " << b << std::endl;
     ASSERT_EQ(0u, r.size());
     ASSERT_TRUE(b.is_allocated(0, mas*8));
-    B.put_ref(coll.get(), mas*2, mas, &r);
+    ASSERT_FALSE(B.put_ref(coll.get(), mas*2, mas, &r));
     ASSERT_EQ(mas * 5, B.get_referenced_bytes());
     cout << "r " << r << " " << b << std::endl;
     ASSERT_EQ(0u, r.size());
     ASSERT_TRUE(b.is_allocated(0, 8));
-    B.put_ref(coll.get(), mas*3, mas*4, &r);
+    ASSERT_FALSE(B.put_ref(coll.get(), mas*3, mas*4, &r));
     ASSERT_EQ(mas * 1, B.get_referenced_bytes());
     ASSERT_EQ(1u, r.size());
     ASSERT_EQ(0x2001u, r[0].offset);
@@ -702,7 +702,7 @@ TEST(Blob, put_ref)
     ASSERT_EQ(2u, b.get_extents().size());
     ASSERT_TRUE(b.get_extents()[0].is_valid());
     ASSERT_FALSE(b.get_extents()[1].is_valid());
-    B.put_ref(coll.get(), mas, mas, &r);
+    ASSERT_TRUE(B.put_ref(coll.get(), mas, mas, &r));
     ASSERT_EQ(mas * 0, B.get_referenced_bytes());
     cout << "r " << r << " " << b << std::endl;
     ASSERT_EQ(1u, r.size());
@@ -723,7 +723,7 @@ TEST(Blob, put_ref)
     B.get_ref(coll.get(), 0, mas*4);
     ASSERT_EQ(mas * 4, B.get_referenced_bytes());
     ASSERT_TRUE(b.is_allocated(0, mas*4));
-    B.put_ref(coll.get(), 0, mas*3, &r);
+    ASSERT_FALSE(B.put_ref(coll.get(), 0, mas*3, &r));
     ASSERT_EQ(mas * 1, B.get_referenced_bytes());
     cout << "r " << r << " " << b << std::endl;
     ASSERT_EQ(0u, r.size());
@@ -749,7 +749,7 @@ TEST(Blob, put_ref)
 
     cout << "before: " << B << std::endl;
     PExtentVector r;
-    B.put_ref(coll.get(), 0x1800, 0x2000, &r);
+    ASSERT_FALSE(B.put_ref(coll.get(), 0x1800, 0x2000, &r));
     ASSERT_EQ(0x3800u + 0x6400u - 0x2000u, B.get_referenced_bytes());
     cout << "after: " << B << std::endl;
     cout << "r " << r << std::endl;
@@ -765,7 +765,7 @@ TEST(Blob, put_ref)
     ASSERT_EQ(0xa000u, B.get_referenced_bytes());
     cout << "before: " << B << std::endl;
     PExtentVector r;
-    B.put_ref(coll.get(), 0x8000, 0x2000, &r);
+    ASSERT_FALSE(B.put_ref(coll.get(), 0x8000, 0x2000, &r));
     cout << "after: " << B << std::endl;
     cout << "r " << r << std::endl;
     ASSERT_EQ(0x8000u, B.get_referenced_bytes());
@@ -784,7 +784,7 @@ TEST(Blob, put_ref)
     ASSERT_EQ(0xe000u, B.get_referenced_bytes());
     cout << "before: " << B << std::endl;
     PExtentVector r;
-    B.put_ref(coll.get(), 0, 0xb000, &r);
+    ASSERT_FALSE(B.put_ref(coll.get(), 0, 0xb000, &r));
     ASSERT_EQ(0x3000u, B.get_referenced_bytes());
     cout << "after: " << B << std::endl;
     cout << "r " << r << std::endl;
@@ -811,7 +811,7 @@ TEST(Blob, put_ref)
     ASSERT_EQ(0xc000u, B.get_referenced_bytes());
     cout << "before: " << B << std::endl;
     PExtentVector r;
-    B.put_ref(coll.get(), 0x2000, 0xa000, &r);
+    ASSERT_FALSE(B.put_ref(coll.get(), 0x2000, 0xa000, &r));
     cout << "after: " << B << std::endl;
     cout << "r " << r << std::endl;
     ASSERT_EQ(0x2000u, B.get_referenced_bytes());
diff --git a/src/test/osd/TestOSDMap.cc b/src/test/osd/TestOSDMap.cc
index a8692d6043d..e833ab2e4ff 100644
--- a/src/test/osd/TestOSDMap.cc
+++ b/src/test/osd/TestOSDMap.cc
@@ -235,7 +235,8 @@ TEST_F(OSDMapTest, PGTempRespected) {
 
   // apply pg_temp to osdmap
   OSDMap::Incremental pgtemp_map(osdmap.get_epoch() + 1);
-  pgtemp_map.new_pg_temp[pgid] = new_acting_osds;
+  pgtemp_map.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
+    new_acting_osds.begin(), new_acting_osds.end());
   osdmap.apply_incremental(pgtemp_map);
 
   osdmap.pg_to_up_acting_osds(pgid, &up_osds, &up_primary,
@@ -276,7 +277,8 @@ TEST_F(OSDMapTest, CleanTemps) {
     int up_primary, acting_primary;
     osdmap.pg_to_up_acting_osds(pga, &up_osds, &up_primary,
 				&acting_osds, &acting_primary);
-    pgtemp_map.new_pg_temp[pga] = up_osds;
+    pgtemp_map.new_pg_temp[pga] = mempool::osdmap::vector<int>(
+      up_osds.begin(), up_osds.end());
     pgtemp_map.new_primary_temp[pga] = up_primary;
   }
   pg_t pgb = osdmap.raw_pg_to_pg(pg_t(1, 0));
@@ -285,7 +287,8 @@ TEST_F(OSDMapTest, CleanTemps) {
     int up_primary, acting_primary;
     osdmap.pg_to_up_acting_osds(pgb, &up_osds, &up_primary,
 				&acting_osds, &acting_primary);
-    pending_inc.new_pg_temp[pgb] = up_osds;
+    pending_inc.new_pg_temp[pgb] = mempool::osdmap::vector<int>(
+      up_osds.begin(), up_osds.end());
     pending_inc.new_primary_temp[pgb] = up_primary;
   }
 
@@ -334,7 +337,8 @@ TEST_F(OSDMapTest, KeepsNecessaryTemps) {
   if (i == (int)get_num_osds())
     FAIL() << "did not find unused OSD for temp mapping";
 
-  pgtemp_map.new_pg_temp[pgid] = up_osds;
+  pgtemp_map.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
+    up_osds.begin(), up_osds.end());
   pgtemp_map.new_primary_temp[pgid] = up_osds[1];
   osdmap.apply_incremental(pgtemp_map);
 
diff --git a/src/test/osd/TestPGLog.cc b/src/test/osd/TestPGLog.cc
index b052442e4f8..626c3cef57c 100644
--- a/src/test/osd/TestPGLog.cc
+++ b/src/test/osd/TestPGLog.cc
@@ -24,6 +24,7 @@
 #include "gtest/gtest.h"
 #include "osd/PGLog.h"
 #include "osd/OSDMap.h"
+#include "include/coredumpctl.h"
 
 class PGLogTest : public ::testing::Test, protected PGLog {
 public:
@@ -1211,6 +1212,7 @@ TEST_F(PGLogTest, merge_log) {
     olog.tail = eversion_t(1, 1);
 
     TestHandler h(remove_snap);
+    PrCtl unset_dumpable;
     ASSERT_DEATH(merge_log(oinfo, olog, fromosd, info, &h,
 			   dirty_info, dirty_big_info), "");
   }
diff --git a/src/test/osd/osd-dup.sh b/src/test/osd/osd-dup.sh
index 264d27ddfdb..f82a85b9080 100755
--- a/src/test/osd/osd-dup.sh
+++ b/src/test/osd/osd-dup.sh
@@ -12,6 +12,9 @@ function run() {
     CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
     CEPH_ARGS+="--mon-host=$CEPH_MON "
     CEPH_ARGS+="--enable-experimental-unrecoverable-data-corrupting-features bluestore "
+    # avoid running out of fds in rados bench
+    CEPH_ARGS+="--filestore_wbthrottle_xfs_ios_hard_limit=900 "
+    CEPH_ARGS+="--filestore_wbthrottle_btrfs_ios_hard_limit=900 "
     local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
     for func in $funcs ; do
         setup $dir || return 1
diff --git a/src/test/osd/osd-scrub-snaps.sh b/src/test/osd/osd-scrub-snaps.sh
index 5410e820c4a..1cd9b562e4c 100755
--- a/src/test/osd/osd-scrub-snaps.sh
+++ b/src/test/osd/osd-scrub-snaps.sh
@@ -309,6 +309,20 @@ function TEST_scrub_snaps() {
       "name": "obj15"
     },
     {
+      "extra clones": [
+        7,
+        4
+      ],
+      "errors": [
+        "ss_attr_missing",
+        "extra_clones"
+      ],
+      "snap": "head",
+      "locator": "",
+      "nspace": "",
+      "name": "obj2"
+    },
+    {
       "errors": [
         "size_mismatch"
       ],
@@ -319,6 +333,18 @@ function TEST_scrub_snaps() {
     },
     {
       "missing": [
+        7
+      ],
+      "errors": [
+        "clone_missing"
+      ],
+      "snap": "head",
+      "locator": "",
+      "nspace": "",
+      "name": "obj4"
+    },
+    {
+      "missing": [
         2,
         1
       ],
@@ -367,32 +393,6 @@ function TEST_scrub_snaps() {
       "locator": "",
       "nspace": "",
       "name": "obj8"
-    },
-    {
-      "extra clones": [
-        7,
-        4
-      ],
-      "errors": [
-        "ss_attr_missing",
-        "extra_clones"
-      ],
-      "snap": "snapdir",
-      "locator": "",
-      "nspace": "",
-      "name": "obj2"
-    },
-    {
-      "missing": [
-        7
-      ],
-      "errors": [
-        "clone_missing"
-      ],
-      "snap": "snapdir",
-      "locator": "",
-      "nspace": "",
-      "name": "obj4"
     }
   ],
   "epoch": 20
@@ -439,17 +439,18 @@ EOF
     err_strings[10]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj7:1 is an unexpected clone"
     err_strings[11]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj3:head on disk size [(]3840[)] does not match object info size [(]768[)] adjusted for ondisk to [(]768[)]"
     err_strings[12]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj6:1 is an unexpected clone"
-    err_strings[13]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj2:snapdir no 'snapset' attr"
+    err_strings[13]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj2:head no 'snapset' attr"
     err_strings[14]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj2:7 clone ignored due to missing snapset"
     err_strings[15]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj2:4 clone ignored due to missing snapset"
-    err_strings[16]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj4:snapdir expected clone .*:::obj4:7"
-    err_strings[17]="log_channel[(]cluster[)] log [[]INF[]] : scrub [0-9]*[.]0 .*:::obj4:snapdir 1 missing clone[(]s[)]"
+    err_strings[16]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj4:head expected clone .*:::obj4:7"
+    err_strings[17]="log_channel[(]cluster[)] log [[]INF[]] : scrub [0-9]*[.]0 .*:::obj4:head 1 missing clone[(]s[)]"
     err_strings[18]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj1:1 is an unexpected clone"
     err_strings[19]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj9:1 is missing in clone_size"
     err_strings[20]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj11:1 is an unexpected clone"
     err_strings[21]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj14:1 size 1032 != clone_size 1033"
-    err_strings[22]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 scrub 22 errors"
+    err_strings[22]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 scrub 23 errors"
     err_strings[23]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj15:head can't decode 'snapset' attr buffer"
+    err_strings[24]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj12:1 has no oi or legacy_snaps; cannot convert 1=[[]1[]]:[[]1[]].stray_clone_snaps=[{]1=[[]1[]][}]"
 
     for i in `seq 0 ${#err_strings[@]}`
     do
diff --git a/src/test/osd/types.cc b/src/test/osd/types.cc
index 3a03d606d38..b4616ea4f2b 100644
--- a/src/test/osd/types.cc
+++ b/src/test/osd/types.cc
@@ -1,4 +1,4 @@
-// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 // vim: ts=8 sw=2 smarttab
 /*
  * Ceph - scalable distributed file system
@@ -19,6 +19,7 @@
 #include "osd/osd_types.h"
 #include "osd/OSDMap.h"
 #include "gtest/gtest.h"
+#include "include/coredumpctl.h"
 #include "common/Thread.h"
 #include "include/stringify.h"
 #include "osd/ReplicatedBackend.h"
@@ -122,6 +123,10 @@ TEST(hobject, prefixes5)
 
 TEST(pg_interval_t, check_new_interval)
 {
+// iterate through all 4 combinations
+for (unsigned i = 0; i < 4; ++i) {
+  bool compact = i & 1;
+  bool ec_pool = i & 2;
   //
   // Create a situation where osdmaps are the same so that
   // each test case can diverge from it using minimal code.
@@ -170,10 +175,10 @@ TEST(pg_interval_t, check_new_interval)
   // being split
   //
   {
-    map<epoch_t, pg_interval_t> past_intervals;
+    PastIntervals past_intervals; past_intervals.update_type(ec_pool, compact);
 
     ASSERT_TRUE(past_intervals.empty());
-    ASSERT_FALSE(pg_interval_t::check_new_interval(old_primary,
+    ASSERT_FALSE(PastIntervals::check_new_interval(old_primary,
 						   new_primary,
 						   old_acting,
 						   new_acting,
@@ -199,10 +204,10 @@ TEST(pg_interval_t, check_new_interval)
     int _new_primary = osd_id + 1;
     new_acting.push_back(_new_primary);
 
-    map<epoch_t, pg_interval_t> past_intervals;
+    PastIntervals past_intervals; past_intervals.update_type(ec_pool, compact);
 
     ASSERT_TRUE(past_intervals.empty());
-    ASSERT_TRUE(pg_interval_t::check_new_interval(old_primary,
+    ASSERT_TRUE(PastIntervals::check_new_interval(old_primary,
 						  new_primary,
 						  old_acting,
 						  new_acting,
@@ -218,11 +223,6 @@ TEST(pg_interval_t, check_new_interval)
                                                   recoverable.get(),
 						  &past_intervals));
     old_primary = new_primary;
-    ASSERT_EQ((unsigned int)1, past_intervals.size());
-    ASSERT_EQ(same_interval_since, past_intervals[same_interval_since].first);
-    ASSERT_EQ(osdmap->get_epoch() - 1, past_intervals[same_interval_since].last);
-    ASSERT_EQ(osd_id, past_intervals[same_interval_since].acting[0]);
-    ASSERT_EQ(osd_id, past_intervals[same_interval_since].up[0]);
   }
 
   //
@@ -233,10 +233,10 @@ TEST(pg_interval_t, check_new_interval)
     int _new_primary = osd_id + 1;
     new_up.push_back(_new_primary);
 
-    map<epoch_t, pg_interval_t> past_intervals;
+    PastIntervals past_intervals; past_intervals.update_type(ec_pool, compact);
 
     ASSERT_TRUE(past_intervals.empty());
-    ASSERT_TRUE(pg_interval_t::check_new_interval(old_primary,
+    ASSERT_TRUE(PastIntervals::check_new_interval(old_primary,
 						  new_primary,
 						  old_acting,
 						  new_acting,
@@ -251,11 +251,6 @@ TEST(pg_interval_t, check_new_interval)
 						  pgid,
                                                   recoverable.get(),
 						  &past_intervals));
-    ASSERT_EQ((unsigned int)1, past_intervals.size());
-    ASSERT_EQ(same_interval_since, past_intervals[same_interval_since].first);
-    ASSERT_EQ(osdmap->get_epoch() - 1, past_intervals[same_interval_since].last);
-    ASSERT_EQ(osd_id, past_intervals[same_interval_since].acting[0]);
-    ASSERT_EQ(osd_id, past_intervals[same_interval_since].up[0]);
   }
 
   //
@@ -265,10 +260,10 @@ TEST(pg_interval_t, check_new_interval)
     vector<int> new_up;
     int _new_up_primary = osd_id + 1;
 
-    map<epoch_t, pg_interval_t> past_intervals;
+    PastIntervals past_intervals; past_intervals.update_type(ec_pool, compact);
 
     ASSERT_TRUE(past_intervals.empty());
-    ASSERT_TRUE(pg_interval_t::check_new_interval(old_primary,
+    ASSERT_TRUE(PastIntervals::check_new_interval(old_primary,
 						  new_primary,
 						  old_acting,
 						  new_acting,
@@ -283,11 +278,6 @@ TEST(pg_interval_t, check_new_interval)
 						  pgid,
                                                   recoverable.get(),
 						  &past_intervals));
-    ASSERT_EQ((unsigned int)1, past_intervals.size());
-    ASSERT_EQ(same_interval_since, past_intervals[same_interval_since].first);
-    ASSERT_EQ(osdmap->get_epoch() - 1, past_intervals[same_interval_since].last);
-    ASSERT_EQ(osd_id, past_intervals[same_interval_since].acting[0]);
-    ASSERT_EQ(osd_id, past_intervals[same_interval_since].up[0]);
   }
 
   //
@@ -304,10 +294,10 @@ TEST(pg_interval_t, check_new_interval)
     inc.new_pools[pool_id].set_pg_num(new_pg_num);
     osdmap->apply_incremental(inc);
 
-    map<epoch_t, pg_interval_t> past_intervals;
+    PastIntervals past_intervals; past_intervals.update_type(ec_pool, compact);
 
     ASSERT_TRUE(past_intervals.empty());
-    ASSERT_TRUE(pg_interval_t::check_new_interval(old_primary,
+    ASSERT_TRUE(PastIntervals::check_new_interval(old_primary,
 						  new_primary,
 						  old_acting,
 						  new_acting,
@@ -322,11 +312,6 @@ TEST(pg_interval_t, check_new_interval)
 						  pgid,
                                                   recoverable.get(),
 						  &past_intervals));
-    ASSERT_EQ((unsigned int)1, past_intervals.size());
-    ASSERT_EQ(same_interval_since, past_intervals[same_interval_since].first);
-    ASSERT_EQ(osdmap->get_epoch() - 1, past_intervals[same_interval_since].last);
-    ASSERT_EQ(osd_id, past_intervals[same_interval_since].acting[0]);
-    ASSERT_EQ(osd_id, past_intervals[same_interval_since].up[0]);
   }
 
   //
@@ -343,10 +328,10 @@ TEST(pg_interval_t, check_new_interval)
     inc.new_pools[pool_id].set_pg_num(pg_num);
     osdmap->apply_incremental(inc);
 
-    map<epoch_t, pg_interval_t> past_intervals;
+    PastIntervals past_intervals; past_intervals.update_type(ec_pool, compact);
 
     ASSERT_TRUE(past_intervals.empty());
-    ASSERT_TRUE(pg_interval_t::check_new_interval(old_primary,
+    ASSERT_TRUE(PastIntervals::check_new_interval(old_primary,
 						  new_primary,
 						  old_acting,
 						  new_acting,
@@ -361,11 +346,6 @@ TEST(pg_interval_t, check_new_interval)
 						  pgid,
                                                   recoverable.get(),
 						  &past_intervals));
-    ASSERT_EQ((unsigned int)1, past_intervals.size());
-    ASSERT_EQ(same_interval_since, past_intervals[same_interval_since].first);
-    ASSERT_EQ(osdmap->get_epoch() - 1, past_intervals[same_interval_since].last);
-    ASSERT_EQ(osd_id, past_intervals[same_interval_since].acting[0]);
-    ASSERT_EQ(osd_id, past_intervals[same_interval_since].up[0]);
   }
 
   //
@@ -375,12 +355,12 @@ TEST(pg_interval_t, check_new_interval)
   {
     vector<int> old_acting;
 
-    map<epoch_t, pg_interval_t> past_intervals;
+    PastIntervals past_intervals; past_intervals.update_type(ec_pool, compact);
 
     ostringstream out;
 
     ASSERT_TRUE(past_intervals.empty());
-    ASSERT_TRUE(pg_interval_t::check_new_interval(old_primary,
+    ASSERT_TRUE(PastIntervals::check_new_interval(old_primary,
 						  new_primary,
 						  old_acting,
 						  new_acting,
@@ -396,8 +376,6 @@ TEST(pg_interval_t, check_new_interval)
                                                   recoverable.get(),
 						  &past_intervals,
 						  &out));
-    ASSERT_EQ((unsigned int)1, past_intervals.size());
-    ASSERT_FALSE(past_intervals[same_interval_since].maybe_went_rw);
     ASSERT_NE(string::npos, out.str().find("acting set is too small"));
   }
 
@@ -429,10 +407,10 @@ TEST(pg_interval_t, check_new_interval)
 
     ostringstream out;
 
-    map<epoch_t, pg_interval_t> past_intervals;
+    PastIntervals past_intervals; past_intervals.update_type(ec_pool, compact);
 
     ASSERT_TRUE(past_intervals.empty());
-    ASSERT_TRUE(pg_interval_t::check_new_interval(old_primary,
+    ASSERT_TRUE(PastIntervals::check_new_interval(old_primary,
 						  new_primary,
 						  old_acting,
 						  new_acting,
@@ -448,8 +426,6 @@ TEST(pg_interval_t, check_new_interval)
                                                   recoverable.get(),
 						  &past_intervals,
 						  &out));
-    ASSERT_EQ((unsigned int)1, past_intervals.size());
-    ASSERT_FALSE(past_intervals[same_interval_since].maybe_went_rw);
     ASSERT_NE(string::npos, out.str().find("acting set is too small"));
   }
 
@@ -464,10 +440,10 @@ TEST(pg_interval_t, check_new_interval)
 
     ostringstream out;
 
-    map<epoch_t, pg_interval_t> past_intervals;
+    PastIntervals past_intervals; past_intervals.update_type(ec_pool, compact);
 
     ASSERT_TRUE(past_intervals.empty());
-    ASSERT_TRUE(pg_interval_t::check_new_interval(old_primary,
+    ASSERT_TRUE(PastIntervals::check_new_interval(old_primary,
 						  new_primary,
 						  old_acting,
 						  new_acting,
@@ -483,8 +459,6 @@ TEST(pg_interval_t, check_new_interval)
                                                   recoverable.get(),
 						  &past_intervals,
 						  &out));
-    ASSERT_EQ((unsigned int)1, past_intervals.size());
-    ASSERT_TRUE(past_intervals[same_interval_since].maybe_went_rw);
     ASSERT_NE(string::npos, out.str().find("includes interval"));
   }
   //
@@ -509,10 +483,10 @@ TEST(pg_interval_t, check_new_interval)
 
     ostringstream out;
 
-    map<epoch_t, pg_interval_t> past_intervals;
+    PastIntervals past_intervals; past_intervals.update_type(ec_pool, compact);
 
     ASSERT_TRUE(past_intervals.empty());
-    ASSERT_TRUE(pg_interval_t::check_new_interval(old_primary,
+    ASSERT_TRUE(PastIntervals::check_new_interval(old_primary,
 						  new_primary,
 						  old_acting,
 						  new_acting,
@@ -528,8 +502,6 @@ TEST(pg_interval_t, check_new_interval)
                                                   recoverable.get(),
 						  &past_intervals,
 						  &out));
-    ASSERT_EQ((unsigned int)1, past_intervals.size());
-    ASSERT_TRUE(past_intervals[same_interval_since].maybe_went_rw);
     ASSERT_NE(string::npos, out.str().find("presumed to have been rw"));
   }
 
@@ -558,10 +530,10 @@ TEST(pg_interval_t, check_new_interval)
 
     ostringstream out;
 
-    map<epoch_t, pg_interval_t> past_intervals;
+    PastIntervals past_intervals; past_intervals.update_type(ec_pool, compact);
 
     ASSERT_TRUE(past_intervals.empty());
-    ASSERT_TRUE(pg_interval_t::check_new_interval(old_primary,
+    ASSERT_TRUE(PastIntervals::check_new_interval(old_primary,
 						  new_primary,
 						  old_acting,
 						  new_acting,
@@ -577,10 +549,9 @@ TEST(pg_interval_t, check_new_interval)
                                                   recoverable.get(),
 						  &past_intervals,
 						  &out));
-    ASSERT_EQ((unsigned int)1, past_intervals.size());
-    ASSERT_FALSE(past_intervals[same_interval_since].maybe_went_rw);
     ASSERT_NE(string::npos, out.str().find("does not include interval"));
   }
+} // end for, didn't want to reindent
 }
 
 TEST(pg_t, get_ancestor)
@@ -882,6 +853,7 @@ TEST(pg_missing_t, add_next_event)
     EXPECT_TRUE(e.object_is_indexed());
     EXPECT_FALSE(e.reqid_is_indexed());
     EXPECT_FALSE(missing.is_missing(oid));
+    PrCtl unset_dumpable;
     EXPECT_DEATH(missing.add_next_event(e), "");
   }
 
@@ -1034,14 +1006,20 @@ TEST(pg_missing_t, got)
     hobject_t oid(object_t("objname"), "key", 123, 456, 0, "");
     pg_missing_t missing;
     // assert if the oid does not exist
-    EXPECT_DEATH(missing.got(oid, eversion_t()), "");
+    {
+      PrCtl unset_dumpable;
+      EXPECT_DEATH(missing.got(oid, eversion_t()), "");
+    }
     EXPECT_FALSE(missing.is_missing(oid));
     epoch_t epoch = 10;
     eversion_t need(epoch,10);
     missing.add(oid, need, eversion_t());
     EXPECT_TRUE(missing.is_missing(oid));
     // assert if that the version to be removed is lower than the version of the object
-    EXPECT_DEATH(missing.got(oid, eversion_t(epoch / 2,20)), "");
+    {
+      PrCtl unset_dumpable;
+      EXPECT_DEATH(missing.got(oid, eversion_t(epoch / 2,20)), "");
+    }
     // remove of a later version removes the object
     missing.got(oid, eversion_t(epoch * 2,20));
     EXPECT_FALSE(missing.is_missing(oid));
@@ -1485,6 +1463,7 @@ TEST(ghobject_t, parse) {
 
 TEST(pool_opts_t, invalid_opt) {
   EXPECT_FALSE(pool_opts_t::is_opt_name("INVALID_OPT"));
+  PrCtl unset_dumpable;
   EXPECT_DEATH(pool_opts_t::get_opt_desc("INVALID_OPT"), "");
 }
 
@@ -1496,7 +1475,10 @@ TEST(pool_opts_t, scrub_min_interval) {
 
   pool_opts_t opts;
   EXPECT_FALSE(opts.is_set(pool_opts_t::SCRUB_MIN_INTERVAL));
-  EXPECT_DEATH(opts.get(pool_opts_t::SCRUB_MIN_INTERVAL), "");
+  {
+    PrCtl unset_dumpable;
+    EXPECT_DEATH(opts.get(pool_opts_t::SCRUB_MIN_INTERVAL), "");
+  }
   double val;
   EXPECT_FALSE(opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &val));
   opts.set(pool_opts_t::SCRUB_MIN_INTERVAL, static_cast<double>(2015));
@@ -1514,7 +1496,10 @@ TEST(pool_opts_t, scrub_max_interval) {
 
   pool_opts_t opts;
   EXPECT_FALSE(opts.is_set(pool_opts_t::SCRUB_MAX_INTERVAL));
-  EXPECT_DEATH(opts.get(pool_opts_t::SCRUB_MAX_INTERVAL), "");
+  {
+    PrCtl unset_dumpable;
+    EXPECT_DEATH(opts.get(pool_opts_t::SCRUB_MAX_INTERVAL), "");
+  }
   double val;
   EXPECT_FALSE(opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &val));
   opts.set(pool_opts_t::SCRUB_MAX_INTERVAL, static_cast<double>(2015));
@@ -1532,7 +1517,10 @@ TEST(pool_opts_t, deep_scrub_interval) {
 
   pool_opts_t opts;
   EXPECT_FALSE(opts.is_set(pool_opts_t::DEEP_SCRUB_INTERVAL));
-  EXPECT_DEATH(opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL), "");
+  {
+    PrCtl unset_dumpable;
+    EXPECT_DEATH(opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL), "");
+  }
   double val;
   EXPECT_FALSE(opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &val));
   opts.set(pool_opts_t::DEEP_SCRUB_INTERVAL, static_cast<double>(2015));
@@ -1542,6 +1530,299 @@ TEST(pool_opts_t, deep_scrub_interval) {
   EXPECT_FALSE(opts.is_set(pool_opts_t::DEEP_SCRUB_INTERVAL));
 }
 
+struct RequiredPredicate : IsPGRecoverablePredicate {
+  unsigned required_size;
+  RequiredPredicate(unsigned required_size) : required_size(required_size) {}
+  bool operator()(const set<pg_shard_t> &have) const override {
+    return have.size() >= required_size;
+  }
+};
+
+using namespace std;
+struct MapPredicate {
+  map<int, pair<PastIntervals::osd_state_t, epoch_t>> states;
+  MapPredicate(
+    vector<pair<int, pair<PastIntervals::osd_state_t, epoch_t>>> _states)
+   : states(_states.begin(), _states.end()) {}
+  PastIntervals::osd_state_t operator()(epoch_t start, int osd, epoch_t *lost_at) {
+    auto val = states.at(osd);
+    if (lost_at)
+      *lost_at = val.second;
+    return val.first;
+  }
+};
+
+using sit = shard_id_t;
+using PI = PastIntervals;
+using pst = pg_shard_t;
+using ival = PastIntervals::pg_interval_t;
+using ivallst = std::list<ival>;
+const int N = 0x7fffffff /* CRUSH_ITEM_NONE, can't import crush.h here */;
+
+struct PITest : ::testing::Test {
+  PITest() {}
+  void run(
+    bool ec_pool,
+    ivallst intervals,
+    epoch_t last_epoch_started,
+    unsigned min_to_peer,
+    vector<pair<int, pair<PastIntervals::osd_state_t, epoch_t>>> osd_states,
+    vector<int> up,
+    vector<int> acting,
+    set<pg_shard_t> probe,
+    set<int> down,
+    map<int, epoch_t> blocked_by,
+    bool pg_down) {
+    RequiredPredicate rec_pred(min_to_peer);
+    MapPredicate map_pred(osd_states);
+
+    PI::PriorSet correct(
+      ec_pool,
+      probe,
+      down,
+      blocked_by,
+      pg_down,
+      new RequiredPredicate(rec_pred));
+
+    PastIntervals simple, compact;
+    simple.update_type(ec_pool, false);
+    compact.update_type(ec_pool, true);
+    for (auto &&i: intervals) {
+      simple.add_interval(ec_pool, i);
+      compact.add_interval(ec_pool, i);
+    }
+    PI::PriorSet simple_ps = simple.get_prior_set(
+      ec_pool,
+      last_epoch_started,
+      new RequiredPredicate(rec_pred),
+      map_pred,
+      up,
+      acting,
+      nullptr);
+    PI::PriorSet compact_ps = compact.get_prior_set(
+      ec_pool,
+      last_epoch_started,
+      new RequiredPredicate(rec_pred),
+      map_pred,
+      up,
+      acting,
+      nullptr);
+    ASSERT_EQ(correct, simple_ps);
+    ASSERT_EQ(correct, compact_ps);
+  }
+};
+
+TEST_F(PITest, past_intervals_rep) {
+  run(
+    /* ec_pool    */ false,
+    /* intervals  */
+    { ival{{0, 1, 2}, {0, 1, 2}, 10, 20,  true, 0, 0}
+    , ival{{   1, 2}, {   1, 2}, 21, 30,  true, 1, 1}
+    , ival{{      2}, {      2}, 31, 35, false, 2, 2}
+    , ival{{0,    2}, {0,    2}, 36, 50,  true, 0, 0}
+    },
+    /* les        */ 5,
+    /* min_peer   */ 1,
+    /* osd states at end */
+    { make_pair(0, make_pair(PI::UP   , 0))
+    , make_pair(1, make_pair(PI::UP   , 0))
+    , make_pair(2, make_pair(PI::DOWN , 0))
+    },
+    /* acting     */ {0, 1   },
+    /* up         */ {0, 1   },
+    /* probe      */ {pst(0), pst(1)},
+    /* down       */ {2},
+    /* blocked_by */ {},
+    /* pg_down    */ false);
+}
+
+TEST_F(PITest, past_intervals_ec) {
+  run(
+    /* ec_pool    */ true,
+    /* intervals  */
+    { ival{{0, 1, 2}, {0, 1, 2}, 10, 20,  true, 0, 0}
+    , ival{{N, 1, 2}, {N, 1, 2}, 21, 30,  true, 1, 1}
+    },
+    /* les        */ 5,
+    /* min_peer   */ 2,
+    /* osd states at end */
+    { make_pair(0, make_pair(PI::DOWN , 0))
+    , make_pair(1, make_pair(PI::UP   , 0))
+    , make_pair(2, make_pair(PI::UP   , 0))
+    },
+    /* acting     */ {N, 1, 2},
+    /* up         */ {N, 1, 2},
+    /* probe      */ {pst(1, sit(1)), pst(2, sit(2))},
+    /* down       */ {0},
+    /* blocked_by */ {},
+    /* pg_down    */ false);
+}
+
+TEST_F(PITest, past_intervals_rep_down) {
+  run(
+    /* ec_pool    */ false,
+    /* intervals  */
+    { ival{{0, 1, 2}, {0, 1, 2}, 10, 20,  true, 0, 0}
+    , ival{{   1, 2}, {   1, 2}, 21, 30,  true, 1, 1}
+    , ival{{      2}, {      2}, 31, 35,  true, 2, 2}
+    , ival{{0,    2}, {0,    2}, 36, 50,  true, 0, 0}
+    },
+    /* les        */ 5,
+    /* min_peer   */ 1,
+    /* osd states at end */
+    { make_pair(0, make_pair(PI::UP   , 0))
+    , make_pair(1, make_pair(PI::UP   , 0))
+    , make_pair(2, make_pair(PI::DOWN , 0))
+    },
+    /* acting     */ {0, 1   },
+    /* up         */ {0, 1   },
+    /* probe      */ {pst(0), pst(1)},
+    /* down       */ {2},
+    /* blocked_by */ {{2, 0}},
+    /* pg_down    */ true);
+}
+
+TEST_F(PITest, past_intervals_ec_down) {
+  run(
+    /* ec_pool    */ true,
+    /* intervals  */
+    { ival{{0, 1, 2}, {0, 1, 2}, 10, 20,  true, 0, 0}
+    , ival{{N, 1, 2}, {N, 1, 2}, 21, 30,  true, 1, 1}
+    , ival{{N, N, 2}, {N, N, 2}, 31, 35, false, 2, 2}
+    },
+    /* les        */ 5,
+    /* min_peer   */ 2,
+    /* osd states at end */
+    { make_pair(0, make_pair(PI::UP   , 0))
+    , make_pair(1, make_pair(PI::DOWN , 0))
+    , make_pair(2, make_pair(PI::UP   , 0))
+    },
+    /* acting     */ {0, N, 2},
+    /* up         */ {0, N, 2},
+    /* probe      */ {pst(0, sit(0)), pst(2, sit(2))},
+    /* down       */ {1},
+    /* blocked_by */ {{1, 0}},
+    /* pg_down    */ true);
+}
+
+TEST_F(PITest, past_intervals_rep_no_subsets) {
+  run(
+    /* ec_pool    */ false,
+    /* intervals  */
+    { ival{{0,    2}, {0,    2}, 10, 20,  true, 0, 0}
+    , ival{{   1, 2}, {   1, 2}, 21, 30,  true, 1, 1}
+    , ival{{0, 1   }, {0, 1   }, 31, 35,  true, 0, 0}
+    },
+    /* les        */ 5,
+    /* min_peer   */ 1,
+    /* osd states at end */
+    { make_pair(0, make_pair(PI::UP   , 0))
+    , make_pair(1, make_pair(PI::UP   , 0))
+    , make_pair(2, make_pair(PI::DOWN , 0))
+    },
+    /* acting     */ {0, 1   },
+    /* up         */ {0, 1   },
+    /* probe      */ {pst(0), pst(1)},
+    /* down       */ {2},
+    /* blocked_by */ {},
+    /* pg_down    */ false);
+}
+
+TEST_F(PITest, past_intervals_ec_no_subsets) {
+  run(
+    /* ec_pool    */ true,
+    /* intervals  */
+    { ival{{0, N, 2}, {0, N, 2}, 10, 20,  true, 0, 0}
+    , ival{{N, 1, 2}, {N, 1, 2}, 21, 30,  true, 1, 1}
+    , ival{{0, 1, N}, {0, 1, N}, 31, 35,  true, 0, 0}
+    },
+    /* les        */ 5,
+    /* min_peer   */ 2,
+    /* osd states at end */
+    { make_pair(0, make_pair(PI::UP   , 0))
+    , make_pair(1, make_pair(PI::DOWN , 0))
+    , make_pair(2, make_pair(PI::UP   , 0))
+    },
+    /* acting     */ {0, N, 2},
+    /* up         */ {0, N, 2},
+    /* probe      */ {pst(0, sit(0)), pst(2, sit(2))},
+    /* down       */ {1},
+    /* blocked_by */ {{1, 0}},
+    /* pg_down    */ true);
+}
+
+TEST_F(PITest, past_intervals_ec_no_subsets2) {
+  run(
+    /* ec_pool    */ true,
+    /* intervals  */
+    { ival{{N, 1, 2}, {N, 1, 2}, 10, 20,  true, 0, 0}
+    , ival{{0, N, 2}, {0, N, 2}, 21, 30,  true, 1, 1}
+    , ival{{0, 3, N}, {0, 3, N}, 31, 35,  true, 0, 0}
+    },
+    /* les        */ 31,
+    /* min_peer   */ 2,
+    /* osd states at end */
+    { make_pair(0, make_pair(PI::UP   , 0))
+    , make_pair(1, make_pair(PI::DOWN , 0))
+    , make_pair(2, make_pair(PI::UP   , 0))
+    , make_pair(3, make_pair(PI::UP   , 0))
+    },
+    /* acting     */ {0, N, 2},
+    /* up         */ {0, N, 2},
+    /* probe      */ {pst(0, sit(0)), pst(2, sit(2)), pst(3, sit(1))},
+    /* down       */ {1},
+    /* blocked_by */ {},
+    /* pg_down    */ false);
+}
+
+TEST_F(PITest, past_intervals_rep_lost) {
+  run(
+    /* ec_pool    */ false,
+    /* intervals  */
+    { ival{{0, 1, 2}, {0, 1, 2}, 10, 20,  true, 0, 0}
+    , ival{{   1, 2}, {   1, 2}, 21, 30,  true, 1, 1}
+    , ival{{      2}, {      2}, 31, 35,  true, 2, 2}
+    , ival{{0,    2}, {0,    2}, 36, 50,  true, 0, 0}
+    },
+    /* les        */ 5,
+    /* min_peer   */ 1,
+    /* osd states at end */
+    { make_pair(0, make_pair(PI::UP   , 0))
+    , make_pair(1, make_pair(PI::UP   , 0))
+    , make_pair(2, make_pair(PI::LOST , 55))
+    },
+    /* acting     */ {0, 1   },
+    /* up         */ {0, 1   },
+    /* probe      */ {pst(0), pst(1)},
+    /* down       */ {2},
+    /* blocked_by */ {},
+    /* pg_down    */ false);
+}
+
+TEST_F(PITest, past_intervals_ec_lost) {
+  run(
+    /* ec_pool    */ true,
+    /* intervals  */
+    { ival{{0, N, 2}, {0, N, 2}, 10, 20,  true, 0, 0}
+    , ival{{N, 1, 2}, {N, 1, 2}, 21, 30,  true, 1, 1}
+    , ival{{0, 1, N}, {0, 1, N}, 31, 35,  true, 0, 0}
+    },
+    /* les        */ 5,
+    /* min_peer   */ 2,
+    /* osd states at end */
+    { make_pair(0, make_pair(PI::UP   , 0))
+    , make_pair(1, make_pair(PI::LOST , 36))
+    , make_pair(2, make_pair(PI::UP   , 0))
+    },
+    /* acting     */ {0, N, 2},
+    /* up         */ {0, N, 2},
+    /* probe      */ {pst(0, sit(0)), pst(2, sit(2))},
+    /* down       */ {1},
+    /* blocked_by */ {},
+    /* pg_down    */ false);
+}
+
+
 /*
  * Local Variables:
  * compile-command: "cd ../.. ;
diff --git a/src/test/osdc/FakeWriteback.cc b/src/test/osdc/FakeWriteback.cc
index 47e778ef021..72e80e433f6 100644
--- a/src/test/osdc/FakeWriteback.cc
+++ b/src/test/osdc/FakeWriteback.cc
@@ -62,7 +62,9 @@ void FakeWriteback::read(const object_t& oid, uint64_t object_no,
 			 const object_locator_t& oloc,
 			 uint64_t off, uint64_t len, snapid_t snapid,
 			 bufferlist *pbl, uint64_t trunc_size,
-			 __u32 trunc_seq, int op_flags, Context *onfinish)
+			 __u32 trunc_seq, int op_flags,
+                         const ZTracer::Trace &parent_trace,
+                         Context *onfinish)
 {
   C_Delay *wrapper = new C_Delay(m_cct, onfinish, m_lock, off, pbl,
 				 m_delay_ns);
@@ -75,7 +77,9 @@ ceph_tid_t FakeWriteback::write(const object_t& oid,
 				const SnapContext& snapc,
 				const bufferlist &bl, ceph::real_time mtime,
 				uint64_t trunc_size, __u32 trunc_seq,
-				ceph_tid_t journal_tid, Context *oncommit)
+				ceph_tid_t journal_tid,
+                                const ZTracer::Trace &parent_trace,
+                                Context *oncommit)
 {
   C_Delay *wrapper = new C_Delay(m_cct, oncommit, m_lock, off, NULL,
 				 m_delay_ns);
diff --git a/src/test/osdc/FakeWriteback.h b/src/test/osdc/FakeWriteback.h
index d6d0235a4bf..6112eb72082 100644
--- a/src/test/osdc/FakeWriteback.h
+++ b/src/test/osdc/FakeWriteback.h
@@ -20,13 +20,16 @@ public:
   void read(const object_t& oid, uint64_t object_no,
 		    const object_locator_t& oloc, uint64_t off, uint64_t len,
 		    snapid_t snapid, bufferlist *pbl, uint64_t trunc_size,
-		    __u32 trunc_seq, int op_flags, Context *onfinish) override;
+		    __u32 trunc_seq, int op_flags,
+		    const ZTracer::Trace &parent_trace,
+                    Context *onfinish) override;
 
   ceph_tid_t write(const object_t& oid, const object_locator_t& oloc,
 			   uint64_t off, uint64_t len,
 			   const SnapContext& snapc, const bufferlist &bl,
 			   ceph::real_time mtime, uint64_t trunc_size,
 			   __u32 trunc_seq, ceph_tid_t journal_tid,
+                           const ZTracer::Trace &parent_trace,
 			   Context *oncommit) override;
 
   using WritebackHandler::write;
diff --git a/src/test/osdc/MemWriteback.cc b/src/test/osdc/MemWriteback.cc
index 801d1556082..e9e1f9fe345 100644
--- a/src/test/osdc/MemWriteback.cc
+++ b/src/test/osdc/MemWriteback.cc
@@ -92,7 +92,9 @@ void MemWriteback::read(const object_t& oid, uint64_t object_no,
 			 const object_locator_t& oloc,
 			 uint64_t off, uint64_t len, snapid_t snapid,
 			 bufferlist *pbl, uint64_t trunc_size,
-			 __u32 trunc_seq, int op_flags, Context *onfinish)
+			 __u32 trunc_seq, int op_flags,
+                         const ZTracer::Trace &parent_trace,
+                         Context *onfinish)
 {
   assert(snapid == CEPH_NOSNAP);
   C_DelayRead *wrapper = new C_DelayRead(this, m_cct, onfinish, m_lock, oid,
@@ -106,7 +108,9 @@ ceph_tid_t MemWriteback::write(const object_t& oid,
 				const SnapContext& snapc,
 				const bufferlist &bl, ceph::real_time mtime,
 				uint64_t trunc_size, __u32 trunc_seq,
-				ceph_tid_t journal_tid, Context *oncommit)
+				ceph_tid_t journal_tid,
+                                const ZTracer::Trace &parent_trace,
+                                Context *oncommit)
 {
   assert(snapc.seq == 0);
   C_DelayWrite *wrapper = new C_DelayWrite(this, m_cct, oncommit, m_lock, oid,
diff --git a/src/test/osdc/MemWriteback.h b/src/test/osdc/MemWriteback.h
index 726f297d4ca..a073cbf1760 100644
--- a/src/test/osdc/MemWriteback.h
+++ b/src/test/osdc/MemWriteback.h
@@ -20,13 +20,16 @@ public:
   void read(const object_t& oid, uint64_t object_no,
 		    const object_locator_t& oloc, uint64_t off, uint64_t len,
 		    snapid_t snapid, bufferlist *pbl, uint64_t trunc_size,
-		    __u32 trunc_seq, int op_flags, Context *onfinish) override;
+		    __u32 trunc_seq, int op_flags,
+                    const ZTracer::Trace &parent_trace,
+                    Context *onfinish) override;
 
   ceph_tid_t write(const object_t& oid, const object_locator_t& oloc,
 			   uint64_t off, uint64_t len,
 			   const SnapContext& snapc, const bufferlist &bl,
 			   ceph::real_time mtime, uint64_t trunc_size,
 			   __u32 trunc_seq, ceph_tid_t journal_tid,
+                           const ZTracer::Trace &parent_trace,
 			   Context *oncommit) override;
 
   using WritebackHandler::write;
diff --git a/src/test/perf_counters.cc b/src/test/perf_counters.cc
index bb451ea1f1d..3cf987fd0e9 100644
--- a/src/test/perf_counters.cc
+++ b/src/test/perf_counters.cc
@@ -182,8 +182,7 @@ TEST(PerfCounters, MultiplePerfCounters) {
   ASSERT_EQ(sd("{\"test_perfcounter_1\":{\"element1\":13,\"element2\":0.000000000,"
 	    "\"element3\":{\"avgcount\":0,\"sum\":0.000000000}}}"), msg);
   ASSERT_EQ("", client.do_request("{ \"prefix\": \"perf schema\", \"format\": \"json\" }", &msg));
-  ASSERT_EQ(sd("{\"test_perfcounter_1\":{\"element1\":{\"type\":2,\"description\":\"\",\"nick\":\"\"},"
-	    "\"element2\":{\"type\":1,\"description\":\"\",\"nick\":\"\"},\"element3\":{\"type\":5,\"description\":\"\",\"nick\":\"\"}}}"), msg);
+  ASSERT_EQ(sd("{\"test_perfcounter_1\":{\"element1\":{\"type\":2,\"metric_type\":\"gauge\",\"value_type\":\"integer\",\"description\":\"\",\"nick\":\"\"},\"element2\":{\"type\":1,\"metric_type\":\"gauge\",\"value_type\":\"real\",\"description\":\"\",\"nick\":\"\"},\"element3\":{\"type\":5,\"metric_type\":\"gauge\",\"value_type\":\"real-integer-pair\",\"description\":\"\",\"nick\":\"\"}}}"), msg);
   coll->clear();
   ASSERT_EQ("", client.do_request("{ \"prefix\": \"perf dump\", \"format\": \"json\" }", &msg));
   ASSERT_EQ("{}", msg);
diff --git a/src/test/pybind/test_ceph_argparse.py b/src/test/pybind/test_ceph_argparse.py
index 0c9cc7524c5..33de0b3a4f5 100755
--- a/src/test/pybind/test_ceph_argparse.py
+++ b/src/test/pybind/test_ceph_argparse.py
@@ -1178,6 +1178,9 @@ class TestConfigKey(TestArgparse):
     def test_exists(self):
         self.check_1_string_arg('config-key', 'exists')
 
+    def test_dump(self):
+        self.check_no_arg('config-key', 'dump')
+
     def test_list(self):
         self.check_no_arg('config-key', 'list')
 # Local Variables:
diff --git a/src/test/pybind/test_rados.py b/src/test/pybind/test_rados.py
index 50b09053cdd..a4e1efd852e 100644
--- a/src/test/pybind/test_rados.py
+++ b/src/test/pybind/test_rados.py
@@ -11,7 +11,7 @@ import errno
 import sys
 
 # Are we running Python 2.x
-_python2 = sys.hexversion < 0x03000000
+_python2 = sys.version_info[0] < 3
 
 def test_rados_init_error():
     assert_raises(Error, Rados, conffile='', rados_id='admin',
diff --git a/src/test/pybind/test_rbd.py b/src/test/pybind/test_rbd.py
index c3eb371d5e6..b01b7390bfd 100644
--- a/src/test/pybind/test_rbd.py
+++ b/src/test/pybind/test_rbd.py
@@ -938,6 +938,7 @@ class TestClone(object):
         eq(pool, pool_name)
         eq(image, image_name)
         eq(snap, 'snap1')
+        eq(self.image.id(), self.clone.parent_id())
 
         # create a new pool...
         pool_name2 = get_temp_pool_name()
@@ -954,6 +955,7 @@ class TestClone(object):
         eq(pool, pool_name)
         eq(image, image_name)
         eq(snap, 'snap1')
+        eq(self.image.id(), self.other_clone.parent_id())
 
         # can't unprotect snap with children
         assert_raises(ImageBusy, self.image.unprotect_snap, 'snap1')
@@ -1127,6 +1129,8 @@ class TestClone(object):
                 clone.flatten()
                 assert_raises(ImageNotFound, clone.parent_info)
                 assert_raises(ImageNotFound, clone2.parent_info)
+                assert_raises(ImageNotFound, clone.parent_id)
+                assert_raises(ImageNotFound, clone2.parent_id)
                 after_flatten = clone.read(IMG_SIZE // 2, 256)
                 eq(data, after_flatten)
                 after_flatten = clone2.read(IMG_SIZE // 2, 256)
@@ -1220,6 +1224,7 @@ class TestExclusiveLock(object):
                 image1.write(data, 0)
                 image2.flatten()
                 assert_raises(ImageNotFound, image1.parent_info)
+                assert_raises(ImageNotFound, image1.parent_id)
                 parent = True
                 for x in range(30):
                     try:
@@ -1496,6 +1501,21 @@ class TestTrash(object):
         RBD().trash_move(ioctx, image_name, 0)
         RBD().trash_remove(ioctx, image_id)
 
+    def test_get(self):
+        create_image()
+        with Image(ioctx, image_name) as image:
+            image_id = image.id()
+
+        RBD().trash_move(ioctx, image_name, 1000)
+
+        info = RBD().trash_get(ioctx, image_id)
+        eq(image_id, info['id'])
+        eq(image_name, info['name'])
+        eq('USER', info['source'])
+        assert(info['deferment_end_time'] > info['deletion_time'])
+
+        RBD().trash_remove(ioctx, image_id, True)
+
     def test_list(self):
         create_image()
         with Image(ioctx, image_name) as image:
diff --git a/src/test/rbd_mirror/CMakeLists.txt b/src/test/rbd_mirror/CMakeLists.txt
index 291b457bb60..ba5e13f3e1f 100644
--- a/src/test/rbd_mirror/CMakeLists.txt
+++ b/src/test/rbd_mirror/CMakeLists.txt
@@ -26,6 +26,7 @@ add_executable(unittest_rbd_mirror
   image_replayer/test_mock_BootstrapRequest.cc
   image_replayer/test_mock_CreateImageRequest.cc
   image_replayer/test_mock_EventPreprocessor.cc
+  image_replayer/test_mock_PrepareLocalImageRequest.cc
   image_sync/test_mock_ImageCopyRequest.cc
   image_sync/test_mock_ObjectCopyRequest.cc
   image_sync/test_mock_SnapshotCopyRequest.cc
diff --git a/src/test/rbd_mirror/image_replayer/test_mock_BootstrapRequest.cc b/src/test/rbd_mirror/image_replayer/test_mock_BootstrapRequest.cc
index 4da8665799b..a7c7b27b7a4 100644
--- a/src/test/rbd_mirror/image_replayer/test_mock_BootstrapRequest.cc
+++ b/src/test/rbd_mirror/image_replayer/test_mock_BootstrapRequest.cc
@@ -3,7 +3,6 @@
 
 #include "test/rbd_mirror/test_mock_fixture.h"
 #include "librbd/journal/TypeTraits.h"
-#include "tools/rbd_mirror/ImageSync.h"
 #include "tools/rbd_mirror/ImageSyncThrottler.h"
 #include "tools/rbd_mirror/Threads.h"
 #include "tools/rbd_mirror/image_replayer/BootstrapRequest.h"
@@ -45,40 +44,6 @@ namespace mirror {
 class ProgressContext;
 
 template<>
-struct ImageSync<librbd::MockTestImageCtx> {
-  static ImageSync* s_instance;
-  Context *on_finish = nullptr;
-
-  static ImageSync* create(librbd::MockTestImageCtx *local_image_ctx,
-                           librbd::MockTestImageCtx *remote_image_ctx,
-                           SafeTimer *timer, Mutex *timer_lock,
-                           const std::string &mirror_uuid,
-                           ::journal::MockJournaler *journaler,
-                           librbd::journal::MirrorPeerClientMeta *client_meta,
-                           ContextWQ *work_queue, Context *on_finish,
-                           ProgressContext *progress_ctx = nullptr) {
-    assert(s_instance != nullptr);
-    return s_instance;
-  }
-
-  ImageSync() {
-    assert(s_instance == nullptr);
-    s_instance = this;
-  }
-
-  void put() {
-  }
-
-  void get() {
-  }
-
-  MOCK_METHOD0(send, void());
-  MOCK_METHOD0(cancel, void());
-};
-
-ImageSync<librbd::MockTestImageCtx>* ImageSync<librbd::MockTestImageCtx>::s_instance = nullptr;
-
-template<>
 struct ImageSyncThrottler<librbd::MockTestImageCtx> {
   MOCK_METHOD10(start_sync, void(librbd::MockTestImageCtx *local_image_ctx,
                                  librbd::MockTestImageCtx *remote_image_ctx,
@@ -124,6 +89,7 @@ struct CloseImageRequest<librbd::MockTestImageCtx> {
 template<>
 struct CreateImageRequest<librbd::MockTestImageCtx> {
   static CreateImageRequest* s_instance;
+  std::string *local_image_id = nullptr;
   Context *on_finish = nullptr;
 
   static CreateImageRequest* create(librados::IoCtx &local_io_ctx,
@@ -135,7 +101,7 @@ struct CreateImageRequest<librbd::MockTestImageCtx> {
 				    std::string *local_image_id,
                                     Context *on_finish) {
     assert(s_instance != nullptr);
-    *local_image_id = "local image id";
+    s_instance->local_image_id = local_image_id;
     s_instance->on_finish = on_finish;
     return s_instance;
   }
@@ -278,6 +244,7 @@ public:
   typedef ImageSyncThrottlerRef<librbd::MockTestImageCtx> MockImageSyncThrottler;
   typedef BootstrapRequest<librbd::MockTestImageCtx> MockBootstrapRequest;
   typedef CloseImageRequest<librbd::MockTestImageCtx> MockCloseImageRequest;
+  typedef CreateImageRequest<librbd::MockTestImageCtx> MockCreateImageRequest;
   typedef IsPrimaryRequest<librbd::MockTestImageCtx> MockIsPrimaryRequest;
   typedef OpenImageRequest<librbd::MockTestImageCtx> MockOpenImageRequest;
   typedef OpenLocalImageRequest<librbd::MockTestImageCtx> MockOpenLocalImageRequest;
@@ -297,25 +264,6 @@ public:
     ASSERT_EQ(0, open_image(m_local_io_ctx, m_image_name, &m_local_image_ctx));
   }
 
-  void expect_mirror_image_get_image_id(librados::IoCtx &io_ctx,
-                                        const std::string &global_image_id,
-                                        const std::string &image_id, int r) {
-    bufferlist in_bl;
-    ::encode(global_image_id, in_bl);
-
-    bufferlist bl;
-    ::encode(image_id, bl);
-
-    EXPECT_CALL(get_mock_io_ctx(io_ctx),
-                exec(RBD_MIRRORING, _, StrEq("rbd"),
-                     StrEq("mirror_image_get_image_id"), ContentsEqual(in_bl),
-                     _, _))
-      .WillOnce(DoAll(WithArg<5>(Invoke([bl](bufferlist *out_bl) {
-                                   *out_bl = bl;
-                                 })),
-                Return(r)));
-  }
-
   void expect_journaler_get_client(::journal::MockJournaler &mock_journaler,
                                    const std::string &client_id,
                                    cls::journal::Client &client, int r) {
@@ -340,6 +288,18 @@ public:
                                         }))));
   }
 
+  void expect_journaler_register_client(::journal::MockJournaler &mock_journaler,
+                                        const librbd::journal::ClientData &client_data,
+                                        int r) {
+    bufferlist bl;
+    ::encode(client_data, bl);
+
+    EXPECT_CALL(mock_journaler, register_client(ContentsEqual(bl), _))
+      .WillOnce(WithArg<1>(Invoke([this, r](Context *on_finish) {
+                                    m_threads->work_queue->queue(on_finish, r);
+                                  })));
+  }
+
   void expect_journaler_update_client(::journal::MockJournaler &mock_journaler,
                                       const librbd::journal::ClientData &client_data,
                                       int r) {
@@ -365,12 +325,12 @@ public:
 
   void expect_open_local_image(MockOpenLocalImageRequest &mock_open_local_image_request,
                                librados::IoCtx &io_ctx, const std::string &image_id,
-                               librbd::MockTestImageCtx &mock_image_ctx, int r) {
+                               librbd::MockTestImageCtx *mock_image_ctx, int r) {
     EXPECT_CALL(mock_open_local_image_request,
                 construct(IsSameIoCtx(&io_ctx), image_id));
     EXPECT_CALL(mock_open_local_image_request, send())
-      .WillOnce(Invoke([this, &mock_open_local_image_request, &mock_image_ctx, r]() {
-          *mock_open_local_image_request.image_ctx = &mock_image_ctx;
+      .WillOnce(Invoke([this, &mock_open_local_image_request, mock_image_ctx, r]() {
+          *mock_open_local_image_request.image_ctx = mock_image_ctx;
           m_threads->work_queue->queue(mock_open_local_image_request.on_finish,
                                        r);
         }));
@@ -412,6 +372,25 @@ public:
                       Return(r)));
   }
 
+  void expect_create_image(MockCreateImageRequest &mock_create_image_request,
+                           const std::string &image_id, int r) {
+    EXPECT_CALL(mock_create_image_request, send())
+      .WillOnce(Invoke([this, &mock_create_image_request, image_id, r]() {
+          *mock_create_image_request.local_image_id = image_id;
+          m_threads->work_queue->queue(mock_create_image_request.on_finish, r);
+        }));
+  }
+
+  void expect_image_sync(MockImageSyncThrottler image_sync_throttler,
+                         int r) {
+    EXPECT_CALL(*image_sync_throttler, start_sync(_, _, _, _,
+                                                  StrEq("local mirror uuid"),
+                                                  _, _, _, _, _))
+      .WillOnce(WithArg<8>(Invoke([this, r](Context *on_finish) {
+                             m_threads->work_queue->queue(on_finish, r);
+                           })));
+  }
+
   bufferlist encode_tag_data(const librbd::journal::TagData &tag_data) {
     bufferlist bl;
     ::encode(tag_data, bl);
@@ -420,6 +399,7 @@ public:
 
   MockBootstrapRequest *create_request(MockImageSyncThrottler mock_image_sync_throttler,
                                        ::journal::MockJournaler &mock_journaler,
+                                       const std::string &local_image_id,
                                        const std::string &remote_image_id,
                                        const std::string &global_image_id,
                                        const std::string &local_mirror_uuid,
@@ -428,7 +408,8 @@ public:
     return new MockBootstrapRequest(m_local_io_ctx,
                                     m_remote_io_ctx,
                                     mock_image_sync_throttler,
-                                    &m_local_test_image_ctx, "",
+                                    &m_local_test_image_ctx,
+                                    local_image_id,
                                     remote_image_id,
                                     global_image_id,
                                     m_threads->work_queue,
@@ -453,11 +434,6 @@ TEST_F(TestMockImageReplayerBootstrapRequest, NonPrimaryRemoteSyncingState) {
 
   InSequence seq;
 
-  // look up local image by global image id
-  librbd::MockTestImageCtx mock_local_image_ctx(*m_local_image_ctx);
-  expect_mirror_image_get_image_id(m_local_io_ctx, "global image id",
-                                   mock_local_image_ctx.id, 0);
-
   // lookup remote image tag class
   cls::journal::Client client;
   librbd::journal::ClientData client_data{
@@ -469,6 +445,7 @@ TEST_F(TestMockImageReplayerBootstrapRequest, NonPrimaryRemoteSyncingState) {
                               client, 0);
 
   // lookup local peer in remote journal
+  librbd::MockTestImageCtx mock_local_image_ctx(*m_local_image_ctx);
   librbd::journal::MirrorPeerClientMeta mirror_peer_client_meta{
     mock_local_image_ctx.id};
   mirror_peer_client_meta.state = librbd::journal::MIRROR_PEER_STATE_SYNCING;
@@ -499,9 +476,9 @@ TEST_F(TestMockImageReplayerBootstrapRequest, NonPrimaryRemoteSyncingState) {
   MockImageSyncThrottler mock_image_sync_throttler(
     new ImageSyncThrottler<librbd::MockTestImageCtx>());
   MockBootstrapRequest *request = create_request(
-    mock_image_sync_throttler, mock_journaler, mock_remote_image_ctx.id,
-    "global image id", "local mirror uuid", "remote mirror uuid",
-    &ctx);
+    mock_image_sync_throttler, mock_journaler, mock_local_image_ctx.id,
+    mock_remote_image_ctx.id, "global image id", "local mirror uuid",
+    "remote mirror uuid", &ctx);
   request->send();
   ASSERT_EQ(-EREMOTEIO, ctx.wait());
 }
@@ -511,11 +488,6 @@ TEST_F(TestMockImageReplayerBootstrapRequest, RemoteDemotePromote) {
 
   InSequence seq;
 
-  // look up local image by global image id
-  librbd::MockTestImageCtx mock_local_image_ctx(*m_local_image_ctx);
-  expect_mirror_image_get_image_id(m_local_io_ctx, "global image id",
-                                   mock_local_image_ctx.id, 0);
-
   // lookup remote image tag class
   cls::journal::Client client;
   librbd::journal::ClientData client_data{
@@ -527,6 +499,7 @@ TEST_F(TestMockImageReplayerBootstrapRequest, RemoteDemotePromote) {
                               client, 0);
 
   // lookup local peer in remote journal
+  librbd::MockTestImageCtx mock_local_image_ctx(*m_local_image_ctx);
   librbd::journal::MirrorPeerClientMeta mirror_peer_client_meta{
     mock_local_image_ctx.id};
   mirror_peer_client_meta.state = librbd::journal::MIRROR_PEER_STATE_REPLAYING;
@@ -549,7 +522,7 @@ TEST_F(TestMockImageReplayerBootstrapRequest, RemoteDemotePromote) {
   mock_local_image_ctx.journal = &mock_journal;
   MockOpenLocalImageRequest mock_open_local_image_request;
   expect_open_local_image(mock_open_local_image_request, m_local_io_ctx,
-                          mock_local_image_ctx.id, mock_local_image_ctx, 0);
+                          mock_local_image_ctx.id, &mock_local_image_ctx, 0);
   expect_is_resync_requested(mock_journal, false, 0);
 
   // remote demotion / promotion event
@@ -578,9 +551,9 @@ TEST_F(TestMockImageReplayerBootstrapRequest, RemoteDemotePromote) {
   MockImageSyncThrottler mock_image_sync_throttler(
     new ImageSyncThrottler<librbd::MockTestImageCtx>());
   MockBootstrapRequest *request = create_request(
-    mock_image_sync_throttler, mock_journaler, mock_remote_image_ctx.id,
-    "global image id", "local mirror uuid", "remote mirror uuid",
-    &ctx);
+    mock_image_sync_throttler, mock_journaler, mock_local_image_ctx.id,
+    mock_remote_image_ctx.id, "global image id", "local mirror uuid",
+    "remote mirror uuid", &ctx);
   request->send();
   ASSERT_EQ(0, ctx.wait());
 }
@@ -590,11 +563,6 @@ TEST_F(TestMockImageReplayerBootstrapRequest, MultipleRemoteDemotePromotes) {
 
   InSequence seq;
 
-  // look up local image by global image id
-  librbd::MockTestImageCtx mock_local_image_ctx(*m_local_image_ctx);
-  expect_mirror_image_get_image_id(m_local_io_ctx, "global image id",
-                                   mock_local_image_ctx.id, 0);
-
   // lookup remote image tag class
   cls::journal::Client client;
   librbd::journal::ClientData client_data{
@@ -606,6 +574,7 @@ TEST_F(TestMockImageReplayerBootstrapRequest, MultipleRemoteDemotePromotes) {
                               client, 0);
 
   // lookup local peer in remote journal
+  librbd::MockTestImageCtx mock_local_image_ctx(*m_local_image_ctx);
   librbd::journal::MirrorPeerClientMeta mirror_peer_client_meta{
     mock_local_image_ctx.id};
   mirror_peer_client_meta.state = librbd::journal::MIRROR_PEER_STATE_REPLAYING;
@@ -628,7 +597,7 @@ TEST_F(TestMockImageReplayerBootstrapRequest, MultipleRemoteDemotePromotes) {
   mock_local_image_ctx.journal = &mock_journal;
   MockOpenLocalImageRequest mock_open_local_image_request;
   expect_open_local_image(mock_open_local_image_request, m_local_io_ctx,
-                          mock_local_image_ctx.id, mock_local_image_ctx, 0);
+                          mock_local_image_ctx.id, &mock_local_image_ctx, 0);
   expect_is_resync_requested(mock_journal, false, 0);
 
   // remote demotion / promotion event
@@ -667,9 +636,9 @@ TEST_F(TestMockImageReplayerBootstrapRequest, MultipleRemoteDemotePromotes) {
   MockImageSyncThrottler mock_image_sync_throttler(
     new ImageSyncThrottler<librbd::MockTestImageCtx>());
   MockBootstrapRequest *request = create_request(
-    mock_image_sync_throttler, mock_journaler, mock_remote_image_ctx.id,
-    "global image id", "local mirror uuid", "remote mirror uuid",
-    &ctx);
+    mock_image_sync_throttler, mock_journaler, mock_local_image_ctx.id,
+    mock_remote_image_ctx.id, "global image id", "local mirror uuid",
+    "remote mirror uuid", &ctx);
   request->send();
   ASSERT_EQ(0, ctx.wait());
 }
@@ -679,11 +648,6 @@ TEST_F(TestMockImageReplayerBootstrapRequest, LocalDemoteRemotePromote) {
 
   InSequence seq;
 
-  // look up local image by global image id
-  librbd::MockTestImageCtx mock_local_image_ctx(*m_local_image_ctx);
-  expect_mirror_image_get_image_id(m_local_io_ctx, "global image id",
-                                   mock_local_image_ctx.id, 0);
-
   // lookup remote image tag class
   cls::journal::Client client;
   librbd::journal::ClientData client_data{
@@ -695,6 +659,7 @@ TEST_F(TestMockImageReplayerBootstrapRequest, LocalDemoteRemotePromote) {
                               client, 0);
 
   // lookup local peer in remote journal
+  librbd::MockTestImageCtx mock_local_image_ctx(*m_local_image_ctx);
   librbd::journal::MirrorPeerClientMeta mirror_peer_client_meta{
     mock_local_image_ctx.id};
   mirror_peer_client_meta.state = librbd::journal::MIRROR_PEER_STATE_REPLAYING;
@@ -717,7 +682,7 @@ TEST_F(TestMockImageReplayerBootstrapRequest, LocalDemoteRemotePromote) {
   mock_local_image_ctx.journal = &mock_journal;
   MockOpenLocalImageRequest mock_open_local_image_request;
   expect_open_local_image(mock_open_local_image_request, m_local_io_ctx,
-                          mock_local_image_ctx.id, mock_local_image_ctx, 0);
+                          mock_local_image_ctx.id, &mock_local_image_ctx, 0);
   expect_is_resync_requested(mock_journal, false, 0);
 
   // remote demotion / promotion event
@@ -744,9 +709,9 @@ TEST_F(TestMockImageReplayerBootstrapRequest, LocalDemoteRemotePromote) {
   MockImageSyncThrottler mock_image_sync_throttler(
     new ImageSyncThrottler<librbd::MockTestImageCtx>());
   MockBootstrapRequest *request = create_request(
-    mock_image_sync_throttler, mock_journaler, mock_remote_image_ctx.id,
-    "global image id", "local mirror uuid", "remote mirror uuid",
-    &ctx);
+    mock_image_sync_throttler, mock_journaler, mock_local_image_ctx.id,
+    mock_remote_image_ctx.id, "global image id", "local mirror uuid",
+    "remote mirror uuid", &ctx);
   request->send();
   ASSERT_EQ(0, ctx.wait());
 }
@@ -756,11 +721,6 @@ TEST_F(TestMockImageReplayerBootstrapRequest, SplitBrainForcePromote) {
 
   InSequence seq;
 
-  // look up local image by global image id
-  librbd::MockTestImageCtx mock_local_image_ctx(*m_local_image_ctx);
-  expect_mirror_image_get_image_id(m_local_io_ctx, "global image id",
-                                   mock_local_image_ctx.id, 0);
-
   // lookup remote image tag class
   cls::journal::Client client;
   librbd::journal::ClientData client_data{
@@ -772,6 +732,7 @@ TEST_F(TestMockImageReplayerBootstrapRequest, SplitBrainForcePromote) {
                               client, 0);
 
   // lookup local peer in remote journal
+  librbd::MockTestImageCtx mock_local_image_ctx(*m_local_image_ctx);
   librbd::journal::MirrorPeerClientMeta mirror_peer_client_meta{
     mock_local_image_ctx.id};
   mirror_peer_client_meta.state = librbd::journal::MIRROR_PEER_STATE_REPLAYING;
@@ -794,7 +755,7 @@ TEST_F(TestMockImageReplayerBootstrapRequest, SplitBrainForcePromote) {
   mock_local_image_ctx.journal = &mock_journal;
   MockOpenLocalImageRequest mock_open_local_image_request;
   expect_open_local_image(mock_open_local_image_request, m_local_io_ctx,
-                          mock_local_image_ctx.id, mock_local_image_ctx, 0);
+                          mock_local_image_ctx.id, &mock_local_image_ctx, 0);
   expect_is_resync_requested(mock_journal, false, 0);
 
   // remote demotion / promotion event
@@ -820,9 +781,9 @@ TEST_F(TestMockImageReplayerBootstrapRequest, SplitBrainForcePromote) {
   MockImageSyncThrottler mock_image_sync_throttler(
     new ImageSyncThrottler<librbd::MockTestImageCtx>());
   MockBootstrapRequest *request = create_request(
-    mock_image_sync_throttler, mock_journaler, mock_remote_image_ctx.id,
-    "global image id", "local mirror uuid", "remote mirror uuid",
-    &ctx);
+    mock_image_sync_throttler, mock_journaler, mock_local_image_ctx.id,
+    mock_remote_image_ctx.id, "global image id", "local mirror uuid",
+    "remote mirror uuid", &ctx);
   request->send();
   ASSERT_EQ(-EEXIST, ctx.wait());
   ASSERT_EQ(NULL, m_local_test_image_ctx);
@@ -833,11 +794,6 @@ TEST_F(TestMockImageReplayerBootstrapRequest, ResyncRequested) {
 
   InSequence seq;
 
-  // look up local image by global image id
-  librbd::MockTestImageCtx mock_local_image_ctx(*m_local_image_ctx);
-  expect_mirror_image_get_image_id(m_local_io_ctx, "global image id",
-                                   mock_local_image_ctx.id, 0);
-
   // lookup remote image tag class
   cls::journal::Client client;
   librbd::journal::ClientData client_data{
@@ -849,6 +805,7 @@ TEST_F(TestMockImageReplayerBootstrapRequest, ResyncRequested) {
                               client, 0);
 
   // lookup local peer in remote journal
+  librbd::MockTestImageCtx mock_local_image_ctx(*m_local_image_ctx);
   librbd::journal::MirrorPeerClientMeta mirror_peer_client_meta{
     mock_local_image_ctx.id};
   mirror_peer_client_meta.state = librbd::journal::MIRROR_PEER_STATE_REPLAYING;
@@ -871,7 +828,7 @@ TEST_F(TestMockImageReplayerBootstrapRequest, ResyncRequested) {
   mock_local_image_ctx.journal = &mock_journal;
   MockOpenLocalImageRequest mock_open_local_image_request;
   expect_open_local_image(mock_open_local_image_request, m_local_io_ctx,
-                          mock_local_image_ctx.id, mock_local_image_ctx, 0);
+                          mock_local_image_ctx.id, &mock_local_image_ctx, 0);
 
   // resync is requested
   expect_is_resync_requested(mock_journal, true, 0);
@@ -884,15 +841,163 @@ TEST_F(TestMockImageReplayerBootstrapRequest, ResyncRequested) {
   MockImageSyncThrottler mock_image_sync_throttler(
     new ImageSyncThrottler<librbd::MockTestImageCtx>());
   MockBootstrapRequest *request = create_request(
-    mock_image_sync_throttler, mock_journaler, mock_remote_image_ctx.id,
-    "global image id", "local mirror uuid", "remote mirror uuid",
-    &ctx);
+    mock_image_sync_throttler, mock_journaler, mock_local_image_ctx.id,
+    mock_remote_image_ctx.id, "global image id", "local mirror uuid",
+    "remote mirror uuid", &ctx);
   m_do_resync = false;
   request->send();
   ASSERT_EQ(0, ctx.wait());
   ASSERT_TRUE(m_do_resync);
 }
 
+TEST_F(TestMockImageReplayerBootstrapRequest, PrimaryRemote) {
+  create_local_image();
+
+  InSequence seq;
+
+  // lookup remote image tag class
+  cls::journal::Client client;
+  librbd::journal::ClientData client_data{
+    librbd::journal::ImageClientMeta{123}};
+  ::encode(client_data, client.data);
+  ::journal::MockJournaler mock_journaler;
+  expect_journaler_get_client(mock_journaler,
+                              librbd::Journal<>::IMAGE_CLIENT_ID,
+                              client, 0);
+
+  // lookup local peer in remote journal
+  client = {};
+  expect_journaler_get_client(mock_journaler, "local mirror uuid",
+                              client, -ENOENT);
+
+  // register missing client in remote journal
+  librbd::journal::MirrorPeerClientMeta mirror_peer_client_meta;
+  client_data.client_meta = mirror_peer_client_meta;
+  expect_journaler_register_client(mock_journaler, client_data, 0);
+
+  // open the remote image
+  librbd::MockJournal mock_journal;
+  librbd::MockTestImageCtx mock_remote_image_ctx(*m_remote_image_ctx);
+  MockOpenImageRequest mock_open_image_request;
+  expect_open_image(mock_open_image_request, m_remote_io_ctx,
+                    mock_remote_image_ctx.id, mock_remote_image_ctx, 0);
+  MockIsPrimaryRequest mock_is_primary_request;
+  expect_is_primary(mock_is_primary_request, true, 0);
+
+  // create the local image
+  librbd::MockTestImageCtx mock_local_image_ctx(*m_local_image_ctx);
+  mock_local_image_ctx.journal = &mock_journal;
+
+  MockCreateImageRequest mock_create_image_request;
+  expect_create_image(mock_create_image_request, mock_local_image_ctx.id, 0);
+
+  // open the local image
+  MockOpenLocalImageRequest mock_open_local_image_request;
+  expect_open_local_image(mock_open_local_image_request, m_local_io_ctx,
+                          mock_local_image_ctx.id, &mock_local_image_ctx, 0);
+  expect_is_resync_requested(mock_journal, false, 0);
+
+  // update client state back to syncing
+  mirror_peer_client_meta = {mock_local_image_ctx.id};
+  mirror_peer_client_meta.state = librbd::journal::MIRROR_PEER_STATE_SYNCING;
+  client_data.client_meta = mirror_peer_client_meta;
+  client.data.clear();
+  ::encode(client_data, client.data);
+  expect_journaler_update_client(mock_journaler, client_data, 0);
+
+  // sync the remote image to the local image
+  MockImageSyncThrottler mock_image_sync_throttler(
+    new ImageSyncThrottler<librbd::MockTestImageCtx>());
+  expect_image_sync(mock_image_sync_throttler, 0);
+
+  MockCloseImageRequest mock_close_image_request;
+  expect_close_image(mock_close_image_request, mock_remote_image_ctx, 0);
+
+  C_SaferCond ctx;
+  MockBootstrapRequest *request = create_request(
+    mock_image_sync_throttler, mock_journaler, "",
+    mock_remote_image_ctx.id, "global image id", "local mirror uuid",
+    "remote mirror uuid", &ctx);
+  request->send();
+  ASSERT_EQ(0, ctx.wait());
+}
+
+TEST_F(TestMockImageReplayerBootstrapRequest, PrimaryRemoteLocalDeleted) {
+  create_local_image();
+
+  InSequence seq;
+
+  // lookup remote image tag class
+  cls::journal::Client client;
+  librbd::journal::ClientData client_data{
+    librbd::journal::ImageClientMeta{123}};
+  ::encode(client_data, client.data);
+  ::journal::MockJournaler mock_journaler;
+  expect_journaler_get_client(mock_journaler,
+                              librbd::Journal<>::IMAGE_CLIENT_ID,
+                              client, 0);
+
+  // lookup local peer in remote journal
+  librbd::journal::MirrorPeerClientMeta mirror_peer_client_meta{
+    "missing image id"};
+  mirror_peer_client_meta.state = librbd::journal::MIRROR_PEER_STATE_REPLAYING;
+  client_data.client_meta = mirror_peer_client_meta;
+  client.data.clear();
+  ::encode(client_data, client.data);
+  expect_journaler_get_client(mock_journaler, "local mirror uuid",
+                              client, 0);
+
+  // open the remote image
+  librbd::MockJournal mock_journal;
+  librbd::MockTestImageCtx mock_remote_image_ctx(*m_remote_image_ctx);
+  MockOpenImageRequest mock_open_image_request;
+  expect_open_image(mock_open_image_request, m_remote_io_ctx,
+                    mock_remote_image_ctx.id, mock_remote_image_ctx, 0);
+  MockIsPrimaryRequest mock_is_primary_request;
+  expect_is_primary(mock_is_primary_request, true, 0);
+
+  // open the missing local image
+  MockOpenLocalImageRequest mock_open_local_image_request;
+  expect_open_local_image(mock_open_local_image_request, m_local_io_ctx,
+                          "missing image id", nullptr, -ENOENT);
+
+  // create the missing local image
+  librbd::MockTestImageCtx mock_local_image_ctx(*m_local_image_ctx);
+  mock_local_image_ctx.journal = &mock_journal;
+
+  MockCreateImageRequest mock_create_image_request;
+  expect_create_image(mock_create_image_request, mock_local_image_ctx.id, 0);
+
+  // open the local image
+  expect_open_local_image(mock_open_local_image_request, m_local_io_ctx,
+                          mock_local_image_ctx.id, &mock_local_image_ctx, 0);
+  expect_is_resync_requested(mock_journal, false, 0);
+
+  // update client state back to syncing
+  mirror_peer_client_meta = {mock_local_image_ctx.id};
+  mirror_peer_client_meta.state = librbd::journal::MIRROR_PEER_STATE_SYNCING;
+  client_data.client_meta = mirror_peer_client_meta;
+  client.data.clear();
+  ::encode(client_data, client.data);
+  expect_journaler_update_client(mock_journaler, client_data, 0);
+
+  // sync the remote image to the local image
+  MockImageSyncThrottler mock_image_sync_throttler(
+    new ImageSyncThrottler<librbd::MockTestImageCtx>());
+  expect_image_sync(mock_image_sync_throttler, 0);
+
+  MockCloseImageRequest mock_close_image_request;
+  expect_close_image(mock_close_image_request, mock_remote_image_ctx, 0);
+
+  C_SaferCond ctx;
+  MockBootstrapRequest *request = create_request(
+    mock_image_sync_throttler, mock_journaler, "",
+    mock_remote_image_ctx.id, "global image id", "local mirror uuid",
+    "remote mirror uuid", &ctx);
+  request->send();
+  ASSERT_EQ(0, ctx.wait());
+}
+
 } // namespace image_replayer
 } // namespace mirror
 } // namespace rbd
diff --git a/src/test/rbd_mirror/image_replayer/test_mock_PrepareLocalImageRequest.cc b/src/test/rbd_mirror/image_replayer/test_mock_PrepareLocalImageRequest.cc
new file mode 100644
index 00000000000..b79d4de9e18
--- /dev/null
+++ b/src/test/rbd_mirror/image_replayer/test_mock_PrepareLocalImageRequest.cc
@@ -0,0 +1,196 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "test/rbd_mirror/test_mock_fixture.h"
+#include "cls/rbd/cls_rbd_types.h"
+#include "librbd/journal/TypeTraits.h"
+#include "tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.h"
+#include "test/journal/mock/MockJournaler.h"
+#include "test/librados_test_stub/MockTestMemIoCtxImpl.h"
+#include "test/librbd/mock/MockImageCtx.h"
+#include "test/librbd/mock/MockJournal.h"
+
+namespace librbd {
+
+namespace {
+
+struct MockTestImageCtx : public librbd::MockImageCtx {
+  MockTestImageCtx(librbd::ImageCtx &image_ctx)
+    : librbd::MockImageCtx(image_ctx) {
+  }
+};
+
+} // anonymous namespace
+} // namespace librbd
+
+// template definitions
+#include "tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.cc"
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+using ::testing::_;
+using ::testing::DoAll;
+using ::testing::InSequence;
+using ::testing::Invoke;
+using ::testing::Return;
+using ::testing::StrEq;
+using ::testing::WithArg;
+using ::testing::WithArgs;
+
+class TestMockImageReplayerPrepareLocalImageRequest : public TestMockFixture {
+public:
+  typedef PrepareLocalImageRequest<librbd::MockTestImageCtx> MockPrepareLocalImageRequest;
+
+  void expect_mirror_image_get_image_id(librados::IoCtx &io_ctx,
+                                        const std::string &image_id, int r) {
+    bufferlist bl;
+    ::encode(image_id, bl);
+
+    EXPECT_CALL(get_mock_io_ctx(io_ctx),
+                exec(RBD_MIRRORING, _, StrEq("rbd"), StrEq("mirror_image_get_image_id"), _, _, _))
+      .WillOnce(DoAll(WithArg<5>(Invoke([bl](bufferlist *out_bl) {
+                                          *out_bl = bl;
+                                        })),
+                      Return(r)));
+  }
+
+  void expect_mirror_image_get(librados::IoCtx &io_ctx,
+                               cls::rbd::MirrorImageState state,
+                               const std::string &global_id, int r) {
+    cls::rbd::MirrorImage mirror_image;
+    mirror_image.state = state;
+    mirror_image.global_image_id = global_id;
+
+    bufferlist bl;
+    ::encode(mirror_image, bl);
+
+    EXPECT_CALL(get_mock_io_ctx(io_ctx),
+                exec(RBD_MIRRORING, _, StrEq("rbd"), StrEq("mirror_image_get"), _, _, _))
+      .WillOnce(DoAll(WithArg<5>(Invoke([bl](bufferlist *out_bl) {
+                                          *out_bl = bl;
+                                        })),
+                      Return(r)));
+  }
+
+  void expect_get_tag_owner(librbd::MockJournal &mock_journal,
+                            const std::string &local_image_id,
+                            const std::string &tag_owner, int r) {
+    EXPECT_CALL(mock_journal, get_tag_owner(local_image_id, _, _, _))
+      .WillOnce(WithArgs<1, 3>(Invoke([tag_owner, r](std::string *owner, Context *on_finish) {
+                                        *owner = tag_owner;
+                                        on_finish->complete(r);
+                                      })));
+  }
+
+};
+
+TEST_F(TestMockImageReplayerPrepareLocalImageRequest, Success) {
+  InSequence seq;
+  expect_mirror_image_get_image_id(m_local_io_ctx, "local image id", 0);
+  expect_mirror_image_get(m_local_io_ctx, cls::rbd::MIRROR_IMAGE_STATE_ENABLED,
+                          "global image id", 0);
+
+  librbd::MockJournal mock_journal;
+  expect_get_tag_owner(mock_journal, "local image id", "remote mirror uuid", 0);
+
+  std::string local_image_id;
+  std::string tag_owner;
+  C_SaferCond ctx;
+  auto req = MockPrepareLocalImageRequest::create(m_local_io_ctx,
+                                                  "global image id",
+                                                  &local_image_id,
+                                                  &tag_owner,
+                                                  m_threads->work_queue,
+                                                  &ctx);
+  req->send();
+
+  ASSERT_EQ(0, ctx.wait());
+  ASSERT_EQ(std::string("local image id"), local_image_id);
+  ASSERT_EQ(std::string("remote mirror uuid"), tag_owner);
+}
+
+TEST_F(TestMockImageReplayerPrepareLocalImageRequest, MirrorImageIdDNE) {
+  InSequence seq;
+  expect_mirror_image_get_image_id(m_local_io_ctx, "", -ENOENT);
+
+  std::string local_image_id;
+  std::string tag_owner;
+  C_SaferCond ctx;
+  auto req = MockPrepareLocalImageRequest::create(m_local_io_ctx,
+                                                  "global image id",
+                                                  &local_image_id,
+                                                  &tag_owner,
+                                                  m_threads->work_queue,
+                                                  &ctx);
+  req->send();
+
+  ASSERT_EQ(-ENOENT, ctx.wait());
+}
+
+TEST_F(TestMockImageReplayerPrepareLocalImageRequest, MirrorImageIdError) {
+  InSequence seq;
+  expect_mirror_image_get_image_id(m_local_io_ctx, "", -EINVAL);
+
+  std::string local_image_id;
+  std::string tag_owner;
+  C_SaferCond ctx;
+  auto req = MockPrepareLocalImageRequest::create(m_local_io_ctx,
+                                                  "global image id",
+                                                  &local_image_id,
+                                                  &tag_owner,
+                                                  m_threads->work_queue,
+                                                  &ctx);
+  req->send();
+
+  ASSERT_EQ(-EINVAL, ctx.wait());
+}
+
+TEST_F(TestMockImageReplayerPrepareLocalImageRequest, MirrorImageError) {
+  InSequence seq;
+  expect_mirror_image_get_image_id(m_local_io_ctx, "local image id", 0);
+  expect_mirror_image_get(m_local_io_ctx, cls::rbd::MIRROR_IMAGE_STATE_DISABLED,
+                          "", -EINVAL);
+
+  std::string local_image_id;
+  std::string tag_owner;
+  C_SaferCond ctx;
+  auto req = MockPrepareLocalImageRequest::create(m_local_io_ctx,
+                                                  "global image id",
+                                                  &local_image_id,
+                                                  &tag_owner,
+                                                  m_threads->work_queue,
+                                                  &ctx);
+  req->send();
+
+  ASSERT_EQ(-EINVAL, ctx.wait());
+}
+
+TEST_F(TestMockImageReplayerPrepareLocalImageRequest, TagOwnerError) {
+  InSequence seq;
+  expect_mirror_image_get_image_id(m_local_io_ctx, "local image id", 0);
+  expect_mirror_image_get(m_local_io_ctx, cls::rbd::MIRROR_IMAGE_STATE_ENABLED,
+                          "global image id", 0);
+
+  librbd::MockJournal mock_journal;
+  expect_get_tag_owner(mock_journal, "local image id", "remote mirror uuid",
+                       -ENOENT);
+
+  std::string local_image_id;
+  std::string tag_owner;
+  C_SaferCond ctx;
+  auto req = MockPrepareLocalImageRequest::create(m_local_io_ctx,
+                                                  "global image id",
+                                                  &local_image_id,
+                                                  &tag_owner,
+                                                  m_threads->work_queue,
+                                                  &ctx);
+  req->send();
+
+  ASSERT_EQ(-ENOENT, ctx.wait());
+}
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
diff --git a/src/test/rbd_mirror/image_sync/test_mock_ObjectCopyRequest.cc b/src/test/rbd_mirror/image_sync/test_mock_ObjectCopyRequest.cc
index bfebabdc97a..370c25fb555 100644
--- a/src/test/rbd_mirror/image_sync/test_mock_ObjectCopyRequest.cc
+++ b/src/test/rbd_mirror/image_sync/test_mock_ObjectCopyRequest.cc
@@ -200,18 +200,18 @@ public:
                                 librados::snap_t snap_id, uint8_t state,
                                 int r) {
     if (mock_image_ctx.image_ctx->object_map != nullptr) {
-      auto &expect = EXPECT_CALL(mock_object_map, aio_update(snap_id, 0, 1, state, _, _));
+      auto &expect = EXPECT_CALL(mock_object_map, aio_update(snap_id, 0, 1, state, _, _, _));
       if (r < 0) {
-        expect.WillOnce(DoAll(WithArg<5>(Invoke([this, r](Context *ctx) {
+        expect.WillOnce(DoAll(WithArg<6>(Invoke([this, r](Context *ctx) {
                                   m_threads->work_queue->queue(ctx, r);
                                 })),
                               Return(true)));
       } else {
-        expect.WillOnce(DoAll(WithArg<5>(Invoke([&mock_image_ctx, snap_id, state, r](Context *ctx) {
+        expect.WillOnce(DoAll(WithArg<6>(Invoke([&mock_image_ctx, snap_id, state, r](Context *ctx) {
                                   assert(mock_image_ctx.image_ctx->snap_lock.is_locked());
                                   assert(mock_image_ctx.image_ctx->object_map_lock.is_wlocked());
                                   mock_image_ctx.image_ctx->object_map->aio_update<Context>(
-                                    snap_id, 0, 1, state, boost::none, ctx);
+                                    snap_id, 0, 1, state, boost::none, {}, ctx);
                                 })),
                               Return(true)));
       }
diff --git a/src/test/rbd_mirror/test_ImageDeleter.cc b/src/test/rbd_mirror/test_ImageDeleter.cc
index 606bec8d378..2aa0b3e9319 100644
--- a/src/test/rbd_mirror/test_ImageDeleter.cc
+++ b/src/test/rbd_mirror/test_ImageDeleter.cc
@@ -217,8 +217,7 @@ int64_t TestImageDeleter::m_local_pool_id;
 
 
 TEST_F(TestImageDeleter, Delete_NonPrimary_Image) {
-  m_deleter->schedule_image_delete(_rados, m_local_pool_id, m_local_image_id,
-                                   GLOBAL_IMAGE_ID);
+  m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID);
 
   C_SaferCond ctx;
   m_deleter->wait_for_scheduled_deletion(m_local_pool_id, GLOBAL_IMAGE_ID,
@@ -234,8 +233,7 @@ TEST_F(TestImageDeleter, Delete_NonPrimary_Image) {
 TEST_F(TestImageDeleter, Fail_Delete_Primary_Image) {
   promote_image();
 
-  m_deleter->schedule_image_delete(_rados, m_local_pool_id, m_local_image_id,
-                                   GLOBAL_IMAGE_ID);
+  m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID);
 
   C_SaferCond ctx;
   m_deleter->wait_for_scheduled_deletion(m_local_pool_id, GLOBAL_IMAGE_ID,
@@ -246,27 +244,10 @@ TEST_F(TestImageDeleter, Fail_Delete_Primary_Image) {
   ASSERT_EQ(0u, m_deleter->get_failed_queue_items().size());
 }
 
-TEST_F(TestImageDeleter, Fail_Delete_Diff_GlobalId) {
-  // This test case represents a case that should never happen, unless
-  // there is bug in the implementation
-
-  m_deleter->schedule_image_delete(_rados, m_local_pool_id, m_local_image_id,
-                                   "diff global id");
-
-  C_SaferCond ctx;
-  m_deleter->wait_for_scheduled_deletion(m_local_pool_id, "diff global id",
-                                         &ctx);
-  EXPECT_EQ(-EINVAL, ctx.wait());
-
-  ASSERT_EQ(0u, m_deleter->get_delete_queue_items().size());
-  ASSERT_EQ(0u, m_deleter->get_failed_queue_items().size());
-}
-
 TEST_F(TestImageDeleter, Delete_Image_With_Child) {
   create_snapshot();
 
-  m_deleter->schedule_image_delete(_rados, m_local_pool_id, m_local_image_id,
-                                   GLOBAL_IMAGE_ID);
+  m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID);
 
   C_SaferCond ctx;
   m_deleter->wait_for_scheduled_deletion(m_local_pool_id, GLOBAL_IMAGE_ID,
@@ -281,8 +262,7 @@ TEST_F(TestImageDeleter, Delete_Image_With_Children) {
   create_snapshot("snap1");
   create_snapshot("snap2");
 
-  m_deleter->schedule_image_delete(_rados, m_local_pool_id, m_local_image_id,
-                                   GLOBAL_IMAGE_ID);
+  m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID);
 
   C_SaferCond ctx;
   m_deleter->wait_for_scheduled_deletion(m_local_pool_id, GLOBAL_IMAGE_ID,
@@ -296,8 +276,7 @@ TEST_F(TestImageDeleter, Delete_Image_With_Children) {
 TEST_F(TestImageDeleter, Delete_Image_With_ProtectedChild) {
   create_snapshot("snap1", true);
 
-  m_deleter->schedule_image_delete(_rados, m_local_pool_id, m_local_image_id,
-                                   GLOBAL_IMAGE_ID);
+  m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID);
 
   C_SaferCond ctx;
   m_deleter->wait_for_scheduled_deletion(m_local_pool_id, GLOBAL_IMAGE_ID,
@@ -312,8 +291,7 @@ TEST_F(TestImageDeleter, Delete_Image_With_ProtectedChildren) {
   create_snapshot("snap1", true);
   create_snapshot("snap2", true);
 
-  m_deleter->schedule_image_delete(_rados, m_local_pool_id, m_local_image_id,
-                                   GLOBAL_IMAGE_ID);
+  m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID);
 
   C_SaferCond ctx;
   m_deleter->wait_for_scheduled_deletion(m_local_pool_id, GLOBAL_IMAGE_ID,
@@ -327,8 +305,7 @@ TEST_F(TestImageDeleter, Delete_Image_With_ProtectedChildren) {
 TEST_F(TestImageDeleter, Delete_Image_With_Clone) {
   std::string clone_id = create_clone();
 
-  m_deleter->schedule_image_delete(_rados, m_local_pool_id, m_local_image_id,
-                                   GLOBAL_IMAGE_ID);
+  m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID);
 
   C_SaferCond ctx;
   m_deleter->wait_for_scheduled_deletion(m_local_pool_id, GLOBAL_IMAGE_ID,
@@ -338,7 +315,7 @@ TEST_F(TestImageDeleter, Delete_Image_With_Clone) {
   ASSERT_EQ(1u, m_deleter->get_delete_queue_items().size());
   ASSERT_EQ(0u, m_deleter->get_failed_queue_items().size());
 
-  m_deleter->schedule_image_delete(_rados, m_local_pool_id, clone_id,
+  m_deleter->schedule_image_delete(_rados, m_local_pool_id,
                                    GLOBAL_CLONE_IMAGE_ID);
 
   C_SaferCond ctx2;
@@ -363,8 +340,7 @@ TEST_F(TestImageDeleter, Delete_NonExistent_Image) {
   EXPECT_EQ(0, cls_client::mirror_image_set(&m_local_io_ctx, m_local_image_id,
                                             mirror_image));
 
-  m_deleter->schedule_image_delete(_rados, m_local_pool_id, m_local_image_id,
-                                   GLOBAL_IMAGE_ID);
+  m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID);
 
   C_SaferCond ctx;
   m_deleter->wait_for_scheduled_deletion(m_local_pool_id, GLOBAL_IMAGE_ID,
@@ -388,8 +364,7 @@ TEST_F(TestImageDeleter, Delete_NonExistent_Image_With_MirroringState) {
   EXPECT_EQ(0, cls_client::mirror_image_set(&m_local_io_ctx, m_local_image_id,
                                             mirror_image));
 
-  m_deleter->schedule_image_delete(_rados, m_local_pool_id, m_local_image_id,
-                                   GLOBAL_IMAGE_ID);
+  m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID);
 
   C_SaferCond ctx;
   m_deleter->wait_for_scheduled_deletion(m_local_pool_id, GLOBAL_IMAGE_ID,
@@ -405,8 +380,7 @@ TEST_F(TestImageDeleter, Delete_NonExistent_Image_With_MirroringState) {
 TEST_F(TestImageDeleter, Delete_NonExistent_Image_Without_MirroringState) {
   remove_image();
 
-  m_deleter->schedule_image_delete(_rados, m_local_pool_id, m_local_image_id,
-                                   GLOBAL_IMAGE_ID);
+  m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID);
 
   C_SaferCond ctx;
   m_deleter->wait_for_scheduled_deletion(m_local_pool_id, GLOBAL_IMAGE_ID,
@@ -424,8 +398,7 @@ TEST_F(TestImageDeleter, Fail_Delete_NonPrimary_Image) {
                                 false);
   EXPECT_EQ(0, ictx->state->open(false));
 
-  m_deleter->schedule_image_delete(_rados, m_local_pool_id, m_local_image_id,
-                                   GLOBAL_IMAGE_ID);
+  m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID);
 
   C_SaferCond ctx;
   m_deleter->wait_for_scheduled_deletion(m_local_pool_id, GLOBAL_IMAGE_ID,
@@ -445,8 +418,7 @@ TEST_F(TestImageDeleter, Retry_Failed_Deletes) {
 
   m_deleter->set_failed_timer_interval(2);
 
-  m_deleter->schedule_image_delete(_rados, m_local_pool_id, m_local_image_id,
-                                   GLOBAL_IMAGE_ID);
+  m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID);
 
   C_SaferCond ctx;
   m_deleter->wait_for_scheduled_deletion(m_local_pool_id, GLOBAL_IMAGE_ID,
@@ -471,8 +443,7 @@ TEST_F(TestImageDeleter, Delete_Is_Idempotent) {
                                 false);
   EXPECT_EQ(0, ictx->state->open(false));
 
-  m_deleter->schedule_image_delete(_rados, m_local_pool_id, m_local_image_id,
-                                   GLOBAL_IMAGE_ID);
+  m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID);
 
   C_SaferCond ctx;
   m_deleter->wait_for_scheduled_deletion(m_local_pool_id, GLOBAL_IMAGE_ID,
@@ -482,8 +453,7 @@ TEST_F(TestImageDeleter, Delete_Is_Idempotent) {
   ASSERT_EQ(0u, m_deleter->get_delete_queue_items().size());
   ASSERT_EQ(1u, m_deleter->get_failed_queue_items().size());
 
-  m_deleter->schedule_image_delete(_rados, m_local_pool_id, m_local_image_id,
-                                   GLOBAL_IMAGE_ID);
+  m_deleter->schedule_image_delete(_rados, m_local_pool_id, GLOBAL_IMAGE_ID);
 
   ASSERT_EQ(0u, m_deleter->get_delete_queue_items().size());
   ASSERT_EQ(1u, m_deleter->get_failed_queue_items().size());
diff --git a/src/test/rbd_mirror/test_ImageReplayer.cc b/src/test/rbd_mirror/test_ImageReplayer.cc
index 6dc064debd7..9ca82925322 100644
--- a/src/test/rbd_mirror/test_ImageReplayer.cc
+++ b/src/test/rbd_mirror/test_ImageReplayer.cc
@@ -622,8 +622,6 @@ TEST_F(TestImageReplayer, Resync)
 
   start();
 
-  std::string image_id = m_replayer->get_local_image_id();
-
   generate_test_data();
 
   open_remote_image(&ictx);
@@ -653,7 +651,6 @@ TEST_F(TestImageReplayer, Resync)
   m_replayer->start(&cond);
   ASSERT_EQ(0, cond.wait());
 
-  ASSERT_NE(image_id, m_replayer->get_local_image_id());
   ASSERT_TRUE(m_replayer->is_replaying());
 
   wait_for_replay_complete();
@@ -674,8 +671,6 @@ TEST_F(TestImageReplayer, Resync_While_Stop)
 
   start();
 
-  std::string image_id = m_replayer->get_local_image_id();
-
   generate_test_data();
 
   librbd::ImageCtx *ictx;
@@ -718,7 +713,6 @@ TEST_F(TestImageReplayer, Resync_While_Stop)
   m_replayer->start(&cond3);
   ASSERT_EQ(0, cond3.wait());
 
-  ASSERT_NE(image_id, m_replayer->get_local_image_id());
   ASSERT_TRUE(m_replayer->is_replaying());
 
   wait_for_replay_complete();
@@ -737,8 +731,6 @@ TEST_F(TestImageReplayer, Resync_StartInterrupted)
 
   bootstrap();
 
-  std::string image_id = m_replayer->get_local_image_id();
-
   librbd::ImageCtx *ictx;
   open_local_image(&ictx);
   librbd::Journal<>::request_resync(ictx);
@@ -764,8 +756,6 @@ TEST_F(TestImageReplayer, Resync_StartInterrupted)
   m_watch_ctx = new C_WatchCtx(this, oid);
   ASSERT_EQ(0, m_remote_ioctx.watch2(oid, &m_watch_handle, m_watch_ctx));
 
-  ASSERT_NE(image_id, m_replayer->get_local_image_id());
-
   ASSERT_TRUE(m_replayer->is_replaying());
 
   open_remote_image(&ictx);
diff --git a/src/test/rbd_mirror/test_PoolWatcher.cc b/src/test/rbd_mirror/test_PoolWatcher.cc
index 9d11c715638..c1272466933 100644
--- a/src/test/rbd_mirror/test_PoolWatcher.cc
+++ b/src/test/rbd_mirror/test_PoolWatcher.cc
@@ -76,8 +76,8 @@ public:
     }
 
     void handle_update(const std::string &mirror_uuid,
-                       const ImageIds &added_image_ids,
-                       const ImageIds &removed_image_ids) override {
+                       ImageIds &&added_image_ids,
+                       ImageIds &&removed_image_ids) override {
       Mutex::Locker locker(test->m_lock);
       for (auto &image_id : removed_image_ids) {
         image_ids.erase(image_id);
diff --git a/src/test/rbd_mirror/test_mock_ImageReplayer.cc b/src/test/rbd_mirror/test_mock_ImageReplayer.cc
index 01a31c950a7..430edaa16d3 100644
--- a/src/test/rbd_mirror/test_mock_ImageReplayer.cc
+++ b/src/test/rbd_mirror/test_mock_ImageReplayer.cc
@@ -8,6 +8,7 @@
 #include "tools/rbd_mirror/image_replayer/BootstrapRequest.h"
 #include "tools/rbd_mirror/image_replayer/CloseImageRequest.h"
 #include "tools/rbd_mirror/image_replayer/EventPreprocessor.h"
+#include "tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.h"
 #include "tools/rbd_mirror/ImageSyncThrottler.h"
 #include "test/rbd_mirror/test_mock_fixture.h"
 #include "test/journal/mock/MockJournaler.h"
@@ -91,6 +92,33 @@ using ::testing::SetArgPointee;
 using ::testing::WithArg;
 
 template<>
+struct PrepareLocalImageRequest<librbd::MockTestImageCtx> {
+  static PrepareLocalImageRequest* s_instance;
+  std::string *local_image_id = nullptr;
+  std::string *tag_owner = nullptr;
+  Context *on_finish = nullptr;
+
+  static PrepareLocalImageRequest* create(librados::IoCtx &,
+                                          const std::string &global_image_id,
+                                          std::string *local_image_id,
+                                          std::string *tag_owner,
+                                          ContextWQ *work_queue,
+                                          Context *on_finish) {
+    assert(s_instance != nullptr);
+    s_instance->local_image_id = local_image_id;
+    s_instance->tag_owner = tag_owner;
+    s_instance->on_finish = on_finish;
+    return s_instance;
+  }
+
+  PrepareLocalImageRequest() {
+    s_instance = this;
+  }
+
+  MOCK_METHOD0(send, void());
+};
+
+template<>
 struct BootstrapRequest<librbd::MockTestImageCtx> {
   static BootstrapRequest* s_instance;
   librbd::MockTestImageCtx **image_ctx = nullptr;
@@ -226,6 +254,7 @@ struct ReplayStatusFormatter<librbd::MockTestImageCtx> {
 BootstrapRequest<librbd::MockTestImageCtx>* BootstrapRequest<librbd::MockTestImageCtx>::s_instance = nullptr;
 CloseImageRequest<librbd::MockTestImageCtx>* CloseImageRequest<librbd::MockTestImageCtx>::s_instance = nullptr;
 EventPreprocessor<librbd::MockTestImageCtx>* EventPreprocessor<librbd::MockTestImageCtx>::s_instance = nullptr;
+PrepareLocalImageRequest<librbd::MockTestImageCtx>* PrepareLocalImageRequest<librbd::MockTestImageCtx>::s_instance = nullptr;
 ReplayStatusFormatter<librbd::MockTestImageCtx>* ReplayStatusFormatter<librbd::MockTestImageCtx>::s_instance = nullptr;
 
 } // namespace image_replayer
@@ -244,6 +273,7 @@ public:
   typedef BootstrapRequest<librbd::MockTestImageCtx> MockBootstrapRequest;
   typedef CloseImageRequest<librbd::MockTestImageCtx> MockCloseImageRequest;
   typedef EventPreprocessor<librbd::MockTestImageCtx> MockEventPreprocessor;
+  typedef PrepareLocalImageRequest<librbd::MockTestImageCtx> MockPrepareLocalImageRequest;
   typedef ReplayStatusFormatter<librbd::MockTestImageCtx> MockReplayStatusFormatter;
   typedef librbd::journal::Replay<librbd::MockTestImageCtx> MockReplay;
   typedef ImageReplayer<librbd::MockTestImageCtx> MockImageReplayer;
@@ -294,6 +324,20 @@ public:
                             Return(true)));
   }
 
+  void expect_send(MockPrepareLocalImageRequest &mock_request,
+                   const std::string &local_image_id,
+                   const std::string &tag_owner,
+                   int r) {
+    EXPECT_CALL(mock_request, send())
+      .WillOnce(Invoke([&mock_request, local_image_id, tag_owner, r]() {
+          if (r == 0) {
+            *mock_request.local_image_id = local_image_id;
+            *mock_request.tag_owner = tag_owner;
+          }
+          mock_request.on_finish->complete(r);
+        }));
+  }
+
   void expect_send(MockBootstrapRequest &mock_bootstrap_request,
                    librbd::MockTestImageCtx &mock_local_image_ctx,
                    bool do_resync, int r) {
@@ -437,6 +481,7 @@ TEST_F(TestMockImageReplayer, StartStop) {
   mock_local_image_ctx.journal = &mock_local_journal;
 
   journal::MockJournaler mock_remote_journaler;
+  MockPrepareLocalImageRequest mock_prepare_local_image_request;
   MockBootstrapRequest mock_bootstrap_request;
   MockReplay mock_local_replay;
   MockEventPreprocessor mock_event_preprocessor;
@@ -445,6 +490,8 @@ TEST_F(TestMockImageReplayer, StartStop) {
   expect_get_or_send_update(mock_replay_status_formatter);
 
   InSequence seq;
+  expect_send(mock_prepare_local_image_request, mock_local_image_ctx.id,
+              "remote mirror uuid", 0);
   EXPECT_CALL(mock_remote_journaler, construct());
   expect_send(mock_bootstrap_request, mock_local_image_ctx, false, 0);
 
@@ -483,18 +530,81 @@ TEST_F(TestMockImageReplayer, StartStop) {
   ASSERT_EQ(0, stop_ctx.wait());
 }
 
+TEST_F(TestMockImageReplayer, LocalImagePrimary) {
+  create_local_image();
+  librbd::MockTestImageCtx mock_local_image_ctx(*m_local_image_ctx);
+
+  MockPrepareLocalImageRequest mock_prepare_local_image_request;
+  MockReplayStatusFormatter mock_replay_status_formatter;
+
+  expect_get_or_send_update(mock_replay_status_formatter);
+
+  InSequence seq;
+  expect_send(mock_prepare_local_image_request, mock_local_image_ctx.id,
+              "", 0);
+
+  C_SaferCond start_ctx;
+  m_image_replayer->start(&start_ctx);
+  ASSERT_EQ(0, start_ctx.wait());
+}
+
+TEST_F(TestMockImageReplayer, LocalImageDNE) {
+  create_local_image();
+  librbd::MockTestImageCtx mock_local_image_ctx(*m_local_image_ctx);
+
+  journal::MockJournaler mock_remote_journaler;
+  MockPrepareLocalImageRequest mock_prepare_local_image_request;
+  MockBootstrapRequest mock_bootstrap_request;
+  MockReplayStatusFormatter mock_replay_status_formatter;
+
+  expect_get_or_send_update(mock_replay_status_formatter);
+
+  InSequence seq;
+  expect_send(mock_prepare_local_image_request, "", "", -ENOENT);
+  EXPECT_CALL(mock_remote_journaler, construct());
+  expect_send(mock_bootstrap_request, mock_local_image_ctx, false, -EREMOTEIO);
+
+  EXPECT_CALL(mock_remote_journaler, remove_listener(_));
+  expect_shut_down(mock_remote_journaler, 0);
+
+  C_SaferCond start_ctx;
+  m_image_replayer->start(&start_ctx);
+  ASSERT_EQ(0, start_ctx.wait());
+}
+
+TEST_F(TestMockImageReplayer, PrepareLocalImageError) {
+  create_local_image();
+  librbd::MockTestImageCtx mock_local_image_ctx(*m_local_image_ctx);
+
+  MockPrepareLocalImageRequest mock_prepare_local_image_request;
+  MockReplayStatusFormatter mock_replay_status_formatter;
+
+  expect_get_or_send_update(mock_replay_status_formatter);
+
+  InSequence seq;
+  expect_send(mock_prepare_local_image_request, mock_local_image_ctx.id,
+              "remote mirror uuid", -EINVAL);
+
+  C_SaferCond start_ctx;
+  m_image_replayer->start(&start_ctx);
+  ASSERT_EQ(-EINVAL, start_ctx.wait());
+}
+
 TEST_F(TestMockImageReplayer, BootstrapError) {
 
   create_local_image();
   librbd::MockTestImageCtx mock_local_image_ctx(*m_local_image_ctx);
 
   journal::MockJournaler mock_remote_journaler;
+  MockPrepareLocalImageRequest mock_prepare_local_image_request;
   MockBootstrapRequest mock_bootstrap_request;
   MockReplayStatusFormatter mock_replay_status_formatter;
 
   expect_get_or_send_update(mock_replay_status_formatter);
 
   InSequence seq;
+  expect_send(mock_prepare_local_image_request, mock_local_image_ctx.id,
+              "remote mirror uuid", 0);
   EXPECT_CALL(mock_remote_journaler, construct());
   expect_send(mock_bootstrap_request, mock_local_image_ctx, false, -EINVAL);
 
@@ -516,6 +626,7 @@ TEST_F(TestMockImageReplayer, StartExternalReplayError) {
   mock_local_image_ctx.journal = &mock_local_journal;
 
   journal::MockJournaler mock_remote_journaler;
+  MockPrepareLocalImageRequest mock_prepare_local_image_request;
   MockBootstrapRequest mock_bootstrap_request;
   MockReplay mock_local_replay;
   MockEventPreprocessor mock_event_preprocessor;
@@ -524,6 +635,8 @@ TEST_F(TestMockImageReplayer, StartExternalReplayError) {
   expect_get_or_send_update(mock_replay_status_formatter);
 
   InSequence seq;
+  expect_send(mock_prepare_local_image_request, mock_local_image_ctx.id,
+              "remote mirror uuid", 0);
   EXPECT_CALL(mock_remote_journaler, construct());
   expect_send(mock_bootstrap_request, mock_local_image_ctx, false, 0);
 
@@ -560,6 +673,7 @@ TEST_F(TestMockImageReplayer, StopError) {
   mock_local_image_ctx.journal = &mock_local_journal;
 
   journal::MockJournaler mock_remote_journaler;
+  MockPrepareLocalImageRequest mock_prepare_local_image_request;
   MockBootstrapRequest mock_bootstrap_request;
   MockReplay mock_local_replay;
   MockEventPreprocessor mock_event_preprocessor;
@@ -568,6 +682,8 @@ TEST_F(TestMockImageReplayer, StopError) {
   expect_get_or_send_update(mock_replay_status_formatter);
 
   InSequence seq;
+  expect_send(mock_prepare_local_image_request, mock_local_image_ctx.id,
+              "remote mirror uuid", 0);
   EXPECT_CALL(mock_remote_journaler, construct());
   expect_send(mock_bootstrap_request, mock_local_image_ctx, false, 0);
 
@@ -616,6 +732,7 @@ TEST_F(TestMockImageReplayer, Replay) {
   mock_local_image_ctx.journal = &mock_local_journal;
 
   journal::MockJournaler mock_remote_journaler;
+  MockPrepareLocalImageRequest mock_prepare_local_image_request;
   MockBootstrapRequest mock_bootstrap_request;
   MockReplay mock_local_replay;
   MockEventPreprocessor mock_event_preprocessor;
@@ -627,6 +744,8 @@ TEST_F(TestMockImageReplayer, Replay) {
   expect_committed(mock_remote_journaler, 2);
 
   InSequence seq;
+  expect_send(mock_prepare_local_image_request, mock_local_image_ctx.id,
+              "remote mirror uuid", 0);
   EXPECT_CALL(mock_remote_journaler, construct());
   expect_send(mock_bootstrap_request, mock_local_image_ctx, false, 0);
 
@@ -714,6 +833,7 @@ TEST_F(TestMockImageReplayer, DecodeError) {
   mock_local_image_ctx.journal = &mock_local_journal;
 
   journal::MockJournaler mock_remote_journaler;
+  MockPrepareLocalImageRequest mock_prepare_local_image_request;
   MockBootstrapRequest mock_bootstrap_request;
   MockReplay mock_local_replay;
   MockEventPreprocessor mock_event_preprocessor;
@@ -724,6 +844,8 @@ TEST_F(TestMockImageReplayer, DecodeError) {
   expect_get_commit_tid_in_debug(mock_replay_entry);
 
   InSequence seq;
+  expect_send(mock_prepare_local_image_request, mock_local_image_ctx.id,
+              "remote mirror uuid", 0);
   EXPECT_CALL(mock_remote_journaler, construct());
   expect_send(mock_bootstrap_request, mock_local_image_ctx, false, 0);
 
@@ -802,6 +924,7 @@ TEST_F(TestMockImageReplayer, DelayedReplay) {
   mock_local_image_ctx.journal = &mock_local_journal;
 
   journal::MockJournaler mock_remote_journaler;
+  MockPrepareLocalImageRequest mock_prepare_local_image_request;
   MockBootstrapRequest mock_bootstrap_request;
   MockReplay mock_local_replay;
   MockEventPreprocessor mock_event_preprocessor;
@@ -813,6 +936,8 @@ TEST_F(TestMockImageReplayer, DelayedReplay) {
   expect_committed(mock_remote_journaler, 1);
 
   InSequence seq;
+  expect_send(mock_prepare_local_image_request, mock_local_image_ctx.id,
+              "remote mirror uuid", 0);
   EXPECT_CALL(mock_remote_journaler, construct());
   expect_send(mock_bootstrap_request, mock_local_image_ctx, false, 0);
 
diff --git a/src/test/rbd_mirror/test_mock_InstanceReplayer.cc b/src/test/rbd_mirror/test_mock_InstanceReplayer.cc
index d406d9f57fa..53a9d0b6edd 100644
--- a/src/test/rbd_mirror/test_mock_InstanceReplayer.cc
+++ b/src/test/rbd_mirror/test_mock_InstanceReplayer.cc
@@ -72,8 +72,9 @@ struct ImageReplayer<librbd::MockTestImageCtx> {
   MOCK_METHOD3(add_remote_image, void(const std::string &,
                                       const std::string &,
                                       librados::IoCtx &));
-  MOCK_METHOD2(remove_remote_image, void(const std::string &,
-                                         const std::string &));
+  MOCK_METHOD3(remove_remote_image, void(const std::string &,
+                                         const std::string &,
+                                         bool));
   MOCK_METHOD0(remote_images_empty, bool());
   MOCK_METHOD0(get_global_image_id, const std::string &());
   MOCK_METHOD0(get_local_image_id, const std::string &());
@@ -174,7 +175,8 @@ TEST_F(TestMockInstanceReplayer, AcquireReleaseImage) {
   C_SaferCond on_release;
 
   EXPECT_CALL(mock_image_replayer,
-              remove_remote_image("remote_mirror_uuid", "remote_image_id"));
+              remove_remote_image("remote_mirror_uuid", "remote_image_id",
+                                  false));
   EXPECT_CALL(mock_image_replayer, remote_images_empty())
     .WillOnce(Return(true));
   EXPECT_CALL(mock_image_replayer, is_stopped())
diff --git a/src/test/rbd_mirror/test_mock_PoolWatcher.cc b/src/test/rbd_mirror/test_mock_PoolWatcher.cc
index b6e61516dce..1b7877434ad 100644
--- a/src/test/rbd_mirror/test_mock_PoolWatcher.cc
+++ b/src/test/rbd_mirror/test_mock_PoolWatcher.cc
@@ -163,8 +163,13 @@ public:
     MockListener(TestMockPoolWatcher *test) : test(test) {
     }
 
-    MOCK_METHOD3(handle_update, void(const std::string &, const ImageIds &,
-                                     const ImageIds &));
+    MOCK_METHOD3(mock_handle_update, void(const std::string &, const ImageIds &,
+                                          const ImageIds &));
+    void handle_update(const std::string &mirror_uuid,
+                       ImageIds &&added_image_ids,
+                       ImageIds &&removed_image_ids) override {
+      mock_handle_update(mirror_uuid, added_image_ids, removed_image_ids);
+    }
   };
 
   TestMockPoolWatcher() : m_lock("TestMockPoolWatcher::m_lock") {
@@ -208,8 +213,8 @@ public:
                                      const std::string &mirror_uuid,
                                      const ImageIds &added_image_ids,
                                      const ImageIds &removed_image_ids) {
-    EXPECT_CALL(mock_listener, handle_update(mirror_uuid, added_image_ids,
-                                             removed_image_ids))
+    EXPECT_CALL(mock_listener, mock_handle_update(mirror_uuid, added_image_ids,
+                                                  removed_image_ids))
       .WillOnce(WithoutArgs(Invoke([this]() {
           Mutex::Locker locker(m_lock);
           ++m_update_count;
diff --git a/src/test/rgw/CMakeLists.txt b/src/test/rgw/CMakeLists.txt
index d8f9a5daf87..c85bda78826 100644
--- a/src/test/rgw/CMakeLists.txt
+++ b/src/test/rgw/CMakeLists.txt
@@ -99,12 +99,35 @@ target_link_libraries(unittest_rgw_crypto
   cls_user_client
   librados
   global
-  curl
-  uuid
-  expat
+  ${CURL_LIBRARIES}
+  ${EXPAT_LIBRARIES}
   ${CMAKE_DL_LIBS}
   ${UNITTEST_LIBS}
   ${CRYPTO_LIBS}
   )
-set_target_properties(unittest_rgw_crypto PROPERTIES COMPILE_FLAGS
-  ${UNITTEST_CXX_FLAGS})
+set_target_properties(unittest_rgw_crypto PROPERTIES COMPILE_FLAGS$ {UNITTEST_CXX_FLAGS})
+
+# ceph_test_rgw_iam_policy
+set(test_rgw_iam_policy_srcs test_rgw_iam_policy.cc)
+add_executable(ceph_test_rgw_iam_policy
+  ${test_rgw_iam_policy_srcs}
+  )
+target_link_libraries(ceph_test_rgw_iam_policy
+  rgw_a
+  cls_rgw_client
+  cls_lock_client
+  cls_refcount_client
+  cls_log_client
+  cls_statelog_client
+  cls_version_client
+  cls_replica_log_client
+  cls_user_client
+  librados
+  global
+  ${CURL_LIBRARIES}
+  ${EXPAT_LIBRARIES}
+  ${CMAKE_DL_LIBS}
+  ${UNITTEST_LIBS}
+  ${CRYPTO_LIBS}
+  )
+set_target_properties(ceph_test_rgw_iam_policy PROPERTIES COMPILE_FLAGS ${UNITTEST_CXX_FLAGS})
diff --git a/src/test/rgw/rgw_multi/multisite.py b/src/test/rgw/rgw_multi/multisite.py
index 061cfd3419f..3f8b727bbe8 100644
--- a/src/test/rgw/rgw_multi/multisite.py
+++ b/src/test/rgw/rgw_multi/multisite.py
@@ -6,7 +6,7 @@ class Cluster:
     __metaclass__ = ABCMeta
 
     @abstractmethod
-    def admin(self, args = [], **kwargs):
+    def admin(self, args = None, **kwargs):
         """ execute a radosgw-admin command """
         pass
 
@@ -57,15 +57,15 @@ class SystemObject:
         """ update internal state based on json data """
         pass
 
-    def command(self, cluster, cmd, args = [], **kwargs):
+    def command(self, cluster, cmd, args = None, **kwargs):
         """ run the given command and return the output and retcode """
-        args = self.build_command(cmd) + args
+        args = self.build_command(cmd) + (args or [])
         return cluster.admin(args, **kwargs)
 
-    def json_command(self, cluster, cmd, args = [], **kwargs):
+    def json_command(self, cluster, cmd, args = None, **kwargs):
         """ run the given command, parse the output and return the resulting
         data and retcode """
-        (s, r) = self.command(cluster, cmd, args, **kwargs)
+        (s, r) = self.command(cluster, cmd, args or [], **kwargs)
         if r == 0:
             output = s.decode('utf-8')
             output = output[output.find('{'):] # trim extra output before json
@@ -76,12 +76,12 @@ class SystemObject:
 
     # mixins for supported commands
     class Create(object):
-        def create(self, cluster, args = [], **kwargs):
+        def create(self, cluster, args = None, **kwargs):
             """ create the object with the given arguments """
             return self.json_command(cluster, 'create', args, **kwargs)
 
     class Delete(object):
-        def delete(self, cluster, args = [], **kwargs):
+        def delete(self, cluster, args = None, **kwargs):
             """ delete the object """
             # not json_command() because delete has no output
             (_, r) = self.command(cluster, 'delete', args, **kwargs)
@@ -90,19 +90,19 @@ class SystemObject:
             return r
 
     class Get(object):
-        def get(self, cluster, args = [], **kwargs):
+        def get(self, cluster, args = None, **kwargs):
             """ read the object from storage """
             kwargs['read_only'] = True
             return self.json_command(cluster, 'get', args, **kwargs)
 
     class Set(object):
-        def set(self, cluster, data, args = [], **kwargs):
+        def set(self, cluster, data, args = None, **kwargs):
             """ set the object by json """
             kwargs['stdin'] = StringIO(json.dumps(data))
             return self.json_command(cluster, 'set', args, **kwargs)
 
     class Modify(object):
-        def modify(self, cluster, args = [], **kwargs):
+        def modify(self, cluster, args = None, **kwargs):
             """ modify the object with the given arguments """
             return self.json_command(cluster, 'modify', args, **kwargs)
 
@@ -110,11 +110,11 @@ class SystemObject:
     class GetSet(Get, Set): pass
 
 class Zone(SystemObject, SystemObject.CreateDelete, SystemObject.GetSet, SystemObject.Modify):
-    def __init__(self, name, zonegroup = None, cluster = None, data = None, zone_id = None, gateways = []):
+    def __init__(self, name, zonegroup = None, cluster = None, data = None, zone_id = None, gateways = None):
         self.name = name
         self.zonegroup = zonegroup
         self.cluster = cluster
-        self.gateways = gateways
+        self.gateways = gateways or []
         super(Zone, self).__init__(data, zone_id)
 
     def zone_arg(self):
@@ -137,7 +137,7 @@ class Zone(SystemObject, SystemObject.CreateDelete, SystemObject.GetSet, SystemO
         self.id = data['id']
         self.name = data['name']
 
-    def start(self, args = []):
+    def start(self, args = None):
         """ start all gateways """
         for g in self.gateways:
             g.start(args)
@@ -154,10 +154,10 @@ class Zone(SystemObject, SystemObject.CreateDelete, SystemObject.GetSet, SystemO
         return self.zonegroup.realm() if self.zonegroup else None
 
 class ZoneGroup(SystemObject, SystemObject.CreateDelete, SystemObject.GetSet, SystemObject.Modify):
-    def __init__(self, name, period = None, data = None, zonegroup_id = None, zones = [], master_zone  = None):
+    def __init__(self, name, period = None, data = None, zonegroup_id = None, zones = None, master_zone  = None):
         self.name = name
         self.period = period
-        self.zones = zones
+        self.zones = zones or []
         self.master_zone = master_zone
         super(ZoneGroup, self).__init__(data, zonegroup_id)
 
@@ -192,18 +192,18 @@ class ZoneGroup(SystemObject, SystemObject.CreateDelete, SystemObject.GetSet, Sy
         if not self.master_zone or master_id != self.master_zone.id:
             self.master_zone = self.zone_by_id(master_id)
 
-    def add(self, cluster, zone, args = [], **kwargs):
+    def add(self, cluster, zone, args = None, **kwargs):
         """ add an existing zone to the zonegroup """
-        args += zone.zone_arg()
+        args = zone.zone_arg() + (args or [])
         (data, r) = self.json_command(cluster, 'add', args, **kwargs)
         if r == 0:
             zone.zonegroup = self
             self.zones.append(zone)
         return (data, r)
 
-    def remove(self, cluster, zone, args = [], **kwargs):
+    def remove(self, cluster, zone, args = None, **kwargs):
         """ remove an existing zone from the zonegroup """
-        args += zone.zone_arg()
+        args = zone.zone_arg() + (args or [])
         (data, r) = self.json_command(cluster, 'remove', args, **kwargs)
         if r == 0:
             zone.zonegroup = None
@@ -214,9 +214,9 @@ class ZoneGroup(SystemObject, SystemObject.CreateDelete, SystemObject.GetSet, Sy
         return self.period.realm if self.period else None
 
 class Period(SystemObject, SystemObject.Get):
-    def __init__(self, realm = None, data = None, period_id = None, zonegroups = [], master_zonegroup = None):
+    def __init__(self, realm = None, data = None, period_id = None, zonegroups = None, master_zonegroup = None):
         self.realm = realm
-        self.zonegroups = zonegroups
+        self.zonegroups = zonegroups or []
         self.master_zonegroup = master_zonegroup
         super(Period, self).__init__(data, period_id)
 
@@ -238,18 +238,18 @@ class Period(SystemObject, SystemObject.Get):
         if not self.master_zonegroup or master_id != self.master_zonegroup.id:
             self.master_zonegroup = self.zonegroup_by_id(master_id)
 
-    def update(self, zone, args = [], **kwargs):
+    def update(self, zone, args = None, **kwargs):
         """ run 'radosgw-admin period update' on the given zone """
         assert(zone.cluster)
-        args = zone.zone_args() + args
+        args = zone.zone_args() + (args or [])
         if kwargs.pop('commit', False):
             args.append('--commit')
         return self.json_command(zone.cluster, 'update', args, **kwargs)
 
-    def commit(self, zone, args = [], **kwargs):
+    def commit(self, zone, args = None, **kwargs):
         """ run 'radosgw-admin period commit' on the given zone """
         assert(zone.cluster)
-        args = zone.zone_args() + args
+        args = zone.zone_args() + (args or [])
         return self.json_command(zone.cluster, 'commit', args, **kwargs)
 
 class Realm(SystemObject, SystemObject.CreateDelete, SystemObject.GetSet):
@@ -298,9 +298,9 @@ class Credentials:
         return ['--access-key', self.access_key, '--secret', self.secret]
 
 class User(SystemObject):
-    def __init__(self, uid, data = None, name = None, credentials = []):
+    def __init__(self, uid, data = None, name = None, credentials = None):
         self.name = name
-        self.credentials = credentials
+        self.credentials = credentials or []
         super(User, self).__init__(data, uid)
 
     def user_arg(self):
@@ -317,21 +317,21 @@ class User(SystemObject):
         self.name = data['display_name']
         self.credentials = [Credentials(k['access_key'], k['secret_key']) for k in data['keys']]
 
-    def create(self, zone, args = [], **kwargs):
+    def create(self, zone, args = None, **kwargs):
         """ create the user with the given arguments """
         assert(zone.cluster)
-        args += zone.zone_args()
+        args = zone.zone_args() + (args or [])
         return self.json_command(zone.cluster, 'create', args, **kwargs)
 
-    def info(self, zone, args = [], **kwargs):
+    def info(self, zone, args = None, **kwargs):
         """ read the user from storage """
         assert(zone.cluster)
-        args += zone.zone_args()
+        args = zone.zone_args() + (args or [])
         kwargs['read_only'] = True
         return self.json_command(zone.cluster, 'info', args, **kwargs)
 
-    def delete(self, zone, args = [], **kwargs):
+    def delete(self, zone, args = None, **kwargs):
         """ delete the user """
         assert(zone.cluster)
-        args += zone.zone_args()
+        args = zone.zone_args() + (args or [])
         return self.command(zone.cluster, 'delete', args, **kwargs)
diff --git a/src/test/rgw/rgw_multi/tests.py b/src/test/rgw/rgw_multi/tests.py
index 81677b1f37f..4b19a5ca256 100644
--- a/src/test/rgw/rgw_multi/tests.py
+++ b/src/test/rgw/rgw_multi/tests.py
@@ -8,6 +8,7 @@ try:
     from itertools import izip_longest as zip_longest
 except ImportError:
     from itertools import zip_longest
+from itertools import combinations
 
 import boto
 import boto.s3.connection
@@ -52,6 +53,17 @@ def get_zone_connection(zone, credentials):
         credentials = credentials[0]
     return get_gateway_connection(zone.gateways[0], credentials)
 
+def mdlog_list(zone, period = None):
+    cmd = ['mdlog', 'list']
+    if period:
+        cmd += ['--period', period]
+    (mdlog_json, _) = zone.cluster.admin(cmd, read_only=True)
+    mdlog_json = mdlog_json.decode('utf-8')
+    return json.loads(mdlog_json)
+
+def mdlog_autotrim(zone):
+    zone.cluster.admin(['mdlog', 'autotrim'])
+
 def meta_sync_status(zone):
     while True:
         cmd = ['metadata', 'sync', 'status'] + zone.zone_args()
@@ -65,15 +77,25 @@ def meta_sync_status(zone):
     log.debug('current meta sync status=%s', meta_sync_status_json)
     sync_status = json.loads(meta_sync_status_json)
 
-    global_sync_status=sync_status['sync_status']['info']['status']
-    num_shards=sync_status['sync_status']['info']['num_shards']
+    sync_info = sync_status['sync_status']['info']
+    global_sync_status = sync_info['status']
+    num_shards = sync_info['num_shards']
+    period = sync_info['period']
+    realm_epoch = sync_info['realm_epoch']
 
     sync_markers=sync_status['sync_status']['markers']
     log.debug('sync_markers=%s', sync_markers)
     assert(num_shards == len(sync_markers))
 
-    markers = {i: m['val']['marker'] for i, m in enumerate(sync_markers)}
-    return (num_shards, markers)
+    markers={}
+    for i in range(num_shards):
+        # get marker, only if it's an incremental marker for the same realm epoch
+        if realm_epoch > sync_markers[i]['val']['realm_epoch'] or sync_markers[i]['val']['state'] == 0:
+            markers[i] = ''
+        else:
+            markers[i] = sync_markers[i]['val']['marker']
+
+    return period, realm_epoch, num_shards, markers
 
 def meta_master_log_status(master_zone):
     cmd = ['mdlog', 'status'] + master_zone.zone_args()
@@ -108,14 +130,21 @@ def zone_meta_checkpoint(zone, meta_master_zone = None, master_status = None):
     if not master_status:
         master_status = meta_master_log_status(meta_master_zone)
 
+    current_realm_epoch = realm.current_period.data['realm_epoch']
+
     log.info('starting meta checkpoint for zone=%s', zone.name)
 
     while True:
-        num_shards, sync_status = meta_sync_status(zone)
-        log.debug('log_status=%s', master_status)
-        log.debug('sync_status=%s', sync_status)
-        if compare_meta_status(zone, master_status, sync_status):
-            break
+        period, realm_epoch, num_shards, sync_status = meta_sync_status(zone)
+        if realm_epoch < current_realm_epoch:
+            log.warning('zone %s is syncing realm epoch=%d, behind current realm epoch=%d',
+                        zone.name, realm_epoch, current_realm_epoch)
+        else:
+            log.debug('log_status=%s', master_status)
+            log.debug('sync_status=%s', sync_status)
+            if compare_meta_status(zone, master_status, sync_status):
+                break
+
         time.sleep(5)
 
     log.info('finish meta checkpoint for zone=%s', zone.name)
@@ -312,6 +341,8 @@ def set_master_zone(zone):
     zonegroup = zone.zonegroup
     zonegroup.period.update(zone, commit=True)
     zonegroup.master_zone = zone
+    # wait for reconfiguration, so that later metadata requests go to the new master
+    time.sleep(5)
 
 def gen_bucket_name():
     global num_buckets
@@ -634,12 +665,18 @@ def test_multi_period_incremental_sync():
     if len(zonegroup.zones) < 3:
         raise SkipTest("test_multi_period_incremental_sync skipped. Requires 3 or more zones in master zonegroup.")
 
-    buckets, zone_bucket = create_bucket_per_zone(zonegroup)
+    # periods to include in mdlog comparison
+    mdlog_periods = [realm.current_period.id]
+
+    # create a bucket in each zone
+    buckets = []
+    for zone in zonegroup.zones:
+        conn = get_zone_connection(zone, user.credentials)
+        bucket_name = gen_bucket_name()
+        log.info('create bucket zone=%s name=%s', zone.name, bucket_name)
+        bucket = conn.create_bucket(bucket_name)
+        buckets.append(bucket_name)
 
-    for zone, bucket_name in zone_bucket.items():
-        for objname in [ 'p1', '_p1' ]:
-            k = new_key(zone, bucket_name, objname)
-            k.set_contents_from_string('asdasd')
     zonegroup_meta_checkpoint(zonegroup)
 
     z1, z2, z3 = zonegroup.zones[0:3]
@@ -650,39 +687,68 @@ def test_multi_period_incremental_sync():
 
     # change master to zone 2 -> period 2
     set_master_zone(z2)
+    mdlog_periods += [realm.current_period.id]
 
-    for zone, bucket_name in zone_bucket.items():
+    # create another bucket in each zone, except for z3
+    for zone in zonegroup.zones:
         if zone == z3:
             continue
-        for objname in [ 'p2', '_p2' ]:
-            k = new_key(zone, bucket_name, objname)
-            k.set_contents_from_string('qweqwe')
+        conn = get_zone_connection(zone, user.credentials)
+        bucket_name = gen_bucket_name()
+        log.info('create bucket zone=%s name=%s', zone.name, bucket_name)
+        bucket = conn.create_bucket(bucket_name)
+        buckets.append(bucket_name)
 
     # wait for zone 1 to sync
     zone_meta_checkpoint(z1)
 
     # change master back to zone 1 -> period 3
     set_master_zone(z1)
+    mdlog_periods += [realm.current_period.id]
 
-    for zone, bucket_name in zone_bucket.items():
+    # create another bucket in each zone, except for z3
+    for zone in zonegroup.zones:
         if zone == z3:
             continue
-        for objname in [ 'p3', '_p3' ]:
-            k = new_key(zone, bucket_name, objname)
-            k.set_contents_from_string('zxczxc')
+        conn = get_zone_connection(zone, user.credentials)
+        bucket_name = gen_bucket_name()
+        log.info('create bucket zone=%s name=%s', zone.name, bucket_name)
+        bucket = conn.create_bucket(bucket_name)
+        buckets.append(bucket_name)
 
     # restart zone 3 gateway and wait for sync
     z3.start()
     zonegroup_meta_checkpoint(zonegroup)
 
-    # verify that we end up with the same objects
-    for source_zone, bucket in zone_bucket.items():
-        for target_zone in zonegroup.zones:
-            if source_zone == target_zone:
+    # verify that we end up with the same buckets
+    for bucket_name in buckets:
+        for source_zone, target_zone in combinations(zonegroup.zones, 2):
+            check_bucket_eq(source_zone, target_zone, bucket_name)
+
+    # verify that mdlogs are not empty and match for each period
+    for period in mdlog_periods:
+        master_mdlog = mdlog_list(z1, period)
+        assert len(master_mdlog) > 0
+        for zone in zonegroup.zones:
+            if zone == z1:
                 continue
+            mdlog = mdlog_list(zone, period)
+            assert len(mdlog) == len(master_mdlog)
 
-            zone_bucket_checkpoint(target_zone, source_zone, bucket.name)
-            check_bucket_eq(source_zone, target_zone, bucket)
+    # autotrim mdlogs for master zone
+    mdlog_autotrim(z1)
+
+    # autotrim mdlogs for peers
+    for zone in zonegroup.zones:
+        if zone == z1:
+            continue
+        mdlog_autotrim(zone)
+
+    # verify that mdlogs are empty for each period
+    for period in mdlog_periods:
+        for zone in zonegroup.zones:
+            mdlog = mdlog_list(zone, period)
+            assert len(mdlog) == 0
 
 def test_zonegroup_remove():
     zonegroup = realm.master_zonegroup()
diff --git a/src/test/rgw/test_multi.py b/src/test/rgw/test_multi.py
index 9f289861ebf..b9f3f5ed09e 100644
--- a/src/test/rgw/test_multi.py
+++ b/src/test/rgw/test_multi.py
@@ -43,9 +43,11 @@ class Cluster(multisite.Cluster):
         self.cluster_id = cluster_id
         self.needs_reset = True
 
-    def admin(self, args = [], **kwargs):
+    def admin(self, args = None, **kwargs):
         """ radosgw-admin command """
-        cmd = [test_path + 'test-rgw-call.sh', 'call_rgw_admin', self.cluster_id] + args
+        cmd = [test_path + 'test-rgw-call.sh', 'call_rgw_admin', self.cluster_id]
+        if args:
+            cmd += args
         if kwargs.pop('read_only', False):
             cmd += ['--rgw-cache-enabled', 'false']
         return bash(cmd, **kwargs)
@@ -67,14 +69,15 @@ class Gateway(multisite.Gateway):
         super(Gateway, self).__init__(*args, **kwargs)
         self.id = client_id
 
-    def start(self, args = []):
+    def start(self, args = None):
         """ start the gateway """
         assert(self.cluster)
         cmd = [mstart_path + 'mrgw.sh', self.cluster.cluster_id, str(self.port)]
         if self.id:
             cmd += ['-i', self.id]
         cmd += ['--debug-rgw=20', '--debug-ms=1']
-        cmd += args
+        if args:
+            cmd += args
         bash(cmd)
 
     def stop(self):
diff --git a/src/test/rgw/test_rgw_iam_policy.cc b/src/test/rgw/test_rgw_iam_policy.cc
new file mode 100644
index 00000000000..cc512e0b0c3
--- /dev/null
+++ b/src/test/rgw/test_rgw_iam_policy.cc
@@ -0,0 +1,507 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <string>
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/optional.hpp>
+
+#include <gtest/gtest.h>
+
+#include "common/code_environment.h"
+#include "common/ceph_context.h"
+#include "global/global_init.h"
+#include "rgw/rgw_auth.h"
+#include "rgw/rgw_iam_policy.h"
+
+
+using std::string;
+using std::vector;
+
+using boost::container::flat_set;
+using boost::intrusive_ptr;
+using boost::make_optional;
+using boost::none;
+using boost::optional;
+
+using rgw::auth::Identity;
+using rgw::auth::Principal;
+
+using rgw::IAM::ARN;
+using rgw::IAM::Effect;
+using rgw::IAM::Environment;
+using rgw::IAM::Partition;
+using rgw::IAM::Policy;
+using rgw::IAM::s3All;
+using rgw::IAM::s3Count;
+using rgw::IAM::s3GetAccelerateConfiguration;
+using rgw::IAM::s3GetBucketAcl;
+using rgw::IAM::s3GetBucketCORS;
+using rgw::IAM::s3GetBucketLocation;
+using rgw::IAM::s3GetBucketLogging;
+using rgw::IAM::s3GetBucketNotification;
+using rgw::IAM::s3GetBucketPolicy;
+using rgw::IAM::s3GetBucketRequestPayment;
+using rgw::IAM::s3GetBucketTagging;
+using rgw::IAM::s3GetBucketVersioning;
+using rgw::IAM::s3GetBucketWebsite;
+using rgw::IAM::s3GetLifecycleConfiguration;
+using rgw::IAM::s3GetObject;
+using rgw::IAM::s3GetObjectAcl;
+using rgw::IAM::s3GetObjectVersionAcl;
+using rgw::IAM::s3GetObjectTorrent;
+using rgw::IAM::s3GetObjectVersion;
+using rgw::IAM::s3GetObjectVersionTorrent;
+using rgw::IAM::s3GetReplicationConfiguration;
+using rgw::IAM::s3ListAllMyBuckets;
+using rgw::IAM::s3ListBucket;
+using rgw::IAM::s3ListBucket;
+using rgw::IAM::s3ListBucketMultiPartUploads;
+using rgw::IAM::s3ListBucketVersions;
+using rgw::IAM::s3ListMultipartUploadParts;
+using rgw::IAM::s3None;
+using rgw::IAM::s3PutBucketAcl;
+using rgw::IAM::s3PutBucketPolicy;
+using rgw::IAM::Service;
+using rgw::IAM::TokenID;
+using rgw::IAM::Version;
+
+class FakeIdentity : public Identity {
+  const Principal id;
+public:
+
+  FakeIdentity(Principal&& id) : id(std::move(id)) {}
+  uint32_t get_perms_from_aclspec(const aclspec_t& aclspec) const override {
+    abort();
+    return 0;
+  };
+
+  bool is_admin_of(const rgw_user& uid) const override {
+    abort();
+    return false;
+  }
+
+  bool is_owner_of(const rgw_user& uid) const override {
+    abort();
+    return false;
+  }
+
+  virtual uint32_t get_perm_mask() const override {
+    abort();
+    return 0;
+  }
+
+  void to_str(std::ostream& out) const override {
+    abort();
+  }
+
+  bool is_identity(const flat_set<Principal>& ids) const override {
+    return ids.find(id) != ids.end();
+  }
+};
+
+class PolicyTest : public ::testing::Test {
+protected:
+  intrusive_ptr<CephContext> cct;
+  static const string arbitrary_tenant;
+  static string example1;
+  static string example2;
+  static string example3;
+public:
+  PolicyTest() {
+    cct = new CephContext(CEPH_ENTITY_TYPE_CLIENT);
+  }
+};
+
+TEST_F(PolicyTest, Parse1) {
+  optional<Policy> p;
+
+  ASSERT_NO_THROW(p = Policy(cct.get(), arbitrary_tenant,
+			     bufferlist::static_from_string(example1)));
+  ASSERT_TRUE(p);
+
+  EXPECT_EQ(p->text, example1);
+  EXPECT_EQ(p->version, Version::v2012_10_17);
+  EXPECT_FALSE(p->id);
+  EXPECT_FALSE(p->statements[0].sid);
+  EXPECT_FALSE(p->statements.empty());
+  EXPECT_EQ(p->statements.size(), 1U);
+  EXPECT_TRUE(p->statements[0].princ.empty());
+  EXPECT_TRUE(p->statements[0].noprinc.empty());
+  EXPECT_EQ(p->statements[0].effect, Effect::Allow);
+  EXPECT_EQ(p->statements[0].action, s3ListBucket);
+  EXPECT_EQ(p->statements[0].notaction, s3None);
+  ASSERT_FALSE(p->statements[0].resource.empty());
+  ASSERT_EQ(p->statements[0].resource.size(), 1U);
+  EXPECT_EQ(p->statements[0].resource.begin()->partition, Partition::aws);
+  EXPECT_EQ(p->statements[0].resource.begin()->service, Service::s3);
+  EXPECT_TRUE(p->statements[0].resource.begin()->region.empty());
+  EXPECT_EQ(p->statements[0].resource.begin()->account, arbitrary_tenant);
+  EXPECT_EQ(p->statements[0].resource.begin()->resource, "example_bucket");
+  EXPECT_TRUE(p->statements[0].notresource.empty());
+  EXPECT_TRUE(p->statements[0].conditions.empty());
+}
+
+TEST_F(PolicyTest, Eval1) {
+  auto p  = Policy(cct.get(), arbitrary_tenant,
+		   bufferlist::static_from_string(example1));
+  Environment e;
+
+  EXPECT_EQ(p.eval(e, none, s3ListBucket,
+		   ARN(Partition::aws, Service::s3,
+		       "", arbitrary_tenant, "example_bucket")),
+	    Effect::Allow);
+
+  EXPECT_EQ(p.eval(e, none, s3PutBucketAcl,
+		   ARN(Partition::aws, Service::s3,
+		       "", arbitrary_tenant, "example_bucket")),
+	    Effect::Pass);
+
+  EXPECT_EQ(p.eval(e, none, s3ListBucket,
+		   ARN(Partition::aws, Service::s3,
+		       "", arbitrary_tenant, "erroneous_bucket")),
+	    Effect::Pass);
+
+}
+
+TEST_F(PolicyTest, Parse2) {
+  optional<Policy> p;
+
+  ASSERT_NO_THROW(p = Policy(cct.get(), arbitrary_tenant,
+			     bufferlist::static_from_string(example2)));
+  ASSERT_TRUE(p);
+
+  EXPECT_EQ(p->text, example2);
+  EXPECT_EQ(p->version, Version::v2012_10_17);
+  EXPECT_EQ(*p->id, "S3-Account-Permissions");
+  ASSERT_FALSE(p->statements.empty());
+  EXPECT_EQ(p->statements.size(), 1U);
+  EXPECT_EQ(*p->statements[0].sid, "1");
+  EXPECT_FALSE(p->statements[0].princ.empty());
+  EXPECT_EQ(p->statements[0].princ.size(), 1U);
+  EXPECT_EQ(*p->statements[0].princ.begin(),
+	    Principal::tenant("ACCOUNT-ID-WITHOUT-HYPHENS"));
+  EXPECT_TRUE(p->statements[0].noprinc.empty());
+  EXPECT_EQ(p->statements[0].effect, Effect::Allow);
+  EXPECT_EQ(p->statements[0].action, s3All);
+  EXPECT_EQ(p->statements[0].notaction, s3None);
+  ASSERT_FALSE(p->statements[0].resource.empty());
+  ASSERT_EQ(p->statements[0].resource.size(), 2U);
+  EXPECT_EQ(p->statements[0].resource.begin()->partition, Partition::aws);
+  EXPECT_EQ(p->statements[0].resource.begin()->service, Service::s3);
+  EXPECT_TRUE(p->statements[0].resource.begin()->region.empty());
+  EXPECT_EQ(p->statements[0].resource.begin()->account, arbitrary_tenant);
+  EXPECT_EQ(p->statements[0].resource.begin()->resource, "mybucket");
+  EXPECT_EQ((p->statements[0].resource.begin() + 1)->partition,
+	    Partition::aws);
+  EXPECT_EQ((p->statements[0].resource.begin() + 1)->service,
+	    Service::s3);
+  EXPECT_TRUE((p->statements[0].resource.begin() + 1)->region.empty());
+  EXPECT_EQ((p->statements[0].resource.begin() + 1)->account,
+	    arbitrary_tenant);
+  EXPECT_EQ((p->statements[0].resource.begin() + 1)->resource, "mybucket/*");
+  EXPECT_TRUE(p->statements[0].notresource.empty());
+  EXPECT_TRUE(p->statements[0].conditions.empty());
+}
+
+TEST_F(PolicyTest, Eval2) {
+  auto p  = Policy(cct.get(), arbitrary_tenant,
+		   bufferlist::static_from_string(example2));
+  Environment e;
+
+  auto trueacct = FakeIdentity(
+    Principal::tenant("ACCOUNT-ID-WITHOUT-HYPHENS"));
+
+  auto notacct = FakeIdentity(
+    Principal::tenant("some-other-account"));
+  for (auto i = 0ULL; i < s3Count; ++i) {
+    EXPECT_EQ(p.eval(e, trueacct, 1ULL << i,
+		     ARN(Partition::aws, Service::s3,
+			 "", arbitrary_tenant, "mybucket")),
+	      Effect::Allow);
+    EXPECT_EQ(p.eval(e, trueacct, 1ULL << i,
+		     ARN(Partition::aws, Service::s3,
+			 "", arbitrary_tenant, "mybucket/myobject")),
+	      Effect::Allow);
+
+    EXPECT_EQ(p.eval(e, notacct, 1ULL << i,
+		     ARN(Partition::aws, Service::s3,
+			 "", arbitrary_tenant, "mybucket")),
+	      Effect::Pass);
+    EXPECT_EQ(p.eval(e, notacct, 1ULL << i,
+		     ARN(Partition::aws, Service::s3,
+			 "", arbitrary_tenant, "mybucket/myobject")),
+	      Effect::Pass);
+
+    EXPECT_EQ(p.eval(e, trueacct, 1ULL << i,
+		     ARN(Partition::aws, Service::s3,
+			 "", arbitrary_tenant, "notyourbucket")),
+	      Effect::Pass);
+    EXPECT_EQ(p.eval(e, trueacct, 1ULL << i,
+		     ARN(Partition::aws, Service::s3,
+			 "", arbitrary_tenant, "notyourbucket/notyourobject")),
+	      Effect::Pass);
+
+  }
+}
+
+TEST_F(PolicyTest, Parse3) {
+  optional<Policy> p;
+
+  ASSERT_NO_THROW(p = Policy(cct.get(), arbitrary_tenant,
+			     bufferlist::static_from_string(example3)));
+  ASSERT_TRUE(p);
+
+  EXPECT_EQ(p->text, example3);
+  EXPECT_EQ(p->version, Version::v2012_10_17);
+  EXPECT_FALSE(p->id);
+  ASSERT_FALSE(p->statements.empty());
+  EXPECT_EQ(p->statements.size(), 3U);
+
+  EXPECT_EQ(*p->statements[0].sid, "FirstStatement");
+  EXPECT_TRUE(p->statements[0].princ.empty());
+  EXPECT_TRUE(p->statements[0].noprinc.empty());
+  EXPECT_EQ(p->statements[0].effect, Effect::Allow);
+  EXPECT_EQ(p->statements[0].action, s3PutBucketPolicy);
+  EXPECT_EQ(p->statements[0].notaction, s3None);
+  ASSERT_FALSE(p->statements[0].resource.empty());
+  ASSERT_EQ(p->statements[0].resource.size(), 1U);
+  EXPECT_EQ(p->statements[0].resource.begin()->partition, Partition::wildcard);
+  EXPECT_EQ(p->statements[0].resource.begin()->service, Service::wildcard);
+  EXPECT_EQ(p->statements[0].resource.begin()->region, "*");
+  EXPECT_EQ(p->statements[0].resource.begin()->account, arbitrary_tenant);
+  EXPECT_EQ(p->statements[0].resource.begin()->resource, "*");
+  EXPECT_TRUE(p->statements[0].notresource.empty());
+  EXPECT_TRUE(p->statements[0].conditions.empty());
+
+  EXPECT_EQ(*p->statements[1].sid, "SecondStatement");
+  EXPECT_TRUE(p->statements[1].princ.empty());
+  EXPECT_TRUE(p->statements[1].noprinc.empty());
+  EXPECT_EQ(p->statements[1].effect, Effect::Allow);
+  EXPECT_EQ(p->statements[1].action, s3ListAllMyBuckets);
+  EXPECT_EQ(p->statements[1].notaction, s3None);
+  ASSERT_FALSE(p->statements[1].resource.empty());
+  ASSERT_EQ(p->statements[1].resource.size(), 1U);
+  EXPECT_EQ(p->statements[1].resource.begin()->partition, Partition::wildcard);
+  EXPECT_EQ(p->statements[1].resource.begin()->service, Service::wildcard);
+  EXPECT_EQ(p->statements[1].resource.begin()->region, "*");
+  EXPECT_EQ(p->statements[1].resource.begin()->account, arbitrary_tenant);
+  EXPECT_EQ(p->statements[1].resource.begin()->resource, "*");
+  EXPECT_TRUE(p->statements[1].notresource.empty());
+  EXPECT_TRUE(p->statements[1].conditions.empty());
+
+  EXPECT_EQ(*p->statements[2].sid, "ThirdStatement");
+  EXPECT_TRUE(p->statements[2].princ.empty());
+  EXPECT_TRUE(p->statements[2].noprinc.empty());
+  EXPECT_EQ(p->statements[2].effect, Effect::Allow);
+  EXPECT_EQ(p->statements[2].action, (s3ListMultipartUploadParts |
+				      s3ListBucket | s3ListBucketVersions |
+				      s3ListAllMyBuckets |
+				      s3ListBucketMultiPartUploads |
+				      s3GetObject | s3GetObjectVersion |
+				      s3GetObjectAcl | s3GetObjectVersionAcl |
+				      s3GetObjectTorrent |
+				      s3GetObjectVersionTorrent |
+				      s3GetAccelerateConfiguration |
+				      s3GetBucketAcl | s3GetBucketCORS |
+				      s3GetBucketVersioning |
+				      s3GetBucketRequestPayment |
+				      s3GetBucketLocation |
+				      s3GetBucketPolicy |
+				      s3GetBucketNotification |
+				      s3GetBucketLogging |
+				      s3GetBucketTagging |
+				      s3GetBucketWebsite |
+				      s3GetLifecycleConfiguration |
+				      s3GetReplicationConfiguration));
+  EXPECT_EQ(p->statements[2].notaction, s3None);
+  ASSERT_FALSE(p->statements[2].resource.empty());
+  ASSERT_EQ(p->statements[2].resource.size(), 2U);
+  EXPECT_EQ(p->statements[2].resource.begin()->partition, Partition::aws);
+  EXPECT_EQ(p->statements[2].resource.begin()->service, Service::s3);
+  EXPECT_TRUE(p->statements[2].resource.begin()->region.empty());
+  EXPECT_EQ(p->statements[2].resource.begin()->account, arbitrary_tenant);
+  EXPECT_EQ(p->statements[2].resource.begin()->resource, "confidential-data");
+  EXPECT_EQ((p->statements[2].resource.begin() + 1)->partition,
+	    Partition::aws);
+  EXPECT_EQ((p->statements[2].resource.begin() + 1)->service, Service::s3);
+  EXPECT_TRUE((p->statements[2].resource.begin() + 1)->region.empty());
+  EXPECT_EQ((p->statements[2].resource.begin() + 1)->account,
+	    arbitrary_tenant);
+  EXPECT_EQ((p->statements[2].resource.begin() + 1)->resource,
+	    "confidential-data/*");
+  EXPECT_TRUE(p->statements[2].notresource.empty());
+  ASSERT_FALSE(p->statements[2].conditions.empty());
+  ASSERT_EQ(p->statements[2].conditions.size(), 1U);
+  EXPECT_EQ(p->statements[2].conditions[0].op, TokenID::Bool);
+  EXPECT_EQ(p->statements[2].conditions[0].key, "aws:MultiFactorAuthPresent");
+  EXPECT_FALSE(p->statements[2].conditions[0].ifexists);
+  ASSERT_FALSE(p->statements[2].conditions[0].vals.empty());
+  EXPECT_EQ(p->statements[2].conditions[0].vals.size(), 1U);
+  EXPECT_EQ(p->statements[2].conditions[0].vals[0], "true");
+}
+
+TEST_F(PolicyTest, Eval3) {
+  auto p  = Policy(cct.get(), arbitrary_tenant,
+		   bufferlist::static_from_string(example3));
+  Environment em;
+  Environment tr = { { "aws:MultiFactorAuthPresent", "true" } };
+  Environment fa = { { "aws:MultiFactorAuthPresent", "false" } };
+
+  auto s3allow = (s3ListMultipartUploadParts | s3ListBucket |
+		  s3ListBucketVersions | s3ListAllMyBuckets |
+		  s3ListBucketMultiPartUploads | s3GetObject |
+		  s3GetObjectVersion | s3GetObjectAcl | s3GetObjectVersionAcl |
+		  s3GetObjectTorrent | s3GetObjectVersionTorrent |
+		  s3GetAccelerateConfiguration | s3GetBucketAcl |
+		  s3GetBucketCORS | s3GetBucketVersioning |
+		  s3GetBucketRequestPayment | s3GetBucketLocation |
+		  s3GetBucketPolicy | s3GetBucketNotification |
+		  s3GetBucketLogging | s3GetBucketTagging |
+		  s3GetBucketWebsite | s3GetLifecycleConfiguration |
+		  s3GetReplicationConfiguration);
+
+  EXPECT_EQ(p.eval(em, none, s3PutBucketPolicy,
+		   ARN(Partition::aws, Service::s3,
+		       "", arbitrary_tenant, "mybucket")),
+	    Effect::Allow);
+
+  EXPECT_EQ(p.eval(em, none, s3PutBucketPolicy,
+		   ARN(Partition::aws, Service::s3,
+		       "", arbitrary_tenant, "mybucket")),
+	    Effect::Allow);
+
+
+  for (auto i = 0ULL; i < s3Count; ++i) {
+    auto op = 1ULL << i;
+    if ((op == s3ListAllMyBuckets) || (op == s3PutBucketPolicy)) {
+      continue;
+    }
+
+    EXPECT_EQ(p.eval(em, none, op,
+		     ARN(Partition::aws, Service::s3,
+			 "", arbitrary_tenant, "confidential-data")),
+	      Effect::Pass);
+    EXPECT_EQ(p.eval(tr, none, op,
+		     ARN(Partition::aws, Service::s3,
+			 "", arbitrary_tenant, "confidential-data")),
+	      op & s3allow ? Effect::Allow : Effect::Pass);
+    EXPECT_EQ(p.eval(fa, none, op,
+		     ARN(Partition::aws, Service::s3,
+			 "", arbitrary_tenant, "confidential-data")),
+	      Effect::Pass);
+
+    EXPECT_EQ(p.eval(em, none, op,
+		     ARN(Partition::aws, Service::s3,
+			 "", arbitrary_tenant, "confidential-data/moo")),
+	      Effect::Pass);
+    EXPECT_EQ(p.eval(tr, none, op,
+		     ARN(Partition::aws, Service::s3,
+			 "", arbitrary_tenant, "confidential-data/moo")),
+	      op & s3allow ? Effect::Allow : Effect::Pass);
+    EXPECT_EQ(p.eval(fa, none, op,
+		     ARN(Partition::aws, Service::s3,
+			 "", arbitrary_tenant, "confidential-data/moo")),
+	      Effect::Pass);
+
+    EXPECT_EQ(p.eval(em, none, op,
+		     ARN(Partition::aws, Service::s3,
+			 "", arbitrary_tenant, "really-confidential-data")),
+	      Effect::Pass);
+    EXPECT_EQ(p.eval(tr, none, op,
+		     ARN(Partition::aws, Service::s3,
+			 "", arbitrary_tenant, "really-confidential-data")),
+	      Effect::Pass);
+    EXPECT_EQ(p.eval(fa, none, op,
+		     ARN(Partition::aws, Service::s3,
+			 "", arbitrary_tenant, "really-confidential-data")),
+	      Effect::Pass);
+
+    EXPECT_EQ(p.eval(em, none, op,
+		     ARN(Partition::aws, Service::s3,
+			 "", arbitrary_tenant,
+			 "really-confidential-data/moo")), Effect::Pass);
+    EXPECT_EQ(p.eval(tr, none, op,
+		     ARN(Partition::aws, Service::s3,
+			 "", arbitrary_tenant,
+			 "really-confidential-data/moo")), Effect::Pass);
+    EXPECT_EQ(p.eval(fa, none, op,
+		     ARN(Partition::aws, Service::s3,
+			 "", arbitrary_tenant,
+			 "really-confidential-data/moo")), Effect::Pass);
+
+  }
+}
+
+const string PolicyTest::arbitrary_tenant = "arbitrary_tenant";
+string PolicyTest::example1 = R"(
+{
+  "Version": "2012-10-17",
+  "Statement": {
+    "Effect": "Allow",
+    "Action": "s3:ListBucket",
+    "Resource": "arn:aws:s3:::example_bucket"
+  }
+}
+)";
+
+string PolicyTest::example2 = R"(
+{
+  "Version": "2012-10-17",
+  "Id": "S3-Account-Permissions",
+  "Statement": [{
+    "Sid": "1",
+    "Effect": "Allow",
+    "Principal": {"AWS": ["arn:aws:iam::ACCOUNT-ID-WITHOUT-HYPHENS:root"]},
+    "Action": "s3:*",
+    "Resource": [
+      "arn:aws:s3:::mybucket",
+      "arn:aws:s3:::mybucket/*"
+    ]
+  }]
+}
+)";
+
+string PolicyTest::example3 = R"(
+{
+  "Version": "2012-10-17",
+  "Statement": [
+    {
+      "Sid": "FirstStatement",
+      "Effect": "Allow",
+      "Action": ["s3:PutBucketPolicy"],
+      "Resource": "*"
+    },
+    {
+      "Sid": "SecondStatement",
+      "Effect": "Allow",
+      "Action": "s3:ListAllMyBuckets",
+      "Resource": "*"
+    },
+    {
+      "Sid": "ThirdStatement",
+      "Effect": "Allow",
+      "Action": [
+	"s3:List*",
+	"s3:Get*"
+      ],
+      "Resource": [
+	"arn:aws:s3:::confidential-data",
+	"arn:aws:s3:::confidential-data/*"
+      ],
+      "Condition": {"Bool": {"aws:MultiFactorAuthPresent": "true"}}
+    }
+  ]
+}
+)";
diff --git a/src/test/signals.cc b/src/test/signals.cc
index afcfbb7d645..e4b1b171234 100644
--- a/src/test/signals.cc
+++ b/src/test/signals.cc
@@ -2,6 +2,7 @@
 #include "common/signal.h"
 #include "global/signal_handler.h"
 #include "common/debug.h"
+#include "include/coredumpctl.h"
 
 #include "gtest/gtest.h"
 
@@ -118,7 +119,10 @@ TEST(SignalHandler, Multiple)
 TEST(SignalHandler, LogInternal)
 {
   g_ceph_context->_log->inject_segv();
-  ASSERT_DEATH(derr << "foo" << dendl, ".*");
+  {
+    PrCtl unset_dumpable;
+    ASSERT_DEATH(derr << "foo" << dendl, ".*");
+  }
   g_ceph_context->_log->reset_segv();
 }
 
diff --git a/src/test/simple_spin.cc b/src/test/simple_spin.cc
index 6857a51ef36..04b4fc07e5f 100644
--- a/src/test/simple_spin.cc
+++ b/src/test/simple_spin.cc
@@ -33,21 +33,19 @@ TEST(SimpleSpin, Test1)
   pthread_t thread1;
   pthread_t thread2;
   ret = pthread_create(&thread1, NULL, mythread, NULL);
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ret = pthread_create(&thread2, NULL, mythread, NULL);
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ret = pthread_join(thread1, NULL);
-  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(0, ret);
   ret = pthread_join(thread2, NULL);
-  ASSERT_EQ(ret, 0);
-  ASSERT_EQ(counter, n);
+  ASSERT_EQ(0, ret);
+  ASSERT_EQ(n, counter);
 
 
   // Should also work with pass-by-reference:
   // (Note that we don't care about cross-threading here as-such.)
   counter = 0;
-  uint32_t urhs = 0;
-  ASSERT_EQ(counter, urhs);
   async(std::launch::async, []() {
         for(int i = 0; n != i; ++i) {
             simple_spin_lock(lock);
@@ -55,6 +53,6 @@ TEST(SimpleSpin, Test1)
             simple_spin_unlock(lock);
         }
        });
-  ASSERT_EQ(counter, n);
+  ASSERT_EQ(n, counter);
 }
 
diff --git a/src/test/test_ipaddr.cc b/src/test/test_ipaddr.cc
index ab43407ec5e..f6ecd2de51f 100644
--- a/src/test/test_ipaddr.cc
+++ b/src/test/test_ipaddr.cc
@@ -6,6 +6,7 @@
 #include <sys/socket.h>
 #endif
 #include <arpa/inet.h>
+#include <ifaddrs.h>
 
 static void ipv4(struct sockaddr_in *addr, const char *s) {
   int err;
diff --git a/src/test/test_texttable.cc b/src/test/test_texttable.cc
index 60b0a1ceb9b..0d439d16c9c 100644
--- a/src/test/test_texttable.cc
+++ b/src/test/test_texttable.cc
@@ -15,6 +15,7 @@
 #include "common/TextTable.h"
 #include <iostream>
 #include "gtest/gtest.h"
+#include "include/coredumpctl.h"
 
 TEST(TextTable, Alignment) {
   TextTable t;
@@ -72,5 +73,6 @@ TEST(TextTable, TooManyItems) {
   t.define_column("3", TextTable::LEFT, TextTable::LEFT);
 
   // expect assertion failure on this, which throws FailedAssertion
+  PrCtl unset_dumpable;
   ASSERT_DEATH((t << "1" << "2" << "3" << "4" << TextTable::endrow), "");
 }
diff --git a/src/test/vstart_wrapper.sh b/src/test/vstart_wrapper.sh
index 17fd9836f66..748fc406723 100755
--- a/src/test/vstart_wrapper.sh
+++ b/src/test/vstart_wrapper.sh
@@ -38,7 +38,7 @@ function vstart_setup()
         --short \
         $OBJSTORE_ARGS \
         -o 'paxos propose interval = 0.01' \
-        -n -l || return 1
+        -d -n -l || return 1
     export CEPH_CONF=$CEPH_DIR/ceph.conf
 
     crit=$(expr 100 - $(ceph-conf --show-config-value mon_data_avail_crit))
diff --git a/src/tools/RadosDump.h b/src/tools/RadosDump.h
index 13bb78b646a..6ad43f7aa63 100644
--- a/src/tools/RadosDump.h
+++ b/src/tools/RadosDump.h
@@ -284,7 +284,7 @@ struct metadata_section {
   epoch_t map_epoch;
   pg_info_t info;
   pg_log_t log;
-  map<epoch_t,pg_interval_t> past_intervals;
+  PastIntervals past_intervals;
   OSDMap osdmap;
   bufferlist osdmap_bl;  // Used in lieu of encoding osdmap due to crc checking
   map<eversion_t, hobject_t> divergent_priors;
@@ -295,7 +295,7 @@ struct metadata_section {
     epoch_t map_epoch,
     const pg_info_t &info,
     const pg_log_t &log,
-    const map<epoch_t,pg_interval_t> &past_intervals,
+    const PastIntervals &past_intervals,
     const pg_missing_t &missing)
     : struct_ver(struct_ver),
       map_epoch(map_epoch),
@@ -308,7 +308,7 @@ struct metadata_section {
       map_epoch(0) { }
 
   void encode(bufferlist& bl) const {
-    ENCODE_START(5, 1, bl);
+    ENCODE_START(6, 6, bl);
     ::encode(struct_ver, bl);
     ::encode(map_epoch, bl);
     ::encode(info, bl);
@@ -322,13 +322,15 @@ struct metadata_section {
     ENCODE_FINISH(bl);
   }
   void decode(bufferlist::iterator& bl) {
-    DECODE_START(5, bl);
+    DECODE_START(6, bl);
     ::decode(struct_ver, bl);
     ::decode(map_epoch, bl);
     ::decode(info, bl);
     ::decode(log, bl);
-    if (struct_v > 1) {
+    if (struct_v >= 6) {
       ::decode(past_intervals, bl);
+    } else if (struct_v > 1) {
+      past_intervals.decode_classic(bl);
     } else {
       cout << "NOTICE: Older export without past_intervals" << std::endl;
     }
diff --git a/src/tools/ceph_kvstore_tool.cc b/src/tools/ceph_kvstore_tool.cc
index df9ef469cc4..adca8a71be2 100644
--- a/src/tools/ceph_kvstore_tool.cc
+++ b/src/tools/ceph_kvstore_tool.cc
@@ -28,6 +28,10 @@
 #include "kv/KeyValueDB.h"
 #include "common/url_escape.h"
 
+#ifdef HAVE_LIBAIO
+#include "os/bluestore/BlueStore.h"
+#endif
+
 using namespace std;
 
 class StoreTool
@@ -37,12 +41,28 @@ class StoreTool
 
   public:
   StoreTool(string type, const string &path) : store_path(path) {
-    KeyValueDB *db_ptr = KeyValueDB::create(g_ceph_context, type, path);
-    int r = db_ptr->open(std::cerr);
-    if (r < 0) {
-      cerr << "failed to open type " << type << " path " << path << ": "
-	   << cpp_strerror(r) << std::endl;
+    KeyValueDB *db_ptr;
+    if (type == "bluestore-kv") {
+#ifdef HAVE_LIBAIO
+      // note: we'll leak this!  the only user is ceph-kvstore-tool and
+      // we don't care.
+      BlueStore *bluestore = new BlueStore(g_ceph_context, path);
+      int r = bluestore->start_kv_only(&db_ptr);
+      if (r < 0) {
+	exit(1);
+      }
+#else
+      cerr << "bluestore not compiled in" << std::endl;
       exit(1);
+#endif
+    } else {
+      db_ptr = KeyValueDB::create(g_ceph_context, type, path);
+      int r = db_ptr->open(std::cerr);
+      if (r < 0) {
+	cerr << "failed to open type " << type << " path " << path << ": "
+	     << cpp_strerror(r) << std::endl;
+	exit(1);
+      }
     }
     db.reset(db_ptr);
   }
diff --git a/src/tools/ceph_objectstore_tool.cc b/src/tools/ceph_objectstore_tool.cc
index c4897805c38..23e06ecf04e 100644
--- a/src/tools/ceph_objectstore_tool.cc
+++ b/src/tools/ceph_objectstore_tool.cc
@@ -401,7 +401,7 @@ int mark_pg_for_removal(ObjectStore *fs, spg_t pgid, ObjectStore::Transaction *t
   int r = PG::peek_map_epoch(fs, pgid, &map_epoch, &bl);
   if (r < 0)
     cerr << __func__ << " warning: peek_map_epoch reported error" << std::endl;
-  map<epoch_t,pg_interval_t> past_intervals;
+  PastIntervals past_intervals;
   __u8 struct_v;
   r = PG::read_info(fs, pgid, coll, bl, info, past_intervals, struct_v);
   if (r < 0) {
@@ -442,7 +442,7 @@ int initiate_new_remove_pg(ObjectStore *store, spg_t r_pgid,
 }
 
 int write_info(ObjectStore::Transaction &t, epoch_t epoch, pg_info_t &info,
-    map<epoch_t,pg_interval_t> &past_intervals)
+    PastIntervals &past_intervals)
 {
   //Empty for this
   coll_t coll(info.pgid);
@@ -464,7 +464,7 @@ int write_info(ObjectStore::Transaction &t, epoch_t epoch, pg_info_t &info,
 typedef map<eversion_t, hobject_t> divergent_priors_t;
 
 int write_pg(ObjectStore::Transaction &t, epoch_t epoch, pg_info_t &info,
-	     pg_log_t &log, map<epoch_t,pg_interval_t> &past_intervals,
+	     pg_log_t &log, PastIntervals &past_intervals,
 	     divergent_priors_t &divergent,
 	     pg_missing_t &missing)
 {
@@ -528,7 +528,7 @@ int ObjectStoreTool::export_file(ObjectStore *store, coll_t cid, ghobject_t &obj
       cerr << "object_info: " << objb.oi << std::endl;
   }
 
-  // XXX: Should we be checking for WHITEOUT or LOST in objb.oi.flags and skip?
+  // NOTE: we include whiteouts, lost, etc.
 
   ret = write_section(TYPE_OBJECT_BEGIN, objb, file_fd);
   if (ret < 0)
@@ -747,7 +747,7 @@ int add_osdmap(ObjectStore *store, metadata_section &ms)
 int ObjectStoreTool::do_export(ObjectStore *fs, coll_t coll, spg_t pgid,
     pg_info_t &info, epoch_t map_epoch, __u8 struct_ver,
     const OSDSuperblock& superblock,
-    map<epoch_t,pg_interval_t> &past_intervals)
+    PastIntervals &past_intervals)
 {
   PGLog::IndexedLog log;
   pg_missing_t missing;
@@ -817,9 +817,12 @@ int get_data(ObjectStore *store, coll_t coll, ghobject_t hoid,
   return 0;
 }
 
-int get_attrs(ObjectStore *store, coll_t coll, ghobject_t hoid,
-    ObjectStore::Transaction *t, bufferlist &bl,
-    OSDriver &driver, SnapMapper &snap_mapper)
+int get_attrs(
+  ObjectStore *store, coll_t coll, ghobject_t hoid,
+  ObjectStore::Transaction *t, bufferlist &bl,
+  OSDriver &driver, SnapMapper &snap_mapper,
+  const ghobject_t& last_head,
+  const set<ghobject_t>& last_clones)
 {
   bufferlist::iterator ebliter = bl.begin();
   attr_section as;
@@ -831,17 +834,47 @@ int get_attrs(ObjectStore *store, coll_t coll, ghobject_t hoid,
 
   // This could have been handled in the caller if we didn't need to
   // support exports that didn't include object_info_t in object_begin.
-  if (hoid.hobj.snap < CEPH_MAXSNAP && hoid.generation == ghobject_t::NO_GEN) {
-    map<string,bufferlist>::iterator mi = as.data.find(OI_ATTR);
-    if (mi != as.data.end()) {
-      object_info_t oi(mi->second);
-
-      if (debug)
-        cerr << "object_info " << oi << std::endl;
-
-      OSDriver::OSTransaction _t(driver.get_transaction(t));
-      set<snapid_t> oi_snaps(oi.snaps.begin(), oi.snaps.end());
-      snap_mapper.add_oid(hoid.hobj, oi_snaps, &_t);
+  if (hoid.generation == ghobject_t::NO_GEN) {
+    if (hoid.hobj.snap < CEPH_MAXSNAP) {
+      map<string,bufferlist>::iterator mi = as.data.find(OI_ATTR);
+      if (mi != as.data.end()) {
+	object_info_t oi(mi->second);
+
+	if (debug)
+	  cerr << "object_info " << oi << std::endl;
+
+	OSDriver::OSTransaction _t(driver.get_transaction(t));
+	set<snapid_t> oi_snaps(oi.legacy_snaps.begin(), oi.legacy_snaps.end());
+	if (!oi_snaps.empty()) {
+	  if (debug)
+	    cerr << "\tsetting legacy snaps " << oi_snaps << std::endl;
+	  snap_mapper.add_oid(hoid.hobj, oi_snaps, &_t);
+	}
+      }
+    } else {
+      if (hoid == last_head) {
+	map<string,bufferlist>::iterator mi = as.data.find(SS_ATTR);
+	if (mi != as.data.end()) {
+	  SnapSet snapset;
+	  auto p = mi->second.begin();
+	  snapset.decode(p);
+	  cout << "snapset " << snapset << std::endl;
+	  if (!snapset.is_legacy()) {
+	    for (auto& p : snapset.clone_snaps) {
+	      hobject_t clone = hoid.hobj;
+	      clone.snap = p.first;
+	      set<snapid_t> snaps(p.second.begin(), p.second.end());
+	      if (debug)
+		cerr << "\tsetting " << clone << " snaps " << snaps << std::endl;
+	      OSDriver::OSTransaction _t(driver.get_transaction(t));
+	      assert(!snaps.empty());
+	      snap_mapper.add_oid(clone, snaps, &_t);
+	    }
+	  }
+	} else {
+	  cerr << "missing SS_ATTR on " << hoid << std::endl;
+	}
+      }
     }
   }
 
@@ -878,7 +911,9 @@ int get_omap(ObjectStore *store, coll_t coll, ghobject_t hoid,
 int ObjectStoreTool::get_object(ObjectStore *store, coll_t coll,
 				bufferlist &bl, OSDMap &curmap,
 				bool *skipped_objects,
-				ObjectStore::Sequencer &osr)
+				ObjectStore::Sequencer &osr,
+				ghobject_t *last_head,
+				set<ghobject_t> *last_clones)
 {
   ObjectStore::Transaction tran;
   ObjectStore::Transaction *t = &tran;
@@ -928,6 +963,19 @@ int ObjectStoreTool::get_object(ObjectStore *store, coll_t coll,
 
   cout << "Write " << ob.hoid << std::endl;
 
+  // manage snap collection
+  if (ob.hoid.hobj.is_snap()) {
+    ghobject_t head = ob.hoid;
+    head.hobj = head.hobj.get_head();
+    if (head == *last_head) {
+      last_clones->insert(ob.hoid);
+    } else {
+      *last_head = head;
+      last_clones->clear();
+    }
+    last_clones->insert(ob.hoid);
+  }
+
   bufferlist ebl;
   bool done = false;
   while(!done) {
@@ -950,7 +998,8 @@ int ObjectStoreTool::get_object(ObjectStore *store, coll_t coll,
       break;
     case TYPE_ATTRS:
       if (dry_run) break;
-      ret = get_attrs(store, coll, ob.hoid, t, ebl, driver, mapper);
+      ret = get_attrs(store, coll, ob.hoid, t, ebl, driver, mapper,
+		      *last_head, *last_clones);
       if (ret) return ret;
       break;
     case TYPE_OMAP_HDR:
@@ -1279,6 +1328,8 @@ int ObjectStoreTool::do_import(ObjectStore *store, OSDSuperblock& sb,
   bool done = false;
   bool found_metadata = false;
   metadata_section ms;
+  ghobject_t last_head;
+  set<ghobject_t> last_clones;
   while(!done) {
     ret = read_section(&type, &ebl);
     if (ret)
@@ -1291,7 +1342,8 @@ int ObjectStoreTool::do_import(ObjectStore *store, OSDSuperblock& sb,
     }
     switch(type) {
     case TYPE_OBJECT_BEGIN:
-      ret = get_object(store, coll, ebl, curmap, &skipped_objects, osr);
+      ret = get_object(store, coll, ebl, curmap, &skipped_objects, osr,
+		       &last_head, &last_clones);
       if (ret) return ret;
       break;
     case TYPE_PG_METADATA:
@@ -3412,7 +3464,7 @@ int main(int argc, char **argv)
       cerr << "map_epoch " << map_epoch << std::endl;
 
     pg_info_t info(pgid);
-    map<epoch_t,pg_interval_t> past_intervals;
+    PastIntervals past_intervals;
     __u8 struct_ver;
     ret = PG::read_info(fs, pgid, coll, bl, info, past_intervals,
 		      struct_ver);
diff --git a/src/tools/ceph_objectstore_tool.h b/src/tools/ceph_objectstore_tool.h
index db279881c4e..77fa968879d 100644
--- a/src/tools/ceph_objectstore_tool.h
+++ b/src/tools/ceph_objectstore_tool.h
@@ -30,10 +30,13 @@ class ObjectStoreTool : public RadosDump
     int do_export(ObjectStore *fs, coll_t coll, spg_t pgid,
           pg_info_t &info, epoch_t map_epoch, __u8 struct_ver,
           const OSDSuperblock& superblock,
-          map<epoch_t,pg_interval_t> &past_intervals);
-    int get_object(ObjectStore *store, coll_t coll,
-		   bufferlist &bl, OSDMap &curmap, bool *skipped_objects,
-		   ObjectStore::Sequencer &osr);
+          PastIntervals &past_intervals);
+    int get_object(
+      ObjectStore *store, coll_t coll,
+      bufferlist &bl, OSDMap &curmap, bool *skipped_objects,
+      ObjectStore::Sequencer &osr,
+      ghobject_t *last_head,
+      set<ghobject_t> *last_clones);
     int export_file(
         ObjectStore *store, coll_t cid, ghobject_t &obj);
     int export_files(ObjectStore *store, coll_t coll);
diff --git a/src/tools/osdmaptool.cc b/src/tools/osdmaptool.cc
index 1301f4f542b..2738412e538 100644
--- a/src/tools/osdmaptool.cc
+++ b/src/tools/osdmaptool.cc
@@ -454,9 +454,8 @@ int main(int argc, const char **argv)
     vector<int> size(30, 0);
     if (test_random)
       srand(getpid());
-    map<int64_t,pg_pool_t>& pools = osdmap.get_pools();
-    for (map<int64_t,pg_pool_t>::iterator p = pools.begin();
-	 p != pools.end(); ++p) {
+    auto& pools = osdmap.get_pools();
+    for (auto p = pools.begin(); p != pools.end(); ++p) {
       if (pool != -1 && p->first != pool)
 	continue;
       if (pg_num > 0) 
diff --git a/src/tools/rbd/action/Group.cc b/src/tools/rbd/action/Group.cc
index 9bfb58fc588..40ebd4d9927 100644
--- a/src/tools/rbd/action/Group.cc
+++ b/src/tools/rbd/action/Group.cc
@@ -69,9 +69,6 @@ int execute_list(const po::variables_map &vm) {
   librbd::RBD rbd;
   std::vector<std::string> names;
   r = rbd.group_list(io_ctx, &names);
-
-  if (r == -ENOENT)
-    r = 0;
   if (r < 0)
     return r;
 
diff --git a/src/tools/rbd/action/Import.cc b/src/tools/rbd/action/Import.cc
index 348915dbcce..5bea7805da7 100644
--- a/src/tools/rbd/action/Import.cc
+++ b/src/tools/rbd/action/Import.cc
@@ -871,15 +871,21 @@ int execute(const po::variables_map &vm) {
 
   std::string deprecated_image_name;
   if (vm.count(at::IMAGE_NAME)) {
-    utils::extract_spec(vm[at::IMAGE_NAME].as<std::string>(),
-                        &deprecated_pool_name, &deprecated_image_name, nullptr,
-                        utils::SPEC_VALIDATION_FULL);
+    deprecated_image_name = vm[at::IMAGE_NAME].as<std::string>();
     std::cerr << "rbd: --image is deprecated for import, use --dest"
               << std::endl;
   } else {
     deprecated_image_name = path.substr(path.find_last_of("/") + 1);
   }
 
+  std::string deprecated_snap_name;
+  r = utils::extract_spec(deprecated_image_name, &deprecated_pool_name,
+                          &deprecated_image_name, &deprecated_snap_name,
+                          utils::SPEC_VALIDATION_FULL);
+  if (r < 0) {
+    return r;
+  }
+
   size_t sparse_size = utils::RBD_DEFAULT_SPARSE_SIZE;
   if (vm.count(at::IMAGE_SPARSE_SIZE)) {
     sparse_size = vm[at::IMAGE_SPARSE_SIZE].as<size_t>();
@@ -888,7 +894,7 @@ int execute(const po::variables_map &vm) {
   size_t arg_index = 1;
   std::string pool_name = deprecated_pool_name;
   std::string image_name;
-  std::string snap_name;
+  std::string snap_name = deprecated_snap_name;
   r = utils::get_pool_image_snapshot_names(
     vm, at::ARGUMENT_MODIFIER_DEST, &arg_index, &pool_name, &image_name,
     &snap_name, utils::SNAPSHOT_PRESENCE_NONE, utils::SPEC_VALIDATION_FULL,
diff --git a/src/tools/rbd/action/Info.cc b/src/tools/rbd/action/Info.cc
index 3bfdeeb7d67..9f754cc8c27 100644
--- a/src/tools/rbd/action/Info.cc
+++ b/src/tools/rbd/action/Info.cc
@@ -69,7 +69,6 @@ static int do_show_info(librados::IoCtx &io_ctx, librbd::Image& image,
                         const std::string &snapname, Formatter *f)
 {
   librbd::image_info_t info;
-  std::string parent_pool, parent_name, parent_snapname;
   uint8_t old_format;
   uint64_t overlap, features, flags, snap_limit;
   bool snap_protected = false;
@@ -207,18 +206,33 @@ static int do_show_info(librados::IoCtx &io_ctx, librbd::Image& image,
   }
 
   // parent info, if present
-  if ((image.parent_info(&parent_pool, &parent_name, &parent_snapname) == 0) &&
+  std::string parent_pool, parent_name, parent_id, parent_snapname;
+  if ((image.parent_info2(&parent_pool, &parent_name, &parent_id,
+                          &parent_snapname) == 0) &&
       parent_name.length() > 0) {
+
+    librbd::trash_image_info_t trash_image_info;
+    librbd::RBD rbd;
+    r = rbd.trash_get(io_ctx, parent_id.c_str(), &trash_image_info);
+    bool trash_image_info_valid = (r == 0);
+
     if (f) {
       f->open_object_section("parent");
       f->dump_string("pool", parent_pool);
       f->dump_string("image", parent_name);
       f->dump_string("snapshot", parent_snapname);
+      if (trash_image_info_valid) {
+        f->dump_string("trash", parent_id);
+      }
       f->dump_unsigned("overlap", overlap);
       f->close_section();
     } else {
       std::cout << "\tparent: " << parent_pool << "/" << parent_name
-                << "@" << parent_snapname << std::endl;
+                << "@" << parent_snapname;
+      if (trash_image_info_valid) {
+        std::cout << " (trash " << parent_id << ")";
+      }
+      std::cout << std::endl;
       std::cout << "\toverlap: " << prettybyte_t(overlap) << std::endl;
     }
   }
diff --git a/src/tools/rbd/action/Kernel.cc b/src/tools/rbd/action/Kernel.cc
index 3db901d3da7..336dd597d86 100644
--- a/src/tools/rbd/action/Kernel.cc
+++ b/src/tools/rbd/action/Kernel.cc
@@ -133,6 +133,8 @@ static int parse_map_options(char *options)
         return -EINVAL;
     } else if (!strcmp(this_char, "lock_on_read")) {
       put_map_option("lock_on_read", this_char);
+    } else if (!strcmp(this_char, "exclusive")) {
+      put_map_option("exclusive", this_char);
     } else {
       std::cerr << "rbd: unknown map option '" << this_char << "'" << std::endl;
       return -EINVAL;
@@ -385,7 +387,8 @@ void get_map_arguments(po::options_description *positional,
                                      at::ARGUMENT_MODIFIER_NONE);
   options->add_options()
     ("options,o", po::value<std::string>(), "map options")
-    ("read-only", po::bool_switch(), "map read-only");
+    ("read-only", po::bool_switch(), "map read-only")
+    ("exclusive", po::bool_switch(), "disable automatic exclusive lock transitions");
 }
 
 int execute_map(const po::variables_map &vm) {
@@ -404,6 +407,9 @@ int execute_map(const po::variables_map &vm) {
   if (vm["read-only"].as<bool>()) {
     put_map_option("rw", "ro");
   }
+  if (vm["exclusive"].as<bool>()) {
+    put_map_option("exclusive", "exclusive");
+  }
 
   // parse default options first so they can be overwritten by cli options
   char *default_map_options = strdup(g_conf->rbd_default_map_options.c_str());
@@ -503,7 +509,7 @@ int execute_unmap(const po::variables_map &vm) {
   return 0;
 }
 
-Shell::SwitchArguments switched_arguments({"read-only"});
+Shell::SwitchArguments switched_arguments({"read-only", "exclusive"});
 Shell::Action action_show(
   {"showmapped"}, {}, "Show the rbd images mapped by the kernel.", "",
   &get_show_arguments, &execute_show);
diff --git a/src/tools/rbd/action/List.cc b/src/tools/rbd/action/List.cc
index 45ca812a5ae..53bebd83d8c 100644
--- a/src/tools/rbd/action/List.cc
+++ b/src/tools/rbd/action/List.cc
@@ -23,8 +23,6 @@ int do_list(librbd::RBD &rbd, librados::IoCtx& io_ctx, bool lflag,
                    Formatter *f) {
   std::vector<std::string> names;
   int r = rbd.list(io_ctx, names);
-  if (r == -ENOENT)
-    r = 0;
   if (r < 0)
     return r;
 
diff --git a/src/tools/rbd/action/MirrorPool.cc b/src/tools/rbd/action/MirrorPool.cc
index d3bc63b277c..929ac446d02 100644
--- a/src/tools/rbd/action/MirrorPool.cc
+++ b/src/tools/rbd/action/MirrorPool.cc
@@ -4,7 +4,6 @@
 #include "tools/rbd/ArgumentTypes.h"
 #include "tools/rbd/Shell.h"
 #include "tools/rbd/Utils.h"
-#include "include/atomic.h"
 #include "include/Context.h"
 #include "include/stringify.h"
 #include "include/rbd/librbd.hpp"
@@ -21,6 +20,8 @@
 #include <boost/regex.hpp>
 #include "include/assert.h"
 
+#include <atomic>
+
 #define dout_context g_ceph_context
 #define dout_subsys ceph_subsys_rbd
 #undef dout_prefix
@@ -319,7 +320,7 @@ private:
 class PromoteImageRequest : public ImageRequestBase {
 public:
   PromoteImageRequest(librados::IoCtx &io_ctx, OrderedThrottle &throttle,
-                      const std::string &image_name, atomic_t *counter,
+                      const std::string &image_name, std::atomic<unsigned> *counter,
                       bool force)
     : ImageRequestBase(io_ctx, throttle, image_name), m_counter(counter),
       m_force(force) {
@@ -334,11 +335,11 @@ protected:
                       librbd::RBD::AioCompletion *aio_comp) override {
     image.aio_mirror_image_promote(m_force, aio_comp);
   }
+
   void handle_execute_action(int r) override {
     if (r >= 0) {
-      m_counter->inc();
+      (*m_counter)++;
     }
-    ImageRequestBase::handle_execute_action(r);
   }
 
   std::string get_action_type() const override {
@@ -346,14 +347,14 @@ protected:
   }
 
 private:
-  atomic_t *m_counter;
+  std::atomic<unsigned> *m_counter = nullptr;
   bool m_force;
 };
 
 class DemoteImageRequest : public ImageRequestBase {
 public:
   DemoteImageRequest(librados::IoCtx &io_ctx, OrderedThrottle &throttle,
-                     const std::string &image_name, atomic_t *counter)
+                     const std::string &image_name, std::atomic<unsigned> *counter)
     : ImageRequestBase(io_ctx, throttle, image_name), m_counter(counter) {
   }
 
@@ -368,7 +369,7 @@ protected:
   }
   void handle_execute_action(int r) override {
     if (r >= 0) {
-      m_counter->inc();
+      (*m_counter)++;
     }
     ImageRequestBase::handle_execute_action(r);
   }
@@ -378,7 +379,7 @@ protected:
   }
 
 private:
-  atomic_t *m_counter;
+  std::atomic<unsigned> *m_counter = nullptr;
 };
 
 class StatusImageRequest : public ImageRequestBase {
@@ -466,7 +467,7 @@ public:
     // mirror image operations
     librbd::RBD rbd;
     int r = rbd.list(m_io_ctx, m_image_names);
-    if (r < 0) {
+    if (r < 0 && r != -ENOENT) {
       std::cerr << "rbd: failed to list images within pool" << std::endl;
       return r;
     }
@@ -915,12 +916,12 @@ int execute_promote(const po::variables_map &vm) {
     return r;
   }
 
-  atomic_t counter;
+  std::atomic<unsigned> counter = { 0 };
   ImageRequestGenerator<PromoteImageRequest> generator(io_ctx, &counter,
                                                        vm["force"].as<bool>());
   r = generator.execute();
 
-  std::cout << "Promoted " << counter.read() << " mirrored images" << std::endl;
+  std::cout << "Promoted " << counter.load() << " mirrored images" << std::endl;
   return r;
 }
 
@@ -940,11 +941,11 @@ int execute_demote(const po::variables_map &vm) {
     return r;
   }
 
-  atomic_t counter;
+  std::atomic<unsigned> counter { 0 };
   ImageRequestGenerator<DemoteImageRequest> generator(io_ctx, &counter);
   r = generator.execute();
 
-  std::cout << "Demoted " << counter.read() << " mirrored images" << std::endl;
+  std::cout << "Demoted " << counter.load() << " mirrored images" << std::endl;
   return r;
 }
 
diff --git a/src/tools/rbd/action/Trash.cc b/src/tools/rbd/action/Trash.cc
index 2c381dccc2a..47d4808d0db 100644
--- a/src/tools/rbd/action/Trash.cc
+++ b/src/tools/rbd/action/Trash.cc
@@ -160,10 +160,9 @@ int do_list(librbd::RBD &rbd, librados::IoCtx& io_ctx, bool long_flag,
             bool all_flag, Formatter *f) {
   std::vector<librbd::trash_image_info_t> trash_entries;
   int r = rbd.trash_list(io_ctx, trash_entries);
-  if (r < 0 && r != -ENOENT) {
+  if (r < 0) {
     return r;
   }
-  r = 0;
 
   if (!long_flag) {
     if (f) {
diff --git a/src/tools/rbd_mirror/CMakeLists.txt b/src/tools/rbd_mirror/CMakeLists.txt
index 3fb0536c762..1a9d0fb1aab 100644
--- a/src/tools/rbd_mirror/CMakeLists.txt
+++ b/src/tools/rbd_mirror/CMakeLists.txt
@@ -25,6 +25,7 @@ set(rbd_mirror_internal
   image_replayer/IsPrimaryRequest.cc
   image_replayer/OpenImageRequest.cc
   image_replayer/OpenLocalImageRequest.cc
+  image_replayer/PrepareLocalImageRequest.cc
   image_replayer/ReplayStatusFormatter.cc
   image_sync/ImageCopyRequest.cc
   image_sync/ObjectCopyRequest.cc
diff --git a/src/tools/rbd_mirror/ImageDeleter.cc b/src/tools/rbd_mirror/ImageDeleter.cc
index 00c4c3952f2..b2eb615a7e4 100644
--- a/src/tools/rbd_mirror/ImageDeleter.cc
+++ b/src/tools/rbd_mirror/ImageDeleter.cc
@@ -134,7 +134,7 @@ private:
 
 ImageDeleter::ImageDeleter(ContextWQ *work_queue, SafeTimer *timer,
                            Mutex *timer_lock)
-  : m_running(1),
+  : m_running(true),
     m_work_queue(work_queue),
     m_delete_lock("rbd::mirror::ImageDeleter::Delete"),
     m_image_deleter_thread(this),
@@ -149,7 +149,7 @@ ImageDeleter::ImageDeleter(ContextWQ *work_queue, SafeTimer *timer,
 ImageDeleter::~ImageDeleter() {
   dout(20) << "enter" << dendl;
 
-  m_running.set(0);
+  m_running = false;
   {
     Mutex::Locker l (m_delete_lock);
     m_delete_queue_cond.Signal();
@@ -164,13 +164,13 @@ ImageDeleter::~ImageDeleter() {
 
 void ImageDeleter::run() {
   dout(20) << "enter" << dendl;
-  while(m_running.read()) {
+  while(m_running) {
     m_delete_lock.Lock();
     while (m_delete_queue.empty()) {
       dout(20) << "waiting for delete requests" << dendl;
       m_delete_queue_cond.Wait(m_delete_lock);
 
-      if (!m_running.read()) {
+      if (!m_running) {
         m_delete_lock.Unlock();
         dout(20) << "return" << dendl;
         return;
@@ -183,7 +183,7 @@ void ImageDeleter::run() {
 
     bool move_to_next = process_image_delete();
     if (!move_to_next) {
-      if (!m_running.read()) {
+      if (!m_running) {
        dout(20) << "return" << dendl;
        return;
       }
@@ -198,7 +198,6 @@ void ImageDeleter::run() {
 
 void ImageDeleter::schedule_image_delete(RadosRef local_rados,
                                          int64_t local_pool_id,
-                                         const std::string& local_image_id,
                                          const std::string& global_image_id) {
   dout(20) << "enter" << dendl;
 
@@ -206,14 +205,14 @@ void ImageDeleter::schedule_image_delete(RadosRef local_rados,
 
   auto del_info = find_delete_info(local_pool_id, global_image_id);
   if (del_info != nullptr) {
-    dout(20) << "image " << local_image_id << " (" << global_image_id << ") "
+    dout(20) << "image " << global_image_id << " "
              << "was already scheduled for deletion" << dendl;
     return;
   }
 
-  m_delete_queue.push_front(unique_ptr<DeleteInfo>(
-        new DeleteInfo(local_rados, local_pool_id, local_image_id,
-                       global_image_id)));
+  m_delete_queue.push_front(
+    unique_ptr<DeleteInfo>(new DeleteInfo(local_rados, local_pool_id,
+                                          global_image_id)));
   m_delete_queue_cond.Signal();
 }
 
@@ -272,25 +271,42 @@ bool ImageDeleter::process_image_delete() {
   r = m_active_delete->local_rados->ioctx_create2(
     m_active_delete->local_pool_id, ioctx);
   if (r < 0) {
-    derr << "error accessing local pool: " << cpp_strerror(r) << dendl;
+    derr << "error accessing local pool " << m_active_delete->local_pool_id
+         << ": " << cpp_strerror(r) << dendl;
     enqueue_failed_delete(r);
     return true;
   }
 
   dout(20) << "connected to local pool: " << ioctx.get_pool_name() << dendl;
 
+  auto &global_image_id = m_active_delete->global_image_id;
+  std::string local_image_id;
+  r = librbd::cls_client::mirror_image_get_image_id(
+    &ioctx, global_image_id, &local_image_id);
+  if (r == -ENOENT) {
+    dout(10) << "image " << global_image_id << " is not mirrored" << dendl;
+    complete_active_delete(r);
+    return true;
+  } else if (r < 0) {
+    derr << "error retrieving local id for image " << global_image_id
+         << ": " << cpp_strerror(r) << dendl;
+    enqueue_failed_delete(r);
+    return true;
+  }
+
   bool is_primary = false;
-  r = Journal<>::is_tag_owner(ioctx, m_active_delete->local_image_id,
-                              &is_primary, m_work_queue);
+  C_SaferCond tag_owner_ctx;
+  Journal<>::is_tag_owner(ioctx, local_image_id, &is_primary,
+                          m_work_queue, &tag_owner_ctx);
+  r = tag_owner_ctx.wait();
   if (r < 0 && r != -ENOENT) {
-    derr << "error retrieving image primary info: " << cpp_strerror(r)
-         << dendl;
+    derr << "error retrieving image primary info for image " << global_image_id
+         << ": " << cpp_strerror(r) << dendl;
     enqueue_failed_delete(r);
     return true;
   }
   if (is_primary) {
-    dout(10) << "local image is the primary image, aborting deletion..."
-             << dendl;
+    dout(10) << "image " << global_image_id << " is local primary" << dendl;
     complete_active_delete(-EISPRM);
     return true;
   }
@@ -298,31 +314,28 @@ bool ImageDeleter::process_image_delete() {
   dout(20) << "local image is not the primary" << dendl;
 
   bool has_snapshots;
-  r = image_has_snapshots_and_children(&ioctx, m_active_delete->local_image_id,
-                                       &has_snapshots);
+  r = image_has_snapshots_and_children(&ioctx, local_image_id, &has_snapshots);
   if (r < 0) {
     enqueue_failed_delete(r);
     return true;
   }
 
-  mirror_image.global_image_id = m_active_delete->global_image_id;
+  mirror_image.global_image_id = global_image_id;
   mirror_image.state = cls::rbd::MIRROR_IMAGE_STATE_DISABLING;
-  r = cls_client::mirror_image_set(&ioctx, m_active_delete->local_image_id,
-                                   mirror_image);
+  r = cls_client::mirror_image_set(&ioctx, local_image_id, mirror_image);
   if (r == -ENOENT) {
     dout(10) << "local image is not mirrored, aborting deletion..." << dendl;
     complete_active_delete(r);
     return true;
   } else if (r == -EEXIST || r == -EINVAL) {
-    derr << "cannot disable mirroring for image id "
-         << m_active_delete->local_image_id
-         << ": global_image_id has changed/reused, aborting deletion: "
+    derr << "cannot disable mirroring for image " << global_image_id
+         << ": global_image_id has changed/reused: "
          << cpp_strerror(r) << dendl;
     complete_active_delete(r);
     return true;
   } else if (r < 0) {
-    derr << "cannot disable mirroring for image id "
-         << m_active_delete->local_image_id << ": " << cpp_strerror(r) << dendl;
+    derr << "cannot disable mirroring for image " << global_image_id
+         << ": " << cpp_strerror(r) << dendl;
     enqueue_failed_delete(r);
     return true;
   }
@@ -332,12 +345,11 @@ bool ImageDeleter::process_image_delete() {
   if (has_snapshots) {
     dout(20) << "local image has snapshots" << dendl;
 
-    ImageCtx *imgctx = new ImageCtx("", m_active_delete->local_image_id,
-                                    nullptr, ioctx, false);
+    ImageCtx *imgctx = new ImageCtx("", local_image_id, nullptr, ioctx, false);
     r = imgctx->state->open(false);
     if (r < 0) {
-      derr << "error opening image id " << m_active_delete->local_image_id
-           << ": " << cpp_strerror(r) << dendl;
+      derr << "error opening image " << global_image_id << " ("
+           << local_image_id << "): " << cpp_strerror(r) << dendl;
       enqueue_failed_delete(r);
       return true;
     }
@@ -375,12 +387,12 @@ bool ImageDeleter::process_image_delete() {
         dout(20) << "snapshot " << imgctx->name << "@" << snap.name
                  << " is protected, issuing unprotect command" << dendl;
 
-        r = imgctx->operations->snap_unprotect(cls::rbd::UserSnapshotNamespace(),
-					       snap.name.c_str());
+        r = imgctx->operations->snap_unprotect(
+          cls::rbd::UserSnapshotNamespace(), snap.name.c_str());
         if (r == -EBUSY) {
           // there are still clones of snapshots of this image, therefore send
           // the delete request to the end of the queue
-          dout(10) << "local image id " << m_active_delete->local_image_id << " has "
+          dout(10) << "local image id " << local_image_id << " has "
                    << "snapshots with cloned children, postponing deletion..."
                    << dendl;
           imgctx->state->close();
@@ -415,10 +427,10 @@ bool ImageDeleter::process_image_delete() {
   }
 
   librbd::NoOpProgressContext ctx;
-  r = librbd::remove(ioctx, "", m_active_delete->local_image_id, ctx, true);
+  r = librbd::remove(ioctx, "", local_image_id, ctx, true);
   if (r < 0 && r != -ENOENT) {
-    derr << "error removing image " << m_active_delete->local_image_id << " "
-         << "(" << m_active_delete->global_image_id << ") from local pool: "
+    derr << "error removing image " << global_image_id << " "
+         << "(" << local_image_id << ") from local pool: "
          << cpp_strerror(r) << dendl;
     enqueue_failed_delete(r);
     return true;
@@ -431,7 +443,7 @@ bool ImageDeleter::process_image_delete() {
              << dendl;
   }
 
-  r = cls_client::mirror_image_remove(&ioctx, m_active_delete->local_image_id);
+  r = cls_client::mirror_image_remove(&ioctx, local_image_id);
   if (r < 0 && r != -ENOENT) {
     derr << "error removing image from mirroring directory: "
          << cpp_strerror(r) << dendl;
@@ -440,8 +452,7 @@ bool ImageDeleter::process_image_delete() {
   }
 
   dout(10) << "Successfully deleted image "
-           << m_active_delete->local_image_id << " "
-           << "(" << m_active_delete->global_image_id << ")" << dendl;
+           << global_image_id << " " << "(" << local_image_id << ")" << dendl;
 
   complete_active_delete(0);
   return true;
@@ -581,7 +592,6 @@ void ImageDeleter::DeleteInfo::notify(int r) {
 
 void ImageDeleter::DeleteInfo::to_string(stringstream& ss) {
   ss << "[" << "local_pool_id=" << local_pool_id << ", ";
-  ss << "local_image_id=" << local_image_id << ", ";
   ss << "global_image_id=" << global_image_id << "]";
 }
 
@@ -590,7 +600,6 @@ void ImageDeleter::DeleteInfo::print_status(Formatter *f, stringstream *ss,
   if (f) {
     f->open_object_section("delete_info");
     f->dump_int("local_pool_id", local_pool_id);
-    f->dump_string("local_image_id", local_image_id);
     f->dump_string("global_image_id", global_image_id);
     if (print_failure_info) {
       f->dump_string("error_code", cpp_strerror(error_code));
@@ -608,7 +617,7 @@ vector<string> ImageDeleter::get_delete_queue_items() {
 
   Mutex::Locker l(m_delete_lock);
   for (const auto& del_info : m_delete_queue) {
-    items.push_back(del_info->local_image_id);
+    items.push_back(del_info->global_image_id);
   }
 
   return items;
@@ -619,7 +628,7 @@ vector<pair<string, int> > ImageDeleter::get_failed_queue_items() {
 
   Mutex::Locker l(m_delete_lock);
   for (const auto& del_info : m_failed_queue) {
-    items.push_back(make_pair(del_info->local_image_id,
+    items.push_back(make_pair(del_info->global_image_id,
                               del_info->error_code));
   }
 
diff --git a/src/tools/rbd_mirror/ImageDeleter.h b/src/tools/rbd_mirror/ImageDeleter.h
index ff5f4e535f4..c91f2fbc6ff 100644
--- a/src/tools/rbd_mirror/ImageDeleter.h
+++ b/src/tools/rbd_mirror/ImageDeleter.h
@@ -15,15 +15,16 @@
 #ifndef CEPH_RBD_MIRROR_IMAGEDELETER_H
 #define CEPH_RBD_MIRROR_IMAGEDELETER_H
 
-#include <deque>
-#include <vector>
-#include "include/atomic.h"
 #include "common/Mutex.h"
 #include "common/Cond.h"
 #include "common/Thread.h"
 #include "common/Timer.h"
 #include "types.h"
 
+#include <deque>
+#include <vector>
+#include <atomic>
+
 class ContextWQ;
 
 namespace rbd {
@@ -45,7 +46,6 @@ public:
 
   void schedule_image_delete(RadosRef local_rados,
                              int64_t local_pool_id,
-                             const std::string& local_image_id,
                              const std::string& global_image_id);
   void wait_for_scheduled_deletion(int64_t local_pool_id,
                                    const std::string &global_image_id,
@@ -77,20 +77,16 @@ private:
   struct DeleteInfo {
     RadosRef local_rados;
     int64_t local_pool_id;
-    std::string local_image_id;
     std::string global_image_id;
-    int error_code;
-    int retries;
-    bool notify_on_failed_retry;
-    Context *on_delete;
+    int error_code = 0;
+    int retries = 0;
+    bool notify_on_failed_retry = true;
+    Context *on_delete = nullptr;
 
     DeleteInfo(RadosRef local_rados, int64_t local_pool_id,
-               const std::string& local_image_id,
                const std::string& global_image_id) :
       local_rados(local_rados), local_pool_id(local_pool_id),
-      local_image_id(local_image_id), global_image_id(global_image_id),
-      error_code(0), retries(0), notify_on_failed_retry(true),
-      on_delete(nullptr) {
+      global_image_id(global_image_id) {
     }
 
     bool match(int64_t local_pool_id, const std::string &global_image_id) {
@@ -103,7 +99,7 @@ private:
                       bool print_failure_info=false);
   };
 
-  atomic_t m_running;
+  std::atomic<unsigned> m_running { 0 };
 
   ContextWQ *m_work_queue;
 
diff --git a/src/tools/rbd_mirror/ImageReplayer.cc b/src/tools/rbd_mirror/ImageReplayer.cc
index fa4cb470b1f..3276dba3b8f 100644
--- a/src/tools/rbd_mirror/ImageReplayer.cc
+++ b/src/tools/rbd_mirror/ImageReplayer.cc
@@ -26,6 +26,7 @@
 #include "tools/rbd_mirror/image_replayer/BootstrapRequest.h"
 #include "tools/rbd_mirror/image_replayer/CloseImageRequest.h"
 #include "tools/rbd_mirror/image_replayer/EventPreprocessor.h"
+#include "tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.h"
 #include "tools/rbd_mirror/image_replayer/ReplayStatusFormatter.h"
 
 #define dout_context g_ceph_context
@@ -335,7 +336,8 @@ void ImageReplayer<I>::add_remote_image(const std::string &mirror_uuid,
 
 template <typename I>
 void ImageReplayer<I>::remove_remote_image(const std::string &mirror_uuid,
-                                           const std::string &image_id) {
+                                           const std::string &image_id,
+					   bool schedule_delete) {
   Mutex::Locker locker(m_lock);
   m_remote_images.erase({mirror_uuid, image_id});
 }
@@ -370,18 +372,12 @@ void ImageReplayer<I>::start(Context *on_finish, bool manual)
       dout(5) << "stopped manually, ignoring start without manual flag"
 	      << dendl;
       r = -EPERM;
-    } else if (m_remote_images.empty()) {
-      derr << "no remote images associated with replayer" << dendl;
-      r = -EINVAL;
     } else {
       m_state = STATE_STARTING;
       m_last_r = 0;
       m_state_desc.clear();
       m_manual_stop = false;
 
-      // TODO bootstrap will need to support multiple remote images
-      m_remote_image = *m_remote_images.begin();
-
       if (on_finish != nullptr) {
         assert(m_on_start_finish == nullptr);
         m_on_start_finish = on_finish;
@@ -405,6 +401,52 @@ void ImageReplayer<I>::start(Context *on_finish, bool manual)
     return;
   }
 
+  prepare_local_image();
+}
+
+template <typename I>
+void ImageReplayer<I>::prepare_local_image() {
+  dout(20) << dendl;
+
+  Context *ctx = create_context_callback<
+    ImageReplayer, &ImageReplayer<I>::handle_prepare_local_image>(this);
+  auto req = PrepareLocalImageRequest<I>::create(
+    m_local_ioctx, m_global_image_id, &m_local_image_id,
+    &m_local_image_tag_owner, m_threads->work_queue, ctx);
+  req->send();
+}
+
+template <typename I>
+void ImageReplayer<I>::handle_prepare_local_image(int r) {
+  dout(20) << "r=" << r << dendl;
+
+  if (r == -ENOENT) {
+    dout(20) << "local image does not exist" << dendl;
+  } else if (r < 0) {
+    on_start_fail(r, "error preparing local image for replay");
+    return;
+  } else if (m_local_image_tag_owner == librbd::Journal<>::LOCAL_MIRROR_UUID) {
+    dout(5) << "local image is primary" << dendl;
+    on_start_fail(0, "local image is primary");
+    return;
+  }
+
+  // local image doesn't exist or is non-primary
+  bootstrap();
+}
+
+template <typename I>
+void ImageReplayer<I>::bootstrap() {
+  dout(20) << dendl;
+
+  if (m_remote_images.empty()) {
+    on_start_fail(0, "waiting for primary remote image");
+    return;
+  }
+
+  // TODO bootstrap will need to support multiple remote images
+  m_remote_image = *m_remote_images.begin();
+
   CephContext *cct = static_cast<CephContext *>(m_local->cct());
   journal::Settings settings;
   settings.commit_interval = cct->_conf->rbd_mirror_journal_commit_age;
@@ -412,16 +454,10 @@ void ImageReplayer<I>::start(Context *on_finish, bool manual)
 
   m_remote_journaler = new Journaler(m_threads->work_queue,
                                      m_threads->timer,
-				     &m_threads->timer_lock,
+                                     &m_threads->timer_lock,
                                      m_remote_image.io_ctx,
                                      m_remote_image.image_id,
                                      m_local_mirror_uuid, settings);
-  bootstrap();
-}
-
-template <typename I>
-void ImageReplayer<I>::bootstrap() {
-  dout(20) << dendl;
 
   Context *ctx = create_context_callback<
     ImageReplayer, &ImageReplayer<I>::handle_bootstrap>(this);
@@ -458,10 +494,12 @@ void ImageReplayer<I>::handle_bootstrap(int r) {
   }
 
   if (r == -EREMOTEIO) {
+    m_local_image_tag_owner = "";
     dout(5) << "remote image is non-primary or local image is primary" << dendl;
     on_start_fail(0, "remote image is non-primary or local image is primary");
     return;
   } else if (r == -EEXIST) {
+    m_local_image_tag_owner = "";
     on_start_fail(r, "split-brain detected");
     return;
   } else if (r < 0) {
@@ -1511,7 +1549,6 @@ void ImageReplayer<I>::handle_shut_down(int r) {
     if (m_stopping_for_resync) {
       m_image_deleter->schedule_image_delete(m_local,
                                              m_local_pool_id,
-                                             m_local_image_id,
                                              m_global_image_id);
       m_stopping_for_resync = false;
     }
diff --git a/src/tools/rbd_mirror/ImageReplayer.h b/src/tools/rbd_mirror/ImageReplayer.h
index 79a744d9842..72f03f77b6c 100644
--- a/src/tools/rbd_mirror/ImageReplayer.h
+++ b/src/tools/rbd_mirror/ImageReplayer.h
@@ -4,11 +4,6 @@
 #ifndef CEPH_RBD_MIRROR_IMAGE_REPLAYER_H
 #define CEPH_RBD_MIRROR_IMAGE_REPLAYER_H
 
-#include <map>
-#include <string>
-#include <vector>
-
-#include "include/atomic.h"
 #include "common/AsyncOpTracker.h"
 #include "common/Mutex.h"
 #include "common/WorkQueue.h"
@@ -23,10 +18,16 @@
 #include "ImageDeleter.h"
 #include "ProgressContext.h"
 #include "types.h"
-#include <set>
+
 #include <boost/noncopyable.hpp>
 #include <boost/optional.hpp>
 
+#include <set>
+#include <map>
+#include <atomic>
+#include <string>
+#include <vector>
+
 class AdminSocketHook;
 
 namespace journal {
@@ -109,7 +110,8 @@ public:
                         const std::string &remote_image_id,
                         librados::IoCtx &remote_io_ctx);
   void remove_remote_image(const std::string &remote_mirror_uuid,
-                           const std::string &remote_image_id);
+                           const std::string &remote_image_id,
+			   bool schedule_delete);
   bool remote_images_empty() const;
 
   inline int64_t get_local_pool_id() const {
@@ -118,10 +120,6 @@ public:
   inline const std::string& get_global_image_id() const {
     return m_global_image_id;
   }
-  inline std::string get_local_image_id() {
-    Mutex::Locker locker(m_lock);
-    return m_local_image_id;
-  }
 
   void start(Context *on_finish = nullptr, bool manual = false);
   void stop(Context *on_finish = nullptr, bool manual = false,
@@ -146,6 +144,9 @@ protected:
    * <starting>                                             *
    *    |                                                   *
    *    v                                           (error) *
+   * PREPARE_LOCAL_IMAGE  * * * * * * * * * * * * * * * * * *
+   *    |                                                   *
+   *    v                                           (error) *
    * BOOTSTRAP_IMAGE  * * * * * * * * * * * * * * * * * * * *
    *    |                                                   *
    *    v                                           (error) *
@@ -304,6 +305,7 @@ private:
     nullptr;
   librados::IoCtx m_local_ioctx;
   ImageCtxT *m_local_image_ctx = nullptr;
+  std::string m_local_image_tag_owner;
 
   decltype(ImageCtxT::journal) m_local_journal = nullptr;
   librbd::journal::Replay<ImageCtxT> *m_local_replay = nullptr;
@@ -388,6 +390,9 @@ private:
   void handle_shut_down(int r);
   void handle_remote_journal_metadata_updated();
 
+  void prepare_local_image();
+  void handle_prepare_local_image(int r);
+
   void bootstrap();
   void handle_bootstrap(int r);
 
diff --git a/src/tools/rbd_mirror/InstanceReplayer.cc b/src/tools/rbd_mirror/InstanceReplayer.cc
index 359e2c66bbb..d2426d0d0e4 100644
--- a/src/tools/rbd_mirror/InstanceReplayer.cc
+++ b/src/tools/rbd_mirror/InstanceReplayer.cc
@@ -156,13 +156,13 @@ void InstanceReplayer<I>::acquire_image(const std::string &global_image_id,
   }
 
   auto image_replayer = it->second;
+  if (!peer_mirror_uuid.empty()) {
+    auto iter = m_peers.find(Peer(peer_mirror_uuid));
+    assert(iter != m_peers.end());
+    auto io_ctx = iter->io_ctx;
 
-  auto iter = m_peers.find(Peer(peer_mirror_uuid));
-  assert(iter != m_peers.end());
-  auto io_ctx = iter->io_ctx;
-
-  image_replayer->add_remote_image(peer_mirror_uuid, peer_image_id, io_ctx);
-
+    image_replayer->add_remote_image(peer_mirror_uuid, peer_image_id, io_ctx);
+  }
   start_image_replayer(image_replayer);
 
   m_threads->work_queue->queue(on_finish, 0);
@@ -190,11 +190,13 @@ void InstanceReplayer<I>::release_image(const std::string &global_image_id,
   }
 
   auto image_replayer = it->second;
-
-  image_replayer->remove_remote_image(peer_mirror_uuid, peer_image_id);
+  if (!peer_mirror_uuid.empty()) {
+    image_replayer->remove_remote_image(peer_mirror_uuid, peer_image_id,
+					schedule_delete);
+  }
 
   if (!image_replayer->remote_images_empty()) {
-    dout(20) << global_image_id << ": still has remote images" << dendl;
+    dout(20) << global_image_id << ": still has peer images" << dendl;
     m_threads->work_queue->queue(on_finish, 0);
     return;
   }
@@ -211,15 +213,8 @@ void InstanceReplayer<I>::release_image(const std::string &global_image_id,
     on_finish = new FunctionContext(
       [this, image_replayer, on_finish] (int r) {
         auto global_image_id = image_replayer->get_global_image_id();
-        auto local_image_id = image_replayer->get_local_image_id();
-        if (local_image_id.empty()) {
-          dout(20) << global_image_id << ": unknown local_image_id"
-                   << " (image does not exist or primary), skipping delete"
-                   << dendl;
-        } else {
-          m_image_deleter->schedule_image_delete(
-            m_local_rados, m_local_pool_id, local_image_id, global_image_id);
-        }
+        m_image_deleter->schedule_image_delete(
+          m_local_rados, m_local_pool_id, global_image_id);
         on_finish->complete(0);
       });
   }
diff --git a/src/tools/rbd_mirror/Mirror.cc b/src/tools/rbd_mirror/Mirror.cc
index 86c6939182f..f37d2559922 100644
--- a/src/tools/rbd_mirror/Mirror.cc
+++ b/src/tools/rbd_mirror/Mirror.cc
@@ -215,7 +215,7 @@ Mirror::~Mirror()
 
 void Mirror::handle_signal(int signum)
 {
-  m_stopping.set(1);
+  m_stopping = true;
   {
     Mutex::Locker l(m_lock);
     m_cond.Signal();
@@ -250,7 +250,7 @@ int Mirror::init()
 void Mirror::run()
 {
   dout(20) << "enter" << dendl;
-  while (!m_stopping.read()) {
+  while (!m_stopping) {
     m_local_cluster_watcher->refresh_pools();
     Mutex::Locker l(m_lock);
     if (!m_manual_stop) {
@@ -275,7 +275,7 @@ void Mirror::print_status(Formatter *f, stringstream *ss)
 
   Mutex::Locker l(m_lock);
 
-  if (m_stopping.read()) {
+  if (m_stopping) {
     return;
   }
 
@@ -314,7 +314,7 @@ void Mirror::start()
   dout(20) << "enter" << dendl;
   Mutex::Locker l(m_lock);
 
-  if (m_stopping.read()) {
+  if (m_stopping) {
     return;
   }
 
@@ -330,7 +330,7 @@ void Mirror::stop()
   dout(20) << "enter" << dendl;
   Mutex::Locker l(m_lock);
 
-  if (m_stopping.read()) {
+  if (m_stopping) {
     return;
   }
 
@@ -346,7 +346,7 @@ void Mirror::restart()
   dout(20) << "enter" << dendl;
   Mutex::Locker l(m_lock);
 
-  if (m_stopping.read()) {
+  if (m_stopping) {
     return;
   }
 
@@ -362,7 +362,7 @@ void Mirror::flush()
   dout(20) << "enter" << dendl;
   Mutex::Locker l(m_lock);
 
-  if (m_stopping.read() || m_manual_stop) {
+  if (m_stopping || m_manual_stop) {
     return;
   }
 
@@ -376,7 +376,7 @@ void Mirror::release_leader()
   dout(20) << "enter" << dendl;
   Mutex::Locker l(m_lock);
 
-  if (m_stopping.read()) {
+  if (m_stopping) {
     return;
   }
 
diff --git a/src/tools/rbd_mirror/Mirror.h b/src/tools/rbd_mirror/Mirror.h
index 4ff9d512399..2253156d660 100644
--- a/src/tools/rbd_mirror/Mirror.h
+++ b/src/tools/rbd_mirror/Mirror.h
@@ -4,19 +4,19 @@
 #ifndef CEPH_RBD_MIRROR_H
 #define CEPH_RBD_MIRROR_H
 
-#include <map>
-#include <memory>
-#include <set>
-
 #include "common/ceph_context.h"
 #include "common/Mutex.h"
-#include "include/atomic.h"
 #include "include/rados/librados.hpp"
 #include "ClusterWatcher.h"
 #include "PoolReplayer.h"
 #include "ImageDeleter.h"
 #include "types.h"
 
+#include <set>
+#include <map>
+#include <memory>
+#include <atomic>
+
 namespace librbd { struct ImageCtx; }
 
 namespace rbd {
@@ -67,7 +67,7 @@ private:
   std::shared_ptr<ImageDeleter> m_image_deleter;
   ImageSyncThrottlerRef<> m_image_sync_throttler;
   std::map<PoolPeer, std::unique_ptr<PoolReplayer> > m_pool_replayers;
-  atomic_t m_stopping;
+  std::atomic<bool> m_stopping = { false };
   bool m_manual_stop = false;
   MirrorAdminSocketHook *m_asok_hook;
 };
diff --git a/src/tools/rbd_mirror/PoolReplayer.cc b/src/tools/rbd_mirror/PoolReplayer.cc
index 6747ddc15aa..0bd06f624ae 100644
--- a/src/tools/rbd_mirror/PoolReplayer.cc
+++ b/src/tools/rbd_mirror/PoolReplayer.cc
@@ -21,7 +21,6 @@
 #include "InstanceWatcher.h"
 #include "LeaderWatcher.h"
 #include "Threads.h"
-#include "pool_watcher/RefreshImagesRequest.h"
 
 #define dout_context g_ceph_context
 #define dout_subsys ceph_subsys_rbd_mirror
@@ -206,20 +205,6 @@ private:
 
 } // anonymous namespace
 
-struct PoolReplayer::C_RefreshLocalImages : public Context {
-  PoolReplayer *pool_replayer;
-  Context *on_finish;
-  ImageIds image_ids;
-
-  C_RefreshLocalImages(PoolReplayer *pool_replayer, Context *on_finish)
-    : pool_replayer(pool_replayer), on_finish(on_finish) {
-  }
-
-  void finish(int r) override {
-    pool_replayer->handle_refresh_local_images(r, std::move(image_ids), on_finish);
-  }
-};
-
 PoolReplayer::PoolReplayer(Threads<librbd::ImageCtx> *threads,
 			   std::shared_ptr<ImageDeleter> image_deleter,
 			   ImageSyncThrottlerRef<> image_sync_throttler,
@@ -232,7 +217,8 @@ PoolReplayer::PoolReplayer(Threads<librbd::ImageCtx> *threads,
   m_peer(peer),
   m_args(args),
   m_local_pool_id(local_pool_id),
-  m_pool_watcher_listener(this),
+  m_local_pool_watcher_listener(this, true),
+  m_remote_pool_watcher_listener(this, false),
   m_asok_hook(nullptr),
   m_pool_replayer_thread(this),
   m_leader_listener(this)
@@ -243,7 +229,7 @@ PoolReplayer::~PoolReplayer()
 {
   delete m_asok_hook;
 
-  m_stopping.set(1);
+  m_stopping = true;
   {
     Mutex::Locker l(m_lock);
     m_cond.Signal();
@@ -261,7 +247,8 @@ PoolReplayer::~PoolReplayer()
     m_instance_replayer->shut_down();
   }
 
-  assert(!m_pool_watcher);
+  assert(!m_local_pool_watcher);
+  assert(!m_remote_pool_watcher);
 }
 
 bool PoolReplayer::is_blacklisted() const {
@@ -400,6 +387,11 @@ int PoolReplayer::init_rados(const std::string &cluster_name,
     }
   }
 
+  if (!g_ceph_context->_conf->admin_socket.empty()) {
+    cct->_conf->set_val_or_die("admin_socket",
+                               "$run_dir/$name.$pid.$cluster.$cctid.asok");
+  }
+
   // disable unnecessary librbd cache
   cct->_conf->set_val_or_die("rbd_cache", "false");
   cct->_conf->apply_changes(nullptr);
@@ -423,7 +415,7 @@ void PoolReplayer::run()
 {
   dout(20) << "enter" << dendl;
 
-  while (!m_stopping.read()) {
+  while (!m_stopping) {
     std::string asok_hook_name = m_local_io_ctx.get_pool_name() + " " +
                                  m_peer.cluster_name;
     if (m_asok_hook_name != asok_hook_name || m_asok_hook == nullptr) {
@@ -435,9 +427,10 @@ void PoolReplayer::run()
     }
 
     Mutex::Locker locker(m_lock);
-    if (m_pool_watcher && m_pool_watcher->is_blacklisted()) {
+    if ((m_local_pool_watcher && m_local_pool_watcher->is_blacklisted()) ||
+	(m_remote_pool_watcher && m_remote_pool_watcher->is_blacklisted())) {
       m_blacklisted = true;
-      m_stopping.set(1);
+      m_stopping = true;
       break;
     }
 
@@ -476,6 +469,13 @@ void PoolReplayer::print_status(Formatter *f, stringstream *ss)
     f->close_section();
   }
 
+  f->dump_string("local_cluster_admin_socket",
+                 reinterpret_cast<CephContext *>(m_local_io_ctx.cct())->_conf->
+                     admin_socket);
+  f->dump_string("remote_cluster_admin_socket",
+                 reinterpret_cast<CephContext *>(m_remote_io_ctx.cct())->_conf->
+                     admin_socket);
+
   m_instance_replayer->print_status(f, ss);
 
   f->close_section();
@@ -488,7 +488,7 @@ void PoolReplayer::start()
 
   Mutex::Locker l(m_lock);
 
-  if (m_stopping.read()) {
+  if (m_stopping) {
     return;
   }
 
@@ -501,10 +501,10 @@ void PoolReplayer::stop(bool manual)
 
   Mutex::Locker l(m_lock);
   if (!manual) {
-    m_stopping.set(1);
+    m_stopping = true;
     m_cond.Signal();
     return;
-  } else if (m_stopping.read()) {
+  } else if (m_stopping) {
     return;
   }
 
@@ -517,7 +517,7 @@ void PoolReplayer::restart()
 
   Mutex::Locker l(m_lock);
 
-  if (m_stopping.read()) {
+  if (m_stopping) {
     return;
   }
 
@@ -530,7 +530,7 @@ void PoolReplayer::flush()
 
   Mutex::Locker l(m_lock);
 
-  if (m_stopping.read() || m_manual_stop) {
+  if (m_stopping || m_manual_stop) {
     return;
   }
 
@@ -543,7 +543,7 @@ void PoolReplayer::release_leader()
 
   Mutex::Locker l(m_lock);
 
-  if (m_stopping.read() || !m_leader_watcher) {
+  if (m_stopping || !m_leader_watcher) {
     return;
   }
 
@@ -551,44 +551,46 @@ void PoolReplayer::release_leader()
 }
 
 void PoolReplayer::handle_update(const std::string &mirror_uuid,
-				 const ImageIds &added_image_ids,
-				 const ImageIds &removed_image_ids) {
-  assert(!mirror_uuid.empty());
-  if (m_stopping.read()) {
+				 ImageIds &&added_image_ids,
+				 ImageIds &&removed_image_ids) {
+  if (m_stopping) {
     return;
   }
 
-  dout(10) << dendl;
+  dout(10) << "mirror_uuid=" << mirror_uuid << ", "
+           << "added_count=" << added_image_ids.size() << ", "
+           << "removed_count=" << removed_image_ids.size() << dendl;
   Mutex::Locker locker(m_lock);
   if (!m_leader_watcher->is_leader()) {
     return;
   }
 
-  if (m_peer.uuid != mirror_uuid) {
-    m_instance_replayer->remove_peer(m_peer.uuid);
-    m_instance_replayer->add_peer(mirror_uuid, m_remote_io_ctx);
-    m_peer.uuid = mirror_uuid;
-  }
+  if (m_initial_mirror_image_ids.find(mirror_uuid) ==
+        m_initial_mirror_image_ids.end() &&
+      m_initial_mirror_image_ids.size() < 2) {
+    m_initial_mirror_image_ids[mirror_uuid] = added_image_ids;
 
-  // first callback will be a full directory -- so see if we need to remove
-  // any local images that no longer exist on the remote side
-  if (!m_init_image_ids.empty()) {
-    dout(20) << "scanning initial local image set" << dendl;
-    for (auto &image_id : added_image_ids) {
-      auto it = m_init_image_ids.find(image_id);
-      if (it != m_init_image_ids.end()) {
-        m_init_image_ids.erase(it);
+    if (m_initial_mirror_image_ids.size() == 2) {
+      dout(10) << "local and remote pools refreshed" << dendl;
+
+      // both local and remote initial pool listing received. derive
+      // removal notifications for the remote pool
+      auto &local_image_ids = m_initial_mirror_image_ids.begin()->second;
+      auto &remote_image_ids = m_initial_mirror_image_ids.rbegin()->second;
+      for (auto &local_image_id : local_image_ids) {
+        if (remote_image_ids.find(local_image_id) == remote_image_ids.end()) {
+          removed_image_ids.emplace(local_image_id.global_id, "");
+        }
       }
+      local_image_ids.clear();
+      remote_image_ids.clear();
     }
+  }
 
-    // the remaining images in m_init_image_ids must be deleted
-    for (auto &image_id : m_init_image_ids) {
-      dout(20) << "scheduling the deletion of init image: "
-               << image_id.global_id << " (" << image_id.id << ")" << dendl;
-      m_image_deleter->schedule_image_delete(m_local_rados, m_local_pool_id,
-                                             image_id.id, image_id.global_id);
-    }
-    m_init_image_ids.clear();
+  if (!mirror_uuid.empty() && m_peer.uuid != mirror_uuid) {
+    m_instance_replayer->remove_peer(m_peer.uuid);
+    m_instance_replayer->add_peer(mirror_uuid, m_remote_io_ctx);
+    m_peer.uuid = mirror_uuid;
   }
 
   m_update_op_tracker.start_op();
@@ -599,19 +601,19 @@ void PoolReplayer::handle_update(const std::string &mirror_uuid,
 
   C_Gather *gather_ctx = new C_Gather(g_ceph_context, ctx);
 
-  for (auto &image_id : removed_image_ids) {
+  for (auto &image_id : added_image_ids) {
     // for now always send to myself (the leader)
     std::string &instance_id = m_instance_watcher->get_instance_id();
-    m_instance_watcher->notify_image_release(instance_id, image_id.global_id,
-                                             mirror_uuid, image_id.id, true,
+    m_instance_watcher->notify_image_acquire(instance_id, image_id.global_id,
+                                             mirror_uuid, image_id.id,
                                              gather_ctx->new_sub());
   }
 
-  for (auto &image_id : added_image_ids) {
+  for (auto &image_id : removed_image_ids) {
     // for now always send to myself (the leader)
     std::string &instance_id = m_instance_watcher->get_instance_id();
-    m_instance_watcher->notify_image_acquire(instance_id, image_id.global_id,
-                                             mirror_uuid, image_id.id,
+    m_instance_watcher->notify_image_release(instance_id, image_id.global_id,
+                                             mirror_uuid, image_id.id, true,
                                              gather_ctx->new_sub());
   }
 
@@ -620,68 +622,73 @@ void PoolReplayer::handle_update(const std::string &mirror_uuid,
 
 void PoolReplayer::handle_post_acquire_leader(Context *on_finish) {
   dout(20) << dendl;
-  refresh_local_images(on_finish);
+  init_local_pool_watcher(on_finish);
 }
 
 void PoolReplayer::handle_pre_release_leader(Context *on_finish) {
   dout(20) << dendl;
-  shut_down_pool_watcher(on_finish);
+  shut_down_pool_watchers(on_finish);
 }
 
-void PoolReplayer::refresh_local_images(Context *on_finish) {
+void PoolReplayer::init_local_pool_watcher(Context *on_finish) {
   dout(20) << dendl;
 
+  Mutex::Locker locker(m_lock);
+  assert(!m_local_pool_watcher);
+  m_local_pool_watcher.reset(new PoolWatcher<>(
+    m_threads, m_local_io_ctx, m_local_pool_watcher_listener));
+  m_initial_mirror_image_ids.clear();
+
   // ensure the initial set of local images is up-to-date
   // after acquiring the leader role
-  auto ctx = new C_RefreshLocalImages(this, on_finish);
-  auto req = pool_watcher::RefreshImagesRequest<>::create(
-    m_local_io_ctx, &ctx->image_ids, ctx);
-  req->send();
+  auto ctx = new FunctionContext([this, on_finish](int r) {
+      handle_init_local_pool_watcher(r, on_finish);
+    });
+  m_local_pool_watcher->init(create_async_context_callback(
+    m_threads->work_queue, ctx));
 }
 
-void PoolReplayer::handle_refresh_local_images(int r, ImageIds &&image_ids,
-					       Context *on_finish) {
+void PoolReplayer::handle_init_local_pool_watcher(int r, Context *on_finish) {
   dout(20) << "r=" << r << dendl;
-
-  {
-    Mutex::Locker locker(m_lock);
-    m_init_image_ids = std::move(image_ids);
-  }
-
   if (r < 0) {
     derr << "failed to retrieve local images: " << cpp_strerror(r) << dendl;
     on_finish->complete(r);
     return;
   }
 
-  init_pool_watcher(on_finish);
+  init_remote_pool_watcher(on_finish);
 }
 
-void PoolReplayer::init_pool_watcher(Context *on_finish) {
+void PoolReplayer::init_remote_pool_watcher(Context *on_finish) {
   dout(20) << dendl;
 
   Mutex::Locker locker(m_lock);
-  assert(!m_pool_watcher);
-  m_pool_watcher.reset(new PoolWatcher<>(
-    m_threads, m_remote_io_ctx, m_pool_watcher_listener));
-  m_pool_watcher->init(create_async_context_callback(
+  assert(!m_remote_pool_watcher);
+  m_remote_pool_watcher.reset(new PoolWatcher<>(
+    m_threads, m_remote_io_ctx, m_remote_pool_watcher_listener));
+  m_remote_pool_watcher->init(create_async_context_callback(
     m_threads->work_queue, on_finish));
 
   m_cond.Signal();
 }
 
-void PoolReplayer::shut_down_pool_watcher(Context *on_finish) {
+void PoolReplayer::shut_down_pool_watchers(Context *on_finish) {
   dout(20) << dendl;
 
   {
     Mutex::Locker locker(m_lock);
-    if (m_pool_watcher) {
+    if (m_local_pool_watcher) { 
       Context *ctx = new FunctionContext([this, on_finish](int r) {
-          handle_shut_down_pool_watcher(r, on_finish);
-      });
+          handle_shut_down_pool_watchers(r, on_finish);
+	});
       ctx = create_async_context_callback(m_threads->work_queue, ctx);
 
-      m_pool_watcher->shut_down(ctx);
+      auto gather_ctx = new C_Gather(g_ceph_context, ctx);
+      m_local_pool_watcher->shut_down(gather_ctx->new_sub());
+      if (m_remote_pool_watcher) {
+	m_remote_pool_watcher->shut_down(gather_ctx->new_sub());
+      }
+      gather_ctx->activate();
       return;
     }
   }
@@ -689,13 +696,17 @@ void PoolReplayer::shut_down_pool_watcher(Context *on_finish) {
   on_finish->complete(0);
 }
 
-void PoolReplayer::handle_shut_down_pool_watcher(int r, Context *on_finish) {
+void PoolReplayer::handle_shut_down_pool_watchers(int r, Context *on_finish) {
   dout(20) << "r=" << r << dendl;
 
   {
     Mutex::Locker locker(m_lock);
-    assert(m_pool_watcher);
-    m_pool_watcher.reset();
+    assert(m_local_pool_watcher);
+    m_local_pool_watcher.reset();
+
+    if (m_remote_pool_watcher) {
+      m_remote_pool_watcher.reset();
+    }
   }
   wait_for_update_ops(on_finish);
 }
diff --git a/src/tools/rbd_mirror/PoolReplayer.h b/src/tools/rbd_mirror/PoolReplayer.h
index 7de10a5c6a5..87a6589356e 100644
--- a/src/tools/rbd_mirror/PoolReplayer.h
+++ b/src/tools/rbd_mirror/PoolReplayer.h
@@ -4,16 +4,10 @@
 #ifndef CEPH_RBD_MIRROR_POOL_REPLAYER_H
 #define CEPH_RBD_MIRROR_POOL_REPLAYER_H
 
-#include <map>
-#include <memory>
-#include <set>
-#include <string>
-
 #include "common/AsyncOpTracker.h"
 #include "common/Cond.h"
 #include "common/Mutex.h"
 #include "common/WorkQueue.h"
-#include "include/atomic.h"
 #include "include/rados/librados.hpp"
 
 #include "ClusterWatcher.h"
@@ -22,6 +16,12 @@
 #include "ImageDeleter.h"
 #include "types.h"
 
+#include <set>
+#include <map>
+#include <memory>
+#include <atomic>
+#include <string>
+
 class AdminSocketHook;
 
 namespace librbd { class ImageCtx; }
@@ -63,24 +63,24 @@ public:
 private:
   struct PoolWatcherListener : public PoolWatcher<>::Listener {
     PoolReplayer *pool_replayer;
+    bool local;
 
-    PoolWatcherListener(PoolReplayer *pool_replayer)
-      : pool_replayer(pool_replayer) {
+    PoolWatcherListener(PoolReplayer *pool_replayer, bool local)
+      : pool_replayer(pool_replayer), local(local) {
     }
 
     void handle_update(const std::string &mirror_uuid,
-                       const ImageIds &added_image_ids,
-                       const ImageIds &removed_image_ids) override {
-      pool_replayer->handle_update(mirror_uuid, added_image_ids,
-				   removed_image_ids);
+                       ImageIds &&added_image_ids,
+                       ImageIds &&removed_image_ids) override {
+      pool_replayer->handle_update((local ? "" : mirror_uuid),
+				   std::move(added_image_ids),
+                                   std::move(removed_image_ids));
     }
   };
 
-  struct C_RefreshLocalImages;
-
   void handle_update(const std::string &mirror_uuid,
-                     const ImageIds &added_image_ids,
-                     const ImageIds &removed_image_ids);
+                     ImageIds &&added_image_ids,
+                     ImageIds &&removed_image_ids);
 
   int init_rados(const std::string &cluster_name,
                  const std::string &client_name,
@@ -89,13 +89,13 @@ private:
   void handle_post_acquire_leader(Context *on_finish);
   void handle_pre_release_leader(Context *on_finish);
 
-  void refresh_local_images(Context *on_finish);
-  void handle_refresh_local_images(int r, ImageIds &&image_ids,
-                                   Context *on_finish);
+  void init_local_pool_watcher(Context *on_finish);
+  void handle_init_local_pool_watcher(int r, Context *on_finish);
+
+  void init_remote_pool_watcher(Context *on_finish);
 
-  void init_pool_watcher(Context *on_finish);
-  void shut_down_pool_watcher(Context *on_finish);
-  void handle_shut_down_pool_watcher(int r, Context *on_finish);
+  void shut_down_pool_watchers(Context *on_finish);
+  void handle_shut_down_pool_watchers(int r, Context *on_finish);
 
   void wait_for_update_ops(Context *on_finish);
   void handle_wait_for_update_ops(int r, Context *on_finish);
@@ -105,7 +105,7 @@ private:
   ImageSyncThrottlerRef<> m_image_sync_throttler;
   mutable Mutex m_lock;
   Cond m_cond;
-  atomic_t m_stopping;
+  std::atomic<bool> m_stopping = { false };
   bool m_manual_stop = false;
   bool m_blacklisted = false;
 
@@ -119,15 +119,18 @@ private:
 
   int64_t m_local_pool_id = -1;
 
-  PoolWatcherListener m_pool_watcher_listener;
-  std::unique_ptr<PoolWatcher<> > m_pool_watcher;
+  PoolWatcherListener m_local_pool_watcher_listener;
+  std::unique_ptr<PoolWatcher<> > m_local_pool_watcher;
+
+  PoolWatcherListener m_remote_pool_watcher_listener;
+  std::unique_ptr<PoolWatcher<> > m_remote_pool_watcher;
 
   std::unique_ptr<InstanceReplayer<librbd::ImageCtx>> m_instance_replayer;
 
   std::string m_asok_hook_name;
   AdminSocketHook *m_asok_hook;
 
-  std::set<ImageId> m_init_image_ids;
+  std::map<std::string, ImageIds> m_initial_mirror_image_ids;
 
   class PoolReplayerThread : public Thread {
     PoolReplayer *m_pool_replayer;
diff --git a/src/tools/rbd_mirror/PoolWatcher.cc b/src/tools/rbd_mirror/PoolWatcher.cc
index 6a855ff2185..18c6df3840f 100644
--- a/src/tools/rbd_mirror/PoolWatcher.cc
+++ b/src/tools/rbd_mirror/PoolWatcher.cc
@@ -479,7 +479,7 @@ void PoolWatcher<I>::notify_listener() {
   }
 
   if (!removed_image_ids.empty()) {
-    m_listener.handle_update(mirror_uuid, {}, removed_image_ids);
+    m_listener.handle_update(mirror_uuid, {}, std::move(removed_image_ids));
     removed_image_ids.clear();
   }
 
@@ -529,7 +529,8 @@ void PoolWatcher<I>::notify_listener() {
     mirror_uuid = m_mirror_uuid;
   }
 
-  m_listener.handle_update(mirror_uuid, added_image_ids, removed_image_ids);
+  m_listener.handle_update(mirror_uuid, std::move(added_image_ids),
+                           std::move(removed_image_ids));
 
   {
     Mutex::Locker locker(m_lock);
diff --git a/src/tools/rbd_mirror/PoolWatcher.h b/src/tools/rbd_mirror/PoolWatcher.h
index aebd981b864..aec063b3e7c 100644
--- a/src/tools/rbd_mirror/PoolWatcher.h
+++ b/src/tools/rbd_mirror/PoolWatcher.h
@@ -37,8 +37,8 @@ public:
     }
 
     virtual void handle_update(const std::string &mirror_uuid,
-                               const ImageIds &added_image_ids,
-                               const ImageIds &removed_image_ids) = 0;
+                               ImageIds &&added_image_ids,
+                               ImageIds &&removed_image_ids) = 0;
   };
 
   PoolWatcher(Threads<ImageCtxT> *threads, librados::IoCtx &remote_io_ctx,
diff --git a/src/tools/rbd_mirror/image_replayer/BootstrapRequest.cc b/src/tools/rbd_mirror/image_replayer/BootstrapRequest.cc
index 7d7d5ed1a6c..8818945b10f 100644
--- a/src/tools/rbd_mirror/image_replayer/BootstrapRequest.cc
+++ b/src/tools/rbd_mirror/image_replayer/BootstrapRequest.cc
@@ -20,7 +20,6 @@
 #include "librbd/Journal.h"
 #include "librbd/Utils.h"
 #include "librbd/journal/Types.h"
-#include "tools/rbd_mirror/ImageSync.h"
 #include "tools/rbd_mirror/ProgressContext.h"
 #include "tools/rbd_mirror/ImageSyncThrottler.h"
 
@@ -79,7 +78,7 @@ template <typename I>
 void BootstrapRequest<I>::send() {
   *m_do_resync = false;
 
-  get_local_image_id();
+  get_remote_tag_class();
 }
 
 template <typename I>
@@ -93,45 +92,6 @@ void BootstrapRequest<I>::cancel() {
 }
 
 template <typename I>
-void BootstrapRequest<I>::get_local_image_id() {
-  dout(20) << dendl;
-
-  update_progress("GET_LOCAL_IMAGE_ID");
-
-  // attempt to cross-reference a local image by the global image id
-  librados::ObjectReadOperation op;
-  librbd::cls_client::mirror_image_get_image_id_start(&op, m_global_image_id);
-
-  librados::AioCompletion *aio_comp = create_rados_callback<
-    BootstrapRequest<I>, &BootstrapRequest<I>::handle_get_local_image_id>(
-      this);
-  int r = m_local_io_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op, &m_out_bl);
-  assert(r == 0);
-  aio_comp->release();
-}
-
-template <typename I>
-void BootstrapRequest<I>::handle_get_local_image_id(int r) {
-  dout(20) << ": r=" << r << dendl;
-
-  if (r == 0) {
-    bufferlist::iterator iter = m_out_bl.begin();
-    r = librbd::cls_client::mirror_image_get_image_id_finish(
-      &iter, &m_local_image_id);
-  }
-
-  if (r == -ENOENT) {
-    dout(10) << ": image not registered locally" << dendl;
-  } else if (r < 0) {
-    derr << ": failed to retrieve local image id: " << cpp_strerror(r) << dendl;
-    finish(r);
-    return;
-  }
-
-  get_remote_tag_class();
-}
-
-template <typename I>
 void BootstrapRequest<I>::get_remote_tag_class() {
   dout(20) << dendl;
 
@@ -453,7 +413,6 @@ void BootstrapRequest<I>::handle_create_local_image(int r) {
     return;
   }
 
-  m_created_local_image = true;
   open_local_image();
 }
 
@@ -472,8 +431,8 @@ void BootstrapRequest<I>::update_client_image() {
 
   dout(20) << dendl;
 
-  librbd::journal::MirrorPeerClientMeta client_meta;
-  client_meta.image_id = m_local_image_id;
+  librbd::journal::MirrorPeerClientMeta client_meta{m_local_image_id};
+  client_meta.state = librbd::journal::MIRROR_PEER_STATE_SYNCING;
 
   librbd::journal::ClientData client_data(client_meta);
   bufferlist data_bl;
@@ -503,7 +462,8 @@ void BootstrapRequest<I>::handle_update_client_image(int r) {
     return;
   }
 
-  m_client_meta->image_id = m_local_image_id;
+  *m_client_meta = {m_local_image_id};
+  m_client_meta->state = librbd::journal::MIRROR_PEER_STATE_SYNCING;
   get_remote_tags();
 }
 
@@ -513,8 +473,7 @@ void BootstrapRequest<I>::get_remote_tags() {
 
   update_progress("GET_REMOTE_TAGS");
 
-  if (m_created_local_image ||
-      m_client_meta->state == librbd::journal::MIRROR_PEER_STATE_SYNCING) {
+  if (m_client_meta->state == librbd::journal::MIRROR_PEER_STATE_SYNCING) {
     // optimization -- no need to compare remote tags if we just created
     // the image locally or sync was interrupted
     image_sync();
diff --git a/src/tools/rbd_mirror/image_replayer/BootstrapRequest.h b/src/tools/rbd_mirror/image_replayer/BootstrapRequest.h
index e367b2ae482..6e755689488 100644
--- a/src/tools/rbd_mirror/image_replayer/BootstrapRequest.h
+++ b/src/tools/rbd_mirror/image_replayer/BootstrapRequest.h
@@ -87,9 +87,6 @@ private:
    * <start>
    *    |
    *    v
-   * GET_LOCAL_IMAGE_ID * * * * * * * * * * * * * * * * *
-   *    |                                               *
-   *    v                                               *
    * GET_REMOTE_TAG_CLASS * * * * * * * * * * * * * * * *
    *    |                                               *
    *    v                                               *
@@ -170,14 +167,10 @@ private:
   uint64_t m_remote_tag_class = 0;
   ImageCtxT *m_remote_image_ctx = nullptr;
   bool m_primary = false;
-  bool m_created_local_image = false;
   int m_ret_val = 0;
 
   bufferlist m_out_bl;
 
-  void get_local_image_id();
-  void handle_get_local_image_id(int r);
-
   void get_remote_tag_class();
   void handle_get_remote_tag_class(int r);
 
diff --git a/src/tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.cc b/src/tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.cc
new file mode 100644
index 00000000000..b26ac05e942
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.cc
@@ -0,0 +1,160 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.h"
+#include "include/rados/librados.hpp"
+#include "cls/rbd/cls_rbd_client.h"
+#include "common/errno.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/Journal.h"
+#include "librbd/Utils.h"
+#include "tools/rbd_mirror/Threads.h"
+#include <type_traits>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rbd_mirror
+#undef dout_prefix
+#define dout_prefix *_dout << "rbd::mirror::image_replayer::" \
+                           << "PrepareLocalImageRequest: " << this << " " \
+                           << __func__ << ": "
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+using librbd::util::create_context_callback;
+using librbd::util::create_rados_callback;
+
+template <typename I>
+void PrepareLocalImageRequest<I>::send() {
+  dout(20) << dendl;
+  get_local_image_id();
+}
+
+template <typename I>
+void PrepareLocalImageRequest<I>::get_local_image_id() {
+  dout(20) << dendl;
+
+  // attempt to cross-reference a local image by the global image id
+  librados::ObjectReadOperation op;
+  librbd::cls_client::mirror_image_get_image_id_start(&op, m_global_image_id);
+
+  m_out_bl.clear();
+  librados::AioCompletion *aio_comp = create_rados_callback<
+    PrepareLocalImageRequest<I>,
+    &PrepareLocalImageRequest<I>::handle_get_local_image_id>(
+      this);
+  int r = m_io_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op, &m_out_bl);
+  assert(r == 0);
+  aio_comp->release();
+}
+
+template <typename I>
+void PrepareLocalImageRequest<I>::handle_get_local_image_id(int r) {
+  if (r == 0) {
+    bufferlist::iterator iter = m_out_bl.begin();
+    r = librbd::cls_client::mirror_image_get_image_id_finish(
+      &iter, m_local_image_id);
+  }
+
+  dout(20) << "r=" << r << ", "
+           << "local_image_id=" << *m_local_image_id << dendl;
+
+  if (r < 0) {
+    if (r == -ENOENT) {
+      dout(10) << "image not registered locally" << dendl;
+    } else {
+      derr << "failed to retrieve local image id: " << cpp_strerror(r)
+           << dendl;
+    }
+    finish(r);
+    return;
+  }
+
+  get_mirror_state();
+}
+
+template <typename I>
+void PrepareLocalImageRequest<I>::get_mirror_state() {
+  dout(20) << dendl;
+
+  librados::ObjectReadOperation op;
+  librbd::cls_client::mirror_image_get_start(&op, *m_local_image_id);
+
+  m_out_bl.clear();
+  librados::AioCompletion *aio_comp = create_rados_callback<
+    PrepareLocalImageRequest<I>,
+    &PrepareLocalImageRequest<I>::handle_get_mirror_state>(this);
+  int r = m_io_ctx.aio_operate(RBD_MIRRORING, aio_comp, &op, &m_out_bl);
+  assert(r == 0);
+  aio_comp->release();
+}
+
+template <typename I>
+void PrepareLocalImageRequest<I>::handle_get_mirror_state(int r) {
+  dout(20) << ": r=" << r << dendl;
+
+  cls::rbd::MirrorImage mirror_image;
+  if (r == 0) {
+    bufferlist::iterator iter = m_out_bl.begin();
+    r = librbd::cls_client::mirror_image_get_finish(&iter, &mirror_image);
+  }
+
+  if (r < 0) {
+    derr << "failed to retrieve image mirror state: " << cpp_strerror(r)
+         << dendl;
+    finish(r);
+    return;
+  }
+
+  // TODO save current mirror state to determine if we should
+  // delete a partially formed image
+  // (e.g. MIRROR_IMAGE_STATE_CREATING/DELETING)
+
+  get_tag_owner();
+}
+
+template <typename I>
+void PrepareLocalImageRequest<I>::get_tag_owner() {
+  // deduce the class type for the journal to support unit tests
+  using Journal = typename std::decay<
+    typename std::remove_pointer<decltype(std::declval<I>().journal)>
+    ::type>::type;
+
+  dout(20) << dendl;
+
+  Context *ctx = create_context_callback<
+    PrepareLocalImageRequest<I>,
+    &PrepareLocalImageRequest<I>::handle_get_tag_owner>(this);
+  Journal::get_tag_owner(m_io_ctx, *m_local_image_id, m_tag_owner,
+                         m_work_queue, ctx);
+}
+
+template <typename I>
+void PrepareLocalImageRequest<I>::handle_get_tag_owner(int r) {
+  dout(20) << "r=" << r << ", "
+           << "tag_owner=" << *m_tag_owner << dendl;
+
+  if (r < 0) {
+    derr << "failed to retrieve journal tag owner: " << cpp_strerror(r)
+         << dendl;
+    finish(r);
+    return;
+  }
+
+  finish(0);
+}
+
+template <typename I>
+void PrepareLocalImageRequest<I>::finish(int r) {
+  dout(20) << "r=" << r << dendl;
+
+  m_on_finish->complete(r);
+  delete this;
+}
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+template class rbd::mirror::image_replayer::PrepareLocalImageRequest<librbd::ImageCtx>;
diff --git a/src/tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.h b/src/tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.h
new file mode 100644
index 00000000000..913bfd1c242
--- /dev/null
+++ b/src/tools/rbd_mirror/image_replayer/PrepareLocalImageRequest.h
@@ -0,0 +1,92 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef RBD_MIRROR_IMAGE_REPLAYER_PREPARE_LOCAL_IMAGE_REQUEST_H
+#define RBD_MIRROR_IMAGE_REPLAYER_PREPARE_LOCAL_IMAGE_REQUEST_H
+
+#include "include/buffer.h"
+#include <string>
+
+namespace librados { struct IoCtx; }
+namespace librbd { struct ImageCtx; }
+
+struct Context;
+struct ContextWQ;
+
+namespace rbd {
+namespace mirror {
+namespace image_replayer {
+
+template <typename ImageCtxT = librbd::ImageCtx>
+class PrepareLocalImageRequest {
+public:
+  static PrepareLocalImageRequest *create(librados::IoCtx &io_ctx,
+                                          const std::string &global_image_id,
+                                          std::string *local_image_id,
+                                          std::string *tag_owner,
+                                          ContextWQ *work_queue,
+                                          Context *on_finish) {
+    return new PrepareLocalImageRequest(io_ctx, global_image_id, local_image_id,
+                                        tag_owner, work_queue, on_finish);
+  }
+
+  PrepareLocalImageRequest(librados::IoCtx &io_ctx,
+                           const std::string &global_image_id,
+                           std::string *local_image_id,
+                           std::string *tag_owner,
+                           ContextWQ *work_queue,
+                           Context *on_finish)
+    : m_io_ctx(io_ctx), m_global_image_id(global_image_id),
+      m_local_image_id(local_image_id), m_tag_owner(tag_owner),
+      m_work_queue(work_queue), m_on_finish(on_finish) {
+  }
+
+  void send();
+
+private:
+  /**
+   * @verbatim
+   *
+   * <start>
+   *    |
+   *    v
+   * GET_LOCAL_IMAGE_ID
+   *    |
+   *    v
+   * GET_MIRROR_STATE
+   *    |
+   *    v
+   * <finish>
+
+   * @endverbatim
+   */
+
+  librados::IoCtx &m_io_ctx;
+  std::string m_global_image_id;
+  std::string *m_local_image_id;
+  std::string *m_tag_owner;
+  ContextWQ *m_work_queue;
+  Context *m_on_finish;
+
+  bufferlist m_out_bl;
+
+  void get_local_image_id();
+  void handle_get_local_image_id(int r);
+
+  void get_mirror_state();
+  void handle_get_mirror_state(int r);
+
+  void get_tag_owner();
+  void handle_get_tag_owner(int r);
+
+  void finish(int r);
+
+};
+
+} // namespace image_replayer
+} // namespace mirror
+} // namespace rbd
+
+extern template class rbd::mirror::image_replayer::PrepareLocalImageRequest<librbd::ImageCtx>;
+
+#endif // RBD_MIRROR_IMAGE_REPLAYER_PREPARE_LOCAL_IMAGE_REQUEST_H
diff --git a/src/tools/rbd_mirror/image_replayer/ReplayStatusFormatter.cc b/src/tools/rbd_mirror/image_replayer/ReplayStatusFormatter.cc
index faca72b47d4..e86765c2d4f 100644
--- a/src/tools/rbd_mirror/image_replayer/ReplayStatusFormatter.cc
+++ b/src/tools/rbd_mirror/image_replayer/ReplayStatusFormatter.cc
@@ -196,7 +196,13 @@ void ReplayStatusFormatter<I>::handle_update_tag_cache(uint64_t master_tag_tid,
     }
   }
 
-  if (tag_data.predecessor.tag_tid == 0) {
+  if (tag_data.predecessor.mirror_uuid !=
+        librbd::Journal<>::LOCAL_MIRROR_UUID &&
+      tag_data.predecessor.mirror_uuid !=
+        librbd::Journal<>::ORPHAN_MIRROR_UUID) {
+    dout(20) << "hit remote image non-primary epoch" << dendl;
+    tag_data.predecessor.tag_tid = mirror_tag_tid;
+  } else if (tag_data.predecessor.tag_tid == 0) {
     // We failed. Don't consider this fatal, just terminate retrieving.
     dout(20) << "making fake tag" << dendl;
     tag_data.predecessor.tag_tid = mirror_tag_tid;
diff --git a/src/tools/rbd_mirror/image_sync/ObjectCopyRequest.cc b/src/tools/rbd_mirror/image_sync/ObjectCopyRequest.cc
index 2d5f1470e8a..92817b7a788 100644
--- a/src/tools/rbd_mirror/image_sync/ObjectCopyRequest.cc
+++ b/src/tools/rbd_mirror/image_sync/ObjectCopyRequest.cc
@@ -334,7 +334,7 @@ void ObjectCopyRequest<I>::send_update_object_map() {
   bool sent = m_local_image_ctx->object_map->template aio_update<
     ObjectCopyRequest<I>, &ObjectCopyRequest<I>::handle_update_object_map>(
       snap_object_state.first, m_object_number, snap_object_state.second, {},
-      this);
+      {}, this);
   assert(sent);
   m_local_image_ctx->snap_lock.put_read();
 }
diff --git a/src/tools/rbd_nbd/rbd-nbd.cc b/src/tools/rbd_nbd/rbd-nbd.cc
index 8ba83208b2e..a8e4bf60395 100644
--- a/src/tools/rbd_nbd/rbd-nbd.cc
+++ b/src/tools/rbd_nbd/rbd-nbd.cc
@@ -33,6 +33,7 @@
 #include <sys/socket.h>
 
 #include <iostream>
+#include <fstream>
 #include <boost/regex.hpp>
 
 #include "mon/MonClient.h"
@@ -85,7 +86,7 @@ static int nbd = -1;
 #ifdef CEPH_BIG_ENDIAN
 #define ntohll(a) (a)
 #elif defined(CEPH_LITTLE_ENDIAN)
-#define ntohll(a) swab64(a)
+#define ntohll(a) swab(a)
 #else
 #error "Could not determine endianess"
 #endif
@@ -113,7 +114,6 @@ public:
   NBDServer(int _fd, librbd::Image& _image)
     : fd(_fd)
     , image(_image)
-    , terminated(false)
     , lock("NBDServer::Locker")
     , reader_thread(*this, &NBDServer::reader_entry)
     , writer_thread(*this, &NBDServer::writer_entry)
@@ -121,11 +121,12 @@ public:
   {}
 
 private:
-  atomic_t terminated;
+  std::atomic<bool> terminated = { false };
 
   void shutdown()
   {
-    if (terminated.compare_and_swap(false, true)) {
+    bool expected = false;
+    if (terminated.compare_exchange_strong(expected, true)) {
       ::shutdown(fd, SHUT_RDWR);
 
       Mutex::Locker l(lock);
@@ -172,7 +173,7 @@ private:
   IOContext *wait_io_finish()
   {
     Mutex::Locker l(lock);
-    while(io_finished.empty() && !terminated.read())
+    while(io_finished.empty() && !terminated)
       cond.Wait(lock);
 
     if (io_finished.empty())
@@ -234,7 +235,7 @@ private:
 
   void reader_entry()
   {
-    while (!terminated.read()) {
+    while (!terminated) {
       ceph::unique_ptr<IOContext> ctx(new IOContext());
       ctx->server = this;
 
@@ -309,7 +310,7 @@ private:
 
   void writer_entry()
   {
-    while (!terminated.read()) {
+    while (!terminated) {
       dout(20) << __func__ << ": waiting for io request" << dendl;
       ceph::unique_ptr<IOContext> ctx(wait_io_finish());
       if (!ctx) {
@@ -500,6 +501,10 @@ static int open_device(const char* path, bool try_load_module = false)
 
 static int check_device_size(int nbd_index, unsigned long expected_size)
 {
+  // There are bugs with some older kernel versions that result in an
+  // overflow for large image sizes. This check is to ensure we are
+  // not affected.
+
   unsigned long size = 0;
   std::string path = "/sys/block/nbd" + stringify(nbd_index) + "/size";
   std::ifstream ifs;
@@ -511,6 +516,12 @@ static int check_device_size(int nbd_index, unsigned long expected_size)
   ifs >> size;
   size *= RBD_NBD_BLKSIZE;
 
+  if (size == 0) {
+    // Newer kernel versions will report real size only after nbd
+    // connect. Assume this is the case and return success.
+    return 0;
+  }
+
   if (size != expected_size) {
     cerr << "rbd-nbd: kernel reported invalid device size (" << size
          << ", expected " << expected_size << ")" << std::endl;
diff --git a/src/tools/rebuild_mondb.cc b/src/tools/rebuild_mondb.cc
index a53af400f47..1d070fc230d 100644
--- a/src/tools/rebuild_mondb.cc
+++ b/src/tools/rebuild_mondb.cc
@@ -364,7 +364,7 @@ int update_pgmap_pg(ObjectStore& fs, MonitorDBStore& ms)
       continue;
     bufferlist bl;
     pg_info_t info(pgid);
-    map<epoch_t, pg_interval_t> past_intervals;
+    PastIntervals past_intervals;
     __u8 struct_v;
     r = PG::read_info(&fs, pgid, coll, bl, info, past_intervals, struct_v);
     if (r < 0) {
diff --git a/src/tracing/librados.tp b/src/tracing/librados.tp
index e2feabb06d5..451d04d4b69 100644
--- a/src/tracing/librados.tp
+++ b/src/tracing/librados.tp
@@ -1488,6 +1488,29 @@ TRACEPOINT_EVENT(librados, rados_ioctx_snap_get_stamp_exit,
     )
 )
 
+TRACEPOINT_EVENT(librados, rados_cmpext_enter,
+    TP_ARGS(
+	rados_ioctx_t, ioctx,
+	const char*, oid,
+	const char*, cmp_buf,
+	size_t, cmp_len,
+	uint64_t, off),
+    TP_FIELDS(
+	ctf_integer_hex(rados_ioctx_t, ioctx, ioctx)
+	ctf_string(oid, oid)
+	ceph_ctf_sequence(unsigned char, cmp_buf, cmp_buf, size_t, cmp_len)
+	ctf_integer(uint64_t, off, off)
+    )
+)
+
+TRACEPOINT_EVENT(librados, rados_cmpext_exit,
+    TP_ARGS(
+	int, retval),
+    TP_FIELDS(
+	ctf_integer(int, retval, retval)
+    )
+)
+
 TRACEPOINT_EVENT(librados, rados_getxattr_enter,
     TP_ARGS(
         rados_ioctx_t, ioctx,
@@ -2466,6 +2489,28 @@ TRACEPOINT_EVENT(librados, rados_aio_exec_exit,
     )
 )
 
+TRACEPOINT_EVENT(librados, rados_aio_cmpext_enter,
+    TP_ARGS(
+        rados_ioctx_t, ioctx,
+        const char*, oid,
+        rados_completion_t, completion,
+	const char*, cmp_buf,
+	size_t, cmp_len,
+	uint64_t, off),
+    TP_FIELDS(
+	ceph_ctf_sequence(unsigned char, cmp_buf, cmp_buf, size_t, cmp_len)
+	ctf_integer(uint64_t, off, off)
+    )
+)
+
+TRACEPOINT_EVENT(librados, rados_aio_cmpext_exit,
+    TP_ARGS(
+	int, retval),
+    TP_FIELDS(
+	ctf_integer(int, retval, retval)
+    )
+)
+
 TRACEPOINT_EVENT(librados, rados_watch_enter,
     TP_ARGS(
         rados_ioctx_t, ioctx,
@@ -3051,6 +3096,27 @@ TRACEPOINT_EVENT(librados, rados_write_op_assert_exists_exit,
     TP_FIELDS()
 )
 
+TRACEPOINT_EVENT(librados, rados_write_op_cmpext_enter,
+    TP_ARGS(
+	rados_write_op_t, op,
+	const char*, cmp_buffer,
+	size_t, cmp_len,
+	uint64_t, offset,
+	int*, prval),
+    TP_FIELDS(
+	ctf_integer_hex(rados_write_op_t, op, op)
+	ceph_ctf_sequence(unsigned char, cmp_buffer, cmp_buffer, size_t, cmp_len)
+	ctf_integer(size_t, cmp_len, cmp_len)
+	ctf_integer(uint64_t, offset, offset)
+	ctf_integer_hex(void*, prval, prval)
+    )
+)
+
+TRACEPOINT_EVENT(librados, rados_write_op_cmpext_exit,
+    TP_ARGS(),
+    TP_FIELDS()
+)
+
 TRACEPOINT_EVENT(librados, rados_write_op_cmpxattr_enter,
     TP_ARGS(
         rados_write_op_t, op,
@@ -3511,6 +3577,27 @@ TRACEPOINT_EVENT(librados, rados_read_op_assert_exists_exit,
     TP_FIELDS()
 )
 
+TRACEPOINT_EVENT(librados, rados_read_op_cmpext_enter,
+    TP_ARGS(
+	rados_read_op_t, op,
+	const char*, cmp_buffer,
+	size_t, cmp_len,
+	uint64_t, offset,
+	int*, prval),
+    TP_FIELDS(
+	ctf_integer_hex(rados_read_op_t, op, op)
+	ceph_ctf_sequence(unsigned char, cmp_buffer, cmp_buffer, size_t, cmp_len)
+	ctf_integer(size_t, cmp_len, cmp_len)
+	ctf_integer(uint64_t, offset, offset)
+	ctf_integer_hex(void*, prval, prval)
+    )
+)
+
+TRACEPOINT_EVENT(librados, rados_read_op_cmpext_exit,
+    TP_ARGS(),
+    TP_FIELDS()
+)
+
 TRACEPOINT_EVENT(librados, rados_read_op_cmpxattr_enter,
     TP_ARGS(
         rados_read_op_t, read_op,
diff --git a/src/tracing/librbd.tp b/src/tracing/librbd.tp
index 95ccaa66955..7078390daed 100644
--- a/src/tracing/librbd.tp
+++ b/src/tracing/librbd.tp
@@ -1851,11 +1851,13 @@ TRACEPOINT_EVENT(librbd, get_parent_info_exit,
         int, retval,
         const char*, parent_pool_name,
         const char*, parent_name,
+        const char*, parent_id,
         const char*, parent_snap_name),
     TP_FIELDS(
         ctf_integer(int, retval, retval)
         ceph_ctf_string(parent_pool_name, parent_pool_name)
         ceph_ctf_string(parent_name, parent_name)
+        ceph_ctf_string(parent_id, parent_id)
         ceph_ctf_string(parent_snap_name, parent_snap_name)
     )
 )
diff --git a/src/tracing/osd.tp b/src/tracing/osd.tp
index 3582ce63b8b..6f199fa2edb 100644
--- a/src/tracing/osd.tp
+++ b/src/tracing/osd.tp
@@ -91,6 +91,28 @@ TRACEPOINT_EVENT(osd, do_osd_op_pre,
     )
 )
 
+TRACEPOINT_EVENT(osd, do_osd_op_pre_extent_cmp,
+    TP_ARGS(
+        const char*, oid,
+        uint64_t, snap,
+        uint64_t, osize,
+        uint32_t, oseq,
+        uint64_t, offset,
+        uint64_t, length,
+        uint64_t, truncate_size,
+        uint32_t, truncate_seq),
+    TP_FIELDS(
+        ctf_string(oid, oid)
+        ctf_integer(uint64_t, snap, snap)
+        ctf_integer(uint64_t, osize, osize)
+        ctf_integer(uint32_t, oseq, oseq)
+        ctf_integer(uint64_t, offset, offset)
+        ctf_integer(uint64_t, length, length)
+        ctf_integer(uint64_t, truncate_size, truncate_size)
+        ctf_integer(uint32_t, truncate_seq, truncate_seq)
+    )
+)
+
 TRACEPOINT_EVENT(osd, do_osd_op_pre_read,
     TP_ARGS(
         const char*, oid,
diff --git a/src/vstart.sh b/src/vstart.sh
index 0a8c7620f5e..a1d4f2d65b7 100755
--- a/src/vstart.sh
+++ b/src/vstart.sh
@@ -602,7 +602,7 @@ start_mgr() {
             mkdir -p $CEPH_DEV_DIR/mgr.$name
             key_fn=$CEPH_DEV_DIR/mgr.$name/keyring
             $SUDO $CEPH_BIN/ceph-authtool --create-keyring --gen-key --name=mgr.$name $key_fn
-            ceph_adm -i $key_fn auth add mgr.$name mon 'allow profile mgr'
+            ceph_adm -i $key_fn auth add mgr.$name mon 'allow profile mgr' mds 'allow *' osd 'allow *'
         fi
 
         wconf <<EOF
@@ -657,7 +657,7 @@ EOF
 EOF
 	        fi
 	        prun $SUDO "$CEPH_BIN/ceph-authtool" --create-keyring --gen-key --name="mds.$name" "$key_fn"
-	        ceph_adm -i "$key_fn" auth add "mds.$name" mon 'allow profile mds' osd 'allow *' mds 'allow' mgr 'allow'
+	        ceph_adm -i "$key_fn" auth add "mds.$name" mon 'allow profile mds' osd 'allow *' mds 'allow' mgr 'allow profile mds'
 	        if [ "$standby" -eq 1 ]; then
 			    prun $SUDO "$CEPH_BIN/ceph-authtool" --create-keyring --gen-key --name="mds.${name}s" \
 				     "$CEPH_DEV_DIR/mds.${name}s/keyring"
@@ -685,6 +685,8 @@ if [ "$debug" -eq 0 ]; then
         debug ms = 1'
     CMDSDEBUG='
         debug ms = 1'
+    CMGRDEBUG='
+        debug ms = 1'
 else
     echo "** going verbose **"
     CMONDEBUG='
diff --git a/systemd/ceph-mgr@.service b/systemd/ceph-mgr@.service
index b6e9fb687f2..42186016313 100644
--- a/systemd/ceph-mgr@.service
+++ b/systemd/ceph-mgr@.service
@@ -20,7 +20,7 @@ Environment=CLUSTER=ceph
 ExecStartPre=-/bin/sh -c "exec mkdir -p /var/lib/ceph/mgr/${CLUSTER}-%i"
 ExecStartPre=-/bin/sh -c "[ -f /var/lib/ceph/mgr/${CLUSTER}-%i/keyring ] || /usr/bin/ceph-authtool --create-keyring --gen-key --name=mgr.%i /var/lib/ceph/mgr/${CLUSTER}-%i/keyring"
 ExecStartPre=-/bin/sh -c "exec chown -R ceph.ceph /var/lib/ceph/mgr/${CLUSTER}-%i"
-ExecStartPre=-/usr/bin/ceph -i /var/lib/ceph/mgr/${CLUSTER}-%i/keyring auth add mgr.%i mon 'allow profile mgr' --keyring=/var/lib/ceph/mon/${CLUSTER}-%i/keyring --name=mon.
+ExecStartPre=-/usr/bin/ceph -i /var/lib/ceph/mgr/${CLUSTER}-%i/keyring auth add mgr.%i mon 'allow profile mgr' osd 'allow *' mds 'allow *' --keyring=/var/lib/ceph/mon/${CLUSTER}-%i/keyring --name=mon.
 
 ExecStart=/usr/bin/ceph-mgr -f --cluster ${CLUSTER} --id %i --setuser ceph --setgroup ceph
 ExecReload=/bin/kill -HUP $MAINPID