summaryrefslogtreecommitdiffstats
path: root/src/blk
diff options
context:
space:
mode:
authorAdam Kupczyk <akupczyk@redhat.com>2022-11-29 16:38:08 +0100
committerAdam Kupczyk <akupczyk@ibm.com>2023-01-25 12:27:14 +0100
commit5dc01efd97109fcb7fc3ded5ffbe502c8e7439b2 (patch)
treef58be8b7c3c8313cfd0e3b33f0deb7e12c73ca64 /src/blk
parentMerge pull request #49107 from zdover23/wip-doc-2022-11-29-rados-balancer-pro... (diff)
downloadceph-5dc01efd97109fcb7fc3ded5ffbe502c8e7439b2.tar.xz
ceph-5dc01efd97109fcb7fc3ded5ffbe502c8e7439b2.zip
blk/kernel: Add O_EXCL for block devices
Change behaviour when target file is block device "mknod name b major minor". Append O_EXCL flag for first open of the block device. The problem is that if 2 different files for same block devices are created, it is possible to ::flock each of them in 2 separate processes. In some container cases when we recreate bluestore osd dir with ceph-bluestore-tool prime-osd command, we can end up with completely different files. Open with O_EXCL is immune to that. Signed-off-by: Adam Kupczyk <akupczyk@redhat.com>
Diffstat (limited to 'src/blk')
-rw-r--r--src/blk/kernel/KernelDevice.cc23
1 files changed, 22 insertions, 1 deletions
diff --git a/src/blk/kernel/KernelDevice.cc b/src/blk/kernel/KernelDevice.cc
index d9c1e529c07..e15b727e0a6 100644
--- a/src/blk/kernel/KernelDevice.cc
+++ b/src/blk/kernel/KernelDevice.cc
@@ -133,8 +133,25 @@ int KernelDevice::open(const string& p)
int r = 0, i = 0;
dout(1) << __func__ << " path " << path << dendl;
+ struct stat statbuf;
+ bool is_block;
+ r = stat(path.c_str(), &statbuf);
+ if (r != 0) {
+ derr << __func__ << " stat got: " << cpp_strerror(r) << dendl;
+ goto out_fail;
+ }
+ is_block = (statbuf.st_mode & S_IFMT) == S_IFBLK;
for (i = 0; i < WRITE_LIFE_MAX; i++) {
- int fd = ::open(path.c_str(), O_RDWR | O_DIRECT);
+ int flags = 0;
+ if (lock_exclusive && is_block && (i == 0)) {
+ // If opening block device use O_EXCL flag. It gives us best protection,
+ // as no other process can overwrite the data for as long as we are running.
+ // For block devices ::flock is not enough,
+ // since 2 different inodes with same major/minor can be locked.
+ // Exclusion by O_EXCL works in containers too.
+ flags |= O_EXCL;
+ }
+ int fd = ::open(path.c_str(), O_RDWR | O_DIRECT | flags);
if (fd < 0) {
r = -errno;
break;
@@ -187,6 +204,10 @@ int KernelDevice::open(const string& p)
}
if (lock_exclusive) {
+ // We need to keep soft locking (via flock()) because O_EXCL does not work for regular files.
+ // This is as good as we can get. Other processes can still overwrite the data,
+ // but at least we are protected from mounting same device twice in ceph processes.
+ // We also apply soft locking for block devices, as it populates /proc/locks. (see lslocks)
r = _lock();
if (r < 0) {
derr << __func__ << " failed to lock " << path << ": " << cpp_strerror(r)