btrfs: do not hold the extent lock for entire read

Historically we've held the extent lock throughout the entire read. There's been a few reasons for this, but it's mostly just caused us problems. For example, this prevents us from allowing page faults during direct io reads, because we could deadlock. This has forced us to only allow 4k reads at a time for io_uring NOWAIT requests because we have no idea if we'll be forced to page fault and thus have to do a whole lot of work. On the buffered side we are protected by the page lock, as long as we're reading things like buffered writes, punch hole, and even direct IO to a certain degree will get hung up on the page lock while the page is in flight. On the direct side we have the dio extent lock, which acts much like the way the extent lock worked previously to this patch, however just for direct reads. This protects direct reads from concurrent direct writes, while we're protected from buffered writes via the inode lock. Now that we're protected in all cases, narrow the extent lock to the part where we're getting the extent map to submit the reads, no longer holding the extent lock for the entire read operation. Push the extent lock down into do_readpage() so that we're only grabbing it when looking up the extent map. This portion was contributed by Goldwyn. Co-developed-by: Goldwyn Rodrigues <rgoldwyn@suse.com> Reviewed-by: Goldwyn Rodrigues <rgoldwyn@suse.com> Signed-off-by: Josef Bacik <josef@toxicpanda.com> Signed-off-by: David Sterba <dsterba@suse.com>
author: Josef Bacik <josef@toxicpanda.com> 2024-08-16 21:16:24 +0200
committer: David Sterba <dsterba@suse.com> 2024-09-10 16:51:20 +0200
commit: ac325fc2aad513072722387a71bf857c938aae4e (patch)
tree: f00b90af5f961b99f3dc2b4af9b28af2d752e5b1 /fs/btrfs/direct-io.c
parent: btrfs: take the dio extent lock during O_DIRECT operations (diff)
download: linux-ac325fc2aad513072722387a71bf857c938aae4e.tar.xz
linux-ac325fc2aad513072722387a71bf857c938aae4e.zip
1 files changed, 24 insertions, 25 deletions
diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c
index 4a5f9b2632f2..bd38df5647e3 100644
--- a/fs/btrfs/direct-io.c
+++ b/fs/btrfs/direct-io.c
@@ -365,7 +365,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
 	int ret = 0;
 	u64 len = length;
 	const u64 data_alloc_len = length;
-	bool unlock_extents = false;
+	u32 unlock_bits = EXTENT_LOCKED;
 
 	/*
 	 * We could potentially fault if we have a buffer > PAGE_SIZE, and if
@@ -526,7 +526,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
 						    start, &len, flags);
 		if (ret < 0)
 			goto unlock_err;
-		unlock_extents = true;
 		/* Recalc len in case the new em is smaller than requested */
 		len = min(len, em->len - (start - em->start));
 		if (dio_data->data_space_reserved) {
@@ -547,23 +546,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
 							       release_offset,
 							       release_len);
 		}
-	} else {
-		/*
-		 * We need to unlock only the end area that we aren't using.
-		 * The rest is going to be unlocked by the endio routine.
-		 */
-		lockstart = start + len;
-		if (lockstart < lockend)
-			unlock_extents = true;
 	}
 
-	if (unlock_extents)
-		clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-				 EXTENT_LOCKED | EXTENT_DIO_LOCKED,
-				 &cached_state);
-	else
-		free_extent_state(cached_state);
-
 	/*
 	 * Translate extent map information to iomap.
 	 * We trim the extents (and move the addr) even though iomap code does
@@ -582,6 +566,23 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
 	iomap->length = len;
 	free_extent_map(em);
 
+	/*
+	 * Reads will hold the EXTENT_DIO_LOCKED bit until the io is completed,
+	 * writes only hold it for this part.  We hold the extent lock until
+	 * we're completely done with the extent map to make sure it remains
+	 * valid.
+	 */
+	if (write)
+		unlock_bits |= EXTENT_DIO_LOCKED;
+
+	clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+			 unlock_bits, &cached_state);
+
+	/* We didn't use everything, unlock the dio extent for the remainder. */
+	if (!write && (start + len) < lockend)
+		unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len,
+				  lockend, NULL);
+
 	return 0;
 
 unlock_err:
@@ -614,8 +615,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
 
 	if (!write && (iomap->type == IOMAP_HOLE)) {
 		/* If reading from a hole, unlock and return */
-		clear_extent_bit(&BTRFS_I(inode)->io_tree, pos, pos + length - 1,
-				 EXTENT_LOCKED | EXTENT_DIO_LOCKED, NULL);
+		unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
+				  pos + length - 1, NULL);
 		return 0;
 	}
 
@@ -626,9 +627,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
 			btrfs_finish_ordered_extent(dio_data->ordered, NULL,
 						    pos, length, false);
 		else
-			clear_extent_bit(&BTRFS_I(inode)->io_tree, pos,
-					 pos + length - 1,
-					 EXTENT_LOCKED | EXTENT_DIO_LOCKED, NULL);
+			unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
+					  pos + length - 1, NULL);
 		ret = -ENOTBLK;
 	}
 	if (write) {
@@ -660,9 +660,8 @@ static void btrfs_dio_end_io(struct btrfs_bio *bbio)
 					    dip->file_offset, dip->bytes,
 					    !bio->bi_status);
 	} else {
-		clear_extent_bit(&inode->io_tree, dip->file_offset,
-				 dip->file_offset + dip->bytes - 1,
-				 EXTENT_LOCKED | EXTENT_DIO_LOCKED, NULL);
+		unlock_dio_extent(&inode->io_tree, dip->file_offset,
+				  dip->file_offset + dip->bytes - 1, NULL);
 	}
 
 	bbio->bio.bi_private = bbio->private;
author	Josef Bacik <josef@toxicpanda.com>	2024-08-16 21:16:24 +0200
committer	David Sterba <dsterba@suse.com>	2024-09-10 16:51:20 +0200
commit	ac325fc2aad513072722387a71bf857c938aae4e (patch)
tree	f00b90af5f961b99f3dc2b4af9b28af2d752e5b1 /fs/btrfs/direct-io.c
parent	btrfs: take the dio extent lock during O_DIRECT operations (diff)
download	linux-ac325fc2aad513072722387a71bf857c938aae4e.tar.xz linux-ac325fc2aad513072722387a71bf857c938aae4e.zip