From 9726891fe753910b8d7db712781438ad229091b3 Mon Sep 17 00:00:00 2001 From: zihan zhou <15645113830zzh@gmail.com> Date: Wed, 25 Dec 2024 10:10:35 +0800 Subject: mm: page_alloc: fix missed updates of lowmem_reserve in adjust_managed_page_count In the kernel, the zone's lowmem_reserve and _watermark, and the global variable 'totalreserve_pages' depend on the value of managed_pages, but after running adjust_managed_page_count, these values aren't updated, which causes some problems. For example, in a system with six 1GB large pages, we found that the value of protection in zoneinfo (zone->lowmem_reserve), is not right. Its value seems to be calculated from the initial managed_pages, but after the managed_pages changed, was not updated. Only after reading the file /proc/sys/vm/lowmem_reserve_ratio, updates happen. read file /proc/sys/vm/lowmem_reserve_ratio: lowmem_reserve_ratio_sysctl_handler ----setup_per_zone_lowmem_reserve --------calculate_totalreserve_pages protection changed after reading file: [root@test ~]# cat /proc/zoneinfo | grep protection protection: (0, 2719, 57360, 0) protection: (0, 0, 54640, 0) protection: (0, 0, 0, 0) protection: (0, 0, 0, 0) [root@test ~]# cat /proc/sys/vm/lowmem_reserve_ratio 256 256 32 0 [root@test ~]# cat /proc/zoneinfo | grep protection protection: (0, 2735, 63524, 0) protection: (0, 0, 60788, 0) protection: (0, 0, 0, 0) protection: (0, 0, 0, 0) lowmem_reserve increased also makes the totalreserve_pages increased, which causes a decrease in available memory. The one above is just a test machine, and the increase is not significant. On our online machine, the reserved memory will increase by several GB due to reading this file. It is clearly unreasonable to cause a sharp drop in available memory just by reading a file. In this patch, we update reserve memory when update managed_pages, The size of reserved memory becomes stable. But it seems that the _watermark should also be updated along with the managed_pages. We have not done it because we are unsure if it is reasonable to set the watermark through the initial managed_pages. If it is not reasonable, we will propose new patch. Link: https://lkml.kernel.org/r/20241225021034.45693-1-15645113830zzh@gmail.com Signed-off-by: zihan zhou <15645113830zzh@gmail.com> Signed-off-by: yaowenchao Signed-off-by: Andrew Morton --- mm/page_alloc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cae7b93864c2..01eab25edf89 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5692,10 +5692,13 @@ __meminit void zone_pcp_init(struct zone *zone) zone->present_pages, zone_batchsize(zone)); } +static void setup_per_zone_lowmem_reserve(void); + void adjust_managed_page_count(struct page *page, long count) { atomic_long_add(count, &page_zone(page)->managed_pages); totalram_pages_add(count); + setup_per_zone_lowmem_reserve(); } EXPORT_SYMBOL(adjust_managed_page_count); -- cgit v1.2.3 From 05c82ee363f64c64b87a0cfd744298e9333475f5 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 26 Dec 2024 13:16:39 -0800 Subject: alloc_tag: skip pgalloc_tag_swap if profiling is disabled When memory allocation profiling is disabled, there is no need to swap allocation tags during migration. Skip it to avoid unnecessary overhead. Once I added these checks, the overhead of the mode when memory profiling is enabled but turned off went down by about 50%. Link: https://lkml.kernel.org/r/20241226211639.1357704-2-surenb@google.com Fixes: e0a955bf7f61 ("mm/codetag: add pgalloc_tag_copy()") Signed-off-by: Suren Baghdasaryan Cc: David Wang <00107082@163.com> Cc: Kent Overstreet Cc: Yu Zhao Cc: Zhenhua Huang Cc: Signed-off-by: Andrew Morton --- lib/alloc_tag.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index 7dcebf118a3e..65e706e1bc19 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -195,6 +195,9 @@ void pgalloc_tag_swap(struct folio *new, struct folio *old) union codetag_ref ref_old, ref_new; struct alloc_tag *tag_old, *tag_new; + if (!mem_alloc_profiling_enabled()) + return; + tag_old = pgalloc_tag_get(&old->page); if (!tag_old) return; -- cgit v1.2.3 From b071cc35469ea44392222fe8de69b431a0778a5f Mon Sep 17 00:00:00 2001 From: Karan Sanghavi Date: Sat, 11 Jan 2025 15:31:30 +0000 Subject: mm: shmem: use signed int for version handling in casefold option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes an issue where the use of an unsigned data type in `shmem_parse_opt_casefold()` caused incorrect evaluation of negative conditions. Link: https://lkml.kernel.org/r/20250111-unsignedcompare1601569-v3-1-c861b4221831@gmail.com Fixes: 58e55efd6c72 ("tmpfs: Add casefold lookup support") Reviewed-by: André Almeida Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Karan Sanghavi Cc: Christian Brauner Cc: Hugh Dickens Cc: Shuah khan Signed-off-by: Andrew Morton --- mm/shmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index ac58d4fb2e6f..fdb5afa1cfe9 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -4368,7 +4368,7 @@ static int shmem_parse_opt_casefold(struct fs_context *fc, struct fs_parameter * bool latest_version) { struct shmem_options *ctx = fc->fs_private; - unsigned int version = UTF8_LATEST; + int version = UTF8_LATEST; struct unicode_map *encoding; char *version_str = param->string + 5; -- cgit v1.2.3 From f1897f2f08b28ae59476d8b73374b08f856973af Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Sat, 11 Jan 2025 11:45:11 +0800 Subject: mm: khugepaged: fix call hpage_collapse_scan_file() for anonymous vma syzkaller reported such a BUG_ON(): ------------[ cut here ]------------ kernel BUG at mm/khugepaged.c:1835! Internal error: Oops - BUG: 00000000f2000800 [#1] SMP ... CPU: 6 UID: 0 PID: 8009 Comm: syz.15.106 Kdump: loaded Tainted: G W 6.13.0-rc6 #22 Tainted: [W]=WARN Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 pstate: 00400005 (nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : collapse_file+0xa44/0x1400 lr : collapse_file+0x88/0x1400 sp : ffff80008afe3a60 ... Call trace: collapse_file+0xa44/0x1400 (P) hpage_collapse_scan_file+0x278/0x400 madvise_collapse+0x1bc/0x678 madvise_vma_behavior+0x32c/0x448 madvise_walk_vmas.constprop.0+0xbc/0x140 do_madvise.part.0+0xdc/0x2c8 __arm64_sys_madvise+0x68/0x88 invoke_syscall+0x50/0x120 el0_svc_common.constprop.0+0xc8/0xf0 do_el0_svc+0x24/0x38 el0_svc+0x34/0x128 el0t_64_sync_handler+0xc8/0xd0 el0t_64_sync+0x190/0x198 This indicates that the pgoff is unaligned. After analysis, I confirm the vma is mapped to /dev/zero. Such a vma certainly has vm_file, but it is set to anonymous by mmap_zero(). So even if it's mmapped by 2m-unaligned, it can pass the check in thp_vma_allowable_order() as it is an anonymous-mmap, but then be collapsed as a file-mmap. It seems the problem has existed for a long time, but actually, since we have khugepaged_max_ptes_none check before, we will skip collapse it as it is /dev/zero and so has no present page. But commit d8ea7cc8547c limit the check for only khugepaged, so the BUG_ON() can be triggered by madvise_collapse(). Add vma_is_anonymous() check to make such vma be processed by hpage_collapse_scan_pmd(). Link: https://lkml.kernel.org/r/20250111034511.2223353-1-liushixin2@huawei.com Fixes: d8ea7cc8547c ("mm/khugepaged: add flag to predicate khugepaged-only behavior") Signed-off-by: Liu Shixin Reviewed-by: Yang Shi Acked-by: David Hildenbrand Cc: Chengming Zhou Cc: Johannes Weiner Cc: Kefeng Wang Cc: Mattew Wilcox Cc: Muchun Song Cc: Nanyong Sun Cc: Qi Zheng Signed-off-by: Andrew Morton --- mm/khugepaged.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 653dbb1ff05c..bad1e130eda8 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2422,7 +2422,7 @@ skip: VM_BUG_ON(khugepaged_scan.address < hstart || khugepaged_scan.address + HPAGE_PMD_SIZE > hend); - if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) { + if (IS_ENABLED(CONFIG_SHMEM) && !vma_is_anonymous(vma)) { struct file *file = get_file(vma->vm_file); pgoff_t pgoff = linear_page_index(vma, khugepaged_scan.address); @@ -2768,7 +2768,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, mmap_assert_locked(mm); memset(cc->node_load, 0, sizeof(cc->node_load)); nodes_clear(cc->alloc_nmask); - if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) { + if (IS_ENABLED(CONFIG_SHMEM) && !vma_is_anonymous(vma)) { struct file *file = get_file(vma->vm_file); pgoff_t pgoff = linear_page_index(vma, addr); -- cgit v1.2.3 From 779b9955f64327c339a16f68055af98252fd3315 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Mon, 13 Jan 2025 21:44:58 +0000 Subject: mm: zswap: move allocations during CPU init outside the lock In zswap_cpu_comp_prepare(), allocations are made and assigned to various members of acomp_ctx under acomp_ctx->mutex. However, allocations may recurse into zswap through reclaim, trying to acquire the same mutex and deadlocking. Move the allocations before the mutex critical section. Only the initialization of acomp_ctx needs to be done with the mutex held. Link: https://lkml.kernel.org/r/20250113214458.2123410-1-yosryahmed@google.com Fixes: 12dcb0ef5406 ("mm: zswap: properly synchronize freeing resources during CPU hotunplug") Signed-off-by: Yosry Ahmed Reviewed-by: Chengming Zhou Cc: Johannes Weiner Cc: Nhat Pham Cc: Signed-off-by: Andrew Morton --- mm/zswap.c | 42 ++++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 30f5a27a6862..b84c20d889b1 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -820,15 +820,15 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) { struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); - struct crypto_acomp *acomp; - struct acomp_req *req; + struct crypto_acomp *acomp = NULL; + struct acomp_req *req = NULL; + u8 *buffer = NULL; int ret; - mutex_lock(&acomp_ctx->mutex); - acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); - if (!acomp_ctx->buffer) { + buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); + if (!buffer) { ret = -ENOMEM; - goto buffer_fail; + goto fail; } acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); @@ -836,21 +836,25 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) pr_err("could not alloc crypto acomp %s : %ld\n", pool->tfm_name, PTR_ERR(acomp)); ret = PTR_ERR(acomp); - goto acomp_fail; + goto fail; } - acomp_ctx->acomp = acomp; - acomp_ctx->is_sleepable = acomp_is_async(acomp); - req = acomp_request_alloc(acomp_ctx->acomp); + req = acomp_request_alloc(acomp); if (!req) { pr_err("could not alloc crypto acomp_request %s\n", pool->tfm_name); ret = -ENOMEM; - goto req_fail; + goto fail; } - acomp_ctx->req = req; + /* + * Only hold the mutex after completing allocations, otherwise we may + * recurse into zswap through reclaim and attempt to hold the mutex + * again resulting in a deadlock. + */ + mutex_lock(&acomp_ctx->mutex); crypto_init_wait(&acomp_ctx->wait); + /* * if the backend of acomp is async zip, crypto_req_done() will wakeup * crypto_wait_req(); if the backend of acomp is scomp, the callback @@ -859,15 +863,17 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, crypto_req_done, &acomp_ctx->wait); + acomp_ctx->buffer = buffer; + acomp_ctx->acomp = acomp; + acomp_ctx->is_sleepable = acomp_is_async(acomp); + acomp_ctx->req = req; mutex_unlock(&acomp_ctx->mutex); return 0; -req_fail: - crypto_free_acomp(acomp_ctx->acomp); -acomp_fail: - kfree(acomp_ctx->buffer); -buffer_fail: - mutex_unlock(&acomp_ctx->mutex); +fail: + if (acomp) + crypto_free_acomp(acomp); + kfree(buffer); return ret; } -- cgit v1.2.3 From 3e1a9371e40ee92fbdf22752538e95497bf9152d Mon Sep 17 00:00:00 2001 From: Ethan Carter Edwards Date: Mon, 13 Jan 2025 11:22:22 -0500 Subject: mailmap: update entry for Ethan Carter Edwards Map old gmail + name to my current full name and email. Link: https://lkml.kernel.org/r/xbfkmvmp4wyxrvlan57bjnul5icrwfyt67vnhhw2cyr5rzbnee@mfvihhd6s7l5 Signed-off-by: Ethan Carter Edwards Signed-off-by: Andrew Morton --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index f5f97f947020..17f491cebc8c 100644 --- a/.mailmap +++ b/.mailmap @@ -200,6 +200,7 @@ Elliot Berman Enric Balletbo i Serra Enric Balletbo i Serra Erik Kaneda +Ethan Carter Edwards Ethan Edwards Eugen Hristev Eugen Hristev Evgeniy Polyakov -- cgit v1.2.3 From b0fce54b8c0d8e5f2b4c243c803c5996e73baee8 Mon Sep 17 00:00:00 2001 From: Su Yue Date: Mon, 6 Jan 2025 22:06:40 +0800 Subject: ocfs2: check dir i_size in ocfs2_find_entry syz reports an out of bounds read: ================================================================== BUG: KASAN: slab-out-of-bounds in ocfs2_match fs/ocfs2/dir.c:334 [inline] BUG: KASAN: slab-out-of-bounds in ocfs2_search_dirblock+0x283/0x6e0 fs/ocfs2/dir.c:367 Read of size 1 at addr ffff88804d8b9982 by task syz-executor.2/14802 CPU: 0 UID: 0 PID: 14802 Comm: syz-executor.2 Not tainted 6.13.0-rc4 #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 Sched_ext: serialise (enabled+all), task: runnable_at=-10ms Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x229/0x350 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0x164/0x530 mm/kasan/report.c:489 kasan_report+0x147/0x180 mm/kasan/report.c:602 ocfs2_match fs/ocfs2/dir.c:334 [inline] ocfs2_search_dirblock+0x283/0x6e0 fs/ocfs2/dir.c:367 ocfs2_find_entry_id fs/ocfs2/dir.c:414 [inline] ocfs2_find_entry+0x1143/0x2db0 fs/ocfs2/dir.c:1078 ocfs2_find_files_on_disk+0x18e/0x530 fs/ocfs2/dir.c:1981 ocfs2_lookup_ino_from_name+0xb6/0x110 fs/ocfs2/dir.c:2003 ocfs2_lookup+0x30a/0xd40 fs/ocfs2/namei.c:122 lookup_open fs/namei.c:3627 [inline] open_last_lookups fs/namei.c:3748 [inline] path_openat+0x145a/0x3870 fs/namei.c:3984 do_filp_open+0xe9/0x1c0 fs/namei.c:4014 do_sys_openat2+0x135/0x1d0 fs/open.c:1402 do_sys_open fs/open.c:1417 [inline] __do_sys_openat fs/open.c:1433 [inline] __se_sys_openat fs/open.c:1428 [inline] __x64_sys_openat+0x15d/0x1c0 fs/open.c:1428 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf6/0x210 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f01076903ad Code: c3 e8 a7 2b 00 00 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f01084acfc8 EFLAGS: 00000246 ORIG_RAX: 0000000000000101 RAX: ffffffffffffffda RBX: 00007f01077cbf80 RCX: 00007f01076903ad RDX: 0000000000105042 RSI: 0000000020000080 RDI: ffffffffffffff9c RBP: 00007f01077cbf80 R08: 0000000000000000 R09: 0000000000000000 R10: 00000000000001ff R11: 0000000000000246 R12: 0000000000000000 R13: 00007f01077cbf80 R14: 00007f010764fc90 R15: 00007f010848d000 ================================================================== And a general protection fault in ocfs2_prepare_dir_for_insert: ================================================================== loop0: detected capacity change from 0 to 32768 JBD2: Ignoring recovery information on journal ocfs2: Mounting device (7,0) on (node local, slot 0) with ordered data mode. Oops: general protection fault, probably for non-canonical address 0xdffffc0000000001: 0000 [#1] PREEMPT SMP KASAN NOPTI KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f] CPU: 0 UID: 0 PID: 5096 Comm: syz-executor792 Not tainted 6.11.0-rc4-syzkaller-00002-gb0da640826ba #0 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 RIP: 0010:ocfs2_find_dir_space_id fs/ocfs2/dir.c:3406 [inline] RIP: 0010:ocfs2_prepare_dir_for_insert+0x3309/0x5c70 fs/ocfs2/dir.c:4280 Code: 00 00 e8 2a 25 13 fe e9 ba 06 00 00 e8 20 25 13 fe e9 4f 01 00 00 e8 16 25 13 fe 49 8d 7f 08 49 8d 5f 09 48 89 f8 48 c1 e8 03 <42> 0f b6 04 20 84 c0 0f 85 bd 23 00 00 48 89 d8 48 c1 e8 03 42 0f RSP: 0018:ffffc9000af9f020 EFLAGS: 00010202 RAX: 0000000000000001 RBX: 0000000000000009 RCX: ffff88801e27a440 RDX: 0000000000000000 RSI: 0000000000000400 RDI: 0000000000000008 RBP: ffffc9000af9f830 R08: ffffffff8380395b R09: ffffffff838090a7 R10: 0000000000000002 R11: ffff88801e27a440 R12: dffffc0000000000 R13: ffff88803c660878 R14: f700000000000088 R15: 0000000000000000 FS: 000055555a677380(0000) GS:ffff888020800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000560bce569178 CR3: 000000001de5a000 CR4: 0000000000350ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ocfs2_mknod+0xcaf/0x2b40 fs/ocfs2/namei.c:292 vfs_mknod+0x36d/0x3b0 fs/namei.c:4088 do_mknodat+0x3ec/0x5b0 __do_sys_mknodat fs/namei.c:4166 [inline] __se_sys_mknodat fs/namei.c:4163 [inline] __x64_sys_mknodat+0xa7/0xc0 fs/namei.c:4163 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f2dafda3a99 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 f1 17 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007ffe336a6658 EFLAGS: 00000246 ORIG_RAX: 0000000000000103 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f2dafda3a99 RDX: 00000000000021c0 RSI: 0000000020000040 RDI: 00000000ffffff9c RBP: 00007f2dafe1b5f0 R08: 0000000000004480 R09: 000055555a6784c0 R10: 0000000000000103 R11: 0000000000000246 R12: 00007ffe336a6680 R13: 00007ffe336a68a8 R14: 431bde82d7b634db R15: 00007f2dafdec03b ================================================================== The two reports are all caused invalid negative i_size of dir inode. For ocfs2, dir_inode can't be negative or zero. Here add a check in which is called by ocfs2_check_dir_for_entry(). It fixes the second report as ocfs2_check_dir_for_entry() must be called before ocfs2_prepare_dir_for_insert(). Also set a up limit for dir with OCFS2_INLINE_DATA_FL. The i_size can't be great than blocksize. Link: https://lkml.kernel.org/r/20250106140640.92260-1-glass.su@suse.com Reported-by: Jiacheng Xu Link: https://lore.kernel.org/ocfs2-devel/17a04f01.1ae74.19436d003fc.Coremail.stitch@zju.edu.cn/T/#u Reported-by: syzbot+5a64828fcc4c2ad9b04f@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/0000000000005894f3062018caf1@google.com/T/ Signed-off-by: Su Yue Reviewed-by: Heming Zhao Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/dir.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 213206ebdd58..7799f4d16ce9 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -1065,26 +1065,39 @@ int ocfs2_find_entry(const char *name, int namelen, { struct buffer_head *bh; struct ocfs2_dir_entry *res_dir = NULL; + int ret = 0; if (ocfs2_dir_indexed(dir)) return ocfs2_find_entry_dx(name, namelen, dir, lookup); + if (unlikely(i_size_read(dir) <= 0)) { + ret = -EFSCORRUPTED; + mlog_errno(ret); + goto out; + } /* * The unindexed dir code only uses part of the lookup * structure, so there's no reason to push it down further * than this. */ - if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) + if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + if (unlikely(i_size_read(dir) > dir->i_sb->s_blocksize)) { + ret = -EFSCORRUPTED; + mlog_errno(ret); + goto out; + } bh = ocfs2_find_entry_id(name, namelen, dir, &res_dir); - else + } else { bh = ocfs2_find_entry_el(name, namelen, dir, &res_dir); + } if (bh == NULL) return -ENOENT; lookup->dl_leaf_bh = bh; lookup->dl_entry = res_dir; - return 0; +out: + return ret; } /* @@ -2010,6 +2023,7 @@ int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name, * * Return 0 if the name does not exist * Return -EEXIST if the directory contains the name + * Return -EFSCORRUPTED if found corruption * * Callers should have i_rwsem + a cluster lock on dir */ @@ -2023,9 +2037,12 @@ int ocfs2_check_dir_for_entry(struct inode *dir, trace_ocfs2_check_dir_for_entry( (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name); - if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0) { + ret = ocfs2_find_entry(name, namelen, dir, &lookup); + if (ret == 0) { ret = -EEXIST; mlog_errno(ret); + } else if (ret == -ENOENT) { + ret = 0; } ocfs2_free_dir_lookup_result(&lookup); -- cgit v1.2.3