diff options
author | Paolo Bonzini <pbonzini@redhat.com> | 2024-01-02 19:16:29 +0100 |
---|---|---|
committer | Paolo Bonzini <pbonzini@redhat.com> | 2024-01-02 19:16:29 +0100 |
commit | 136292522e43da46bee4c0fef80b2602f79525a2 (patch) | |
tree | 47c892c46e01fa4a3ef014f3737ecee3776969ee /mm | |
parent | KVM: clean up directives to compile out irqfds (diff) | |
parent | LoongArch: KVM: Add LASX (256bit SIMD) support (diff) | |
download | linux-136292522e43da46bee4c0fef80b2602f79525a2.tar.xz linux-136292522e43da46bee4c0fef80b2602f79525a2.zip |
Merge tag 'loongarch-kvm-6.8' of git://git.kernel.org/pub/scm/linux/kernel/git/chenhuacai/linux-loongson into HEAD
LoongArch KVM changes for v6.8
1. Optimization for memslot hugepage checking.
2. Cleanup and fix some HW/SW timer issues.
3. Add LSX/LASX (128bit/256bit SIMD) support.
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 16 | ||||
-rw-r--r-- | mm/damon/core.c | 9 | ||||
-rw-r--r-- | mm/damon/sysfs-schemes.c | 54 | ||||
-rw-r--r-- | mm/damon/sysfs.c | 6 | ||||
-rw-r--r-- | mm/filemap.c | 4 | ||||
-rw-r--r-- | mm/huge_memory.c | 16 | ||||
-rw-r--r-- | mm/hugetlb.c | 7 | ||||
-rw-r--r-- | mm/kmemleak.c | 40 | ||||
-rw-r--r-- | mm/ksm.c | 2 | ||||
-rw-r--r-- | mm/madvise.c | 11 | ||||
-rw-r--r-- | mm/memcontrol.c | 5 | ||||
-rw-r--r-- | mm/memory.c | 1 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 15 | ||||
-rw-r--r-- | mm/page-writeback.c | 2 | ||||
-rw-r--r-- | mm/shmem.c | 19 | ||||
-rw-r--r-- | mm/userfaultfd.c | 2 | ||||
-rw-r--r-- | mm/util.c | 10 | ||||
-rw-r--r-- | mm/vmscan.c | 92 | ||||
-rw-r--r-- | mm/workingset.c | 6 |
19 files changed, 226 insertions, 91 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 89971a894b60..57cd378c73d6 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1201,13 +1201,6 @@ config ANON_VMA_NAME area from being merged with adjacent virtual memory areas due to the difference in their name. -config USERFAULTFD - bool "Enable userfaultfd() system call" - depends on MMU - help - Enable the userfaultfd() system call that allows to intercept and - handle page faults in userland. - config HAVE_ARCH_USERFAULTFD_WP bool help @@ -1218,6 +1211,14 @@ config HAVE_ARCH_USERFAULTFD_MINOR help Arch has userfaultfd minor fault support +menuconfig USERFAULTFD + bool "Enable userfaultfd() system call" + depends on MMU + help + Enable the userfaultfd() system call that allows to intercept and + handle page faults in userland. + +if USERFAULTFD config PTE_MARKER_UFFD_WP bool "Userfaultfd write protection support for shmem/hugetlbfs" default y @@ -1227,6 +1228,7 @@ config PTE_MARKER_UFFD_WP Allows to create marker PTEs for userfaultfd write protection purposes. It is required to enable userfaultfd write protection on file-backed memory types like shmem and hugetlbfs. +endif # USERFAULTFD # multi-gen LRU { config LRU_GEN diff --git a/mm/damon/core.c b/mm/damon/core.c index 630077d95dc6..3a05e71509b9 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -445,6 +445,8 @@ struct damon_ctx *damon_new_ctx(void) if (!ctx) return NULL; + init_completion(&ctx->kdamond_started); + ctx->attrs.sample_interval = 5 * 1000; ctx->attrs.aggr_interval = 100 * 1000; ctx->attrs.ops_update_interval = 60 * 1000 * 1000; @@ -668,11 +670,14 @@ static int __damon_start(struct damon_ctx *ctx) mutex_lock(&ctx->kdamond_lock); if (!ctx->kdamond) { err = 0; + reinit_completion(&ctx->kdamond_started); ctx->kdamond = kthread_run(kdamond_fn, ctx, "kdamond.%d", nr_running_ctxs); if (IS_ERR(ctx->kdamond)) { err = PTR_ERR(ctx->kdamond); ctx->kdamond = NULL; + } else { + wait_for_completion(&ctx->kdamond_started); } } mutex_unlock(&ctx->kdamond_lock); @@ -924,7 +929,7 @@ static bool __damos_filter_out(struct damon_ctx *ctx, struct damon_target *t, matched = true; break; default: - break; + return false; } return matched == filter->matching; @@ -1225,6 +1230,7 @@ static void damon_split_region_at(struct damon_target *t, new->age = r->age; new->last_nr_accesses = r->last_nr_accesses; new->nr_accesses_bp = r->nr_accesses_bp; + new->nr_accesses = r->nr_accesses; damon_insert_region(new, r, damon_next_region(r), t); } @@ -1432,6 +1438,7 @@ static int kdamond_fn(void *data) pr_debug("kdamond (%d) starts\n", current->pid); + complete(&ctx->kdamond_started); kdamond_init_intervals_sis(ctx); if (ctx->ops.init) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 45bd0fd4a8b1..fe0fe2562000 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -139,6 +139,13 @@ static const struct kobj_type damon_sysfs_scheme_region_ktype = { * damon_sysfs_before_damos_apply() understands the situation by showing the * 'finished' status and do nothing. * + * If DAMOS is not applied to any region due to any reasons including the + * access pattern, the watermarks, the quotas, and the filters, + * ->before_damos_apply() will not be called back. Until the situation is + * changed, the update will not be finished. To avoid this, + * damon_sysfs_after_sampling() set the status as 'finished' if more than two + * apply intervals of the scheme is passed while the state is 'idle'. + * * Finally, the tried regions request handling finisher function * (damon_sysfs_schemes_update_regions_stop()) unregisters the callbacks. */ @@ -154,6 +161,7 @@ struct damon_sysfs_scheme_regions { int nr_regions; unsigned long total_bytes; enum damos_sysfs_regions_upd_status upd_status; + unsigned long upd_timeout_jiffies; }; static struct damon_sysfs_scheme_regions * @@ -162,6 +170,9 @@ damon_sysfs_scheme_regions_alloc(void) struct damon_sysfs_scheme_regions *regions = kmalloc(sizeof(*regions), GFP_KERNEL); + if (!regions) + return NULL; + regions->kobj = (struct kobject){}; INIT_LIST_HEAD(®ions->regions_list); regions->nr_regions = 0; @@ -1823,6 +1834,8 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx, return 0; region = damon_sysfs_scheme_region_alloc(r); + if (!region) + return 0; list_add_tail(®ion->list, &sysfs_regions->regions_list); sysfs_regions->nr_regions++; if (kobject_init_and_add(®ion->kobj, @@ -1849,7 +1862,9 @@ static int damon_sysfs_after_sampling(struct damon_ctx *ctx) for (i = 0; i < sysfs_schemes->nr; i++) { sysfs_regions = sysfs_schemes->schemes_arr[i]->tried_regions; if (sysfs_regions->upd_status == - DAMOS_TRIED_REGIONS_UPD_STARTED) + DAMOS_TRIED_REGIONS_UPD_STARTED || + time_after(jiffies, + sysfs_regions->upd_timeout_jiffies)) sysfs_regions->upd_status = DAMOS_TRIED_REGIONS_UPD_FINISHED; } @@ -1880,14 +1895,41 @@ int damon_sysfs_schemes_clear_regions( return 0; } +static struct damos *damos_sysfs_nth_scheme(int n, struct damon_ctx *ctx) +{ + struct damos *scheme; + int i = 0; + + damon_for_each_scheme(scheme, ctx) { + if (i == n) + return scheme; + i++; + } + return NULL; +} + static void damos_tried_regions_init_upd_status( - struct damon_sysfs_schemes *sysfs_schemes) + struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx) { int i; + struct damos *scheme; + struct damon_sysfs_scheme_regions *sysfs_regions; - for (i = 0; i < sysfs_schemes->nr; i++) - sysfs_schemes->schemes_arr[i]->tried_regions->upd_status = - DAMOS_TRIED_REGIONS_UPD_IDLE; + for (i = 0; i < sysfs_schemes->nr; i++) { + sysfs_regions = sysfs_schemes->schemes_arr[i]->tried_regions; + scheme = damos_sysfs_nth_scheme(i, ctx); + if (!scheme) { + sysfs_regions->upd_status = + DAMOS_TRIED_REGIONS_UPD_FINISHED; + continue; + } + sysfs_regions->upd_status = DAMOS_TRIED_REGIONS_UPD_IDLE; + sysfs_regions->upd_timeout_jiffies = jiffies + + 2 * usecs_to_jiffies(scheme->apply_interval_us ? + scheme->apply_interval_us : + ctx->attrs.sample_interval); + } } /* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */ @@ -1897,7 +1939,7 @@ int damon_sysfs_schemes_update_regions_start( { damon_sysfs_schemes_clear_regions(sysfs_schemes, ctx); damon_sysfs_schemes_for_damos_callback = sysfs_schemes; - damos_tried_regions_init_upd_status(sysfs_schemes); + damos_tried_regions_init_upd_status(sysfs_schemes, ctx); damos_regions_upd_total_bytes_only = total_bytes_only; ctx->callback.before_damos_apply = damon_sysfs_before_damos_apply; ctx->callback.after_sampling = damon_sysfs_after_sampling; diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index e27846708b5a..7472404456aa 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1172,7 +1172,7 @@ static int damon_sysfs_update_target(struct damon_target *target, struct damon_ctx *ctx, struct damon_sysfs_target *sys_target) { - int err; + int err = 0; if (damon_target_has_pid(ctx)) { err = damon_sysfs_update_target_pid(target, sys_target->pid); @@ -1203,8 +1203,10 @@ static int damon_sysfs_set_targets(struct damon_ctx *ctx, damon_for_each_target_safe(t, next, ctx) { if (i < sysfs_targets->nr) { - damon_sysfs_update_target(t, ctx, + err = damon_sysfs_update_target(t, ctx, sysfs_targets->targets_arr[i]); + if (err) + return err; } else { if (damon_target_has_pid(ctx)) put_pid(t->pid); diff --git a/mm/filemap.c b/mm/filemap.c index 9710f43a89ac..f1c8c278310f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3371,7 +3371,7 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio, } } - if (pmd_none(*vmf->pmd)) + if (pmd_none(*vmf->pmd) && vmf->prealloc_pte) pmd_install(mm, vmf->pmd, &vmf->prealloc_pte); return false; @@ -3443,7 +3443,7 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, * handled in the specific fault path, and it'll prohibit the * fault-around logic. */ - if (!pte_none(vmf->pte[count])) + if (!pte_none(ptep_get(&vmf->pte[count]))) goto skip; count++; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f31f02472396..4f542444a91f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2769,13 +2769,15 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) int nr = folio_nr_pages(folio); xas_split(&xas, folio, folio_order(folio)); - if (folio_test_swapbacked(folio)) { - __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, - -nr); - } else { - __lruvec_stat_mod_folio(folio, NR_FILE_THPS, - -nr); - filemap_nr_thps_dec(mapping); + if (folio_test_pmd_mappable(folio)) { + if (folio_test_swapbacked(folio)) { + __lruvec_stat_mod_folio(folio, + NR_SHMEM_THPS, -nr); + } else { + __lruvec_stat_mod_folio(folio, + NR_FILE_THPS, -nr); + filemap_nr_thps_dec(mapping); + } } } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1169ef2f2176..6feb3e0630d1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1182,6 +1182,13 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) return (get_vma_private_data(vma) & flag) != 0; } +bool __vma_private_lock(struct vm_area_struct *vma) +{ + return !(vma->vm_flags & VM_MAYSHARE) && + get_vma_private_data(vma) & ~HPAGE_RESV_MASK && + is_vma_resv_set(vma, HPAGE_RESV_OWNER); +} + void hugetlb_dup_vma_private(struct vm_area_struct *vma) { VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 1eacca03bedd..5501363d6b31 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -642,32 +642,16 @@ static struct kmemleak_object *__alloc_object(gfp_t gfp) if (!object) { pr_warn("Cannot allocate a kmemleak_object structure\n"); kmemleak_disable(); + return NULL; } - return object; -} - -static int __link_object(struct kmemleak_object *object, unsigned long ptr, - size_t size, int min_count, bool is_phys) -{ - - struct kmemleak_object *parent; - struct rb_node **link, *rb_parent; - unsigned long untagged_ptr; - unsigned long untagged_objp; - INIT_LIST_HEAD(&object->object_list); INIT_LIST_HEAD(&object->gray_list); INIT_HLIST_HEAD(&object->area_list); raw_spin_lock_init(&object->lock); atomic_set(&object->use_count, 1); - object->flags = OBJECT_ALLOCATED | (is_phys ? OBJECT_PHYS : 0); - object->pointer = ptr; - object->size = kfence_ksize((void *)ptr) ?: size; object->excess_ref = 0; - object->min_count = min_count; object->count = 0; /* white color initially */ - object->jiffies = jiffies; object->checksum = 0; object->del_state = 0; @@ -692,6 +676,24 @@ static int __link_object(struct kmemleak_object *object, unsigned long ptr, /* kernel backtrace */ object->trace_handle = set_track_prepare(); + return object; +} + +static int __link_object(struct kmemleak_object *object, unsigned long ptr, + size_t size, int min_count, bool is_phys) +{ + + struct kmemleak_object *parent; + struct rb_node **link, *rb_parent; + unsigned long untagged_ptr; + unsigned long untagged_objp; + + object->flags = OBJECT_ALLOCATED | (is_phys ? OBJECT_PHYS : 0); + object->pointer = ptr; + object->size = kfence_ksize((void *)ptr) ?: size; + object->min_count = min_count; + object->jiffies = jiffies; + untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr); /* * Only update min_addr and max_addr with object @@ -1150,6 +1152,7 @@ EXPORT_SYMBOL_GPL(kmemleak_free_percpu); void __ref kmemleak_update_trace(const void *ptr) { struct kmemleak_object *object; + depot_stack_handle_t trace_handle; unsigned long flags; pr_debug("%s(0x%px)\n", __func__, ptr); @@ -1166,8 +1169,9 @@ void __ref kmemleak_update_trace(const void *ptr) return; } + trace_handle = set_track_prepare(); raw_spin_lock_irqsave(&object->lock, flags); - object->trace_handle = set_track_prepare(); + object->trace_handle = trace_handle; raw_spin_unlock_irqrestore(&object->lock, flags); put_object(object); @@ -468,7 +468,7 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex page = pfn_swap_entry_to_page(entry); } /* return 1 if the page is an normal ksm page or KSM-placed zero page */ - ret = (page && PageKsm(page)) || is_ksm_zero_pte(*pte); + ret = (page && PageKsm(page)) || is_ksm_zero_pte(ptent); pte_unmap_unlock(pte, ptl); return ret; } diff --git a/mm/madvise.c b/mm/madvise.c index cf4d694280e9..6214a1ab5654 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -335,6 +335,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, struct folio *folio = NULL; LIST_HEAD(folio_list); bool pageout_anon_only_filter; + unsigned int batch_count = 0; if (fatal_signal_pending(current)) return -EINTR; @@ -416,6 +417,7 @@ huge_unlock: regular_folio: #endif tlb_change_page_size(tlb, PAGE_SIZE); +restart: start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (!start_pte) return 0; @@ -424,6 +426,15 @@ regular_folio: for (; addr < end; pte++, addr += PAGE_SIZE) { ptent = ptep_get(pte); + if (++batch_count == SWAP_CLUSTER_MAX) { + batch_count = 0; + if (need_resched()) { + pte_unmap_unlock(start_pte, ptl); + cond_resched(); + goto restart; + } + } + if (pte_none(ptent)) continue; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 774bd6e21e27..b226090fd906 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2936,7 +2936,8 @@ void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg) * Moreover, it should not come from DMA buffer and is not readily * reclaimable. So those GFP bits should be masked off. */ -#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT) +#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \ + __GFP_ACCOUNT | __GFP_NOFAIL) /* * mod_objcg_mlstate() may be called with irq enabled, so @@ -3165,6 +3166,7 @@ __always_inline struct obj_cgroup *current_obj_cgroup(void) return NULL; from_memcg: + objcg = NULL; for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { /* * Memcg pointer is protected by scope (see set_active_memcg()) @@ -3175,7 +3177,6 @@ from_memcg: objcg = rcu_dereference_check(memcg->objcg, 1); if (likely(objcg)) break; - objcg = NULL; } return objcg; diff --git a/mm/memory.c b/mm/memory.c index 1f18ed4a5497..5c757fba8858 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1517,6 +1517,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, continue; } else { /* We should have covered all the swap entry types */ + pr_alert("unrecognized swap entry 0x%lx\n", entry.val); WARN_ON_ONCE(1); } pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index ab41a511e20a..7a5fc89a8652 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1129,6 +1129,9 @@ void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages) kasan_remove_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages)); } +/* + * Must be called with mem_hotplug_lock in write mode. + */ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone, struct memory_group *group) { @@ -1149,7 +1152,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, !IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION))) return -EINVAL; - mem_hotplug_begin(); /* associate pfn range with the zone */ move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE); @@ -1208,7 +1210,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, writeback_set_ratelimit(); memory_notify(MEM_ONLINE, &arg); - mem_hotplug_done(); return 0; failed_addition: @@ -1217,7 +1218,6 @@ failed_addition: (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1); memory_notify(MEM_CANCEL_ONLINE, &arg); remove_pfn_range_from_zone(zone, pfn, nr_pages); - mem_hotplug_done(); return ret; } @@ -1458,7 +1458,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) /* create memory block devices after memory was added */ ret = create_memory_block_devices(start, size, params.altmap, group); if (ret) { - arch_remove_memory(start, size, NULL); + arch_remove_memory(start, size, params.altmap); goto error_free; } @@ -1863,6 +1863,9 @@ static int count_system_ram_pages_cb(unsigned long start_pfn, return 0; } +/* + * Must be called with mem_hotplug_lock in write mode. + */ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, struct zone *zone, struct memory_group *group) { @@ -1885,8 +1888,6 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, !IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION))) return -EINVAL; - mem_hotplug_begin(); - /* * Don't allow to offline memory blocks that contain holes. * Consequently, memory blocks with holes can never get onlined @@ -2031,7 +2032,6 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, memory_notify(MEM_OFFLINE, &arg); remove_pfn_range_from_zone(zone, start_pfn, nr_pages); - mem_hotplug_done(); return 0; failed_removal_isolated: @@ -2046,7 +2046,6 @@ failed_removal: (unsigned long long) start_pfn << PAGE_SHIFT, ((unsigned long long) end_pfn << PAGE_SHIFT) - 1, reason); - mem_hotplug_done(); return ret; } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 46f2f5d3d183..ee2fd6a6af40 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -3107,7 +3107,7 @@ EXPORT_SYMBOL_GPL(folio_wait_writeback_killable); */ void folio_wait_stable(struct folio *folio) { - if (folio_inode(folio)->i_sb->s_iflags & SB_I_STABLE_WRITES) + if (mapping_stable_writes(folio_mapping(folio))) folio_wait_writeback(folio); } EXPORT_SYMBOL_GPL(folio_wait_stable); diff --git a/mm/shmem.c b/mm/shmem.c index 91e2620148b2..0d1ce70bce38 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1080,7 +1080,24 @@ whole_folios: } VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); - truncate_inode_folio(mapping, folio); + + if (!folio_test_large(folio)) { + truncate_inode_folio(mapping, folio); + } else if (truncate_inode_partial_folio(folio, lstart, lend)) { + /* + * If we split a page, reset the loop so + * that we pick up the new sub pages. + * Otherwise the THP was entirely + * dropped or the target range was + * zeroed, so just continue the loop as + * is. + */ + if (!folio_test_large(folio)) { + folio_unlock(folio); + index = start; + break; + } + } } folio_unlock(folio); } diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 96d9eae5c7cc..0b6ca553bebe 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -312,7 +312,7 @@ static int mfill_atomic_pte_poison(pmd_t *dst_pmd, ret = -EEXIST; /* Refuse to overwrite any PTE, even a PTE marker (e.g. UFFD WP). */ - if (!pte_none(*dst_pte)) + if (!pte_none(ptep_get(dst_pte))) goto out_unlock; set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); diff --git a/mm/util.c b/mm/util.c index aa01f6ea5a75..744b4d7e3fae 100644 --- a/mm/util.c +++ b/mm/util.c @@ -414,6 +414,15 @@ static int mmap_is_legacy(struct rlimit *rlim_stack) static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) { +#ifdef CONFIG_STACK_GROWSUP + /* + * For an upwards growing stack the calculation is much simpler. + * Memory for the maximum stack size is reserved at the top of the + * task. mmap_base starts directly below the stack and grows + * downwards. + */ + return PAGE_ALIGN_DOWN(mmap_upper_limit(rlim_stack) - rnd); +#else unsigned long gap = rlim_stack->rlim_cur; unsigned long pad = stack_guard_gap; @@ -431,6 +440,7 @@ static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) gap = MAX_GAP; return PAGE_ALIGN(STACK_TOP - gap - rnd); +#endif } void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) diff --git a/mm/vmscan.c b/mm/vmscan.c index 506f8220c5fe..9dd8977de5a2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4089,6 +4089,9 @@ static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op) else VM_WARN_ON_ONCE(true); + WRITE_ONCE(lruvec->lrugen.seg, seg); + WRITE_ONCE(lruvec->lrugen.gen, new); + hlist_nulls_del_rcu(&lruvec->lrugen.list); if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD) @@ -4099,9 +4102,6 @@ static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op) pgdat->memcg_lru.nr_memcgs[old]--; pgdat->memcg_lru.nr_memcgs[new]++; - lruvec->lrugen.gen = new; - WRITE_ONCE(lruvec->lrugen.seg, seg); - if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq)) WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); @@ -4124,11 +4124,11 @@ void lru_gen_online_memcg(struct mem_cgroup *memcg) gen = get_memcg_gen(pgdat->memcg_lru.seq); + lruvec->lrugen.gen = gen; + hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]); pgdat->memcg_lru.nr_memcgs[gen]++; - lruvec->lrugen.gen = gen; - spin_unlock_irq(&pgdat->memcg_lru.lock); } } @@ -4232,7 +4232,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c } /* protected */ - if (tier > tier_idx) { + if (tier > tier_idx || refs == BIT(LRU_REFS_WIDTH)) { int hist = lru_hist_from_seq(lrugen->min_seq[type]); gen = folio_inc_gen(lruvec, folio, false); @@ -4598,7 +4598,12 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, } /* try to scrape all its memory if this memcg was deleted */ - *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total; + if (!mem_cgroup_online(memcg)) { + *nr_to_scan = total; + return false; + } + + *nr_to_scan = total >> sc->priority; /* * The aging tries to be lazy to reduce the overhead, while the eviction @@ -4635,7 +4640,7 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool DEFINE_MAX_SEQ(lruvec); if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg)) - return 0; + return -1; if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan)) return nr_to_scan; @@ -4648,20 +4653,41 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0; } -static unsigned long get_nr_to_reclaim(struct scan_control *sc) +static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc) { + int i; + enum zone_watermarks mark; + /* don't abort memcg reclaim to ensure fairness */ if (!root_reclaim(sc)) - return -1; + return false; + + if (sc->nr_reclaimed >= max(sc->nr_to_reclaim, compact_gap(sc->order))) + return true; + + /* check the order to exclude compaction-induced reclaim */ + if (!current_is_kswapd() || sc->order) + return false; + + mark = sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING ? + WMARK_PROMO : WMARK_HIGH; + + for (i = 0; i <= sc->reclaim_idx; i++) { + struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i; + unsigned long size = wmark_pages(zone, mark) + MIN_LRU_BATCH; + + if (managed_zone(zone) && !zone_watermark_ok(zone, 0, size, sc->reclaim_idx, 0)) + return false; + } - return max(sc->nr_to_reclaim, compact_gap(sc->order)); + /* kswapd should abort if all eligible zones are safe */ + return true; } static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { long nr_to_scan; unsigned long scanned = 0; - unsigned long nr_to_reclaim = get_nr_to_reclaim(sc); int swappiness = get_swappiness(lruvec, sc); /* clean file folios are more likely to exist */ @@ -4683,13 +4709,13 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) if (scanned >= nr_to_scan) break; - if (sc->nr_reclaimed >= nr_to_reclaim) + if (should_abort_scan(lruvec, sc)) break; cond_resched(); } - /* whether try_to_inc_max_seq() was successful */ + /* whether this lruvec should be rotated */ return nr_to_scan < 0; } @@ -4698,14 +4724,9 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) bool success; unsigned long scanned = sc->nr_scanned; unsigned long reclaimed = sc->nr_reclaimed; - int seg = lru_gen_memcg_seg(lruvec); struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct pglist_data *pgdat = lruvec_pgdat(lruvec); - /* see the comment on MEMCG_NR_GENS */ - if (!lruvec_is_sizable(lruvec, sc)) - return seg != MEMCG_LRU_TAIL ? MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG; - mem_cgroup_calculate_protection(NULL, memcg); if (mem_cgroup_below_min(NULL, memcg)) @@ -4713,7 +4734,7 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) if (mem_cgroup_below_low(NULL, memcg)) { /* see the comment on MEMCG_NR_GENS */ - if (seg != MEMCG_LRU_TAIL) + if (lru_gen_memcg_seg(lruvec) != MEMCG_LRU_TAIL) return MEMCG_LRU_TAIL; memcg_memory_event(memcg, MEMCG_LOW); @@ -4729,7 +4750,15 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) flush_reclaim_state(sc); - return success ? MEMCG_LRU_YOUNG : 0; + if (success && mem_cgroup_online(memcg)) + return MEMCG_LRU_YOUNG; + + if (!success && lruvec_is_sizable(lruvec, sc)) + return 0; + + /* one retry if offlined or too small */ + return lru_gen_memcg_seg(lruvec) != MEMCG_LRU_TAIL ? + MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG; } #ifdef CONFIG_MEMCG @@ -4743,14 +4772,13 @@ static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) struct lruvec *lruvec; struct lru_gen_folio *lrugen; struct mem_cgroup *memcg; - const struct hlist_nulls_node *pos; - unsigned long nr_to_reclaim = get_nr_to_reclaim(sc); + struct hlist_nulls_node *pos; + gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq)); bin = first_bin = get_random_u32_below(MEMCG_NR_BINS); restart: op = 0; memcg = NULL; - gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq)); rcu_read_lock(); @@ -4761,6 +4789,10 @@ restart: } mem_cgroup_put(memcg); + memcg = NULL; + + if (gen != READ_ONCE(lrugen->gen)) + continue; lruvec = container_of(lrugen, struct lruvec, lrugen); memcg = lruvec_memcg(lruvec); @@ -4777,7 +4809,7 @@ restart: rcu_read_lock(); - if (sc->nr_reclaimed >= nr_to_reclaim) + if (should_abort_scan(lruvec, sc)) break; } @@ -4788,7 +4820,7 @@ restart: mem_cgroup_put(memcg); - if (sc->nr_reclaimed >= nr_to_reclaim) + if (!is_a_nulls(pos)) return; /* restart if raced with lru_gen_rotate_memcg() */ @@ -4845,16 +4877,14 @@ static void set_initial_priority(struct pglist_data *pgdat, struct scan_control if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH) return; /* - * Determine the initial priority based on ((total / MEMCG_NR_GENS) >> - * priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, where the - * estimated reclaimed_to_scanned_ratio = inactive / total. + * Determine the initial priority based on + * (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, + * where reclaimed_to_scanned_ratio = inactive / total. */ reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE); if (get_swappiness(lruvec, sc)) reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON); - reclaimable /= MEMCG_NR_GENS; - /* round down reclaimable and round up sc->nr_to_reclaim */ priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1); diff --git a/mm/workingset.c b/mm/workingset.c index b192e44a0e7c..33baad203277 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -313,10 +313,10 @@ static void lru_gen_refault(struct folio *folio, void *shadow) * 1. For pages accessed through page tables, hotter pages pushed out * hot pages which refaulted immediately. * 2. For pages accessed multiple times through file descriptors, - * numbers of accesses might have been out of the range. + * they would have been protected by sort_folio(). */ - if (lru_gen_in_fault() || refs == BIT(LRU_REFS_WIDTH)) { - folio_set_workingset(folio); + if (lru_gen_in_fault() || refs >= BIT(LRU_REFS_WIDTH) - 1) { + set_mask_bits(&folio->flags, 0, LRU_REFS_MASK | BIT(PG_workingset)); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); } unlock: |