summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/filemap.c4
-rw-r--r--mm/huge_memory.c12
-rw-r--r--mm/hugetlb.c14
-rw-r--r--mm/kmemleak.c2
-rw-r--r--mm/mempolicy.c3
-rw-r--r--mm/mremap.c32
-rw-r--r--mm/page-writeback.c10
-rw-r--r--mm/vmscan.c3
-rw-r--r--mm/vmstat.c15
-rw-r--r--mm/zswap.c59
10 files changed, 123 insertions, 31 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 33b60d448fca..4f476411a9a2 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1523,7 +1523,7 @@ void folio_end_read(struct folio *folio, bool success)
/* Must be in bottom byte for x86 to work */
BUILD_BUG_ON(PG_uptodate > 7);
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
- VM_BUG_ON_FOLIO(folio_test_uptodate(folio), folio);
+ VM_BUG_ON_FOLIO(success && folio_test_uptodate(folio), folio);
if (likely(success))
mask |= 1 << PG_uptodate;
@@ -2996,7 +2996,7 @@ static inline loff_t folio_seek_hole_data(struct xa_state *xas,
if (ops->is_partially_uptodate(folio, offset, bsz) ==
seek_data)
break;
- start = (start + bsz) & ~(bsz - 1);
+ start = (start + bsz) & ~((u64)bsz - 1);
offset += bsz;
} while (offset < folio_size(folio));
unlock:
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e53d83b3e5cf..db64116a4f84 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2206,6 +2206,16 @@ static pmd_t move_soft_dirty_pmd(pmd_t pmd)
return pmd;
}
+static pmd_t clear_uffd_wp_pmd(pmd_t pmd)
+{
+ if (pmd_present(pmd))
+ pmd = pmd_clear_uffd_wp(pmd);
+ else if (is_swap_pmd(pmd))
+ pmd = pmd_swp_clear_uffd_wp(pmd);
+
+ return pmd;
+}
+
bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
{
@@ -2244,6 +2254,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
}
pmd = move_soft_dirty_pmd(pmd);
+ if (vma_has_uffd_without_event_remap(vma))
+ pmd = clear_uffd_wp_pmd(pmd);
set_pmd_at(mm, new_addr, new_pmd, pmd);
if (force_flush)
flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c498874a7170..eaaec19caa7c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5402,6 +5402,7 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte,
unsigned long sz)
{
+ bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma);
struct hstate *h = hstate_vma(vma);
struct mm_struct *mm = vma->vm_mm;
spinlock_t *src_ptl, *dst_ptl;
@@ -5418,7 +5419,18 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
pte = huge_ptep_get_and_clear(mm, old_addr, src_pte);
- set_huge_pte_at(mm, new_addr, dst_pte, pte, sz);
+
+ if (need_clear_uffd_wp && pte_marker_uffd_wp(pte))
+ huge_pte_clear(mm, new_addr, dst_pte, sz);
+ else {
+ if (need_clear_uffd_wp) {
+ if (pte_present(pte))
+ pte = huge_pte_clear_uffd_wp(pte);
+ else if (is_swap_pte(pte))
+ pte = pte_swp_clear_uffd_wp(pte);
+ }
+ set_huge_pte_at(mm, new_addr, dst_pte, pte, sz);
+ }
if (src_ptl != dst_ptl)
spin_unlock(src_ptl);
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 737af23f4f4e..820ba3b5cbfc 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1093,7 +1093,7 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
pr_debug("%s(0x%px, %zu)\n", __func__, ptr, size);
if (kmemleak_enabled && ptr && !IS_ERR_PCPU(ptr))
- create_object_percpu((__force unsigned long)ptr, size, 0, gfp);
+ create_object_percpu((__force unsigned long)ptr, size, 1, gfp);
}
EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 04f35659717a..162407fbf2bc 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2268,7 +2268,8 @@ struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order,
page = __alloc_pages_noprof(gfp, order, nid, nodemask);
- if (unlikely(pol->mode == MPOL_INTERLEAVE) && page) {
+ if (unlikely(pol->mode == MPOL_INTERLEAVE ||
+ pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) {
/* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */
if (static_branch_likely(&vm_numa_stat_key) &&
page_to_nid(page) == nid) {
diff --git a/mm/mremap.c b/mm/mremap.c
index 60473413836b..cff7f552f909 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -138,6 +138,7 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
struct vm_area_struct *new_vma, pmd_t *new_pmd,
unsigned long new_addr, bool need_rmap_locks)
{
+ bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma);
struct mm_struct *mm = vma->vm_mm;
pte_t *old_pte, *new_pte, pte;
pmd_t dummy_pmdval;
@@ -216,7 +217,18 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
force_flush = true;
pte = move_pte(pte, old_addr, new_addr);
pte = move_soft_dirty_pte(pte);
- set_pte_at(mm, new_addr, new_pte, pte);
+
+ if (need_clear_uffd_wp && pte_marker_uffd_wp(pte))
+ pte_clear(mm, new_addr, new_pte);
+ else {
+ if (need_clear_uffd_wp) {
+ if (pte_present(pte))
+ pte = pte_clear_uffd_wp(pte);
+ else if (is_swap_pte(pte))
+ pte = pte_swp_clear_uffd_wp(pte);
+ }
+ set_pte_at(mm, new_addr, new_pte, pte);
+ }
}
arch_leave_lazy_mmu_mode();
@@ -278,6 +290,15 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
return false;
+ /* If this pmd belongs to a uffd vma with remap events disabled, we need
+ * to ensure that the uffd-wp state is cleared from all pgtables. This
+ * means recursing into lower page tables in move_page_tables(), and we
+ * can reuse the existing code if we simply treat the entry as "not
+ * moved".
+ */
+ if (vma_has_uffd_without_event_remap(vma))
+ return false;
+
/*
* We don't have to worry about the ordering of src and dst
* ptlocks because exclusive mmap_lock prevents deadlock.
@@ -333,6 +354,15 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
if (WARN_ON_ONCE(!pud_none(*new_pud)))
return false;
+ /* If this pud belongs to a uffd vma with remap events disabled, we need
+ * to ensure that the uffd-wp state is cleared from all pgtables. This
+ * means recursing into lower page tables in move_page_tables(), and we
+ * can reuse the existing code if we simply treat the entry as "not
+ * moved".
+ */
+ if (vma_has_uffd_without_event_remap(vma))
+ return false;
+
/*
* We don't have to worry about the ordering of src and dst
* ptlocks because exclusive mmap_lock prevents deadlock.
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d213ead95675..d9861e42b2bd 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -692,6 +692,8 @@ static unsigned long bdi_ratio_from_pages(unsigned long pages)
unsigned long ratio;
global_dirty_limits(&background_thresh, &dirty_thresh);
+ if (!dirty_thresh)
+ return -EINVAL;
ratio = div64_u64(pages * 100ULL * BDI_RATIO_SCALE, dirty_thresh);
return ratio;
@@ -790,13 +792,15 @@ int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes)
{
int ret;
unsigned long pages = min_bytes >> PAGE_SHIFT;
- unsigned long min_ratio;
+ long min_ratio;
ret = bdi_check_pages_limit(pages);
if (ret)
return ret;
min_ratio = bdi_ratio_from_pages(pages);
+ if (min_ratio < 0)
+ return min_ratio;
return __bdi_set_min_ratio(bdi, min_ratio);
}
@@ -809,13 +813,15 @@ int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes)
{
int ret;
unsigned long pages = max_bytes >> PAGE_SHIFT;
- unsigned long max_ratio;
+ long max_ratio;
ret = bdi_check_pages_limit(pages);
if (ret)
return ret;
max_ratio = bdi_ratio_from_pages(pages);
+ if (max_ratio < 0)
+ return max_ratio;
return __bdi_set_max_ratio(bdi, max_ratio);
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9a859b7d18d7..b1ec5ece067e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4642,6 +4642,9 @@ retry:
reset_batch_size(walk);
}
+ __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(),
+ stat.nr_demoted);
+
item = PGSTEAL_KSWAPD + reclaimer_offset();
if (!cgroup_reclaim(sc))
__count_vm_events(item, reclaimed);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 4d016314a56c..16bfe1c694dd 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -2122,10 +2122,20 @@ static void __init start_shepherd_timer(void)
{
int cpu;
- for_each_possible_cpu(cpu)
+ for_each_possible_cpu(cpu) {
INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
vmstat_update);
+ /*
+ * For secondary CPUs during CPU hotplug scenarios,
+ * vmstat_cpu_online() will enable the work.
+ * mm/vmstat:online enables and disables vmstat_work
+ * symmetrically during CPU hotplug events.
+ */
+ if (!cpu_online(cpu))
+ disable_delayed_work_sync(&per_cpu(vmstat_work, cpu));
+ }
+
schedule_delayed_work(&shepherd,
round_jiffies_relative(sysctl_stat_interval));
}
@@ -2148,13 +2158,14 @@ static int vmstat_cpu_online(unsigned int cpu)
if (!node_state(cpu_to_node(cpu), N_CPU)) {
node_set_state(cpu_to_node(cpu), N_CPU);
}
+ enable_delayed_work(&per_cpu(vmstat_work, cpu));
return 0;
}
static int vmstat_cpu_down_prep(unsigned int cpu)
{
- cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
+ disable_delayed_work_sync(&per_cpu(vmstat_work, cpu));
return 0;
}
diff --git a/mm/zswap.c b/mm/zswap.c
index 5a27af8d86ea..30f5a27a6862 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -251,7 +251,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
struct zswap_pool *pool;
char name[38]; /* 'zswap' + 32 char (max) num + \0 */
gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
- int ret;
+ int ret, cpu;
if (!zswap_has_pool) {
/* if either are unset, pool initialization failed, and we
@@ -285,6 +285,9 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
goto error;
}
+ for_each_possible_cpu(cpu)
+ mutex_init(&per_cpu_ptr(pool->acomp_ctx, cpu)->mutex);
+
ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE,
&pool->node);
if (ret)
@@ -821,11 +824,12 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
struct acomp_req *req;
int ret;
- mutex_init(&acomp_ctx->mutex);
-
+ mutex_lock(&acomp_ctx->mutex);
acomp_ctx->buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
- if (!acomp_ctx->buffer)
- return -ENOMEM;
+ if (!acomp_ctx->buffer) {
+ ret = -ENOMEM;
+ goto buffer_fail;
+ }
acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
if (IS_ERR(acomp)) {
@@ -855,12 +859,15 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
crypto_req_done, &acomp_ctx->wait);
+ mutex_unlock(&acomp_ctx->mutex);
return 0;
req_fail:
crypto_free_acomp(acomp_ctx->acomp);
acomp_fail:
kfree(acomp_ctx->buffer);
+buffer_fail:
+ mutex_unlock(&acomp_ctx->mutex);
return ret;
}
@@ -869,27 +876,43 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
+ mutex_lock(&acomp_ctx->mutex);
if (!IS_ERR_OR_NULL(acomp_ctx)) {
if (!IS_ERR_OR_NULL(acomp_ctx->req))
acomp_request_free(acomp_ctx->req);
+ acomp_ctx->req = NULL;
if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
crypto_free_acomp(acomp_ctx->acomp);
kfree(acomp_ctx->buffer);
}
+ mutex_unlock(&acomp_ctx->mutex);
return 0;
}
-/* Prevent CPU hotplug from freeing up the per-CPU acomp_ctx resources */
-static struct crypto_acomp_ctx *acomp_ctx_get_cpu(struct crypto_acomp_ctx __percpu *acomp_ctx)
+static struct crypto_acomp_ctx *acomp_ctx_get_cpu_lock(struct zswap_pool *pool)
{
- cpus_read_lock();
- return raw_cpu_ptr(acomp_ctx);
+ struct crypto_acomp_ctx *acomp_ctx;
+
+ for (;;) {
+ acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
+ mutex_lock(&acomp_ctx->mutex);
+ if (likely(acomp_ctx->req))
+ return acomp_ctx;
+ /*
+ * It is possible that we were migrated to a different CPU after
+ * getting the per-CPU ctx but before the mutex was acquired. If
+ * the old CPU got offlined, zswap_cpu_comp_dead() could have
+ * already freed ctx->req (among other things) and set it to
+ * NULL. Just try again on the new CPU that we ended up on.
+ */
+ mutex_unlock(&acomp_ctx->mutex);
+ }
}
-static void acomp_ctx_put_cpu(void)
+static void acomp_ctx_put_unlock(struct crypto_acomp_ctx *acomp_ctx)
{
- cpus_read_unlock();
+ mutex_unlock(&acomp_ctx->mutex);
}
static bool zswap_compress(struct page *page, struct zswap_entry *entry,
@@ -905,9 +928,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
gfp_t gfp;
u8 *dst;
- acomp_ctx = acomp_ctx_get_cpu(pool->acomp_ctx);
- mutex_lock(&acomp_ctx->mutex);
-
+ acomp_ctx = acomp_ctx_get_cpu_lock(pool);
dst = acomp_ctx->buffer;
sg_init_table(&input, 1);
sg_set_page(&input, page, PAGE_SIZE, 0);
@@ -960,8 +981,7 @@ unlock:
else if (alloc_ret)
zswap_reject_alloc_fail++;
- mutex_unlock(&acomp_ctx->mutex);
- acomp_ctx_put_cpu();
+ acomp_ctx_put_unlock(acomp_ctx);
return comp_ret == 0 && alloc_ret == 0;
}
@@ -972,9 +992,7 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
struct crypto_acomp_ctx *acomp_ctx;
u8 *src;
- acomp_ctx = acomp_ctx_get_cpu(entry->pool->acomp_ctx);
- mutex_lock(&acomp_ctx->mutex);
-
+ acomp_ctx = acomp_ctx_get_cpu_lock(entry->pool);
src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
/*
* If zpool_map_handle is atomic, we cannot reliably utilize its mapped buffer
@@ -998,11 +1016,10 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE);
BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait));
BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE);
- mutex_unlock(&acomp_ctx->mutex);
if (src != acomp_ctx->buffer)
zpool_unmap_handle(zpool, entry->handle);
- acomp_ctx_put_cpu();
+ acomp_ctx_put_unlock(acomp_ctx);
}
/*********************************