summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/madvise.c239
-rw-r--r--mm/mseal.c1
2 files changed, 240 insertions, 0 deletions
diff --git a/mm/madvise.c b/mm/madvise.c
index e871a72a6c32..0ceae57da7da 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -37,6 +37,12 @@
#include "internal.h"
#include "swap.h"
+/*
+ * Maximum number of attempts we make to install guard pages before we give up
+ * and return -ERESTARTNOINTR to have userspace try again.
+ */
+#define MAX_MADVISE_GUARD_RETRIES 3
+
struct madvise_walk_private {
struct mmu_gather *tlb;
bool pageout;
@@ -60,6 +66,8 @@ static int madvise_need_mmap_write(int behavior)
case MADV_POPULATE_READ:
case MADV_POPULATE_WRITE:
case MADV_COLLAPSE:
+ case MADV_GUARD_INSTALL:
+ case MADV_GUARD_REMOVE:
return 0;
default:
/* be safe, default to 1. list exceptions explicitly */
@@ -1017,6 +1025,214 @@ static long madvise_remove(struct vm_area_struct *vma,
return error;
}
+static bool is_valid_guard_vma(struct vm_area_struct *vma, bool allow_locked)
+{
+ vm_flags_t disallowed = VM_SPECIAL | VM_HUGETLB;
+
+ /*
+ * A user could lock after setting a guard range but that's fine, as
+ * they'd not be able to fault in. The issue arises when we try to zap
+ * existing locked VMAs. We don't want to do that.
+ */
+ if (!allow_locked)
+ disallowed |= VM_LOCKED;
+
+ if (!vma_is_anonymous(vma))
+ return false;
+
+ if ((vma->vm_flags & (VM_MAYWRITE | disallowed)) != VM_MAYWRITE)
+ return false;
+
+ return true;
+}
+
+static bool is_guard_pte_marker(pte_t ptent)
+{
+ return is_pte_marker(ptent) &&
+ is_guard_swp_entry(pte_to_swp_entry(ptent));
+}
+
+static int guard_install_pud_entry(pud_t *pud, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pud_t pudval = pudp_get(pud);
+
+ /* If huge return >0 so we abort the operation + zap. */
+ return pud_trans_huge(pudval) || pud_devmap(pudval);
+}
+
+static int guard_install_pmd_entry(pmd_t *pmd, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pmd_t pmdval = pmdp_get(pmd);
+
+ /* If huge return >0 so we abort the operation + zap. */
+ return pmd_trans_huge(pmdval) || pmd_devmap(pmdval);
+}
+
+static int guard_install_pte_entry(pte_t *pte, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pte_t pteval = ptep_get(pte);
+ unsigned long *nr_pages = (unsigned long *)walk->private;
+
+ /* If there is already a guard page marker, we have nothing to do. */
+ if (is_guard_pte_marker(pteval)) {
+ (*nr_pages)++;
+
+ return 0;
+ }
+
+ /* If populated return >0 so we abort the operation + zap. */
+ return 1;
+}
+
+static int guard_install_set_pte(unsigned long addr, unsigned long next,
+ pte_t *ptep, struct mm_walk *walk)
+{
+ unsigned long *nr_pages = (unsigned long *)walk->private;
+
+ /* Simply install a PTE marker, this causes segfault on access. */
+ *ptep = make_pte_marker(PTE_MARKER_GUARD);
+ (*nr_pages)++;
+
+ return 0;
+}
+
+static const struct mm_walk_ops guard_install_walk_ops = {
+ .pud_entry = guard_install_pud_entry,
+ .pmd_entry = guard_install_pmd_entry,
+ .pte_entry = guard_install_pte_entry,
+ .install_pte = guard_install_set_pte,
+ .walk_lock = PGWALK_RDLOCK,
+};
+
+static long madvise_guard_install(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end)
+{
+ long err;
+ int i;
+
+ *prev = vma;
+ if (!is_valid_guard_vma(vma, /* allow_locked = */false))
+ return -EINVAL;
+
+ /*
+ * If we install guard markers, then the range is no longer
+ * empty from a page table perspective and therefore it's
+ * appropriate to have an anon_vma.
+ *
+ * This ensures that on fork, we copy page tables correctly.
+ */
+ err = anon_vma_prepare(vma);
+ if (err)
+ return err;
+
+ /*
+ * Optimistically try to install the guard marker pages first. If any
+ * non-guard pages are encountered, give up and zap the range before
+ * trying again.
+ *
+ * We try a few times before giving up and releasing back to userland to
+ * loop around, releasing locks in the process to avoid contention. This
+ * would only happen if there was a great many racing page faults.
+ *
+ * In most cases we should simply install the guard markers immediately
+ * with no zap or looping.
+ */
+ for (i = 0; i < MAX_MADVISE_GUARD_RETRIES; i++) {
+ unsigned long nr_pages = 0;
+
+ /* Returns < 0 on error, == 0 if success, > 0 if zap needed. */
+ err = walk_page_range_mm(vma->vm_mm, start, end,
+ &guard_install_walk_ops, &nr_pages);
+ if (err < 0)
+ return err;
+
+ if (err == 0) {
+ unsigned long nr_expected_pages = PHYS_PFN(end - start);
+
+ VM_WARN_ON(nr_pages != nr_expected_pages);
+ return 0;
+ }
+
+ /*
+ * OK some of the range have non-guard pages mapped, zap
+ * them. This leaves existing guard pages in place.
+ */
+ zap_page_range_single(vma, start, end - start, NULL);
+ }
+
+ /*
+ * We were unable to install the guard pages due to being raced by page
+ * faults. This should not happen ordinarily. We return to userspace and
+ * immediately retry, relieving lock contention.
+ */
+ return restart_syscall();
+}
+
+static int guard_remove_pud_entry(pud_t *pud, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pud_t pudval = pudp_get(pud);
+
+ /* If huge, cannot have guard pages present, so no-op - skip. */
+ if (pud_trans_huge(pudval) || pud_devmap(pudval))
+ walk->action = ACTION_CONTINUE;
+
+ return 0;
+}
+
+static int guard_remove_pmd_entry(pmd_t *pmd, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pmd_t pmdval = pmdp_get(pmd);
+
+ /* If huge, cannot have guard pages present, so no-op - skip. */
+ if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval))
+ walk->action = ACTION_CONTINUE;
+
+ return 0;
+}
+
+static int guard_remove_pte_entry(pte_t *pte, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pte_t ptent = ptep_get(pte);
+
+ if (is_guard_pte_marker(ptent)) {
+ /* Simply clear the PTE marker. */
+ pte_clear_not_present_full(walk->mm, addr, pte, false);
+ update_mmu_cache(walk->vma, addr, pte);
+ }
+
+ return 0;
+}
+
+static const struct mm_walk_ops guard_remove_walk_ops = {
+ .pud_entry = guard_remove_pud_entry,
+ .pmd_entry = guard_remove_pmd_entry,
+ .pte_entry = guard_remove_pte_entry,
+ .walk_lock = PGWALK_RDLOCK,
+};
+
+static long madvise_guard_remove(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end)
+{
+ *prev = vma;
+ /*
+ * We're ok with removing guards in mlock()'d ranges, as this is a
+ * non-destructive action.
+ */
+ if (!is_valid_guard_vma(vma, /* allow_locked = */true))
+ return -EINVAL;
+
+ return walk_page_range(vma->vm_mm, start, end,
+ &guard_remove_walk_ops, NULL);
+}
+
/*
* Apply an madvise behavior to a region of a vma. madvise_update_vma
* will handle splitting a vm area into separate areas, each area with its own
@@ -1098,6 +1314,10 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
break;
case MADV_COLLAPSE:
return madvise_collapse(vma, prev, start, end);
+ case MADV_GUARD_INSTALL:
+ return madvise_guard_install(vma, prev, start, end);
+ case MADV_GUARD_REMOVE:
+ return madvise_guard_remove(vma, prev, start, end);
}
anon_name = anon_vma_name(vma);
@@ -1197,6 +1417,8 @@ madvise_behavior_valid(int behavior)
case MADV_DODUMP:
case MADV_WIPEONFORK:
case MADV_KEEPONFORK:
+ case MADV_GUARD_INSTALL:
+ case MADV_GUARD_REMOVE:
#ifdef CONFIG_MEMORY_FAILURE
case MADV_SOFT_OFFLINE:
case MADV_HWPOISON:
@@ -1490,6 +1712,23 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
while (iov_iter_count(iter)) {
ret = do_madvise(mm, (unsigned long)iter_iov_addr(iter),
iter_iov_len(iter), behavior);
+ /*
+ * An madvise operation is attempting to restart the syscall,
+ * but we cannot proceed as it would not be correct to repeat
+ * the operation in aggregate, and would be surprising to the
+ * user.
+ *
+ * As we have already dropped locks, it is safe to just loop and
+ * try again. We check for fatal signals in case we need exit
+ * early anyway.
+ */
+ if (ret == -ERESTARTNOINTR) {
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+ continue;
+ }
if (ret < 0)
break;
iov_iter_advance(iter, iter_iov_len(iter));
diff --git a/mm/mseal.c b/mm/mseal.c
index ece977bd21e1..81d6e980e8a9 100644
--- a/mm/mseal.c
+++ b/mm/mseal.c
@@ -30,6 +30,7 @@ static bool is_madv_discard(int behavior)
case MADV_REMOVE:
case MADV_DONTFORK:
case MADV_WIPEONFORK:
+ case MADV_GUARD_INSTALL:
return true;
}